diff --git a/.clang-tidy b/.clang-tidy
index 5a2b0fed5dc..07b79a9504f 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -3,6 +3,7 @@ Checks: |
    clang-diagnostic-*,
    clang-analyzer-*,
    -clang-analyzer-core.NullDereference,
+   -clang-analyzer-core.uninitialized.UndefReturn,
    -clang-analyzer-optin.mpi.MPI-Checker,
    -clang-analyzer-security.FloatLoopCounter,
    bugprone-*,
diff --git a/.github/actions/build_and_check/action.yml b/.github/actions/build_and_check/action.yml
new file mode 100644
index 00000000000..e9ea1f140b5
--- /dev/null
+++ b/.github/actions/build_and_check/action.yml
@@ -0,0 +1,31 @@
+name: 'Build and check'
+description: 'Build espresso and run checks'
+inputs:
+  asan:  # id of input
+    description: 'Whether to build with address sanitizer'
+    required: true
+    default: 'false'
+  ubsan:
+    description: 'Whether to build with undefined behavior sanitizer'
+    required: true
+    default: 'false'
+  check_skip_long:  # id of input
+    description: 'Whether to skip long python tests'
+    required: true
+    default: 'false'
+runs:
+  using: "composite"
+  steps:
+    - run: |
+       brew install boost boost-mpi fftw
+       brew install hdf5-mpi
+       pip3 install numpy cython h5py scipy
+      shell: bash
+    - run: |
+        export myconfig=maxset with_cuda=false test_timeout=600 with_asan=${{ inputs.asan }} with_ubsan=${{ inputs.ubsan }} check_skip_long=${{ inputs.check_skip_long }}
+        bash maintainer/CI/build_cmake.sh
+      shell: bash
+      # This is a workaround for the unfortunate interaction of MacOS and OpenMPI 4
+      # See https://github.com/open-mpi/ompi/issues/6518
+      env:
+        OMPI_MCA_btl: "self,tcp"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index 8378f07f207..00000000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: run tests on mac
-
-on:
-  push:
-  pull_request:
-
-jobs:
-  github_mactest:
-    runs-on: macos-latest
-    steps:
-    - uses: actions/checkout@main
-    - uses: actions/setup-python@v2
-      with:
-          python-version: '3.7'
-    - run: |
-       brew install boost boost-mpi fftw
-       brew install hdf5-mpi
-       pip3 install numpy cython h5py scipy
-    - run: |
-        export myconfig=maxset with_cuda=false test_timeout=600
-        bash maintainer/CI/build_cmake.sh
diff --git a/.github/workflows/push_pull.yml b/.github/workflows/push_pull.yml
new file mode 100644
index 00000000000..b75a93e92d4
--- /dev/null
+++ b/.github/workflows/push_pull.yml
@@ -0,0 +1,52 @@
+name: run tests on mac
+
+on:
+  push:
+  pull_request:
+  schedule:
+    - cron: '0 3 * * *'
+
+jobs:
+  regular_check:
+    runs-on: macos-latest
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@main
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.7'
+      - name: Check without sanitizer
+        uses: ./.github/actions/build_and_check
+        with:
+          asan: false
+          ubsan: false
+          check_skip_long: false
+
+  sanitizer_check:
+    runs-on: macos-latest
+    if: github.event_name == 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@main
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.7'
+      - name: Check with sanitizer
+        uses: ./.github/actions/build_and_check
+        with:
+          asan: true
+          ubsan: true
+          check_skip_long: true
+      - name: Setting job link variable
+        if: ${{ failure() }}
+        run: |
+          echo "job_link=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" >> $GITHUB_ENV
+      - uses: alialaa/issue-action@v1
+        if: ${{ failure() }}
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: Scheduled CI job has failed
+          body: ${{ env.job_link }}
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 03340740354..3683684bf2e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: docker.pkg.github.com/espressomd/docker/ubuntu-20.04:e583d4b2eb8eedd10068957f952bd67008475ee5
+image: docker.pkg.github.com/espressomd/docker/ubuntu-20.04:063f945eb434f6900402fd412f28a4486288c82b
 
 stages:
   - prepare
@@ -100,6 +100,7 @@ maxset:
      with_scafacos: 'true'
      with_stokesian_dynamics: 'true'
      check_skip_long: 'true'
+     cmake_params: '-DTEST_NP=8'
   script:
     - bash maintainer/CI/build_cmake.sh
   tags:
@@ -130,6 +131,7 @@ ubuntu:wo-dependencies:
   variables:
      myconfig: 'maxset'
      with_cuda: 'false'
+     with_hdf5: 'false'
      make_check_unit_tests: 'false'
      make_check_python: 'false'
   script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45ec3dffa95..489aee2b2b9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,6 @@ include(FeatureSummary)
 include(GNUInstallDirs)
 project(ESPResSo)
 include(option_enum)
-include(option_if_available)
 if(POLICY CMP0074)
   # make find_package() use <PackageName>_ROOT variables
   cmake_policy(SET CMP0074 NEW)
@@ -69,12 +68,12 @@ set(CMAKE_FIND_FRAMEWORK LAST)
 # ##############################################################################
 
 option(WITH_PYTHON "Build with Python bindings" ON)
-option_if_available(WITH_GSL "Build with GSL support" ON)
+option(WITH_GSL "Build with GSL support" OFF)
 option(WITH_CUDA "Build with GPU support" OFF)
-option_if_available(WITH_HDF5 "Build with HDF5 support" ON)
+option(WITH_HDF5 "Build with HDF5 support" OFF)
 option(WITH_TESTS "Enable tests" ON)
-option_if_available(WITH_SCAFACOS "Build with ScaFaCoS support" OFF)
-option_if_available(WITH_STOKESIAN_DYNAMICS "Build with Stokesian Dynamics" ON)
+option(WITH_SCAFACOS "Build with ScaFaCoS support" OFF)
+option(WITH_STOKESIAN_DYNAMICS "Build with Stokesian Dynamics" OFF)
 option(WITH_BENCHMARKS "Enable benchmarks" OFF)
 option(WITH_VALGRIND_INSTRUMENTATION
        "Build with valgrind instrumentation markers" OFF)
@@ -192,7 +191,7 @@ if(WITH_HDF5)
   # who are not familiar with the way hdf5 is distributed in Linux package
   # repositories (libhdf5-dev is the serial version).
   set(HDF5_PREFER_PARALLEL 1)
-  find_package(HDF5 "1.8" COMPONENTS C)
+  find_package(HDF5 "1.8" REQUIRED COMPONENTS C)
   if(HDF5_FOUND)
     if(HDF5_IS_PARALLEL)
       set(H5MD 1)
@@ -200,17 +199,8 @@ if(WITH_HDF5)
     else()
       unset(H5MD)
       set(HDF5_FOUND FALSE)
-      if(NOT WITH_HDF5_IS_DEFAULT_VALUE)
-        message(
-          FATAL_ERROR
-            "Optional dependency HDF5 explicitly requested, but parallel version not found."
-        )
-      endif()
+      message(FATAL_ERROR "HDF5 parallel version not found.")
     endif(HDF5_IS_PARALLEL)
-  elseif(NOT WITH_HDF5_IS_DEFAULT_VALUE)
-    message(
-      FATAL_ERROR
-        "Optional dependency HDF5 explicitly requested, but not found.")
   endif(HDF5_FOUND)
 endif(WITH_HDF5)
 
@@ -231,50 +221,24 @@ if(WITH_HDF5 AND EXISTS "${CMAKE_SOURCE_DIR}/.git")
 endif()
 
 if(WITH_SCAFACOS)
-  find_package(PkgConfig)
-  pkg_check_modules(SCAFACOS scafacos)
+  find_package(PkgConfig REQUIRED)
+  pkg_check_modules(SCAFACOS scafacos REQUIRED)
   if(SCAFACOS_FOUND)
     set(SCAFACOS 1)
-  elseif(NOT WITH_SCAFACOS_IS_DEFAULT_VALUE)
-    message(
-      FATAL_ERROR
-        "Optional dependency ScaFaCoS explicitly requested, but not found.")
   endif(SCAFACOS_FOUND)
 endif(WITH_SCAFACOS)
 
 if(WITH_GSL)
+  find_package(GSL REQUIRED)
+else()
   find_package(GSL)
-  if(GSL_FOUND)
-    set(GSL 1)
-  elseif(NOT WITH_GSL_IS_DEFAULT_VALUE)
-    message(
-      FATAL_ERROR "Optional dependency GSL explicitly requested, but not found."
-    )
-  endif(GSL_FOUND)
 endif(WITH_GSL)
 
-find_package(BLAS)
-if(BLAS_FOUND)
-  set(BLAS 1)
-endif()
-find_package(LAPACK)
-if(LAPACK_FOUND)
-  set(LAPACK 1)
-endif()
+if(GSL_FOUND)
+  set(GSL 1)
+endif(GSL_FOUND)
 
 if(WITH_STOKESIAN_DYNAMICS)
-  if(BLAS AND LAPACK)
-    set(STOKESIAN_DYNAMICS 1)
-  endif()
-  if(NOT STOKESIAN_DYNAMICS AND NOT WITH_STOKESIAN_DYNAMICS_IS_DEFAULT_VALUE)
-    message(
-      FATAL_ERROR
-        "Optional feature Stokesian Dynamics explicitly requested, but dependencies not found."
-    )
-  endif()
-endif(WITH_STOKESIAN_DYNAMICS)
-
-if(STOKESIAN_DYNAMICS)
   set(CMAKE_INSTALL_LIBDIR
       "${CMAKE_INSTALL_PREFIX}/${PYTHON_INSTDIR}/espressomd")
   include(FetchContent)
@@ -283,16 +247,17 @@ if(STOKESIAN_DYNAMICS)
     GIT_REPOSITORY https://github.com/hmenke/espresso-stokesian-dynamics.git
     GIT_TAG c14e57655e929)
   FetchContent_GetProperties(stokesian_dynamics)
+  set(STOKESIAN_DYNAMICS 1)
   if(NOT stokesian_dynamics_POPULATED)
     FetchContent_Populate(stokesian_dynamics)
     add_subdirectory(${stokesian_dynamics_SOURCE_DIR}
                      ${stokesian_dynamics_BINARY_DIR})
   endif()
-endif(STOKESIAN_DYNAMICS)
+endif(WITH_STOKESIAN_DYNAMICS)
 
 if(WITH_VALGRIND_INSTRUMENTATION)
-  find_package(PkgConfig)
-  pkg_check_modules(VALGRIND valgrind)
+  find_package(PkgConfig REQUIRED)
+  pkg_check_modules(VALGRIND valgrind REQUIRED)
   if(VALGRIND_FOUND)
     set(VALGRIND_INSTRUMENTATION 1)
     message(STATUS ${VALGRIND_INCLUDE_DIRS})
@@ -306,9 +271,9 @@ endif(WITH_VALGRIND_INSTRUMENTATION)
 
 find_package(MPI 3.0 REQUIRED)
 
-# ##############################################################################
+#
 # Boost
-# ##############################################################################
+#
 
 list(APPEND BOOST_COMPONENTS mpi serialization filesystem system)
 
@@ -423,6 +388,10 @@ target_compile_options(
 
 set(CMAKE_MACOSX_RPATH TRUE)
 
+#
+# Sanitizers
+#
+
 if(WITH_ASAN AND WITH_MSAN)
   message(
     FATAL_ERROR
@@ -448,14 +417,18 @@ endif()
 target_link_libraries(cxx_interface INTERFACE coverage_interface)
 
 #
-# Testing
-# ##############################################################################
+# Static analysis
+#
 
 if(WITH_CLANG_TIDY)
   find_package(ClangTidy "${CMAKE_CXX_COMPILER_VERSION}" EXACT REQUIRED)
   set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=--cuda-host-only")
 endif()
 
+#
+# Testing
+#
+
 if(WITH_TESTS)
   enable_testing()
   add_custom_target(check)
@@ -480,6 +453,7 @@ endif(WITH_BENCHMARKS)
 add_subdirectory(doc)
 add_subdirectory(src)
 add_subdirectory(libs)
+
 #
 # Feature summary
 #
diff --git a/cmake/FindCUDACompilerClang.cmake b/cmake/FindCUDACompilerClang.cmake
index 79df6fad7d0..ea8714d99a0 100644
--- a/cmake/FindCUDACompilerClang.cmake
+++ b/cmake/FindCUDACompilerClang.cmake
@@ -77,7 +77,6 @@ function(find_gpu_library)
   endif()
 endfunction(find_gpu_library)
 
-find_gpu_library(VARNAME CUDA_LIBRARY NAMES cuda REQUIRED)
 find_gpu_library(VARNAME CUDART_LIBRARY NAMES cudart REQUIRED)
 find_gpu_library(VARNAME CUFFT_LIBRARY NAMES cufft REQUIRED)
 
diff --git a/cmake/FindCUDACompilerNVCC.cmake b/cmake/FindCUDACompilerNVCC.cmake
index 930518e3dcb..62674892ec1 100644
--- a/cmake/FindCUDACompilerNVCC.cmake
+++ b/cmake/FindCUDACompilerNVCC.cmake
@@ -71,7 +71,6 @@ function(find_gpu_library)
   endif()
 endfunction(find_gpu_library)
 
-find_gpu_library(VARNAME CUDA_LIBRARY NAMES cuda REQUIRED)
 find_gpu_library(VARNAME CUDART_LIBRARY NAMES cudart REQUIRED)
 find_gpu_library(VARNAME CUDA_CUFFT_LIBRARIES NAMES cufft REQUIRED)
 
diff --git a/cmake/option_if_available.cmake b/cmake/option_if_available.cmake
deleted file mode 100644
index f8864511d05..00000000000
--- a/cmake/option_if_available.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (C) 2020 The ESPResSo project
-#
-# This file is part of ESPResSo.
-#
-# ESPResSo is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# ESPResSo is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-# Like `option()`, but create an extra boolean variable to store whether the
-# option was set to its default value or to a user-provided value. With this
-# command, the project can be installed with optional dependencies without
-# the need to provide a list of CMake flags. Unavailable dependencies will be
-# silently ignored. However, if the user specifically requested an optional
-# dependency by passing the corresponding CMake flag, the build system has
-# the possibility to throw an error if the dependency is unavailable.
-#
-# Note that when calling CMake again without clearing the build folder,
-# variables from the previous CMake call are loaded in memory. For example,
-# if the user passed a value to an `option_if_available()` the first time but
-# not the second time, the variable will still be flagged as a user-provided
-# value in the second CMake call.
-macro(option_if_available varname help_text default_value)
-  if(NOT DEFINED ${varname}_IS_DEFAULT_VALUE)
-    if("${${varname}}" STREQUAL "")
-      set(${varname}_IS_DEFAULT_VALUE TRUE CACHE INTERNAL "does ${varname} contain the default value?")
-    else()
-      set(${varname}_IS_DEFAULT_VALUE FALSE CACHE INTERNAL "does ${varname} contain the default value?")
-    endif()
-  endif()
-  option(${varname} ${help_text} ${default_value})
-endmacro()
diff --git a/doc/sphinx/analysis.rst b/doc/sphinx/analysis.rst
index d15f3cf7707..a346007fd20 100644
--- a/doc/sphinx/analysis.rst
+++ b/doc/sphinx/analysis.rst
@@ -411,10 +411,21 @@ or bin edges for the axes. Example::
                        density_profile.min_y, density_profile.max_y])
     plt.show()
 
+Observables based on cylindrical coordinates are also available.
+They require special parameters if the cylindrical coordinate system is non-standard, e.g. if you want the origin of the cylindrical coordinates to be at a special location of the box or if you want to make use of symmetries along an axis that is not parallel to the z-axis.
+For this purpose, use :class:`espressomd.math.CylindricalTransformationParameters` to create a consistent set of the parameters needed. Example::
+
+    import espressomd.math
+    
+    # shifted and rotated cylindrical coordinates
+    cyl_transform_params = espressomd.math.CylindricalTransformationParameters(
+        center=[5.0, 5.0, 0.0], axis=[0, 1, 0], orientation=[0, 0, 1])
+
     # histogram in cylindrical coordinates
     density_profile = espressomd.observables.CylindricalDensityProfile(
-        ids=[0, 1], center=[5.0, 5.0, 0.0], axis=[0, 0, 1],
-        n_r_bins=8, min_r=0.0, max_r=4.0,
+        ids=[0, 1],
+        transform_params = cyl_transform_params,
+        n_r_bins=8, min_r=1.0, max_r=4.0,
         n_phi_bins=16, min_phi=-np.pi, max_phi=np.pi,
         n_z_bins=4, min_z=4.0, max_z=8.0)
     obs_data = density_profile.calculate()
@@ -779,4 +790,3 @@ Note that the cluster objects do not contain copies of the particles, but refer
 
 
 
-
diff --git a/doc/sphinx/conf.py.in b/doc/sphinx/conf.py.in
index aa3f9fd2d61..d85f456e91e 100644
--- a/doc/sphinx/conf.py.in
+++ b/doc/sphinx/conf.py.in
@@ -115,6 +115,9 @@ pygments_style = 'sphinx'
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
 
+# sphinxcontrib.bibtex options
+bibtex_bibfiles = ['zrefs.bib']
+
 
 # -- Options for HTML output ----------------------------------------------
 
diff --git a/doc/sphinx/electrostatics.rst b/doc/sphinx/electrostatics.rst
index d39e7db170e..7c6061ab50c 100644
--- a/doc/sphinx/electrostatics.rst
+++ b/doc/sphinx/electrostatics.rst
@@ -234,7 +234,7 @@ using it.
 Electrostatic Layer Correction (ELC)
 ------------------------------------
 
-:class:`espressomd.electrostatic_extensions.ELC`
+:class:`espressomd.electrostatics.ELC`
 
 *ELC* is an extension of the P3M electrostatics solver for explicit 2D periodic
 systems. It can account for different dielectric jumps on both sides of the
@@ -260,8 +260,9 @@ Usage notes:
 
 *ELC* is an |es| actor and is used with::
 
-    import espressomd.electrostatic_extensions
-    elc = electrostatic_extensions.ELC(gap_size=box_l * 0.2, maxPWerror=1e-3)
+    import espressomd.electrostatics
+    p3m = espressomd.electrostatics.P3M(prefactor=1, accuracy=1e-4)
+    elc = espressomd.electrostatics.ELC(p3m_actor=p3m, gap_size=box_l * 0.2, maxPWerror=1e-3)
     system.actors.add(elc)
 
 *ELC* can also be used to simulate 2D periodic systems with image charges,
@@ -273,8 +274,8 @@ simulation region (*middle*) to *bottom* (at :math:`z=0`) and from *middle* to
 are :math:`\Delta_t=\frac{\varepsilon_m-\varepsilon_t}{\varepsilon_m+\varepsilon_t}`
 and :math:`\Delta_b=\frac{\varepsilon_m-\varepsilon_b}{\varepsilon_m+\varepsilon_b}`::
 
-    elc = electrostatic_extensions.ELC(gap_size=box_l * 0.2, maxPWerror=1e-3,
-                                       delta_mid_top=0.9, delta_mid_bot=0.1)
+    elc = espressomd.electrostatics.ELC(p3m_actor=p3m, gap_size=box_l * 0.2, maxPWerror=1e-3,
+                                        delta_mid_top=0.9, delta_mid_bot=0.1)
 
 The fully metallic case :math:`\Delta_t=\Delta_b=-1` would lead to divergence
 of the forces/energies in *ELC* and is therefore only possible with the
@@ -283,8 +284,8 @@ of the forces/energies in *ELC* and is therefore only possible with the
 Toggle ``const_pot`` on to maintain a constant electric potential difference
 ``pot_diff`` between the xy-planes at :math:`z=0` and :math:`z = L_z - h`::
 
-    elc = electrostatic_extensions.ELC(gap_size=box_l * 0.2, maxPWerror=1e-3,
-                                       const_pot=True, delta_mid_bot=100.0)
+    elc = espressomd.electrostatics.ELC(p3m_actor=p3m, gap_size=box_l * 0.2, maxPWerror=1e-3,
+                                        const_pot=True, delta_mid_bot=100.0)
 
 This is done by countering the total dipole moment of the system with the
 electric field :math:`E_{\textrm{induced}}` and superposing a homogeneous
diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst
index 3d4949fe3f1..799894bc8a0 100644
--- a/doc/sphinx/installation.rst
+++ b/doc/sphinx/installation.rst
@@ -86,8 +86,8 @@ are required:
 
 .. code-block:: bash
 
-    sudo apt install python3-matplotlib python3-scipy ipython3 jupyter-notebook
-    pip3 install --user 'pint>=0.9' 'jupyter_contrib_nbextensions==0.5.1' \
+    sudo apt install python3-matplotlib python3-scipy python3-pint ipython3 jupyter-notebook
+    pip3 install --user 'jupyter_contrib_nbextensions==0.5.1' \
                         'sphinx>=1.6.7,!=2.1.0,!=3.0.0' 'sphinxcontrib-bibtex>=0.3.5'
     jupyter contrib nbextension install --user
     jupyter nbextension enable rubberband/main
diff --git a/doc/sphinx/system_setup.rst b/doc/sphinx/system_setup.rst
index 59b070115c5..52fa8be2f12 100644
--- a/doc/sphinx/system_setup.rst
+++ b/doc/sphinx/system_setup.rst
@@ -231,16 +231,33 @@ For more information please check :class:`espressomd.cuda_init.CudaInitHandle`.
 List available CUDA devices
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-If you want to list available CUDA devices
-you should access :attr:`espressomd.cuda_init.CudaInitHandle.device_list`, e.g., ::
+If you want to list available CUDA devices, you should call
+:meth:`espressomd.cuda_init.CudaInitHandle.list_devices`::
 
-    system = espressomd.System(box_l=[1, 1, 1])
-
-    print(system.cuda_init_handle.device_list)
+    >>> import espressomd
+    >>> system = espressomd.System(box_l=[1, 1, 1])
+    >>> print(system.cuda_init_handle.list_devices())
+    {0: 'GeForce RTX 2080', 1: 'GeForce GT 730'}
 
-This attribute is read only and will return a dictionary containing
+This method returns a dictionary containing
 the device id as key and the device name as its value.
 
+To get more details on the CUDA devices for each MPI node, call
+:meth:`espressomd.cuda_init.CudaInitHandle.list_devices_properties`::
+
+    >>> import pprint
+    >>> import espressomd
+    >>> system = espressomd.System(box_l=[1, 1, 1])
+    >>> pprint.pprint(system.cuda_init_handle.list_devices_properties())
+    {'seraue': {0: {'name': 'GeForce RTX 2080',
+                    'compute_capability': (7, 5),
+                    'cores': 46,
+                    'total_memory': 8370061312},
+                1: {'name': 'GeForce GT 730',
+                    'compute_capability': (3, 5),
+                    'cores': 2,
+                    'total_memory': 1014104064}}}
+
 .. _Selection of CUDA device:
 
 Selection of CUDA device
@@ -250,9 +267,9 @@ When you start ``pypresso`` your first GPU should be selected.
 If you wanted to use the second GPU, this can be done
 by setting :attr:`espressomd.cuda_init.CudaInitHandle.device` as follows::
 
-    system = espressomd.System(box_l=[1, 1, 1])
-
-    system.cuda_init_handle.device = 1
+    >>> import espressomd
+    >>> system = espressomd.System(box_l=[1, 1, 1])
+    >>> system.cuda_init_handle.device = 1
 
 Setting a device id outside the valid range or a device
 which does not meet the minimum requirements will raise
diff --git a/doc/tutorials/charged_system/charged_system-1.ipynb b/doc/tutorials/charged_system/charged_system-1.ipynb
index d743b8731db..7f223216e8e 100644
--- a/doc/tutorials/charged_system/charged_system-1.ipynb
+++ b/doc/tutorials/charged_system/charged_system-1.ipynb
@@ -36,7 +36,7 @@
     "import espressomd\n",
     "espressomd.assert_features(['WCA', 'ELECTROSTATICS'])\n",
     "\n",
-    "from espressomd import System, interactions, electrostatics, observables, accumulators\n",
+    "from espressomd import System, interactions, electrostatics, observables, accumulators, math\n",
     "\n",
     "import numpy as np\n",
     "from scipy import optimize\n",
@@ -438,20 +438,18 @@
     "```python\n",
     "def setup_profile_calculation(system, delta_N, ion_types, r_min, n_radial_bins):\n",
     "    radial_profile_accumulators = {}\n",
+    "    ctp = math.CylindricalTransformationParameters(center = np.array(system.box_l) / 2.,\n",
+    "                                                   axis = [0, 0, 1],\n",
+    "                                                   orientation = [1, 0, 0])\n",
     "    for ion_type in ion_types:\n",
     "        ion_ids = system.part.select(type=ion_type).id\n",
     "        radial_profile_obs = observables.CylindricalDensityProfile(\n",
     "            ids=ion_ids,\n",
-    "            center=np.array(system.box_l) / 2.,\n",
-    "            axis=[0, 0, 1, ],\n",
+    "            transform_params = ctp,\n",
     "            n_r_bins=n_radial_bins,\n",
-    "            n_phi_bins=1,\n",
-    "            n_z_bins=1,\n",
     "            min_r=r_min,\n",
-    "            min_phi=-np.pi,\n",
     "            min_z=-system.box_l[2] / 2.,\n",
     "            max_r=system.box_l[0] / 2.,\n",
-    "            max_phi=np.pi,\n",
     "            max_z=system.box_l[2] / 2.)\n",
     "\n",
     "        bin_edges = radial_profile_obs.bin_edges()\n",
@@ -945,7 +943,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
diff --git a/doc/tutorials/constant_pH/constant_pH.ipynb b/doc/tutorials/constant_pH/constant_pH.ipynb
index 5fce1d8475c..f535722405a 100644
--- a/doc/tutorials/constant_pH/constant_pH.ipynb
+++ b/doc/tutorials/constant_pH/constant_pH.ipynb
@@ -117,7 +117,10 @@
    "source": [
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
+    "import setuptools\n",
     "import pint  # module for working with units and dimensions\n",
+    "assert setuptools.version.pkg_resources.packaging.specifiers.SpecifierSet('>=0.10.1').contains(pint.__version__), \\\n",
+    "  f'pint version {pint.__version__} is too old: several numpy operations can cast away the unit'\n",
     "\n",
     "import espressomd\n",
     "espressomd.assert_features(['WCA', 'ELECTROSTATICS'])\n",
diff --git a/maintainer/CI/build_cmake.sh b/maintainer/CI/build_cmake.sh
index bdd690b296d..75083e666c3 100755
--- a/maintainer/CI/build_cmake.sh
+++ b/maintainer/CI/build_cmake.sh
@@ -96,6 +96,7 @@ set_default_value with_cuda false
 set_default_value with_cuda_compiler "nvcc"
 set_default_value build_type "RelWithAssert"
 set_default_value with_ccache false
+set_default_value with_hdf5 true
 set_default_value with_scafacos false
 set_default_value with_stokesian_dynamics false
 set_default_value test_timeout 300
@@ -121,11 +122,19 @@ cmake_params="${cmake_params} -DTEST_TIMEOUT=${test_timeout}"
 if [ "${with_ccache}" = true ]; then
     cmake_params="${cmake_params} -DWITH_CCACHE=ON"
 fi
+
+if [ "${with_hdf5}" = true ]; then
+    cmake_params="${cmake_params} -DWITH_HDF5=ON"
+else
+    cmake_params="${cmake_params} -DWITH_HDF5=OFF"
+fi
+
 if [ "${with_scafacos}" = true ]; then
     cmake_params="${cmake_params} -DWITH_SCAFACOS=ON"
 else
     cmake_params="${cmake_params} -DWITH_SCAFACOS=OFF"
 fi
+
 if [ "${with_stokesian_dynamics}" = true ]; then
     cmake_params="${cmake_params} -DWITH_STOKESIAN_DYNAMICS=ON"
 else
@@ -228,7 +237,7 @@ if [ "${run_checks}" = true ]; then
 
     # fail if built with CUDA but no compatible GPU was found
     if [ "${with_cuda}" = true ] && [ "${hide_gpu}" != true ]; then
-        ./pypresso -c "import espressomd;assert espressomd.gpu_available(), 'No GPU available'" || exit 1
+        ./pypresso -c "import espressomd.cuda_init as gpu;gpu.CudaInitHandle().device = 0" || exit 1
     fi
 
     # unit tests
diff --git a/maintainer/benchmarks/CMakeLists.txt b/maintainer/benchmarks/CMakeLists.txt
index 5a6c5907652..6d2fc5e4228 100644
--- a/maintainer/benchmarks/CMakeLists.txt
+++ b/maintainer/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 include(ProcessorCount)
-processorcount(NP)
+ProcessorCount(NP)
 
 if(EXISTS ${MPIEXEC})
   # OpenMPI 3.0 and higher checks the number of processes against the number of
diff --git a/maintainer/configs/maxset.hpp b/maintainer/configs/maxset.hpp
index 74368803f10..a0171117848 100644
--- a/maintainer/configs/maxset.hpp
+++ b/maintainer/configs/maxset.hpp
@@ -47,6 +47,7 @@
 #define LB_BOUNDARIES_GPU
 #define ELECTROKINETICS
 #define EK_BOUNDARIES
+#define EK_DEBUG
 #define MMM1D_GPU
 #endif
 
diff --git a/maintainer/format/autopep8.sh b/maintainer/format/autopep8.sh
index 3e2ca8ab8ac..d05e8756a5a 100755
--- a/maintainer/format/autopep8.sh
+++ b/maintainer/format/autopep8.sh
@@ -17,8 +17,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
-AUTOPEP8_VER=1.3.4
-PYCODESTYLE_VER=2.3.1
+AUTOPEP8_VER=1.5
+PYCODESTYLE_VER=2.5.0
 
 python3 -m autopep8 --help 2>&1 > /dev/null
 if [ "$?" = "0" ]; then
diff --git a/maintainer/format/cmake-format.sh b/maintainer/format/cmake-format.sh
index 6c3e69f5f29..d1dab5cb365 100755
--- a/maintainer/format/cmake-format.sh
+++ b/maintainer/format/cmake-format.sh
@@ -16,7 +16,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-CMAKE_FORMAT_VER=0.6.9
+CMAKE_FORMAT_VER=0.6.11
 python3 -m cmake_format 2>&1 > /dev/null
 if [ "$?" = "0" ]; then
     CMAKE_FORMAT="python3 -m cmake_format"
diff --git a/requirements.txt b/requirements.txt
index 86d78b471d9..be665e5cb59 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,8 +3,8 @@ numpy>=1.14.0
 h5py>=2.7.1
 # optional scientific packages
 scipy>=0.19.0
-MDAnalysis>=0.18
-pint>=0.9
+MDAnalysis>=1.0.0
+pint>=0.10.1
 # optional packages for graphics and external devices
 matplotlib>=2.1.1
 vtk>=7.1.1
@@ -23,12 +23,12 @@ sphinxcontrib-bibtex>=0.3.5
 # jupyter dependencies
 jupyter_contrib_nbextensions==0.5.1
 # pep8 and its dependencies
-autopep8==1.3.4
-pycodestyle==2.3.1
+autopep8==1.5.0
+pycodestyle==2.5.0
 # pylint and its dependencies
-pylint>=2.2.2
-astroid>=2.1.0
+pylint>=2.4.4
+astroid>=2.3.3
 isort>=4.3.4
 setuptools>=39.0.1
 pre-commit>=2.2.0
-cmake-format==0.6.9
+cmake-format==0.6.11
diff --git a/samples/lb_profile.py b/samples/lb_profile.py
index 7f4f6235dd6..06e6e05f516 100644
--- a/samples/lb_profile.py
+++ b/samples/lb_profile.py
@@ -32,6 +32,7 @@
 import espressomd.shapes
 import espressomd.lbboundaries
 import espressomd.accumulators
+import espressomd.math
 
 system = espressomd.System(box_l=[10.0, 10.0, 5.0])
 system.time_step = 0.01
@@ -42,16 +43,14 @@
     agrid=1.0, dens=1.0, visc=1.0, tau=0.01, ext_force_density=[0, 0, 0.15], kT=1.0, seed=32)
 system.actors.add(lb_fluid)
 system.thermostat.set_lb(LB_fluid=lb_fluid, seed=23)
-fluid_obs = espressomd.observables.CylindricalLBVelocityProfile(
+ctp = espressomd.math.CylindricalTransformationParameters(
     center=[5.0, 5.0, 0.0],
     axis=[0, 0, 1],
+    orientation=[1, 0, 0])
+fluid_obs = espressomd.observables.CylindricalLBVelocityProfile(
+    transform_params=ctp,
     n_r_bins=100,
-    n_phi_bins=1,
-    n_z_bins=1,
-    min_r=0.0,
     max_r=4.0,
-    min_phi=-np.pi,
-    max_phi=np.pi,
     min_z=0.0,
     max_z=10.0,
     sampling_density=0.1)
diff --git a/samples/load_checkpoint.py b/samples/load_checkpoint.py
index f00107ce4f4..807768e3d3d 100644
--- a/samples/load_checkpoint.py
+++ b/samples/load_checkpoint.py
@@ -30,7 +30,6 @@
 espressomd.assert_features(required_features)
 
 from espressomd import checkpointing
-import numpy as np
 
 checkpoint = checkpointing.Checkpoint(checkpoint_id="mycheckpoint")
 checkpoint.load()
@@ -77,10 +76,7 @@
     checkpoint.get_registered_objects()))
 
 
-# integrate system and finally save checkpoint
-print("\n### Integrate until user presses ctrl+c ###")
+# integrate system
 print("Integrating...")
 
-np.random.seed(seed=42)
-while True:
-    system.integrator.run(1000)
+system.integrator.run(1000)
diff --git a/samples/save_checkpoint.py b/samples/save_checkpoint.py
index 462945805b0..5c51c57faf4 100644
--- a/samples/save_checkpoint.py
+++ b/samples/save_checkpoint.py
@@ -43,7 +43,7 @@
 # test for user data
 myvar = "some script variable"
 checkpoint.register("myvar")
-myvar = "updated value"  # demo of how the register function works
+myvar = myvar + " (updated value)"  # demo of how the register function works
 
 # test for "system"
 box_l = 10.7437
diff --git a/samples/visualization_elc.py b/samples/visualization_elc.py
index c8c67c73aea..950bc8b3708 100644
--- a/samples/visualization_elc.py
+++ b/samples/visualization_elc.py
@@ -27,7 +27,6 @@
 import espressomd
 import espressomd.shapes
 from espressomd import electrostatics
-from espressomd import electrostatic_extensions
 from espressomd import visualization
 
 required_features = ["P3M", "WCA"]
@@ -75,11 +74,8 @@
 system.thermostat.set_langevin(kT=0.1, gamma=1.0, seed=42)
 
 p3m = electrostatics.P3M(prefactor=1.0, accuracy=1e-2)
-
-system.actors.add(p3m)
-
-elc = electrostatic_extensions.ELC(maxPWerror=1.0, gap_size=elc_gap,
-                                   const_pot=True, pot_diff=potential_diff)
+elc = electrostatics.ELC(p3m_actor=p3m, maxPWerror=1.0, gap_size=elc_gap,
+                         const_pot=True, pot_diff=potential_diff)
 system.actors.add(elc)
 
 visualizer.run(1)
diff --git a/src/config/features.def b/src/config/features.def
index 6a203625f25..73c6cd3a7d7 100644
--- a/src/config/features.def
+++ b/src/config/features.def
@@ -23,7 +23,6 @@ COLLISION_DETECTION
 NPT
 ENGINE                          implies ROTATION, EXTERNAL_FORCES
 PARTICLE_ANISOTROPY             implies ROTATION
-STOKESIAN_DYNAMICS              requires BLAS and LAPACK
 STOKESIAN_DYNAMICS              implies ROTATION
 
 /* Rotation */
@@ -102,6 +101,4 @@ FFTW external
 H5MD external
 SCAFACOS external
 GSL external
-BLAS external
-LAPACK external
 STOKESIAN_DYNAMICS external
diff --git a/src/core/CellStructure.hpp b/src/core/CellStructure.hpp
index b3688a956fe..c8c091fc323 100644
--- a/src/core/CellStructure.hpp
+++ b/src/core/CellStructure.hpp
@@ -459,22 +459,21 @@ struct CellStructure {
 
 public:
   /**
-   * @brief Set the particle decomposition to
-   *        AtomDecomposition.
+   * @brief Set the particle decomposition to AtomDecomposition.
    *
-   *        @param comm Communicator to use.
-   *        @param box Box Geometry
+   * @param comm Communicator to use.
+   * @param box Box Geometry
    */
   void set_atom_decomposition(boost::mpi::communicator const &comm,
                               BoxGeometry const &box);
 
   /**
-   * @brief Set the particle decomposition to
-   *        DomainDecomposition.
+   * @brief Set the particle decomposition to DomainDecomposition.
    *
-   *        @param comm Cartesian communicator to use.
-   *        @param box Box Geometry
-   *        @param local_geo Geometry of the local box.
+   * @param comm Cartesian communicator to use.
+   * @param range Interaction range.
+   * @param box Box Geometry
+   * @param local_geo Geometry of the local box.
    */
   void set_domain_decomposition(boost::mpi::communicator const &comm,
                                 double range, BoxGeometry const &box,
diff --git a/src/core/DomainDecomposition.hpp b/src/core/DomainDecomposition.hpp
index 0c6fe81252e..747f19af010 100644
--- a/src/core/DomainDecomposition.hpp
+++ b/src/core/DomainDecomposition.hpp
@@ -66,12 +66,10 @@
  *
  */
 struct DomainDecomposition : public ParticleDecomposition {
-  /** Grind dimensions per node. */
+  /** Grid dimensions per node. */
   Utils::Vector3i cell_grid = {};
-  /** cell size. */
+  /** Cell size. */
   Utils::Vector3d cell_size = {};
-
-private:
   /** Offset in global grid */
   Utils::Vector3i cell_offset = {};
   /** linked cell grid with ghost frame. */
@@ -120,7 +118,7 @@ struct DomainDecomposition : public ParticleDecomposition {
   }
 
 private:
-  /** Fill local_cells list and ghost_cells list for use with domain
+  /** Fill @c m_local_cells list and @c m_ghost_cells list for use with domain
    *  decomposition.
    */
   void mark_cells();
@@ -128,8 +126,8 @@ struct DomainDecomposition : public ParticleDecomposition {
   /** Fill a communication cell pointer list. Fill the cell pointers of
    *  all cells which are inside a rectangular subgrid of the 3D cell
    *  grid starting from the
-   *  lower left corner lc up to the high top corner hc. The cell
-   *  pointer list part_lists must already be large enough.
+   *  lower left corner @p lc up to the high top corner @p hc. The cell
+   *  pointer list @p part_lists must already be large enough.
    *  \param part_lists  List of cell pointers to store the result.
    *  \param lc          lower left corner of the subgrid.
    *  \param hc          high up corner of the subgrid.
@@ -159,10 +157,10 @@ struct DomainDecomposition : public ParticleDecomposition {
   /**
    * @brief Split particle list by direction.
    *
-   * Moves all particles from src into left
-   * and right depending if they belong to
-   * the left or right side from local node
-   * in direction dir.
+   * Moves all particles from @p src into @p left
+   * or @p right depending on whether they belong
+   * to the left or right side of the local node
+   * in direction @p dir.
    *
    * @param src Particles to sort.
    * @param left Particles that should go to the left
@@ -185,36 +183,36 @@ struct DomainDecomposition : public ParticleDecomposition {
    *  @brief Calculate cell grid dimensions, cell sizes and number of cells.
    *
    *  Calculates the cell grid, based on the local box size and the range.
-   *  If the number of cells is larger than max_num_cells,
-   *  it increases max_range until the number of cells is
-   *  smaller or equal max_num_cells. It sets:
-   *  cell_grid,
-   *  ghost_cell_grid,
-   *  cell_size, and
-   *  inv_cell_size.
+   *  If the number of cells is larger than @c max_num_cells,
+   *  it increases @c max_range until the number of cells is
+   *  smaller or equal to @c max_num_cells. It sets:
+   *  @c cell_grid,
+   *  @c ghost_cell_grid,
+   *  @c cell_size, and
+   *  @c inv_cell_size.
    *
-   *  @param range Required interacting range. All pairs closer
-   *         than this distance are found.
+   *  @param range interaction range. All pairs closer
+   *               than this distance are found.
    */
   void create_cell_grid(double range);
 
   /** Init cell interactions for cell system domain decomposition.
    *  Initializes the interacting neighbor cell list of a cell.
    *  This list of interacting neighbor cells is used by the Verlet
-   * algorithm.
+   *  algorithm.
    */
   void init_cell_interactions();
 
-  /** Create communicators for cell structure domain decomposition. (see \ref
-   *  GhostCommunicator)
+  /** Create communicators for cell structure domain decomposition (see \ref
+   *  GhostCommunicator).
    */
   GhostCommunicator prepare_comm();
 
   /** Maximal number of cells per node. In order to avoid memory
-   *  problems due to the cell grid one has to specify the maximal
+   *  problems due to the cell grid, one has to specify the maximal
    *  number of cells. If the number of cells is larger
-   *  than max_num_cells the cell grid is reduced.
-   *  max_num_cells has to be larger than 27, e.g. one inner cell.
+   *  than @c max_num_cells, the cell grid is reduced.
+   *  @c max_num_cells has to be larger than 27, e.g. one inner cell.
    */
   static constexpr int max_num_cells = 32768;
 };
diff --git a/src/core/EspressoSystemInterface_cuda.cu b/src/core/EspressoSystemInterface_cuda.cu
index adeb11a1491..81948ad920b 100644
--- a/src/core/EspressoSystemInterface_cuda.cu
+++ b/src/core/EspressoSystemInterface_cuda.cu
@@ -19,7 +19,7 @@
 
 #include "EspressoSystemInterface.hpp"
 #include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 #include "errorhandling.hpp"
 
 #include <cuda.h>
diff --git a/src/core/Particle.hpp b/src/core/Particle.hpp
index 82bfe54cdc3..10ee2833641 100644
--- a/src/core/Particle.hpp
+++ b/src/core/Particle.hpp
@@ -39,23 +39,36 @@ enum : uint8_t {
 };
 
 #ifdef EXTERNAL_FORCES
-/**
- *  \ref ParticleProperties::ext_flag "ext_flag" value for fixed coordinate
+/** \ref ParticleProperties::ext_flag "ext_flag" value for fixed coordinate
  *  @c coord.
  */
 #define COORD_FIXED(coord) (2u << (coord))
 /** \ref ParticleProperties::ext_flag "ext_flag" mask to check whether any of
- *  the coordinates is fixed. */
+ *  the coordinates is fixed.
+ */
 #define COORDS_FIX_MASK (COORD_FIXED(0) | COORD_FIXED(1) | COORD_FIXED(2))
-#else
+#else // EXTERNAL_FORCES
 #define COORD_FIXED(coord) (0)
-#endif
+#endif // EXTERNAL_FORCES
 
+/** Properties of a self-propelled particle. */
 struct ParticleParametersSwimming {
+  /** Is the particle a swimmer. */
   bool swimming = false;
+  /** Constant velocity to relax to. */
   double f_swim = 0.;
+  /** Imposed constant force. */
   double v_swim = 0.;
+  /** Flag for the swimming mode in a LB fluid.
+   *  Values:
+   *  - -1: pusher
+   *  - +1: puller
+   *  - 0: no swimming
+   */
   int push_pull = 0;
+  /** Distance of the source of propulsion from the particle
+   *  center in a LB fluid.
+   */
   double dipole_length = 0.;
 
   template <class Archive> void serialize(Archive &ar, long int /* version */) {
@@ -82,7 +95,7 @@ struct ParticleProperties {
   double mass = 1.0;
 #else
   constexpr static double mass{1.0};
-#endif /* MASS */
+#endif
 
   /** rotational inertia */
 #ifdef ROTATIONAL_INERTIA
@@ -141,48 +154,46 @@ struct ParticleProperties {
       ar &quat;
     }
   } vs_relative;
-#endif
-#else  /* VIRTUAL_SITES */
+#endif // VIRTUAL_SITES_RELATIVE
+#else  // VIRTUAL_SITES
   static constexpr bool is_virtual = false;
-#endif /* VIRTUAL_SITES */
+#endif // VIRTUAL_SITES
 
 #ifdef THERMOSTAT_PER_PARTICLE
+/** Friction coefficient for translation */
 #ifndef PARTICLE_ANISOTROPY
   double gamma = -1.;
 #else
   Utils::Vector3d gamma = {-1., -1., -1.};
 #endif // PARTICLE_ANISOTROPY
-/** Friction coefficient gamma for rotation */
 #ifdef ROTATION
+/** Friction coefficient for rotation */
 #ifndef PARTICLE_ANISOTROPY
   double gamma_rot = -1.;
 #else
   Utils::Vector3d gamma_rot = {-1., -1., -1.};
-#endif // ROTATIONAL_INERTIA
+#endif // PARTICLE_ANISOTROPY
 #endif // ROTATION
 #endif // THERMOSTAT_PER_PARTICLE
 
 #ifdef EXTERNAL_FORCES
-  /** flag whether to fix a particle in space.
-      Values:
-      <ul> <li> 0 no external influence
-           <li> 1 apply external force \ref ParticleProperties::ext_force
-           <li> 2,3,4 fix particle coordinate 0,1,2
-           <li> 5 apply external torque \ref ParticleProperties::ext_torque
-      </ul>
-  */
+  /** Flag for fixed particle coordinates.
+   *  Values:
+   *  - 0: no fixed coordinates
+   *  - 2: fix translation along the x axis
+   *  - 4: fix translation along the y axis
+   *  - 8: fix translation along the z axis
+   */
   uint8_t ext_flag = 0;
-  /** External force, apply if \ref ParticleProperties::ext_flag == 1. */
+  /** External force. */
   Utils::Vector3d ext_force = {0, 0, 0};
-
 #ifdef ROTATION
-  /** External torque, apply if \ref ParticleProperties::ext_flag == 16. */
+  /** External torque. */
   Utils::Vector3d ext_torque = {0, 0, 0};
 #endif
-#else
-  static constexpr const uint8_t ext_flag =
-      0; // no external forces and fixed coordinates
-#endif
+#else  // EXTERNAL_FORCES
+  static constexpr const uint8_t ext_flag = 0; // no fixed coordinates
+#endif // EXTERNAL_FORCES
 
 #ifdef ENGINE
   ParticleParametersSwimming swim;
@@ -194,7 +205,7 @@ struct ParticleProperties {
     ar &type;
 #ifdef MASS
     ar &mass;
-#endif /* MASS */
+#endif
 #ifdef ROTATIONAL_INERTIA
     ar &rinertia;
 #endif
@@ -217,7 +228,7 @@ struct ParticleProperties {
 #ifdef VIRTUAL_SITES_RELATIVE
     ar &vs_relative;
 #endif
-#endif /* VIRTUAL_SITES */
+#endif // VIRTUAL_SITES
 
 #ifdef THERMOSTAT_PER_PARTICLE
     ar &gamma;
@@ -231,7 +242,7 @@ struct ParticleProperties {
 #ifdef ROTATION
     ar &ext_torque;
 #endif
-#endif
+#endif // EXTERNAL_FORCES
 
 #ifdef ENGINE
     ar &swim;
@@ -256,7 +267,7 @@ struct ParticlePosition {
 #endif
 
 #ifdef BOND_CONSTRAINT
-  /** particle position at the previous time step */
+  /** particle position at the previous time step (RATTLE algorithm) */
   Utils::Vector3d p_old = {0., 0., 0.};
 #endif
 
@@ -277,6 +288,7 @@ struct ParticlePosition {
 struct ParticleForce {
   ParticleForce() = default;
   ParticleForce(ParticleForce const &) = default;
+  ParticleForce &operator=(ParticleForce const &) = default;
   ParticleForce(const Utils::Vector3d &f) : f(f) {}
 #ifdef ROTATION
   ParticleForce(const Utils::Vector3d &f, const Utils::Vector3d &torque)
@@ -300,7 +312,7 @@ struct ParticleForce {
   Utils::Vector3d f = {0., 0., 0.};
 
 #ifdef ROTATION
-  /** torque */
+  /** torque. */
   Utils::Vector3d torque = {0., 0., 0.};
 #endif
 
@@ -313,15 +325,17 @@ struct ParticleForce {
 };
 
 /** Momentum information on a particle. Information not contained in
-    communication of ghost particles so far, but a communication would
-    be necessary for velocity dependent potentials. */
+ *  communication of ghost particles so far, but a communication would
+ *  be necessary for velocity-dependent potentials.
+ */
 struct ParticleMomentum {
   /** velocity. */
   Utils::Vector3d v = {0., 0., 0.};
 
 #ifdef ROTATION
-  /** angular velocity
-      ALWAYS IN PARTICLE FIXED, I.E., CO-ROTATING COORDINATE SYSTEM */
+  /** angular velocity.
+   *  ALWAYS IN PARTICLE FIXED, I.E., CO-ROTATING COORDINATE SYSTEM.
+   */
   Utils::Vector3d omega = {0., 0., 0.};
 #endif
 
@@ -334,10 +348,10 @@ struct ParticleMomentum {
 };
 
 /** Information on a particle that is needed only on the
- *  node the particle belongs to
+ *  node the particle belongs to.
  */
 struct ParticleLocal {
-  /** check whether a particle is a ghost or not */
+  /** is particle a ghost particle. */
   bool ghost = false;
   /** position in the last time step before last Verlet list update. */
   Utils::Vector3d p_old = {0, 0, 0};
@@ -387,10 +401,9 @@ struct Particle { // NOLINT(bugprone-exception-escape)
 
 private:
 #ifdef EXCLUSIONS
-  /** list of particles, with which this particle has no nonbonded
+  /** list of particles, with which this particle has no non-bonded
    *  interactions
    */
-
   std::vector<int> el;
 #endif
 
diff --git a/src/core/RuntimeErrorStream.cpp b/src/core/RuntimeErrorStream.cpp
index 654d5d39781..8bd24eb8c4f 100644
--- a/src/core/RuntimeErrorStream.cpp
+++ b/src/core/RuntimeErrorStream.cpp
@@ -22,8 +22,7 @@
 #include <utility>
 
 namespace ErrorHandling {
-/** ostringstream is not copyable, but it is fine here to copy just the content.
- */
+// ostringstream is not copyable, but it is fine here to copy just the content.
 RuntimeErrorStream::RuntimeErrorStream(const RuntimeErrorStream &rhs)
     : m_ec(rhs.m_ec), m_line(rhs.m_line), m_file(rhs.m_file),
       m_function(rhs.m_function) {
diff --git a/src/core/accumulators/Correlator.cpp b/src/core/accumulators/Correlator.cpp
index a8a5d4b24c1..7345eddd134 100644
--- a/src/core/accumulators/Correlator.cpp
+++ b/src/core/accumulators/Correlator.cpp
@@ -418,8 +418,6 @@ void Correlator::update() {
       }
     }
   }
-
-  m_last_update = sim_time;
 }
 
 int Correlator::finalize() {
@@ -543,7 +541,6 @@ std::string Correlator::get_internal_state() const {
   oa << A_accumulated_average;
   oa << B_accumulated_average;
   oa << n_data;
-  oa << m_last_update;
 
   return ss.str();
 }
@@ -565,7 +562,6 @@ void Correlator::set_internal_state(std::string const &state) {
   ia >> A_accumulated_average;
   ia >> B_accumulated_average;
   ia >> n_data;
-  ia >> m_last_update;
 }
 
 } // namespace Accumulators
diff --git a/src/core/accumulators/Correlator.hpp b/src/core/accumulators/Correlator.hpp
index 5fcf9f46c27..bd25147798c 100644
--- a/src/core/accumulators/Correlator.hpp
+++ b/src/core/accumulators/Correlator.hpp
@@ -201,7 +201,6 @@ class Correlator : public AccumulatorBase {
 
   int tau_lin() const { return m_tau_lin; }
   double tau_max() const { return m_tau_max; }
-  double last_update() const { return m_last_update; }
   double dt() const { return m_dt; }
 
   Utils::Vector3d const &correlation_args() const { return m_correlation_args; }
@@ -259,8 +258,6 @@ class Correlator : public AccumulatorBase {
   std::vector<double> B_accumulated_average; ///< all B values are added up here
   size_t n_data; ///< a counter for calculated averages and variances
 
-  double m_last_update;
-
   size_t dim_A;                ///< dimensionality of A
   size_t dim_B;                ///< dimensionality of B
   std::vector<size_t> m_shape; ///< dimensionality of the correlation
diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
index 443a13a06a1..b010ebe3804 100644
--- a/src/core/actor/DipolarBarnesHut.hpp
+++ b/src/core/actor/DipolarBarnesHut.hpp
@@ -27,6 +27,7 @@
 #include "DipolarBarnesHut_cuda.cuh"
 #include "SystemInterface.hpp"
 #include "cuda_interface.hpp"
+#include "cuda_utils.hpp"
 #include "electrostatics_magnetostatics/dipole.hpp"
 #include "errorhandling.hpp"
 
@@ -38,7 +39,7 @@ typedef float dds_float;
 class DipolarBarnesHut : public Actor {
 public:
   DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) {
-    k = static_cast<float>(dipole.prefactor);
+    m_k = static_cast<float>(dipole.prefactor);
     m_epssq = epssq;
     m_itolsq = itolsq;
     setBHPrecision(&m_epssq, &m_itolsq);
@@ -53,7 +54,12 @@ class DipolarBarnesHut : public Actor {
   };
 
   void computeForces(SystemInterface &s) override {
-    allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
+    try {
+      allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
+    } catch (cuda_runtime_error const &err) {
+      runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
+      return;
+    }
 
     fillConstantPointers(s.rGpuBegin(), s.dipGpuBegin(), m_bh_data);
     initBHgpu(m_bh_data.blocks);
@@ -61,12 +67,17 @@ class DipolarBarnesHut : public Actor {
     buildTreeBH(m_bh_data.blocks);
     summarizeBH(m_bh_data.blocks);
     sortBH(m_bh_data.blocks);
-    if (forceBH(&m_bh_data, k, s.fGpuBegin(), s.torqueGpuBegin())) {
+    if (forceBH(&m_bh_data, m_k, s.fGpuBegin(), s.torqueGpuBegin())) {
       runtimeErrorMsg() << "kernels encountered a functional error";
     }
   };
   void computeEnergy(SystemInterface &s) override {
-    allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
+    try {
+      allocBHmemCopy(static_cast<int>(s.npart_gpu()), &m_bh_data);
+    } catch (cuda_runtime_error const &err) {
+      runtimeErrorMsg() << "DipolarBarnesHut: " << err.what();
+      return;
+    }
 
     fillConstantPointers(s.rGpuBegin(), s.dipGpuBegin(), m_bh_data);
     initBHgpu(m_bh_data.blocks);
@@ -74,13 +85,13 @@ class DipolarBarnesHut : public Actor {
     buildTreeBH(m_bh_data.blocks);
     summarizeBH(m_bh_data.blocks);
     sortBH(m_bh_data.blocks);
-    if (energyBH(&m_bh_data, k, (&(((CUDA_energy *)s.eGpu())->dipolar)))) {
+    if (energyBH(&m_bh_data, m_k, (&(((CUDA_energy *)s.eGpu())->dipolar)))) {
       runtimeErrorMsg() << "kernels encountered a functional error";
     }
   };
 
-protected:
-  float k;
+private:
+  float m_k;
   float m_epssq;
   float m_itolsq;
   BHData m_bh_data = {0,       0,       0,       nullptr, nullptr,
diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu
index 761a21098a2..1b6b72c4c61 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cu
+++ b/src/core/actor/DipolarBarnesHut_cuda.cu
@@ -28,7 +28,8 @@
 #include "DipolarBarnesHut_cuda.cuh"
 
 #include "cuda_init.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
+#include "errorhandling.hpp"
 
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
@@ -1181,11 +1182,8 @@ void allocBHmemCopy(int nbodies, BHData *bh_data) {
 
   bh_data->nbodies = nbodies;
 
-  int devID = -1;
-  EspressoGpuDevice dev;
-
-  devID = cuda_get_device();
-  cuda_get_device_props(devID, dev);
+  auto const devID = cuda_get_device();
+  EspressoGpuDevice const dev = cuda_get_device_props(devID);
 
   bh_data->blocks = dev.n_cores;
   // Each node corresponds to a split of the cubic box in 3D space to equal
diff --git a/src/core/actor/DipolarBarnesHut_cuda.cuh b/src/core/actor/DipolarBarnesHut_cuda.cuh
index 3a09682bd30..95079830c2e 100644
--- a/src/core/actor/DipolarBarnesHut_cuda.cuh
+++ b/src/core/actor/DipolarBarnesHut_cuda.cuh
@@ -28,94 +28,98 @@
 typedef float dds_float;
 
 typedef struct {
-  // CUDA blocks
+  /// CUDA blocks
   int blocks;
-  // each node corresponds to a split of the cubic box in 3D space to equal
-  // cubic boxes hence, 8 octant nodes per particle is a theoretical octree
-  // limit: a maximal number of octree nodes is "nnodesd" and a number of
-  // particles "nbodiesd" respectively.
+  /// each node corresponds to a split of the cubic box in 3D space to equal
+  /// cubic boxes hence, 8 octant nodes per particle is a theoretical octree
+  /// limit: a maximal number of octree nodes is "nnodesd" and a number of
+  /// particles "nbodiesd" respectively.
   int nbodies;
   int nnodes;
-  // particle positions on the device:
+  /// particle positions on the device:
   float *r;
-  // particle dipole moments on the device:
+  /// particle dipole moments on the device:
   float *u;
-  // Not a real mass. Just a node weight coefficient.
+  /// Not a real mass. Just a node weight coefficient.
   float *mass;
-  // min positions' coordinates of the BH box.
+  /// min positions' coordinates of the Barnes-Hut box.
   float *minp;
-  // max positions' coordinates of the BH box.
+  /// max positions' coordinates of the Barnes-Hut box.
   float *maxp;
-  // Error report.
+  /// Error report.
   int *err;
-  // Indices of particles sorted according to the tree linear representation.
+  /// Indices of particles sorted according to the tree linear representation.
   int *sort;
-  // The tree linear representation.
+  /// The tree linear representation.
   int *child;
-  // Supplementary array: a tree nodes (division octant cells/particles inside)
-  // counting.
+  /// Supplementary array: a tree nodes (division octant cells/particles inside)
+  /// counting.
   int *count;
-  // Start indices for the per-cell sorting.
+  /// Start indices for the per-cell sorting.
   int *start;
-  // trace the max loops for a threads' sync
+  /// trace the max loops for a threads' sync
   int *max_lps;
 } BHData;
 
-// thread count for different kernels (see kernel calls from below functions).
+/// @name Barnes-Hut thread count for different kernels.
+/// @{
 #define THREADS1 512
 #define THREADS2 1024
 #define THREADS3 1024
 #define THREADS4 1024
 #define THREADS5 256
+/// @}
 
-// block count = factor * #SMs
-// for different kernels (see kernel calls from below functions).
+/// @name Barnes-Hut block factor for different kernels.
+/// block count = factor * number of blocks
+/// @{
 #define FACTOR1 2
 #define FACTOR2 1
 #define FACTOR3 1 /* must all be resident at the same time */
 #define FACTOR4 1 /* must all be resident at the same time */
 #define FACTOR5 4
+/// @}
 
-// Warp size.
+/// Barnes-Hut warp size.
 #define WARPSIZE 32
-// Max possible depth of the Barnes-Hut tree branching.
+/// Maximal depth of the Barnes-Hut tree branching.
 #define MAXDEPTH 32
 
-// Function to set the BH method parameters.
+/// Function to set the Barnes-Hut parameters.
 void setBHPrecision(float *epssq, float *itolsq);
 
-// An allocation of the GPU device memory and an initialization where it is
-// needed.
+/// An allocation of the GPU device memory and an initialization where it is
+/// needed.
 void allocBHmemCopy(int nbodies, BHData *bh_data);
 
-// Populating of array pointers allocated in GPU device before.
-// Copy the particle data to the Barnes-Hut related arrays.
+/// Populating of array pointers allocated in GPU device before.
+/// Copy the particle data to the Barnes-Hut related arrays.
 void fillConstantPointers(float *r, float *dip, BHData bh_data);
 
-// Required BH CUDA init.
+/// Barnes-Hut CUDA initialization.
 void initBHgpu(int blocks);
 
-// Building Barnes-Hut spatial min/max position box
+/// Building Barnes-Hut spatial min/max position box
 void buildBoxBH(int blocks);
 
-// Building Barnes-Hut tree in a linear child array representation
-// of octant cells and particles inside.
+/// Building Barnes-Hut tree in a linear child array representation
+/// of octant cells and particles inside.
 void buildTreeBH(int blocks);
 
-// Calculate octant cells masses and cell index counts.
-// Determine cells centers of mass and total dipole moments
-// on all possible levels of the BH tree.
+/// Calculate octant cells masses and cell index counts.
+/// Determine cells centers of mass and total dipole moments
+/// on all possible levels of the Barnes-Hut tree.
 void summarizeBH(int blocks);
 
-// Sort particle indexes according to the BH tree representation.
-// Crucial for the per-warp performance tuning of forceCalculationKernel and
-// energyCalculationKernel.
+/// Sort particle indexes according to the Barnes-Hut tree representation.
+/// Crucial for the per-warp performance tuning of @c forceCalculationKernel
+/// and @c energyCalculationKernel.
 void sortBH(int blocks);
 
-// Force calculation.
+/// Barnes-Hut force calculation.
 int forceBH(BHData *bh_data, dds_float k, float *f, float *torque);
 
-// Energy calculation.
+/// Barnes-Hut energy calculation.
 int energyBH(BHData *bh_data, dds_float k, float *E);
 
 #endif // DIPOLAR_BARNES_HUT
diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu
index 91462241478..278bf4e9eb4 100644
--- a/src/core/actor/DipolarDirectSum_cuda.cu
+++ b/src/core/actor/DipolarDirectSum_cuda.cu
@@ -21,7 +21,7 @@
 
 #ifdef DIPOLAR_DIRECT_SUM
 
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu
index 89f81c67844..e217e652e5e 100644
--- a/src/core/actor/Mmm1dgpuForce_cuda.cu
+++ b/src/core/actor/Mmm1dgpuForce_cuda.cu
@@ -28,7 +28,7 @@
 #include "EspressoSystemInterface.hpp"
 #include "actor/Mmm1dgpuForce.hpp"
 #include "actor/specfunc_cuda.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 #include "electrostatics_magnetostatics/coulomb.hpp"
 #include "electrostatics_magnetostatics/mmm-modpsi.hpp"
 #include "electrostatics_magnetostatics/mmm1d.hpp"
diff --git a/src/core/cuda_common_cuda.cu b/src/core/cuda_common_cuda.cu
index c35f3858279..0e1bcba4556 100644
--- a/src/core/cuda_common_cuda.cu
+++ b/src/core/cuda_common_cuda.cu
@@ -21,7 +21,7 @@
 #include "ParticleRange.hpp"
 #include "cuda_init.hpp"
 #include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 #include "errorhandling.hpp"
 
 #include "CudaDeviceAllocator.hpp"
@@ -34,6 +34,7 @@
 #include <cuda.h>
 
 #include <cstddef>
+#include <cstdio>
 
 extern int this_node;
 
@@ -66,16 +67,12 @@ CUDA_energy energy_host;
 
 pinned_vector<float> particle_torques_host;
 
-/**cuda streams for parallel computing on cpu and gpu */
 cudaStream_t stream[1];
 
-cudaError_t _err;
-cudaError_t CU_err;
-
-void _cuda_check_errors(const dim3 &block, const dim3 &grid,
-                        const char *function, const char *file,
-                        unsigned int line) {
-  CU_err = cudaGetLastError();
+void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
+                            const char *function, const char *file,
+                            unsigned int line) {
+  cudaError_t CU_err = cudaGetLastError();
   if (CU_err != cudaSuccess) {
     fprintf(stderr,
             "%d: error \"%s\" calling %s with dim %d %d %d, grid %d %d "
@@ -134,25 +131,23 @@ void resize_buffers(size_t number_of_particles) {
  */
 void gpu_init_particle_comm() {
   if (this_node == 0 && global_part_vars_host.communication_enabled == 0) {
-    if (cuda_get_n_gpus() == -1) {
-      runtimeErrorMsg()
-          << "Unable to initialize CUDA as no sufficient GPU is available.";
-      errexit();
-    }
-    if (cuda_get_n_gpus() > 1) {
-      runtimeWarningMsg() << "More than one GPU detected, please note ESPResSo "
-                             "uses device 0 by default regardless of usage or "
-                             "capability. The GPU to be used can be modified "
-                             "by setting System.cuda_init_handle.device.";
-      if (cuda_check_gpu(0) != ES_OK) {
-        runtimeWarningMsg()
-            << "CUDA device 0 is not capable of running ESPResSo but is used "
-               "by default. ESPResSo has detected a CUDA capable card but it "
-               "is not the one used by ESPResSo by default. Please set the "
-               "GPU to use by setting System.cuda_init_handle.device. A list "
-               "of available GPUs is available through "
-               "System.cuda_init_handle.device_list.";
+    try {
+      if (cuda_get_n_gpus() == 0) {
+        fprintf(stderr, "ERROR: No GPU was found.\n");
+        errexit();
       }
+      auto const devID = cuda_get_device();
+      auto const compute_capability = cuda_check_gpu_compute_capability(devID);
+      auto const communication_test = cuda_test_device_access();
+      if (compute_capability != ES_OK or communication_test != ES_OK) {
+        fprintf(stderr,
+                "ERROR: CUDA device %i is not capable of running ESPResSo.\n",
+                devID);
+        errexit();
+      }
+    } catch (cuda_runtime_error const &err) {
+      fprintf(stderr, "ERROR: %s\n", err.what());
+      errexit();
     }
   }
   global_part_vars_host.communication_enabled = 1;
@@ -230,10 +225,11 @@ CUDA_energy copy_energy_from_GPU() {
   return energy_host;
 }
 
-void _cuda_safe_mem(cudaError_t CU_err, const char *file, unsigned int line) {
-  if (cudaSuccess != CU_err) {
-    fprintf(stderr, "Cuda Memory error at %s:%u.\n", file, line);
-    printf("CUDA error: %s\n", cudaGetErrorString(CU_err));
+void cuda_safe_mem_exit(cudaError_t CU_err, const char *file,
+                        unsigned int line) {
+  if (CU_err != cudaSuccess) {
+    fprintf(stderr, "CUDA Memory error at %s:%u.\n", file, line);
+    fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(CU_err));
     if (CU_err == cudaErrorInvalidValue)
       fprintf(stderr, "You may have tried to allocate zero memory at %s:%u.\n",
               file, line);
diff --git a/src/core/cuda_init.cpp b/src/core/cuda_init.cpp
index 62a9e8d5cc5..15fec94982a 100644
--- a/src/core/cuda_init.cpp
+++ b/src/core/cuda_init.cpp
@@ -22,8 +22,10 @@
 #ifdef CUDA
 
 #include "cuda_init.hpp"
+#include "cuda_utils.hpp"
 
 #include "communication.hpp"
+#include "errorhandling.hpp"
 
 #include <utils/constants.hpp>
 
@@ -34,13 +36,13 @@
 #include <iterator>
 #include <set>
 
-/** Helper class force device set.
+/** Helper class for device sets.
  */
 struct CompareDevices {
   bool operator()(const EspressoGpuDevice &a,
                   const EspressoGpuDevice &b) const {
     const int name_comp = strncmp(a.proc_name, b.proc_name, 63);
-    /* Both devs are from the same node, order by id */
+    /* if both devices are from the same node, order by id */
     if (name_comp == 0)
       return a.id < b.id;
 
@@ -49,49 +51,50 @@ struct CompareDevices {
 };
 
 /** Gather list of CUDA devices on all nodes on the master node.
- *  It relies on MPI_Get_processor_name() to get a unique identifier of
- *  the physical node, as opposed to the logical rank of which there can
- *  be more than one on one node.
+ *  It relies on <tt>MPI_Get_processor_name()</tt> to get a unique identifier
+ *  of the physical node, as opposed to the logical rank of which there can
+ *  be more than one per node.
  */
-std::vector<EspressoGpuDevice> cuda_gather_gpus() {
-  int n_gpus = cuda_get_n_gpus();
-  char proc_name[MPI_MAX_PROCESSOR_NAME];
-  int proc_name_len;
+static std::vector<EspressoGpuDevice> mpi_cuda_gather_gpus_local() {
   /* List of local devices */
-  std::vector<EspressoGpuDevice> devices;
+  std::vector<EspressoGpuDevice> devices_local;
   /* Global unique device list (only relevant on master) */
-  std::vector<EspressoGpuDevice> g_devices;
-  int *n_gpu_array = nullptr;
+  std::vector<EspressoGpuDevice> devices_global;
 
-  MPI_Get_processor_name(proc_name, &proc_name_len);
+  int n_devices;
+  try {
+    n_devices = cuda_get_n_gpus();
+  } catch (cuda_runtime_error const &err) {
+    n_devices = 0;
+  }
 
-  /* Truncate to 63 chars to fit struct. */
-  if (strlen(proc_name) > 63)
+  int proc_name_len;
+  char proc_name[MPI_MAX_PROCESSOR_NAME];
+  MPI_Get_processor_name(proc_name, &proc_name_len);
+  if (std::strlen(proc_name) > 63)
     proc_name[63] = '\0';
 
-  for (int i = 0; i < n_gpus; ++i) {
-    /* Check if device has at least minimum compute capability */
-    if (cuda_check_gpu(i) == ES_OK) {
-      EspressoGpuDevice device;
-      if (cuda_get_device_props(i, device) == ES_OK) {
-        strncpy(device.proc_name, proc_name, 64);
-        device.proc_name[63] = '\0';
-        device.node = this_node;
-        devices.push_back(device);
-      }
+  for (int i = 0; i < n_devices; ++i) {
+    try {
+      EspressoGpuDevice device = cuda_get_device_props(i);
+      std::strncpy(device.proc_name, proc_name, 64);
+      device.proc_name[63] = '\0';
+      device.node = this_node;
+      devices_local.push_back(device);
+    } catch (cuda_runtime_error const &err) {
+      // pass
     }
   }
 
-  /* Update n_gpus to number of usable devices */
-  n_gpus = devices.size();
+  int const n_gpus = static_cast<int>(devices_local.size());
 
   if (this_node == 0) {
     std::set<EspressoGpuDevice, CompareDevices> device_set;
-    n_gpu_array = new int[n_nodes];
+    int *n_gpu_array = new int[n_nodes];
     MPI_Gather(&n_gpus, 1, MPI_INT, n_gpu_array, 1, MPI_INT, 0, MPI_COMM_WORLD);
 
     /* insert local devices */
-    std::copy(devices.begin(), devices.end(),
+    std::copy(devices_local.begin(), devices_local.end(),
               std::inserter(device_set, device_set.begin()));
 
     EspressoGpuDevice device;
@@ -106,18 +109,25 @@ std::vector<EspressoGpuDevice> cuda_gather_gpus() {
     }
     /* Copy unique devices to result, if any */
     std::copy(device_set.begin(), device_set.end(),
-              std::inserter(g_devices, g_devices.begin()));
+              std::inserter(devices_global, devices_global.begin()));
     delete[] n_gpu_array;
   } else {
     /* Send number of devices to master */
-    MPI_Gather(&n_gpus, 1, MPI_INT, n_gpu_array, 1, MPI_INT, 0, MPI_COMM_WORLD);
-    /* Send devices to maser */
-    for (auto &device : devices) {
+    MPI_Gather(&n_gpus, 1, MPI_INT, nullptr, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    /* Send devices to master */
+    for (auto const &device : devices_local) {
       MPI_Send(&device, sizeof(EspressoGpuDevice), MPI_BYTE, 0, 0,
                MPI_COMM_WORLD);
     }
   }
-  return g_devices;
+  return devices_global;
+}
+
+REGISTER_CALLBACK_MASTER_RANK(mpi_cuda_gather_gpus_local)
+
+std::vector<EspressoGpuDevice> cuda_gather_gpus() {
+  return mpi_call(Communication::Result::master_rank,
+                  mpi_cuda_gather_gpus_local);
 }
 
 #endif /* CUDA */
diff --git a/src/core/cuda_init.hpp b/src/core/cuda_init.hpp
index eab42a7490f..2447c532078 100644
--- a/src/core/cuda_init.hpp
+++ b/src/core/cuda_init.hpp
@@ -30,20 +30,21 @@
  *  old datatypes, as it is intended for MPI communication.
  */
 struct EspressoGpuDevice {
-  /* Local CUDA device id */
+  /** Local CUDA device id */
   int id;
-  /* Node identification */
+  /** Local CUDA device name */
+  char name[64];
+  /** Node identification */
   char proc_name[64];
-  /* MPI process identification */
+  /** MPI process identification */
   int node;
-  /* Compute capability */
+  /** Compute capability (major) */
   int compute_capability_major;
+  /** Compute capability (minor) */
   int compute_capability_minor;
-  /* Name */
-  char name[64];
-  /* Total Memory */
-  int total_memory;
-  /* Number of cores */
+  /** Total Memory */
+  size_t total_memory;
+  /** Number of cores */
   int n_cores;
 };
 
@@ -53,62 +54,52 @@ void cuda_init();
 
 /** Get the number of CUDA devices.
  *
- *  @return the number of GPUs, or -1 if CUDA could not be
- *  initialized. The error message from CUDA can be found in \ref
- *  cuda_error.
+ *  @return the number of GPUs.
  */
 int cuda_get_n_gpus();
 
-/** Check that a given GPU is capable of what we need, that is, at
- *  least compute capability 1.1.
+/** Check that a given GPU has compute capability.
+ *  The minimal compute capability required by ESPResSo is
+ *  \ref computeCapabilityMinMajor . \ref computeCapabilityMinMinor .
  *
  *  @param dev CUDA device number
- *  @return \ref ES_OK if and only if the GPU with the given id is
- *          usable for CUDA computations. Only devices with compute
- *          capability of 1.1 or higher are ok, since atomic operations are
- *          required for CUDA-LB.
+ *  @return \ref ES_OK if the GPU meets the requirements, else \ref ES_ERROR.
  */
-int cuda_check_gpu(int dev);
+int cuda_check_gpu_compute_capability(int dev);
 
 /** Get the name of a CUDA device.
  *
- *  @param dev the CUDA device number to ask the name for
- *  @param name a buffer to write the name to, at least 64 characters
+ *  @param[in]  dev the CUDA device number to ask the name for
+ *  @param[out] name a buffer to write the name to, at least 64 characters
  */
 void cuda_get_gpu_name(int dev, char name[64]);
 
 /** Choose a device for future CUDA computations.
  *
  *  @param dev the device to use
- *  @return \ref ES_OK on success, \ref ES_ERROR else. The error
- *  message from CUDA can be found in \ref cuda_error.
  */
-int cuda_set_device(int dev);
+void cuda_set_device(int dev);
 
 /** Get the current CUDA device.
  *
- *  @return the current device's number or -1 if an error occurred. The error
- *  message from CUDA can be found in \ref cuda_error.
+ *  @return the current device's number.
  */
 int cuda_get_device();
 
 /** Test if actual CUDA device works.
  *  @return \ref ES_OK on success, \ref ES_ERROR else.
- *  The error message from CUDA can be found in \ref cuda_error.
  */
 int cuda_test_device_access();
 
-/** Gather unique list of CUDA devices on all nodes
- *  @return vector of device on master, empty vector on other nodes.
+/** Gather unique list of CUDA devices on all nodes.
+ *  @return vector of device properties.
  */
 std::vector<EspressoGpuDevice> cuda_gather_gpus();
 
 /** Get properties of a CUDA device
+ *  @param dev CUDA device number
  */
-int cuda_get_device_props(int dev, EspressoGpuDevice &d);
-
-/** Current error message of CUDA. */
-extern const char *cuda_error;
+EspressoGpuDevice cuda_get_device_props(int dev);
 
 #endif // ifdef CUDA
 #endif
diff --git a/src/core/cuda_init_cuda.cu b/src/core/cuda_init_cuda.cu
index 31292cd2d39..e434d7132f0 100644
--- a/src/core/cuda_init_cuda.cu
+++ b/src/core/cuda_init_cuda.cu
@@ -20,10 +20,12 @@
 #include <cuda.h>
 
 #include "cuda_init.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 
 #include <utils/constants.hpp>
 
+#include <cstring>
+
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
 #endif
@@ -36,32 +38,20 @@ static const int computeCapabilityMinMajor = 3;
 static const int computeCapabilityMinMinor = 0;
 /**@}*/
 
-const char *cuda_error;
-
-void cuda_init() { cudaStreamCreate(&stream[0]); }
+void cuda_init() { CUDA_CHECK(cudaStreamCreate(&stream[0])) }
 
-/// get the number of CUDA devices.
 int cuda_get_n_gpus() {
   int deviceCount;
-  cudaError_t error = cudaGetDeviceCount(&deviceCount);
-  if (error != cudaSuccess) {
-    cuda_error = cudaGetErrorString(error);
-    return -1;
-  }
+  CUDA_CHECK(cudaGetDeviceCount(&deviceCount))
   return deviceCount;
 }
 
-int cuda_check_gpu(int dev) {
+int cuda_check_gpu_compute_capability(int dev) {
   cudaDeviceProp deviceProp;
-  cudaError_t error = cudaGetDeviceProperties(&deviceProp, dev);
-  if (error != cudaSuccess) {
-    cuda_error = cudaGetErrorString(error);
-    return ES_ERROR;
-  }
+  CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev))
   if (deviceProp.major < computeCapabilityMinMajor ||
       (deviceProp.major == computeCapabilityMinMajor &&
        deviceProp.minor < computeCapabilityMinMinor)) {
-    cuda_error = "compute capability insufficient";
     return ES_ERROR;
   }
   return ES_OK;
@@ -69,53 +59,36 @@ int cuda_check_gpu(int dev) {
 
 void cuda_get_gpu_name(int dev, char name[64]) {
   cudaDeviceProp deviceProp;
-  cudaError_t error = cudaGetDeviceProperties(&deviceProp, dev);
-  if (error != cudaSuccess) {
-    cuda_error = cudaGetErrorString(error);
-    strncpy(name, "no GPU", 63);
-  } else {
-    strncpy(name, deviceProp.name, 63);
-  }
+  CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev))
+  std::strncpy(name, deviceProp.name, 63);
   name[63] = 0;
 }
 
-int cuda_get_device_props(const int dev, EspressoGpuDevice &d) {
+EspressoGpuDevice cuda_get_device_props(const int dev) {
   cudaDeviceProp deviceProp;
-  cudaError_t error = cudaGetDeviceProperties(&deviceProp, dev);
-  if (error != cudaSuccess) {
-    cuda_error = cudaGetErrorString(error);
-    return ES_ERROR;
-  }
-  strncpy(d.name, deviceProp.name, 64);
-  d.id = dev;
-  d.total_memory = deviceProp.totalGlobalMem;
-  d.compute_capability_major = deviceProp.major;
-  d.compute_capability_minor = deviceProp.minor;
-  d.n_cores = deviceProp.multiProcessorCount;
-
-  return ES_OK;
+  CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev))
+  EspressoGpuDevice device{dev,
+                           "",
+                           "",
+                           -1,
+                           deviceProp.major,
+                           deviceProp.minor,
+                           deviceProp.totalGlobalMem,
+                           deviceProp.multiProcessorCount};
+  std::strncpy(device.name, deviceProp.name, 64);
+  device.name[63] = '\0';
+  return device;
 }
 
-int cuda_set_device(int dev) {
-  cudaSetDevice(dev);
-  cudaStreamDestroy(stream[0]);
-  cudaError_t error = cudaStreamCreate(&stream[0]);
-
-  if (error != cudaSuccess) {
-    cuda_error = cudaGetErrorString(error);
-    throw std::runtime_error(cuda_error);
-  }
-
-  return ES_OK;
+void cuda_set_device(int dev) {
+  CUDA_CHECK(cudaSetDevice(dev))
+  CUDA_CHECK(cudaStreamDestroy(stream[0]))
+  CUDA_CHECK(cudaStreamCreate(&stream[0]))
 }
 
 int cuda_get_device() {
   int dev;
-  cudaError_t error = cudaGetDevice(&dev);
-  if (error != cudaSuccess) {
-    cuda_error = cudaGetErrorString(error);
-    return -1;
-  }
+  CUDA_CHECK(cudaGetDevice(&dev))
   return dev;
 }
 
@@ -126,23 +99,23 @@ int cuda_test_device_access() {
 
   err = cudaMalloc((void **)&d, sizeof(int));
   if (err != cudaSuccess) {
-    cuda_error = cudaGetErrorString(err);
-    return ES_ERROR;
+    throw cuda_runtime_error_cuda(err);
   }
   err = cudaMemcpy(d, &h, sizeof(int), cudaMemcpyHostToDevice);
   if (err != cudaSuccess) {
-    cuda_error = cudaGetErrorString(err);
-    return ES_ERROR;
+    cudaFree(d);
+    throw cuda_runtime_error_cuda(err);
   }
   h = 0;
   err = cudaMemcpy(&h, d, sizeof(int), cudaMemcpyDeviceToHost);
   cudaFree(d);
-
-  if ((h == 42) && (err == cudaSuccess)) {
-    return ES_OK;
+  if (err != cudaSuccess) {
+    throw cuda_runtime_error_cuda(err);
   }
-  cuda_error = cudaGetErrorString(err);
-  return ES_ERROR;
+  if (h != 42) {
+    return ES_ERROR;
+  }
+  return ES_OK;
 }
 
 #endif /* defined(CUDA) */
diff --git a/src/core/cuda_interface.cpp b/src/core/cuda_interface.cpp
index 3b6c07a8a27..a954a4e9ac8 100644
--- a/src/core/cuda_interface.cpp
+++ b/src/core/cuda_interface.cpp
@@ -96,7 +96,7 @@ void cuda_mpi_get_particles(
     /* pack local parts into buffer */
     pack_particles(particles, buffer.data());
 
-    Utils::Mpi::gather_buffer(buffer.data(), buffer.size(), comm_cart);
+    Utils::Mpi::gather_buffer(buffer, comm_cart);
   } else {
     particle_data_host.resize(n_part);
 
diff --git a/src/core/cuda_interface.hpp b/src/core/cuda_interface.hpp
index 6c518f649a2..0896d1bf703 100644
--- a/src/core/cuda_interface.hpp
+++ b/src/core/cuda_interface.hpp
@@ -57,7 +57,6 @@ struct CUDA_particle_data {
   /** particle position given from md part*/
   Vector3f p;
 
-#if defined(CUDA)
   /** particle id */
   int identity;
 #ifdef VIRTUAL_SITES
@@ -68,7 +67,6 @@ struct CUDA_particle_data {
 
   /** particle momentum struct velocity p.m->v*/
   Vector3f v;
-#endif
 
 #ifdef ROTATION
   Vector3f director;
diff --git a/src/core/cuda_utils.cuh b/src/core/cuda_utils.cuh
new file mode 100644
index 00000000000..5eec836394b
--- /dev/null
+++ b/src/core/cuda_utils.cuh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2013-2019 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _CUDA_UTILS_CUH
+#define _CUDA_UTILS_CUH
+
+#if !defined(__CUDACC__)
+#error Do not include CUDA headers in normal C++-code!!!
+#endif
+
+#include "cuda_utils.hpp"
+
+#include <cuda.h>
+
+#include <cassert>
+#include <string>
+
+class cuda_runtime_error_cuda : public cuda_runtime_error {
+public:
+  cuda_runtime_error_cuda(cudaError_t error)
+      : cuda_runtime_error(error_message(error)) {}
+
+private:
+  std::string error_message(cudaError_t error) {
+    const char *cuda_error = cudaGetErrorString(error);
+    return std::string("CUDA error: ") + cuda_error;
+  }
+};
+
+/** Convert CUDA error codes into runtime errors. */
+#define CUDA_CHECK(statement)                                                  \
+  {                                                                            \
+    cudaError_t const error_code = (statement);                                \
+    if (error_code != cudaSuccess) {                                           \
+      throw cuda_runtime_error_cuda(error_code);                               \
+    }                                                                          \
+  }
+
+/** CUDA streams for parallel computing on CPU and GPU */
+extern cudaStream_t stream[1];
+
+/** In case of error during CUDA memory allocation and memory copy, print
+ *  the error message and exit.
+ *  @param CU_err cuda error code
+ *  @param file  .cu file were the error took place
+ *  @param line  line of the file were the error took place
+ */
+void cuda_safe_mem_exit(cudaError_t CU_err, const char *file,
+                        unsigned int line);
+
+/** In case of error during a CUDA operation, print the error message and exit.
+ */
+void cuda_check_errors_exit(const dim3 &block, const dim3 &grid,
+                            const char *function, const char *file,
+                            unsigned int line);
+
+#define cuda_safe_mem(a) cuda_safe_mem_exit((a), __FILE__, __LINE__)
+
+/** Calculate @c dim_grid for CUDA kernel calls. */
+inline dim3 calculate_dim_grid(unsigned const threads_x,
+                               unsigned const blocks_per_grid_y,
+                               unsigned const threads_per_block) {
+  assert(threads_x >= 1);
+  assert(blocks_per_grid_y >= 1);
+  assert(threads_per_block >= 1);
+  auto const threads_y = threads_per_block * blocks_per_grid_y;
+  auto const blocks_per_grid_x = (threads_x + threads_y - 1) / threads_y;
+  return make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+}
+
+#define KERNELCALL_shared(_function, _grid, _block, _stream, ...)              \
+  _function<<<_grid, _block, _stream, stream[0]>>>(__VA_ARGS__);               \
+  cuda_check_errors_exit(_grid, _block, #_function, __FILE__, __LINE__);
+
+#define KERNELCALL_stream(_function, _grid, _block, _stream, ...)              \
+  _function<<<_grid, _block, 0, _stream>>>(__VA_ARGS__);                       \
+  cuda_check_errors_exit(_grid, _block, #_function, __FILE__, __LINE__);
+
+#define KERNELCALL(_function, _grid, _block, ...)                              \
+  KERNELCALL_shared(_function, _grid, _block, 0, ##__VA_ARGS__)
+
+#endif
diff --git a/src/core/cuda_utils.hpp b/src/core/cuda_utils.hpp
index 30cde8f791e..faf49dbb472 100644
--- a/src/core/cuda_utils.hpp
+++ b/src/core/cuda_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013-2019 The ESPResSo project
+ * Copyright (C) 2021 The ESPResSo project
  *
  * This file is part of ESPResSo.
  *
@@ -19,35 +19,17 @@
 #ifndef _CUDA_UTILS_HPP
 #define _CUDA_UTILS_HPP
 
-#if !defined(__CUDACC__)
-#error Do not include CUDA headers in normal C++-code!!!
-#endif
-
-/** cuda streams for parallel computing on cpu and gpu */
-extern cudaStream_t stream[1];
-
-/** Error output for memory allocation and memory copy
- *  @param err   cuda error code
- *  @param file  .cu file were the error took place
- *  @param line  line of the file were the error took place
- */
-void _cuda_safe_mem(cudaError_t err, const char *file, unsigned int line);
-
-void _cuda_check_errors(const dim3 &block, const dim3 &grid,
-                        const char *function, const char *file,
-                        unsigned int line);
-
-#define cuda_safe_mem(a) _cuda_safe_mem((a), __FILE__, __LINE__)
+#include "config.hpp"
 
-#define KERNELCALL_shared(_function, _grid, _block, _stream, ...)              \
-  _function<<<_grid, _block, _stream, stream[0]>>>(__VA_ARGS__);               \
-  _cuda_check_errors(_grid, _block, #_function, __FILE__, __LINE__);
+#ifdef CUDA
 
-#define KERNELCALL_stream(_function, _grid, _block, _stream, ...)              \
-  _function<<<_grid, _block, 0, _stream>>>(__VA_ARGS__);                       \
-  _cuda_check_errors(_grid, _block, #_function, __FILE__, __LINE__);
+#include <stdexcept>
+#include <string>
 
-#define KERNELCALL(_function, _grid, _block, ...)                              \
-  KERNELCALL_shared(_function, _grid, _block, 0, ##__VA_ARGS__)
+class cuda_runtime_error : public std::runtime_error {
+public:
+  cuda_runtime_error(std::string const &msg) : std::runtime_error(msg) {}
+};
 
+#endif // CUDA
 #endif
diff --git a/src/core/dpd.cpp b/src/core/dpd.cpp
index c79743126fe..657f00c965e 100644
--- a/src/core/dpd.cpp
+++ b/src/core/dpd.cpp
@@ -92,17 +92,6 @@ void dpd_init() {
   }
 }
 
-void dpd_update_params(double pref_scale) {
-  for (int type_a = 0; type_a < max_seen_particle_type; type_a++) {
-    for (int type_b = 0; type_b < max_seen_particle_type; type_b++) {
-      IA_parameters &ia_params = *get_ia_param(type_a, type_b);
-
-      ia_params.dpd_radial.pref *= pref_scale;
-      ia_params.dpd_trans.pref *= pref_scale;
-    }
-  }
-}
-
 static double weight(int type, double r_cut, double k, double r) {
   if (type == 0) {
     return 1.;
diff --git a/src/core/dpd.hpp b/src/core/dpd.hpp
index da7c4191dc2..0d8d8caa311 100644
--- a/src/core/dpd.hpp
+++ b/src/core/dpd.hpp
@@ -38,7 +38,6 @@ struct IA_parameters;
 int dpd_set_params(int part_type_a, int part_type_b, double gamma, double k,
                    double r_c, int wf, double tgamma, double tr_c, int twf);
 void dpd_init();
-void dpd_update_params(double pref2_scale);
 
 Utils::Vector3d dpd_pair_force(Particle const &p1, Particle const &p2,
                                IA_parameters const &ia_params,
diff --git a/src/core/electrostatics_magnetostatics/coulomb.cpp b/src/core/electrostatics_magnetostatics/coulomb.cpp
index 8ffbe210253..ce0990eb659 100644
--- a/src/core/electrostatics_magnetostatics/coulomb.cpp
+++ b/src/core/electrostatics_magnetostatics/coulomb.cpp
@@ -158,8 +158,8 @@ void deactivate() {
 }
 
 void update_dependent_particles() {
-  iccp3m_iteration(cell_structure.local_particles(),
-                   cell_structure.ghost_particles());
+  icc_iteration(cell_structure.local_particles(),
+                cell_structure.ghost_particles());
 }
 
 void on_observable_calc() {
@@ -183,8 +183,13 @@ void on_coulomb_change() {
 #ifdef P3M
 #ifdef CUDA
   case COULOMB_P3M_GPU:
-    if (this_node == 0)
-      p3m_gpu_init(p3m.params.cao, p3m.params.mesh, p3m.params.alpha);
+    if (this_node == 0) {
+      try {
+        p3m_gpu_init(p3m.params.cao, p3m.params.mesh, p3m.params.alpha);
+      } catch (std::runtime_error const &err) {
+        runtimeErrorMsg() << err.what();
+      }
+    }
     break;
 #endif
   case COULOMB_ELC_P3M:
@@ -359,23 +364,23 @@ double calc_energy_long_range(const ParticleRange &particles) {
   return energy;
 }
 
-int iccp3m_sanity_check() {
+int icc_sanity_check() {
   switch (coulomb.method) {
 #ifdef P3M
   case COULOMB_ELC_P3M: {
     if (elc_params.dielectric_contrast_on) {
-      runtimeErrorMsg() << "ICCP3M conflicts with ELC dielectric contrast";
+      runtimeErrorMsg() << "ICC conflicts with ELC dielectric contrast";
       return 1;
     }
     break;
   }
 #endif
   case COULOMB_DH: {
-    runtimeErrorMsg() << "ICCP3M does not work with Debye-Hueckel.";
+    runtimeErrorMsg() << "ICC does not work with Debye-Hueckel.";
     return 1;
   }
   case COULOMB_RF: {
-    runtimeErrorMsg() << "ICCP3M does not work with COULOMB_RF.";
+    runtimeErrorMsg() << "ICC does not work with COULOMB_RF.";
     return 1;
   }
   default:
@@ -384,7 +389,7 @@ int iccp3m_sanity_check() {
 
 #ifdef NPT
   if (integ_switch == INTEG_METHOD_NPT_ISO) {
-    runtimeErrorMsg() << "ICCP3M does not work in the NPT ensemble";
+    runtimeErrorMsg() << "ICC does not work in the NPT ensemble";
     return 1;
   }
 #endif
diff --git a/src/core/electrostatics_magnetostatics/coulomb.hpp b/src/core/electrostatics_magnetostatics/coulomb.hpp
index c6cc7015fa9..b0d4f78244c 100644
--- a/src/core/electrostatics_magnetostatics/coulomb.hpp
+++ b/src/core/electrostatics_magnetostatics/coulomb.hpp
@@ -75,7 +75,7 @@ void calc_long_range_force(const ParticleRange &particles);
 
 double calc_energy_long_range(const ParticleRange &particles);
 
-int iccp3m_sanity_check();
+int icc_sanity_check();
 
 int elc_sanity_check();
 
diff --git a/src/core/electrostatics_magnetostatics/elc.cpp b/src/core/electrostatics_magnetostatics/elc.cpp
index ac44fbe4437..41ded584289 100644
--- a/src/core/electrostatics_magnetostatics/elc.cpp
+++ b/src/core/electrostatics_magnetostatics/elc.cpp
@@ -45,6 +45,7 @@
 
 #include <mpi.h>
 
+#include <cassert>
 #include <cmath>
 #include <cstddef>
 #include <vector>
@@ -92,6 +93,9 @@ ELC_struct elc_params = {1e100, 10,    1, 0, true, true, false, 1,
 #define PQECCM 7
 /**@}*/
 
+/** ELC axes (x and y directions)*/
+enum class PoQ : int { P, Q };
+
 /** temporary buffers for product decomposition */
 static std::vector<double> partblk;
 /** collected data from the other cells */
@@ -112,27 +116,7 @@ static std::vector<SCCache> scycache;
  * LOCAL FUNCTIONS
  ****************************************/
 
-static void distribute(int size);
-/** \name q=0 per frequency code */
-/**@{*/
-static void setup_P(int p, double omega, const ParticleRange &particles);
-static void add_P_force(const ParticleRange &particles);
-static double P_energy(double omega, int n_part);
-/**@}*/
-/** \name p=0 per frequency code */
-/**@{*/
-static void setup_Q(int q, double omega, const ParticleRange &particles);
-static void add_Q_force(const ParticleRange &particles);
-static double Q_energy(double omega, int n_part);
-/**@}*/
-/** \name p,q <> 0 per frequency code */
-/**@{*/
-static void setup_PQ(int p, int q, double omega,
-                     const ParticleRange &particles);
-static void add_PQ_force(int p, int q, double omega,
-                         const ParticleRange &particles);
-static double PQ_energy(double omega, int n_part);
-/**@}*/
+static void distribute(std::size_t size);
 static void add_dipole_force(const ParticleRange &particles);
 static double dipole_energy(const ParticleRange &particles);
 static double z_energy(const ParticleRange &particles);
@@ -147,7 +131,7 @@ void ELC_setup_constants() {
 }
 
 /**
- * @brief Calculated cached sin/cos values for one direction.
+ * @brief Calculate cached sin/cos values for one direction.
  *
  * @tparam dir Index of the dimension to consider (e.g. 0 for x ...).
  *
@@ -157,13 +141,13 @@ void ELC_setup_constants() {
  * @return Calculated values.
  */
 template <size_t dir>
-static std::vector<SCCache> sc_cache(const ParticleRange &particles, int n_freq,
-                                     double u) {
+static std::vector<SCCache> calc_sc_cache(const ParticleRange &particles,
+                                          std::size_t n_freq, double u) {
   constexpr double c_2pi = 2 * Utils::pi();
   auto const n_part = particles.size();
   std::vector<SCCache> ret(n_freq * n_part);
 
-  for (size_t freq = 1; freq <= n_freq; freq++) {
+  for (std::size_t freq = 1; freq <= n_freq; freq++) {
     auto const pref = c_2pi * u * static_cast<double>(freq);
 
     size_t o = (freq - 1) * n_part;
@@ -176,51 +160,54 @@ static std::vector<SCCache> sc_cache(const ParticleRange &particles, int n_freq,
   return ret;
 }
 
-static void prepare_sc_cache(const ParticleRange &particles, int n_freq_x,
-                             double u_x, int n_freq_y, double u_y) {
-  scxcache = sc_cache<0>(particles, n_freq_x, u_x);
-  scycache = sc_cache<1>(particles, n_freq_y, u_y);
+static void prepare_sc_cache(const ParticleRange &particles,
+                             std::size_t n_freq_x, double u_x,
+                             std::size_t n_freq_y, double u_y) {
+  scxcache = calc_sc_cache<0>(particles, n_freq_x, u_x);
+  scycache = calc_sc_cache<1>(particles, n_freq_y, u_y);
 }
 
 /*****************************************************************/
 /* data distribution */
 /*****************************************************************/
 
-inline void clear_vec(double *pdc, int size) {
-  for (int i = 0; i < size; i++)
+inline void clear_vec(double *pdc, std::size_t size) {
+  for (std::size_t i = 0; i < size; i++)
     pdc[i] = 0;
 }
 
-inline void copy_vec(double *pdc_d, double const *pdc_s, int size) {
-  for (int i = 0; i < size; i++)
+inline void copy_vec(double *pdc_d, double const *pdc_s, std::size_t size) {
+  for (std::size_t i = 0; i < size; i++)
     pdc_d[i] = pdc_s[i];
 }
 
 inline void add_vec(double *pdc_d, double const *pdc_s1, double const *pdc_s2,
-                    int size) {
-  for (int i = 0; i < size; i++)
+                    std::size_t size) {
+  for (std::size_t i = 0; i < size; i++)
     pdc_d[i] = pdc_s1[i] + pdc_s2[i];
 }
 
 inline void addscale_vec(double *pdc_d, double scale, double const *pdc_s1,
-                         double const *pdc_s2, int size) {
-  for (int i = 0; i < size; i++)
+                         double const *pdc_s2, std::size_t size) {
+  for (std::size_t i = 0; i < size; i++)
     pdc_d[i] = scale * pdc_s1[i] + pdc_s2[i];
 }
 
-inline void scale_vec(double scale, double *pdc, int size) {
-  for (int i = 0; i < size; i++)
+inline void scale_vec(double scale, double *pdc, std::size_t size) {
+  for (std::size_t i = 0; i < size; i++)
     pdc[i] *= scale;
 }
 
-inline double *block(double *p, int index, int size) {
+inline double *block(double *p, std::size_t index, std::size_t size) {
   return &p[index * size];
 }
 
-void distribute(int size) {
+void distribute(std::size_t size) {
+  assert(size <= 8);
   double send_buf[8];
   copy_vec(send_buf, gblcblk, size);
-  MPI_Allreduce(send_buf, gblcblk, size, MPI_DOUBLE, MPI_SUM, comm_cart);
+  MPI_Allreduce(send_buf, gblcblk, static_cast<int>(size), MPI_DOUBLE, MPI_SUM,
+                comm_cart);
 }
 
 /** Checks if a charged particle is in the forbidden gap region
@@ -246,7 +233,7 @@ inline void check_gap_elc(const Particle &p) {
  */
 static void add_dipole_force(const ParticleRange &particles) {
   double const pref = coulomb.prefactor * 4 * Utils::pi() * ux * uy * uz;
-  int const size = 3;
+  constexpr std::size_t size = 3;
 
   auto local_particles = particles;
 
@@ -311,7 +298,7 @@ static void add_dipole_force(const ParticleRange &particles) {
  */
 static double dipole_energy(const ParticleRange &particles) {
   double const pref = coulomb.prefactor * 2 * Utils::pi() * ux * uy * uz;
-  int const size = 7;
+  constexpr std::size_t size = 7;
   /* for nonneutral systems, this shift gives the background contribution
      (rsp. for this shift, the DM of the background is zero) */
   double const shift = 0.5 * box_geo.length()[2];
@@ -354,32 +341,32 @@ static double dipole_energy(const ParticleRange &particles) {
   distribute(size);
 
   // Yeh + Berkowitz term @cite yeh99a
-  double eng = 2 * pref * (Utils::sqr(gblcblk[2]) + gblcblk[2] * gblcblk[3]);
+  double energy = 2 * pref * (Utils::sqr(gblcblk[2]) + gblcblk[2] * gblcblk[3]);
 
   if (!elc_params.neutralize) {
     // SUBTRACT the energy of the P3M homogeneous neutralizing background
-    eng += 2 * pref *
-           (-gblcblk[0] * gblcblk[4] -
-            (.25 - .5 / 3.) * Utils::sqr(gblcblk[0] * box_geo.length()[2]));
+    energy += 2 * pref *
+              (-gblcblk[0] * gblcblk[4] -
+               (.25 - .5 / 3.) * Utils::sqr(gblcblk[0] * box_geo.length()[2]));
   }
 
   if (elc_params.dielectric_contrast_on) {
     if (elc_params.const_pot) {
       // zero potential difference contribution
-      eng += pref * height_inverse / uz * Utils::sqr(gblcblk[6]);
+      energy += pref * height_inverse / uz * Utils::sqr(gblcblk[6]);
       // external potential shift contribution
-      eng -= 2 * elc_params.pot_diff * height_inverse * gblcblk[6];
+      energy -= 2 * elc_params.pot_diff * height_inverse * gblcblk[6];
     }
 
     /* counter the P3M homogeneous background contribution to the
        boundaries. We never need that, since a homogeneous background
        spanning the artificial boundary layers is aphysical. */
-    eng += pref * (-(gblcblk[1] * gblcblk[4] + gblcblk[0] * gblcblk[5]) -
-                   (1. - 2. / 3.) * gblcblk[0] * gblcblk[1] *
-                       Utils::sqr(box_geo.length()[2]));
+    energy += pref * (-(gblcblk[1] * gblcblk[4] + gblcblk[0] * gblcblk[5]) -
+                      (1. - 2. / 3.) * gblcblk[0] * gblcblk[1] *
+                          Utils::sqr(box_geo.length()[2]));
   }
 
-  return this_node == 0 ? eng : 0;
+  return this_node == 0 ? energy : 0;
 }
 
 /*****************************************************************/
@@ -405,7 +392,7 @@ inline double image_sum_t(double q, double z) {
 /*****************************************************************/
 static double z_energy(const ParticleRange &particles) {
   double const pref = coulomb.prefactor * 2 * Utils::pi() * ux * uy;
-  int const size = 4;
+  constexpr std::size_t size = 4;
 
   /* for nonneutral systems, this shift gives the background contribution
      (rsp. for this shift, the DM of the background is zero) */
@@ -473,20 +460,20 @@ static double z_energy(const ParticleRange &particles) {
   }
   distribute(size);
 
-  double eng = 0;
+  double energy = 0;
   if (this_node == 0)
-    eng -= pref * (gblcblk[1] * gblcblk[2] - gblcblk[0] * gblcblk[3]);
+    energy -= gblcblk[1] * gblcblk[2] - gblcblk[0] * gblcblk[3];
 
-  return eng;
+  return pref * energy;
 }
 
 /*****************************************************************/
 static void add_z_force(const ParticleRange &particles) {
   double const pref = coulomb.prefactor * 2 * Utils::pi() * ux * uy;
+  constexpr std::size_t size = 1;
 
   if (elc_params.dielectric_contrast_on) {
     auto local_particles = particles;
-    int const size = 1;
     if (elc_params.const_pot) {
       clear_vec(gblcblk, size);
       /* just counter the 2 pi |z| contribution stemming from P3M */
@@ -536,10 +523,15 @@ static void add_z_force(const ParticleRange &particles) {
 /* PoQ exp sum */
 /*****************************************************************/
 
-static void setup_P(int p, double omega, const ParticleRange &particles) {
+/** \name q=0 or p=0 per frequency code */
+/**@{*/
+template <PoQ axis>
+void setup_PoQ(std::size_t index, double omega,
+               const ParticleRange &particles) {
+  assert(index >= 1);
   double const pref_di = coulomb.prefactor * 4 * Utils::pi() * ux * uy;
   double const pref = -pref_di / expm1(omega * box_geo.length()[2]);
-  int const size = 4;
+  constexpr std::size_t size = 4;
   double lclimgebot[4], lclimgetop[4], lclimge[4];
   double fac_delta_mid_bot = 1, fac_delta_mid_top = 1, fac_delta = 1;
 
@@ -554,16 +546,17 @@ static void setup_P(int p, double omega, const ParticleRange &particles) {
 
   clear_vec(lclimge, size);
   clear_vec(gblcblk, size);
+  auto &sc_cache = (axis == PoQ::P) ? scxcache : scycache;
 
-  int ic = 0;
-  auto const o = static_cast<int>((p - 1) * particles.size());
+  std::size_t ic = 0;
+  auto const o = (index - 1) * particles.size();
   for (auto &p : particles) {
     double e = exp(omega * p.r.p[2]);
 
-    partblk[size * ic + POQESM] = p.p.q * scxcache[o + ic].s / e;
-    partblk[size * ic + POQESP] = p.p.q * scxcache[o + ic].s * e;
-    partblk[size * ic + POQECM] = p.p.q * scxcache[o + ic].c / e;
-    partblk[size * ic + POQECP] = p.p.q * scxcache[o + ic].c * e;
+    partblk[size * ic + POQESM] = p.p.q * sc_cache[o + ic].s / e;
+    partblk[size * ic + POQESP] = p.p.q * sc_cache[o + ic].s * e;
+    partblk[size * ic + POQECM] = p.p.q * sc_cache[o + ic].c / e;
+    partblk[size * ic + POQECP] = p.p.q * sc_cache[o + ic].c * e;
 
     add_vec(gblcblk, gblcblk, block(partblk.data(), ic, size), size);
 
@@ -575,10 +568,10 @@ static void setup_P(int p, double omega, const ParticleRange &particles) {
 
         double const scale = p.p.q * elc_params.delta_mid_bot;
 
-        lclimgebot[POQESM] = scxcache[o + ic].s / e;
-        lclimgebot[POQESP] = scxcache[o + ic].s * e;
-        lclimgebot[POQECM] = scxcache[o + ic].c / e;
-        lclimgebot[POQECP] = scxcache[o + ic].c * e;
+        lclimgebot[POQESM] = sc_cache[o + ic].s / e;
+        lclimgebot[POQESP] = sc_cache[o + ic].s * e;
+        lclimgebot[POQECM] = sc_cache[o + ic].c / e;
+        lclimgebot[POQECP] = sc_cache[o + ic].c * e;
 
         addscale_vec(gblcblk, scale, lclimgebot, gblcblk, size);
 
@@ -595,8 +588,8 @@ static void setup_P(int p, double omega, const ParticleRange &particles) {
             fac_delta_mid_bot;
       }
 
-      lclimge[POQESP] += p.p.q * scxcache[o + ic].s * e;
-      lclimge[POQECP] += p.p.q * scxcache[o + ic].c * e;
+      lclimge[POQESP] += p.p.q * sc_cache[o + ic].s * e;
+      lclimge[POQECP] += p.p.q * sc_cache[o + ic].c * e;
 
       if (p.r.p[2] > (elc_params.h -
                       elc_params.space_layer)) { // handle the upper case now
@@ -605,14 +598,14 @@ static void setup_P(int p, double omega, const ParticleRange &particles) {
 
         double const scale = p.p.q * elc_params.delta_mid_top;
 
-        lclimgetop[POQESM] = scxcache[o + ic].s / e;
-        lclimgetop[POQESP] = scxcache[o + ic].s * e;
-        lclimgetop[POQECM] = scxcache[o + ic].c / e;
-        lclimgetop[POQECP] = scxcache[o + ic].c * e;
+        lclimgetop[POQESM] = sc_cache[o + ic].s / e;
+        lclimgetop[POQESP] = sc_cache[o + ic].s * e;
+        lclimgetop[POQECM] = sc_cache[o + ic].c / e;
+        lclimgetop[POQECP] = sc_cache[o + ic].c * e;
 
         addscale_vec(gblcblk, scale, lclimgetop, gblcblk, size);
 
-        e = (exp(omega * (p.r.p[2] - 4 * elc_params.h)) *
+        e = (exp(omega * (+p.r.p[2] - 4 * elc_params.h)) *
                  elc_params.delta_mid_top +
              exp(omega * (-p.r.p[2] - 2 * elc_params.h))) *
             fac_delta;
@@ -625,113 +618,8 @@ static void setup_P(int p, double omega, const ParticleRange &particles) {
             fac_delta_mid_top;
       }
 
-      lclimge[POQESM] += p.p.q * scxcache[o + ic].s * e;
-      lclimge[POQECM] += p.p.q * scxcache[o + ic].c * e;
-    }
-
-    ic++;
-  }
-
-  scale_vec(pref, gblcblk, size);
-
-  if (elc_params.dielectric_contrast_on) {
-    scale_vec(pref_di, lclimge, size);
-    add_vec(gblcblk, gblcblk, lclimge, size);
-  }
-}
-
-static void setup_Q(int q, double omega, const ParticleRange &particles) {
-  double const pref_di = coulomb.prefactor * 4 * Utils::pi() * ux * uy;
-  double const pref = -pref_di / expm1(omega * box_geo.length()[2]);
-  int const size = 4;
-  double lclimgebot[4], lclimgetop[4], lclimge[4];
-  double fac_delta_mid_bot = 1, fac_delta_mid_top = 1, fac_delta = 1;
-
-  if (elc_params.dielectric_contrast_on) {
-    double const fac_elc =
-        1.0 / (1 - elc_params.delta_mid_top * elc_params.delta_mid_bot *
-                       exp(-omega * 2 * elc_params.h));
-    fac_delta_mid_bot = elc_params.delta_mid_bot * fac_elc;
-    fac_delta_mid_top = elc_params.delta_mid_top * fac_elc;
-    fac_delta = fac_delta_mid_bot * elc_params.delta_mid_top;
-  }
-
-  clear_vec(lclimge, size);
-  clear_vec(gblcblk, size);
-
-  int ic = 0;
-  auto const o = static_cast<int>((q - 1) * particles.size());
-  for (auto &p : particles) {
-    double e = exp(omega * p.r.p[2]);
-
-    partblk[size * ic + POQESM] = p.p.q * scycache[o + ic].s / e;
-    partblk[size * ic + POQESP] = p.p.q * scycache[o + ic].s * e;
-    partblk[size * ic + POQECM] = p.p.q * scycache[o + ic].c / e;
-    partblk[size * ic + POQECP] = p.p.q * scycache[o + ic].c * e;
-
-    add_vec(gblcblk, gblcblk, block(partblk.data(), ic, size), size);
-
-    if (elc_params.dielectric_contrast_on) {
-      if (p.r.p[2] < elc_params.space_layer) { // handle the lower case first
-        // negative sign before omega is okay here as the image is located
-        // at -p.r.p[2]
-
-        e = exp(-omega * p.r.p[2]);
-
-        double const scale = p.p.q * elc_params.delta_mid_bot;
-
-        lclimgebot[POQESM] = scycache[o + ic].s / e;
-        lclimgebot[POQESP] = scycache[o + ic].s * e;
-        lclimgebot[POQECM] = scycache[o + ic].c / e;
-        lclimgebot[POQECP] = scycache[o + ic].c * e;
-
-        addscale_vec(gblcblk, scale, lclimgebot, gblcblk, size);
-
-        e = (exp(omega * (-p.r.p[2] - 2 * elc_params.h)) *
-                 elc_params.delta_mid_bot +
-             exp(omega * (p.r.p[2] - 2 * elc_params.h))) *
-            fac_delta;
-
-      } else {
-
-        e = (exp(omega * (-p.r.p[2])) +
-             exp(omega * (p.r.p[2] - 2 * elc_params.h)) *
-                 elc_params.delta_mid_top) *
-            fac_delta_mid_bot;
-      }
-
-      lclimge[POQESP] += p.p.q * scycache[o + ic].s * e;
-      lclimge[POQECP] += p.p.q * scycache[o + ic].c * e;
-
-      if (p.r.p[2] > (elc_params.h -
-                      elc_params.space_layer)) { // handle the upper case now
-
-        e = exp(omega * (2 * elc_params.h - p.r.p[2]));
-
-        double const scale = p.p.q * elc_params.delta_mid_top;
-
-        lclimgetop[POQESM] = scycache[o + ic].s / e;
-        lclimgetop[POQESP] = scycache[o + ic].s * e;
-        lclimgetop[POQECM] = scycache[o + ic].c / e;
-        lclimgetop[POQECP] = scycache[o + ic].c * e;
-
-        addscale_vec(gblcblk, scale, lclimgetop, gblcblk, size);
-
-        e = (exp(omega * (p.r.p[2] - 4 * elc_params.h)) *
-                 elc_params.delta_mid_top +
-             exp(omega * (-p.r.p[2] - 2 * elc_params.h))) *
-            fac_delta;
-
-      } else {
-
-        e = (exp(omega * (p.r.p[2] - 2 * elc_params.h)) +
-             exp(omega * (-p.r.p[2] - 2 * elc_params.h)) *
-                 elc_params.delta_mid_bot) *
-            fac_delta_mid_top;
-      }
-
-      lclimge[POQESM] += p.p.q * scycache[o + ic].s * e;
-      lclimge[POQECM] += p.p.q * scycache[o + ic].c * e;
+      lclimge[POQESM] += p.p.q * sc_cache[o + ic].s * e;
+      lclimge[POQECM] += p.p.q * sc_cache[o + ic].c * e;
     }
 
     ic++;
@@ -745,12 +633,13 @@ static void setup_Q(int q, double omega, const ParticleRange &particles) {
   }
 }
 
-static void add_P_force(const ParticleRange &particles) {
-  int const size = 4;
+template <PoQ axis> void add_PoQ_force(const ParticleRange &particles) {
+  constexpr auto i = static_cast<int>(axis);
+  constexpr std::size_t size = 4;
 
-  int ic = 0;
+  std::size_t ic = 0;
   for (auto &p : particles) {
-    p.f.f[0] += partblk[size * ic + POQESM] * gblcblk[POQECP] -
+    p.f.f[i] += partblk[size * ic + POQESM] * gblcblk[POQECP] -
                 partblk[size * ic + POQECM] * gblcblk[POQESP] +
                 partblk[size * ic + POQESP] * gblcblk[POQECM] -
                 partblk[size * ic + POQECP] * gblcblk[POQESM];
@@ -762,61 +651,34 @@ static void add_P_force(const ParticleRange &particles) {
   }
 }
 
-static double P_energy(double omega, int n_part) {
-  int const size = 4;
-  double eng = 0;
-  double const pref = 1 / omega;
+static double PoQ_energy(double omega, std::size_t n_part) {
+  constexpr std::size_t size = 4;
 
-  for (int ic = 0; ic < n_part; ic++) {
-    eng += pref * (partblk[size * ic + POQECM] * gblcblk[POQECP] +
-                   partblk[size * ic + POQESM] * gblcblk[POQESP] +
-                   partblk[size * ic + POQECP] * gblcblk[POQECM] +
-                   partblk[size * ic + POQESP] * gblcblk[POQESM]);
+  double energy = 0;
+  for (std::size_t ic = 0; ic < n_part; ic++) {
+    energy += partblk[size * ic + POQECM] * gblcblk[POQECP] +
+              partblk[size * ic + POQESM] * gblcblk[POQESP] +
+              partblk[size * ic + POQECP] * gblcblk[POQECM] +
+              partblk[size * ic + POQESP] * gblcblk[POQESM];
   }
 
-  return eng;
-}
-
-static void add_Q_force(const ParticleRange &particles) {
-  int const size = 4;
-
-  int ic = 0;
-  for (auto &p : particles) {
-    p.f.f[1] += partblk[size * ic + POQESM] * gblcblk[POQECP] -
-                partblk[size * ic + POQECM] * gblcblk[POQESP] +
-                partblk[size * ic + POQESP] * gblcblk[POQECM] -
-                partblk[size * ic + POQECP] * gblcblk[POQESM];
-    p.f.f[2] += partblk[size * ic + POQECM] * gblcblk[POQECP] +
-                partblk[size * ic + POQESM] * gblcblk[POQESP] -
-                partblk[size * ic + POQECP] * gblcblk[POQECM] -
-                partblk[size * ic + POQESP] * gblcblk[POQESM];
-    ic++;
-  }
-}
-
-static double Q_energy(double omega, int n_part) {
-  int const size = 4;
-  double eng = 0;
-  double const pref = 1 / omega;
-
-  for (int ic = 0; ic < n_part; ic++) {
-    eng += pref * (partblk[size * ic + POQECM] * gblcblk[POQECP] +
-                   partblk[size * ic + POQESM] * gblcblk[POQESP] +
-                   partblk[size * ic + POQECP] * gblcblk[POQECM] +
-                   partblk[size * ic + POQESP] * gblcblk[POQESM]);
-  }
-  return eng;
+  return energy / omega;
 }
+/**@}*/
 
 /*****************************************************************/
 /* PQ particle blocks */
 /*****************************************************************/
 
-static void setup_PQ(int p, int q, double omega,
+/** \name p,q <> 0 per frequency code */
+/**@{*/
+static void setup_PQ(std::size_t index_p, std::size_t index_q, double omega,
                      const ParticleRange &particles) {
+  assert(index_p >= 1);
+  assert(index_q >= 1);
   double const pref_di = coulomb.prefactor * 8 * Utils::pi() * ux * uy;
   double const pref = -pref_di / expm1(omega * box_geo.length()[2]);
-  int const size = 8;
+  constexpr std::size_t size = 8;
   double lclimgebot[8], lclimgetop[8], lclimge[8];
   double fac_delta_mid_bot = 1, fac_delta_mid_top = 1, fac_delta = 1;
   if (elc_params.dielectric_contrast_on) {
@@ -831,9 +693,9 @@ static void setup_PQ(int p, int q, double omega,
   clear_vec(lclimge, size);
   clear_vec(gblcblk, size);
 
-  int ic = 0;
-  auto const ox = static_cast<int>((p - 1) * particles.size());
-  auto const oy = static_cast<int>((q - 1) * particles.size());
+  std::size_t ic = 0;
+  auto const ox = (index_p - 1) * particles.size();
+  auto const oy = (index_q - 1) * particles.size();
   for (auto const &p : particles) {
     double e = exp(omega * p.r.p[2]);
 
@@ -941,14 +803,14 @@ static void setup_PQ(int p, int q, double omega,
   }
 }
 
-static void add_PQ_force(int p, int q, double omega,
+static void add_PQ_force(std::size_t index_p, std::size_t index_q, double omega,
                          const ParticleRange &particles) {
   constexpr double c_2pi = 2 * Utils::pi();
-  double const pref_x = c_2pi * ux * p / omega;
-  double const pref_y = c_2pi * uy * q / omega;
-  int const size = 8;
+  double const pref_x = c_2pi * ux * static_cast<double>(index_p) / omega;
+  double const pref_y = c_2pi * uy * static_cast<double>(index_q) / omega;
+  constexpr std::size_t size = 8;
 
-  int ic = 0;
+  std::size_t ic = 0;
   for (auto &p : particles) {
     p.f.f[0] += pref_x * (partblk[size * ic + PQESCM] * gblcblk[PQECCP] +
                           partblk[size * ic + PQESSM] * gblcblk[PQECSP] -
@@ -978,23 +840,23 @@ static void add_PQ_force(int p, int q, double omega,
   }
 }
 
-static double PQ_energy(double omega, int n_part) {
-  int const size = 8;
-  double eng = 0;
-  double const pref = 1 / omega;
-
-  for (int ic = 0; ic < n_part; ic++) {
-    eng += pref * (partblk[size * ic + PQECCM] * gblcblk[PQECCP] +
-                   partblk[size * ic + PQECSM] * gblcblk[PQECSP] +
-                   partblk[size * ic + PQESCM] * gblcblk[PQESCP] +
-                   partblk[size * ic + PQESSM] * gblcblk[PQESSP] +
-                   partblk[size * ic + PQECCP] * gblcblk[PQECCM] +
-                   partblk[size * ic + PQECSP] * gblcblk[PQECSM] +
-                   partblk[size * ic + PQESCP] * gblcblk[PQESCM] +
-                   partblk[size * ic + PQESSP] * gblcblk[PQESSM]);
+static double PQ_energy(double omega, std::size_t n_part) {
+  constexpr std::size_t size = 8;
+
+  double energy = 0;
+  for (std::size_t ic = 0; ic < n_part; ic++) {
+    energy += partblk[size * ic + PQECCM] * gblcblk[PQECCP] +
+              partblk[size * ic + PQECSM] * gblcblk[PQECSP] +
+              partblk[size * ic + PQESCM] * gblcblk[PQESCP] +
+              partblk[size * ic + PQESSM] * gblcblk[PQESSP] +
+              partblk[size * ic + PQECCP] * gblcblk[PQECCM] +
+              partblk[size * ic + PQECSP] * gblcblk[PQECSM] +
+              partblk[size * ic + PQESCP] * gblcblk[PQESCM] +
+              partblk[size * ic + PQESSP] * gblcblk[PQESSM];
   }
-  return eng;
+  return energy / omega;
 }
+/**@}*/
 
 /*****************************************************************/
 /* main loops */
@@ -1002,8 +864,8 @@ static double PQ_energy(double omega, int n_part) {
 
 void ELC_add_force(const ParticleRange &particles) {
   constexpr double c_2pi = 2 * Utils::pi();
-  auto const n_scxcache = int(ceil(elc_params.far_cut / ux) + 1);
-  auto const n_scycache = int(ceil(elc_params.far_cut / uy) + 1);
+  auto const n_scxcache = std::size_t(ceil(elc_params.far_cut / ux) + 1);
+  auto const n_scycache = std::size_t(ceil(elc_params.far_cut / uy) + 1);
 
   prepare_sc_cache(particles, n_scxcache, ux, n_scycache, uy);
   partblk.resize(particles.size() * 8);
@@ -1012,26 +874,35 @@ void ELC_add_force(const ParticleRange &particles) {
   add_z_force(particles);
 
   /* the second condition is just for the case of numerical accident */
-  for (int p = 1; ux * (p - 1) < elc_params.far_cut && p <= n_scxcache; p++) {
-    auto const omega = c_2pi * ux * p;
-    setup_P(p, omega, particles);
+  for (std::size_t p = 1;
+       ux * static_cast<double>(p - 1) < elc_params.far_cut && p <= n_scxcache;
+       p++) {
+    auto const omega = c_2pi * ux * static_cast<double>(p);
+    setup_PoQ<PoQ::P>(p, omega, particles);
     distribute(4);
-    add_P_force(particles);
+    add_PoQ_force<PoQ::P>(particles);
   }
 
-  for (int q = 1; uy * (q - 1) < elc_params.far_cut && q <= n_scycache; q++) {
-    auto const omega = c_2pi * uy * q;
-    setup_Q(q, omega, particles);
+  for (std::size_t q = 1;
+       uy * static_cast<double>(q - 1) < elc_params.far_cut && q <= n_scycache;
+       q++) {
+    auto const omega = c_2pi * uy * static_cast<double>(q);
+    setup_PoQ<PoQ::Q>(q, omega, particles);
     distribute(4);
-    add_Q_force(particles);
+    add_PoQ_force<PoQ::Q>(particles);
   }
 
-  for (int p = 1; ux * (p - 1) < elc_params.far_cut && p <= n_scxcache; p++) {
-    for (int q = 1; Utils::sqr(ux * (p - 1)) + Utils::sqr(uy * (q - 1)) <
-                        elc_params.far_cut2 &&
-                    q <= n_scycache;
+  for (std::size_t p = 1;
+       ux * static_cast<double>(p - 1) < elc_params.far_cut && p <= n_scxcache;
+       p++) {
+    for (std::size_t q = 1;
+         Utils::sqr(ux * static_cast<double>(p - 1)) +
+                 Utils::sqr(uy * static_cast<double>(q - 1)) <
+             elc_params.far_cut2 &&
+         q <= n_scycache;
          q++) {
-      auto const omega = c_2pi * sqrt(Utils::sqr(ux * p) + Utils::sqr(uy * q));
+      auto const omega = c_2pi * sqrt(Utils::sqr(ux * static_cast<double>(p)) +
+                                      Utils::sqr(uy * static_cast<double>(q)));
       setup_PQ(p, q, omega, particles);
       distribute(8);
       add_PQ_force(p, q, omega, particles);
@@ -1041,42 +912,53 @@ void ELC_add_force(const ParticleRange &particles) {
 
 double ELC_energy(const ParticleRange &particles) {
   constexpr double c_2pi = 2 * Utils::pi();
-  auto eng = dipole_energy(particles);
-  eng += z_energy(particles);
+  auto energy = dipole_energy(particles);
+  energy += z_energy(particles);
 
-  auto const n_scxcache = int(ceil(elc_params.far_cut / ux) + 1);
-  auto const n_scycache = int(ceil(elc_params.far_cut / uy) + 1);
+  auto const n_scxcache = std::size_t(ceil(elc_params.far_cut / ux) + 1);
+  auto const n_scycache = std::size_t(ceil(elc_params.far_cut / uy) + 1);
   prepare_sc_cache(particles, n_scxcache, ux, n_scycache, uy);
 
   auto const n_localpart = particles.size();
   partblk.resize(n_localpart * 8);
 
   /* the second condition is just for the case of numerical accident */
-  for (int p = 1; ux * (p - 1) < elc_params.far_cut && p <= n_scxcache; p++) {
-    auto const omega = c_2pi * ux * p;
-    setup_P(p, omega, particles);
+  for (std::size_t p = 1;
+       ux * static_cast<double>(p - 1) < elc_params.far_cut && p <= n_scxcache;
+       p++) {
+    auto const omega = c_2pi * ux * static_cast<double>(p);
+    setup_PoQ<PoQ::P>(p, omega, particles);
     distribute(4);
-    eng += P_energy(omega, n_localpart);
+    energy += PoQ_energy(omega, n_localpart);
   }
-  for (int q = 1; uy * (q - 1) < elc_params.far_cut && q <= n_scycache; q++) {
-    auto const omega = c_2pi * uy * q;
-    setup_Q(q, omega, particles);
+
+  for (std::size_t q = 1;
+       uy * static_cast<double>(q - 1) < elc_params.far_cut && q <= n_scycache;
+       q++) {
+    auto const omega = c_2pi * uy * static_cast<double>(q);
+    setup_PoQ<PoQ::Q>(q, omega, particles);
     distribute(4);
-    eng += Q_energy(omega, n_localpart);
+    energy += PoQ_energy(omega, n_localpart);
   }
-  for (int p = 1; ux * (p - 1) < elc_params.far_cut && p <= n_scxcache; p++) {
-    for (int q = 1; Utils::sqr(ux * (p - 1)) + Utils::sqr(uy * (q - 1)) <
-                        elc_params.far_cut2 &&
-                    q <= n_scycache;
+
+  for (std::size_t p = 1;
+       ux * static_cast<double>(p - 1) < elc_params.far_cut && p <= n_scxcache;
+       p++) {
+    for (std::size_t q = 1;
+         Utils::sqr(ux * static_cast<double>(p - 1)) +
+                 Utils::sqr(uy * static_cast<double>(q - 1)) <
+             elc_params.far_cut2 &&
+         q <= n_scycache;
          q++) {
-      auto const omega = c_2pi * sqrt(Utils::sqr(ux * p) + Utils::sqr(uy * q));
+      auto const omega = c_2pi * sqrt(Utils::sqr(ux * static_cast<double>(p)) +
+                                      Utils::sqr(uy * static_cast<double>(q)));
       setup_PQ(p, q, omega, particles);
       distribute(8);
-      eng += PQ_energy(omega, n_localpart);
+      energy += PQ_energy(omega, n_localpart);
     }
   }
   /* we count both i<->j and j<->i, so return just half of it */
-  return 0.5 * eng;
+  return 0.5 * energy;
 }
 
 int ELC_tune(double error) {
@@ -1089,8 +971,10 @@ int ELC_tune(double error) {
     lz = elc_params.h + elc_params.space_layer;
   }
 
-  if (h < 0)
+  if (h < 0) {
+    runtimeErrorMsg() << "gap size too large";
     return ES_ERROR;
+  }
 
   elc_params.far_cut = min_inv_boxl;
 
@@ -1109,8 +993,10 @@ int ELC_tune(double error) {
 
     elc_params.far_cut += min_inv_boxl;
   } while (err > error && elc_params.far_cut < MAXIMAL_FAR_CUT);
-  if (elc_params.far_cut >= MAXIMAL_FAR_CUT)
+  if (elc_params.far_cut >= MAXIMAL_FAR_CUT) {
+    runtimeErrorMsg() << "maxPWerror too small";
     return ES_ERROR;
+  }
   elc_params.far_cut -= min_inv_boxl;
   elc_params.far_cut2 = Utils::sqr(elc_params.far_cut);
 
@@ -1192,18 +1078,9 @@ void ELC_init() {
 
   if (elc_params.far_calculated && (elc_params.dielectric_contrast_on)) {
     if (ELC_tune(elc_params.maxPWerror) == ES_ERROR) {
-      runtimeErrorMsg() << "ELC auto-retuning failed, gap size too small";
+      runtimeErrorMsg() << "ELC auto-retuning failed";
     }
   }
-  if (elc_params.dielectric_contrast_on) {
-    p3m.params.additional_mesh[0] = 0;
-    p3m.params.additional_mesh[1] = 0;
-    p3m.params.additional_mesh[2] = elc_params.space_layer;
-  } else {
-    p3m.params.additional_mesh[0] = 0;
-    p3m.params.additional_mesh[1] = 0;
-    p3m.params.additional_mesh[2] = 0;
-  }
 }
 
 int ELC_set_params(double maxPWerror, double gap_size, double far_cut,
@@ -1248,7 +1125,7 @@ int ELC_set_params(double maxPWerror, double gap_size, double far_cut,
 
   ELC_setup_constants();
 
-  Coulomb::elc_sanity_check();
+  int error_code = Coulomb::elc_sanity_check();
 
   p3m.params.epsilon = P3M_EPSILON_METALLIC;
   coulomb.method = COULOMB_ELC_P3M;
@@ -1260,12 +1137,12 @@ int ELC_set_params(double maxPWerror, double gap_size, double far_cut,
   } else {
     elc_params.far_calculated = true;
     if (ELC_tune(elc_params.maxPWerror) == ES_ERROR) {
-      runtimeErrorMsg() << "ELC tuning failed, gap size too small";
+      error_code = ES_ERROR;
     }
   }
   mpi_bcast_coulomb_params();
 
-  return ES_OK;
+  return error_code;
 }
 
 ////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/electrostatics_magnetostatics/icc.cpp b/src/core/electrostatics_magnetostatics/icc.cpp
index 149f5ffc84f..64324a2d0e4 100644
--- a/src/core/electrostatics_magnetostatics/icc.cpp
+++ b/src/core/electrostatics_magnetostatics/icc.cpp
@@ -50,10 +50,10 @@
 #include <cstdlib>
 #include <tuple>
 
-iccp3m_struct iccp3m_cfg;
+icc_struct icc_cfg;
 
-void init_forces_iccp3m(const ParticleRange &particles,
-                        const ParticleRange &ghosts_particles);
+void init_forces_icc(const ParticleRange &particles,
+                     const ParticleRange &ghosts_particles);
 
 /** Calculate the electrostatic forces between source charges (= real charges)
  *  and wall charges. For each electrostatic method, the proper functions
@@ -61,15 +61,15 @@ void init_forces_iccp3m(const ParticleRange &particles,
  *  directly, short-range parts need helper functions according to the particle
  *  data organisation. This is a modified version of \ref force_calc.
  */
-void force_calc_iccp3m(const ParticleRange &particles,
-                       const ParticleRange &ghost_particles);
+void force_calc_icc(const ParticleRange &particles,
+                    const ParticleRange &ghost_particles);
 
 /** Variant of @ref add_non_bonded_pair_force where only %Coulomb
  *  contributions are calculated
  */
-inline void add_non_bonded_pair_force_iccp3m(Particle &p1, Particle &p2,
-                                             Utils::Vector3d const &d,
-                                             double dist, double dist2) {
+inline void add_non_bonded_pair_force_icc(Particle &p1, Particle &p2,
+                                          Utils::Vector3d const &d, double dist,
+                                          double dist2) {
   auto forces = Coulomb::pair_force(p1, p2, d, dist);
 
   p1.f.f += std::get<0>(forces);
@@ -79,90 +79,74 @@ inline void add_non_bonded_pair_force_iccp3m(Particle &p1, Particle &p2,
   p2.f.f += std::get<2>(forces);
 #endif
 }
+void icc_iteration(const ParticleRange &particles,
+                   const ParticleRange &ghost_particles) {
+  if (icc_cfg.n_icc == 0)
+    return;
 
-void iccp3m_alloc_lists() {
-  auto const n_ic = iccp3m_cfg.n_ic;
-
-  iccp3m_cfg.areas.resize(n_ic);
-  iccp3m_cfg.ein.resize(n_ic);
-  iccp3m_cfg.normals.resize(n_ic);
-  iccp3m_cfg.sigma.resize(n_ic);
-}
-
-int iccp3m_iteration(const ParticleRange &particles,
-                     const ParticleRange &ghost_particles) {
-  if (iccp3m_cfg.n_ic == 0)
-    return 0;
-
-  Coulomb::iccp3m_sanity_check();
-
-  if (iccp3m_cfg.eout <= 0) {
-    runtimeErrorMsg()
-        << "ICCP3M: nonpositive dielectric constant is not allowed.";
-  }
+  Coulomb::icc_sanity_check();
 
   auto const pref = 1.0 / (coulomb.prefactor * 2 * Utils::pi());
-  iccp3m_cfg.citeration = 0;
+  icc_cfg.citeration = 0;
 
-  double globalmax = 1e100;
+  double globalmax = 0.;
 
-  for (int j = 0; j < iccp3m_cfg.num_iteration; j++) {
-    double hmax = 0.;
+  for (int j = 0; j < icc_cfg.num_iteration; j++) {
+    double charge_density_max = 0.;
 
-    force_calc_iccp3m(particles, ghost_particles); /* Calculate electrostatic
+    force_calc_icc(particles, ghost_particles); /* Calculate electrostatic
                             forces (SR+LR) excluding source source interaction*/
     cell_structure.ghosts_reduce_forces();
 
     double diff = 0;
 
     for (auto &p : particles) {
-      if (p.p.identity < iccp3m_cfg.n_ic + iccp3m_cfg.first_id &&
-          p.p.identity >= iccp3m_cfg.first_id) {
-        auto const id = p.p.identity - iccp3m_cfg.first_id;
+      if (p.p.identity < icc_cfg.n_icc + icc_cfg.first_id &&
+          p.p.identity >= icc_cfg.first_id) {
+        auto const id = p.p.identity - icc_cfg.first_id;
         /* the dielectric-related prefactor: */
-        auto const del_eps = (iccp3m_cfg.ein[id] - iccp3m_cfg.eout) /
-                             (iccp3m_cfg.ein[id] + iccp3m_cfg.eout);
+        auto const del_eps =
+            (icc_cfg.ein[id] - icc_cfg.eout) / (icc_cfg.ein[id] + icc_cfg.eout);
         /* calculate the electric field at the certain position */
-        auto const E = p.f.f / p.p.q + iccp3m_cfg.ext_field;
+        auto const local_e_field = p.f.f / p.p.q + icc_cfg.ext_field;
 
-        if (E[0] == 0 && E[1] == 0 && E[2] == 0) {
+        if (local_e_field.norm2() == 0) {
           runtimeErrorMsg()
-              << "ICCP3M found zero electric field on a charge. This must "
+              << "ICC found zero electric field on a charge. This must "
                  "never happen";
         }
 
-        /* recalculate the old charge density */
-        auto const hold = p.p.q / iccp3m_cfg.areas[id];
-        /* determine if it is higher than the previously highest charge
-         * density */
-        hmax = std::max(hmax, std::abs(hold));
-
-        auto const f1 = del_eps * pref * (E * iccp3m_cfg.normals[id]);
-        auto const f2 = (not iccp3m_cfg.sigma.empty())
-                            ? (2 * iccp3m_cfg.eout) /
-                                  (iccp3m_cfg.eout + iccp3m_cfg.ein[id]) *
-                                  (iccp3m_cfg.sigma[id])
-                            : 0.;
+        auto const charge_density_old = p.p.q / icc_cfg.areas[id];
+
+        charge_density_max =
+            std::max(charge_density_max, std::abs(charge_density_old));
+
+        auto const charge_density_update =
+            del_eps * pref * (local_e_field * icc_cfg.normals[id]) +
+            2 * icc_cfg.eout / (icc_cfg.eout + icc_cfg.ein[id]) *
+                icc_cfg.sigma[id];
         /* relative variation: never use an estimator which can be negative
          * here */
-        auto const hnew =
-            (1. - iccp3m_cfg.relax) * hold + (iccp3m_cfg.relax) * (f1 + f2);
+        auto const charge_density_new =
+            (1. - icc_cfg.relax) * charge_density_old +
+            (icc_cfg.relax) * charge_density_update;
 
         /* Take the largest error to check for convergence */
         auto const relative_difference =
-            std::abs(1 * (hnew - hold) / (hmax + std::abs(hnew + hold)));
+            std::abs((charge_density_new - charge_density_old) /
+                     (charge_density_max +
+                      std::abs(charge_density_new + charge_density_old)));
 
         diff = std::max(diff, relative_difference);
 
-        p.p.q = hnew * iccp3m_cfg.areas[id];
+        p.p.q = charge_density_new * icc_cfg.areas[id];
 
         /* check if the charge now is more than 1e6, to determine if ICC still
-         * leads to reasonable results */
-        /* this is kind of an arbitrary measure but does a good job spotting
-         * divergence! */
+         * leads to reasonable results. This is kind of an arbitrary measure
+         * but does a good job spotting divergence! */
         if (std::abs(p.p.q) > 1e6) {
           runtimeErrorMsg()
-              << "too big charge assignment in iccp3m! q >1e6 , assigned "
+              << "too big charge assignment in icc! q >1e6 , assigned "
                  "charge= "
               << p.p.q;
 
@@ -174,62 +158,118 @@ int iccp3m_iteration(const ParticleRange &particles,
     /* Update charges on ghosts. */
     cell_structure.ghosts_update(Cells::DATA_PART_PROPERTIES);
 
-    iccp3m_cfg.citeration++;
+    icc_cfg.citeration++;
 
-    MPI_Allreduce(&diff, &globalmax, 1, MPI_DOUBLE, MPI_MAX, comm_cart);
+    boost::mpi::all_reduce(comm_cart, diff, globalmax,
+                           boost::mpi::maximum<double>());
 
-    if (globalmax < iccp3m_cfg.convergence)
+    if (globalmax < icc_cfg.convergence)
       break;
   } /* iteration */
 
-  if (globalmax > iccp3m_cfg.convergence) {
+  if (globalmax > icc_cfg.convergence) {
     runtimeErrorMsg()
         << "ICC failed to converge in the given number of maximal steps.";
   }
 
   on_particle_charge_change();
-
-  return iccp3m_cfg.citeration;
 }
 
-void force_calc_iccp3m(const ParticleRange &particles,
-                       const ParticleRange &ghost_particles) {
-  init_forces_iccp3m(particles, ghost_particles);
+void force_calc_icc(const ParticleRange &particles,
+                    const ParticleRange &ghost_particles) {
+  init_forces_icc(particles, ghost_particles);
 
-  cell_structure.non_bonded_loop([](Particle &p1, Particle &p2,
-                                    Distance const &d) {
-    /* calc non-bonded interactions */
-    add_non_bonded_pair_force_iccp3m(p1, p2, d.vec21, sqrt(d.dist2), d.dist2);
-  });
+  cell_structure.non_bonded_loop(
+      [](Particle &p1, Particle &p2, Distance const &d) {
+        /* calc non-bonded interactions */
+        add_non_bonded_pair_force_icc(p1, p2, d.vec21, sqrt(d.dist2), d.dist2);
+      });
 
   Coulomb::calc_long_range_force(particles);
 }
 
-void init_forces_iccp3m(const ParticleRange &particles,
-                        const ParticleRange &ghosts_particles) {
+void init_forces_icc(const ParticleRange &particles,
+                     const ParticleRange &ghosts_particles) {
   for (auto &p : particles) {
-    p.f = ParticleForce{};
+    p.f.f = {};
   }
 
   for (auto &p : ghosts_particles) {
-    p.f = ParticleForce{};
+    p.f.f = {};
   }
 }
 
-void mpi_iccp3m_init_local(const iccp3m_struct &iccp3m_cfg_) {
-  iccp3m_cfg = iccp3m_cfg_;
+void mpi_icc_init_local(const icc_struct &icc_cfg_) {
+  icc_cfg = icc_cfg_;
 
   on_particle_charge_change();
   check_runtime_errors(comm_cart);
 }
 
-REGISTER_CALLBACK(mpi_iccp3m_init_local)
+REGISTER_CALLBACK(mpi_icc_init_local)
 
-int mpi_iccp3m_init() {
-  mpi_call(mpi_iccp3m_init_local, iccp3m_cfg);
+int mpi_icc_init() {
+  mpi_call(mpi_icc_init_local, icc_cfg);
 
   on_particle_charge_change();
   return check_runtime_errors(comm_cart);
 }
 
+void icc_set_params(int n_icc, double convergence, double relaxation,
+                    Utils::Vector3d &ext_field, int max_iterations,
+                    int first_id, double eps_out, std::vector<double> &areas,
+                    std::vector<double> &e_in, std::vector<double> &sigma,
+                    std::vector<Utils::Vector3d> &normals) {
+  if (n_icc < 0)
+    throw std::runtime_error("ICC: invalid number of particles. " +
+                             std::to_string(n_icc));
+  if (convergence <= 0)
+    throw std::runtime_error("ICC: invalid convergence value. " +
+                             std::to_string(convergence));
+  if (relaxation < 0 or relaxation > 2)
+    throw std::runtime_error("ICC: invalid relaxation value. " +
+                             std::to_string(relaxation));
+  if (max_iterations <= 0)
+    throw std::runtime_error("ICC: invalid max_iterations. " +
+                             std::to_string(max_iterations));
+  if (first_id < 0)
+    throw std::runtime_error("ICC: invalid first_id. " +
+                             std::to_string(first_id));
+  if (eps_out <= 0)
+    throw std::runtime_error("ICC: invalid eps_out. " +
+                             std::to_string(eps_out));
+  if (areas.size() != n_icc)
+    throw std::runtime_error("ICC: invalid areas vector.");
+  if (e_in.size() != n_icc)
+    throw std::runtime_error("ICC: invalid e_in vector.");
+  if (sigma.size() != n_icc)
+    throw std::runtime_error("ICC: invalid sigma vector.");
+  if (normals.size() != n_icc)
+    throw std::runtime_error("ICC: invalid normals vector.");
+
+  icc_cfg.n_icc = n_icc;
+  icc_cfg.convergence = convergence;
+  icc_cfg.relax = relaxation;
+  icc_cfg.ext_field = ext_field;
+  icc_cfg.num_iteration = max_iterations;
+  icc_cfg.first_id = first_id;
+  icc_cfg.eout = eps_out;
+
+  icc_cfg.areas = std::move(areas);
+  icc_cfg.ein = std::move(e_in);
+  icc_cfg.sigma = std::move(sigma);
+  icc_cfg.normals = std::move(normals);
+
+  mpi_icc_init();
+}
+
+void icc_deactivate() {
+  icc_cfg.n_icc = 0;
+  icc_cfg.areas.resize(0);
+  icc_cfg.ein.resize(0);
+  icc_cfg.normals.resize(0);
+  icc_cfg.sigma.resize(0);
+
+  mpi_icc_init();
+}
 #endif
diff --git a/src/core/electrostatics_magnetostatics/icc.hpp b/src/core/electrostatics_magnetostatics/icc.hpp
index 12f31942e6c..317fd66ee60 100644
--- a/src/core/electrostatics_magnetostatics/icc.hpp
+++ b/src/core/electrostatics_magnetostatics/icc.hpp
@@ -20,22 +20,18 @@
  */
 /** \file
  *
- *  ICCP3M is a method that allows to take into account the influence
+ *  ICC is a method that allows to take into account the influence
  *  of arbitrarily shaped dielectric interfaces. The dielectric
  *  properties of a dielectric medium in the bulk of the simulation
  *  box are taken into account by reproducing the jump in the electric
  *  field at the interface with charge surface segments. The charge
  *  density of the surface segments have to be determined
- *  self-consistently using an iterative scheme. It can at present -
- *  despite its name - be used with P3M, ELCP3M and MMM1D. For
- *  details see: @cite tyagi10a
+ *  self-consistently using an iterative scheme. It can at present
+ *  be used with P3M, ELCP3M and MMM1D. For details see: @cite tyagi10a
  *
- *  To set up ICCP3M, first the dielectric boundary has to be modeled
- *  by ESPResSo particles 0..n where n has to be passed as a parameter
- *  to ICCP3M. This is still a bit inconvenient, as it forces the user
- *  to reserve the first n particle ids to wall charges, but as the
- *  other parts of ESPResSo do not suffer from a limitation like this,
- *  it can be tolerated.
+ *  To set up ICC, first the dielectric boundary has to be modeled
+ *  by ESPResSo particles n_0...n_0+n where n_0 and n have to be passed
+ *  as a parameter to ICC.
  *
  *  For the determination of the induced charges only the forces
  *  acting on the induced charges has to be determined. As P3M and the
@@ -45,8 +41,8 @@
  *  particle data organisation schemes this is performed differently.
  */
 
-#ifndef CORE_ICCP3M_HPP
-#define CORE_ICCP3M_HPP
+#ifndef CORE_ICC_HPP
+#define CORE_ICC_HPP
 
 #include "config.hpp"
 
@@ -58,25 +54,36 @@
 #include <algorithm>
 #include <vector>
 
-/** ICCP3M data structure */
-struct iccp3m_struct {
-  int n_ic;                  /**< Last induced id (cannot be smaller than 2) */
-  int num_iteration = 30;    /**< Number of max iterations                   */
-  double eout = 1;           /**< Dielectric constant of the bulk            */
-  std::vector<double> areas; /**< Array of area of the grid elements         */
-  std::vector<double> ein;   /**< Array of dielectric constants at each surface
-                                  element */
-  std::vector<double> sigma; /**< Surface charge density */
-  double convergence = 1e-2; /**< Convergence criterion */
-  std::vector<Utils::Vector3d> normals;  /**< Surface normal vectors */
-  Utils::Vector3d ext_field = {0, 0, 0}; /**< External field */
-  double relax = 0.7; /**< relaxation parameter for iteration */
-  int citeration = 0; /**< current number of iterations */
-  int first_id = 0; /**< id of the first particle in the dielectric boundary */
+/** ICC data structure */
+struct icc_struct {
+  /** First id of ICC particle */
+  int n_icc;
+  /** maximum number of iterations */
+  int num_iteration = 30;
+  /** bulk dielectric constant */
+  double eout;
+  /** areas of the particles */
+  std::vector<double> areas;
+  /** dielectric constants of the particles */
+  std::vector<double> ein;
+  /** surface charge density of the particles */
+  std::vector<double> sigma;
+  /** convergence criteria */
+  double convergence = 1e-2;
+  /** surface normal vectors */
+  std::vector<Utils::Vector3d> normals;
+  /** external electric field */
+  Utils::Vector3d ext_field = {0, 0, 0};
+  /** relaxation parameter */
+  double relax;
+  /** last number of iterations */
+  int citeration = 0;
+  /** first ICC particle id */
+  int first_id = 0;
 
   template <typename Archive>
   void serialize(Archive &ar, long int /* version */) {
-    ar &n_ic;
+    ar &n_icc;
     ar &num_iteration;
     ar &first_id;
     ar &convergence;
@@ -90,27 +97,32 @@ struct iccp3m_struct {
     ar &citeration;
   }
 };
-extern iccp3m_struct iccp3m_cfg; /**< Global state of the ICCP3M solver */
+
+/** ICC parameters */
+extern icc_struct icc_cfg;
 
 /** The main iterative scheme, where the surface element charges are calculated
  *  self-consistently.
  */
-int iccp3m_iteration(const ParticleRange &particles,
-                     const ParticleRange &ghost_particles);
+void icc_iteration(const ParticleRange &particles,
+                   const ParticleRange &ghost_particles);
 
-/** The allocation of ICCP3M lists for python interface
+/** Perform ICC initialization.
+ *  @return non-zero value on error
  */
-void iccp3m_alloc_lists();
+int mpi_icc_init();
 
-/** check sanity of parameters for use with ICCP3M
+/** Set ICC parameters
  */
-int iccp3m_sanity_check();
+void icc_set_params(int n_ic, double convergence, double relaxation,
+                    Utils::Vector3d &ext_field, int max_iterations,
+                    int first_id, double eps_out, std::vector<double> &areas,
+                    std::vector<double> &e_in, std::vector<double> &sigma,
+                    std::vector<Utils::Vector3d> &normals);
 
-/** Perform iccp3m initialization.
- *  @return non-zero value on error
+/** clear ICC vector allocations
  */
-int mpi_iccp3m_init();
+void icc_deactivate();
 
 #endif /* ELECTROSTATICS */
-
-#endif /* ICCP3M_H */
+#endif /* CORE_ICC_HPP */
diff --git a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp
index 957022c3c69..b59c1e79e21 100644
--- a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp
+++ b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.cpp
@@ -34,9 +34,17 @@
 #include "grid.hpp"
 
 #include <utils/constants.hpp>
-
-double calc_dipole_dipole_ia(Particle &p1, Utils::Vector3d const &dip1,
-                             Particle &p2, bool force_flag) {
+#include <utils/math/sqr.hpp>
+
+/**
+ * Calculate dipolar energy and optionally force between two particles.
+ * @param[in,out] p1          First particle
+ * @param[in]     dip1        Cached dipole moment of the first particle
+ * @param[in,out] p2          Second particle
+ * @param[in]     force_flag  If true, update the particle forces and torques
+ */
+static double calc_dipole_dipole_ia(Particle &p1, Utils::Vector3d const &dip1,
+                                    Particle &p2, bool force_flag) {
 
   // Cache dipole moment
   auto const dip2 = p2.calc_dip();
@@ -58,7 +66,7 @@ double calc_dipole_dipole_ia(Particle &p1, Utils::Vector3d const &dip1,
   auto const pe4 = 3.0 / r5;
 
   // Energy
-  auto const u = dipole.prefactor * (pe1 / r3 - pe4 * pe2 * pe3);
+  auto const energy = dipole.prefactor * (pe1 / r3 - pe4 * pe2 * pe3);
 
   // Forces, if requested
   if (force_flag) {
@@ -81,8 +89,7 @@ double calc_dipole_dipole_ia(Particle &p1, Utils::Vector3d const &dip1,
     p2.f.torque += dipole.prefactor * (aa / r3 + b2 * dd);
   }
 
-  // Return energy
-  return u;
+  return energy;
 }
 
 /* =============================================================================
@@ -93,23 +100,12 @@ double calc_dipole_dipole_ia(Particle &p1, Utils::Vector3d const &dip1,
 double dawaanr_calculations(bool force_flag, bool energy_flag,
                             const ParticleRange &particles) {
 
-  if (n_nodes != 1) {
-    fprintf(stderr, "error: DAWAANR is just for one cpu...\n");
-    errexit();
-  }
-  if (!(force_flag) && !(energy_flag)) {
-    fprintf(stderr, "I don't know why you call dawaanr_calculations() "
-                    "with all flags zero.\n");
-    return 0;
-  }
-
-  // Variable to sum up the energy
-  double u = 0;
-
-  auto parts = particles;
+  assert(n_nodes == 1);
+  assert(force_flag || energy_flag);
 
-  // Iterate over all cells
-  for (auto it = parts.begin(), end = parts.end(); it != end; ++it) {
+  double energy = 0.0;
+  // Iterate over all particles
+  for (auto it = particles.begin(), end = particles.end(); it != end; ++it) {
     // If the particle has no dipole moment, ignore it
     if (it->p.dipm == 0.0)
       continue;
@@ -123,12 +119,11 @@ double dawaanr_calculations(bool force_flag, bool energy_flag,
       if (jt->p.dipm == 0.0)
         continue;
       // Calculate energy and/or force between the particles
-      u += calc_dipole_dipole_ia(*it, dip1, *jt, force_flag);
+      energy += calc_dipole_dipole_ia(*it, dip1, *jt, force_flag);
     }
   }
 
-  // Return energy
-  return u;
+  return energy;
 }
 
 /* =============================================================================
@@ -148,6 +143,9 @@ double
 magnetic_dipolar_direct_sum_calculations(bool force_flag, bool energy_flag,
                                          ParticleRange const &particles) {
 
+  assert(n_nodes == 1);
+  assert(force_flag || energy_flag);
+
   if (box_geo.periodic(0) and box_geo.periodic(1) and box_geo.periodic(2) and
       Ncut_off_magnetic_dipolar_direct_sum == 0) {
     throw std::runtime_error("Dipolar direct sum with replica does not support "
@@ -158,17 +156,6 @@ magnetic_dipolar_direct_sum_calculations(bool force_flag, bool energy_flag,
   std::vector<double> mx, my, mz;
   std::vector<double> fx, fy, fz;
   std::vector<double> tx, ty, tz;
-  double u;
-
-  if (n_nodes != 1) {
-    fprintf(stderr, "error: magnetic Direct Sum is just for one cpu...\n");
-    errexit();
-  }
-  if (!(force_flag) && !(energy_flag)) {
-    fprintf(stderr, "I don't know why you call magnetic_dipolar_direct_sum_"
-                    "calculations() with all flags zero\n");
-    return 0;
-  }
 
   auto const n_part = particles.size();
 
@@ -217,111 +204,98 @@ magnetic_dipolar_direct_sum_calculations(bool force_flag, bool energy_flag,
     }
   }
 
-  /*now we do the calculations */
-
-  { /* beginning of the area of calculation */
-    int NCUT[3], NCUT2;
-
-    for (int i = 0; i < 3; i++) {
-      NCUT[i] = Ncut_off_magnetic_dipolar_direct_sum;
-      if (box_geo.periodic(i) == 0) {
-        NCUT[i] = 0;
-      }
-    }
-    NCUT2 = Ncut_off_magnetic_dipolar_direct_sum *
-            Ncut_off_magnetic_dipolar_direct_sum;
-
-    u = 0;
-
-    for (int i = 0; i < dip_particles; i++) {
-      for (int j = 0; j < dip_particles; j++) {
-        auto const pe1 = mx[i] * mx[j] + my[i] * my[j] + mz[i] * mz[j];
-        auto const rx = x[i] - x[j];
-        auto const ry = y[i] - y[j];
-        auto const rz = z[i] - z[j];
-
-        for (int nx = -NCUT[0]; nx <= NCUT[0]; nx++) {
-          auto const rnx = rx + nx * box_geo.length()[0];
-          auto const rnx2 = rnx * rnx;
-          for (int ny = -NCUT[1]; ny <= NCUT[1]; ny++) {
-            auto const rny = ry + ny * box_geo.length()[1];
-            auto const rny2 = rny * rny;
-            for (int nz = -NCUT[2]; nz <= NCUT[2]; nz++) {
-              if (!(i == j && nx == 0 && ny == 0 && nz == 0)) {
-                if (nx * nx + ny * ny + nz * nz <= NCUT2) {
-                  auto const rnz = rz + nz * box_geo.length()[2];
-                  auto const r2 = rnx2 + rny2 + rnz * rnz;
-                  auto const r = sqrt(r2);
-                  auto const r3 = r2 * r;
-                  auto const r5 = r3 * r2;
-                  auto const r7 = r5 * r2;
-
-                  auto const pe2 = mx[i] * rnx + my[i] * rny + mz[i] * rnz;
-                  auto const pe3 = mx[j] * rnx + my[j] * rny + mz[j] * rnz;
-
-                  // Energy ............................
-
-                  u += pe1 / r3 - 3.0 * pe2 * pe3 / r5;
-
-                  if (force_flag) {
-                    double a, b, c, d;
-                    // force ............................
-                    a = mx[i] * mx[j] + my[i] * my[j] + mz[i] * mz[j];
-                    a = 3.0 * a / r5;
-                    b = -15.0 * pe2 * pe3 / r7;
-                    c = 3.0 * pe3 / r5;
-                    d = 3.0 * pe2 / r5;
-
-                    fx[i] += (a + b) * rnx + c * mx[i] + d * mx[j];
-                    fy[i] += (a + b) * rny + c * my[i] + d * my[j];
-                    fz[i] += (a + b) * rnz + c * mz[i] + d * mz[j];
-
-                    // torque ............................
-                    c = 3.0 / r5 * pe3;
-                    auto const ax = my[i] * mz[j] - my[j] * mz[i];
-                    auto const ay = mx[j] * mz[i] - mx[i] * mz[j];
-                    auto const az = mx[i] * my[j] - mx[j] * my[i];
-
-                    auto const bx = my[i] * rnz - rny * mz[i];
-                    auto const by = rnx * mz[i] - mx[i] * rnz;
-                    auto const bz = mx[i] * rny - rnx * my[i];
-
-                    tx[i] += -ax / r3 + bx * c;
-                    ty[i] += -ay / r3 + by * c;
-                    tz[i] += -az / r3 + bz * c;
-                  } /* of force_flag  */
-                }
-              } /* of nx*nx+ny*ny +nz*nz< NCUT*NCUT   and   !(i==j && nx==0 &&
-                   ny==0 && nz==0) */
-            }   /* of  for nz */
-          }     /* of  for ny  */
-        }       /* of  for nx  */
-      }
-    } /* of  j and i  */
-  }   /* end of the area of calculation */
+  /* energy calculation */
+  double energy = 0.;
 
-  /* set the forces, and torques of the particles within ESPResSo */
+  int NCUT[3];
+  for (int i = 0; i < 3; i++) {
+    NCUT[i] = box_geo.periodic(i) ? Ncut_off_magnetic_dipolar_direct_sum : 0;
+  }
+  auto const NCUT2 = Utils::sqr(Ncut_off_magnetic_dipolar_direct_sum);
+
+  for (int i = 0; i < dip_particles; i++) {
+    for (int j = 0; j < dip_particles; j++) {
+      auto const pe1 = mx[i] * mx[j] + my[i] * my[j] + mz[i] * mz[j];
+      auto const rx = x[i] - x[j];
+      auto const ry = y[i] - y[j];
+      auto const rz = z[i] - z[j];
+
+      for (int nx = -NCUT[0]; nx <= NCUT[0]; nx++) {
+        auto const rnx = rx + nx * box_geo.length()[0];
+        auto const rnx2 = rnx * rnx;
+        for (int ny = -NCUT[1]; ny <= NCUT[1]; ny++) {
+          auto const rny = ry + ny * box_geo.length()[1];
+          auto const rny2 = rny * rny;
+          for (int nz = -NCUT[2]; nz <= NCUT[2]; nz++) {
+            if (!(i == j && nx == 0 && ny == 0 && nz == 0) and
+                (nx * nx + ny * ny + nz * nz <= NCUT2)) {
+              auto const rnz = rz + nz * box_geo.length()[2];
+              auto const r2 = rnx2 + rny2 + rnz * rnz;
+              auto const r = sqrt(r2);
+              auto const r3 = r2 * r;
+              auto const r5 = r3 * r2;
+              auto const r7 = r5 * r2;
+
+              auto const pe2 = mx[i] * rnx + my[i] * rny + mz[i] * rnz;
+              auto const pe3 = mx[j] * rnx + my[j] * rny + mz[j] * rnz;
+              auto const pe4 = 3.0 / r5;
+
+              // Energy
+              energy += pe1 / r3 - pe4 * pe2 * pe3;
+
+              if (force_flag) {
+                // Forces
+                auto const a = pe4 * pe1;
+                auto const b = -15.0 * pe2 * pe3 / r7;
+                auto const c = pe4 * pe3;
+                auto const d = pe4 * pe2;
+
+                fx[i] += (a + b) * rnx + c * mx[i] + d * mx[j];
+                fy[i] += (a + b) * rny + c * my[i] + d * my[j];
+                fz[i] += (a + b) * rnz + c * mz[i] + d * mz[j];
+
+                // Torques
+                auto const ax = my[i] * mz[j] - my[j] * mz[i];
+                auto const ay = mx[j] * mz[i] - mx[i] * mz[j];
+                auto const az = mx[i] * my[j] - mx[j] * my[i];
+
+                auto const bx = my[i] * rnz - rny * mz[i];
+                auto const by = rnx * mz[i] - mx[i] * rnz;
+                auto const bz = mx[i] * rny - rnx * my[i];
+
+                tx[i] += -ax / r3 + bx * c;
+                ty[i] += -ay / r3 + by * c;
+                tz[i] += -az / r3 + bz * c;
+              } /* if force_flag  */
+            }   /* if distance criterion */
+          }     /* for nz */
+        }       /* for ny */
+      }         /* for nx */
+    }           /* for j */
+  }             /* for i */
+
+  /* update particle forces and torques */
   if (force_flag) {
 
-    int dip_particles2 = 0;
+    dip_particles = 0;
 
     for (auto &p : particles) {
       if (p.p.dipm != 0.0) {
 
-        p.f.f[0] += dipole.prefactor * fx[dip_particles2];
-        p.f.f[1] += dipole.prefactor * fy[dip_particles2];
-        p.f.f[2] += dipole.prefactor * fz[dip_particles2];
+        p.f.f[0] += dipole.prefactor * fx[dip_particles];
+        p.f.f[1] += dipole.prefactor * fy[dip_particles];
+        p.f.f[2] += dipole.prefactor * fz[dip_particles];
 
-        p.f.torque[0] += dipole.prefactor * tx[dip_particles2];
-        p.f.torque[1] += dipole.prefactor * ty[dip_particles2];
-        p.f.torque[2] += dipole.prefactor * tz[dip_particles2];
+        p.f.torque[0] += dipole.prefactor * tx[dip_particles];
+        p.f.torque[1] += dipole.prefactor * ty[dip_particles];
+        p.f.torque[2] += dipole.prefactor * tz[dip_particles];
 
-        dip_particles2++;
+        dip_particles++;
       }
     }
-  } /*of if force_flag */
+  } /* if force_flag */
 
-  return 0.5 * dipole.prefactor * u;
+  return 0.5 * dipole.prefactor * energy;
 }
 
 int dawaanr_set_params() {
diff --git a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp
index 4eaacaf7435..f594b5c72ca 100644
--- a/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp
+++ b/src/core/electrostatics_magnetostatics/magnetic_non_p3m_methods.hpp
@@ -42,9 +42,6 @@
 #include "Particle.hpp"
 #include "ParticleRange.hpp"
 
-/** Calculate dipolar energy and/or force between two particles */
-double calc_dipole_dipole_ia(Particle &p1, Particle &p2, bool force_flag);
-
 /* =============================================================================
                   DAWAANR => DIPOLAR ALL WITH ALL AND NO REPLICA
    =============================================================================
diff --git a/src/core/electrostatics_magnetostatics/p3m-common.cpp b/src/core/electrostatics_magnetostatics/p3m-common.cpp
index 0bf10fc1a61..abbfffb0e58 100644
--- a/src/core/electrostatics_magnetostatics/p3m-common.cpp
+++ b/src/core/electrostatics_magnetostatics/p3m-common.cpp
@@ -115,14 +115,17 @@ double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao) {
 
 void p3m_calc_local_ca_mesh(p3m_local_mesh &local_mesh,
                             const P3MParameters &params,
-                            const LocalBox<double> &local_geo, double skin) {
+                            const LocalBox<double> &local_geo, double skin,
+                            double space_layer) {
   int i;
   int ind[3];
   /* total skin size */
   double full_skin[3];
 
   for (i = 0; i < 3; i++)
-    full_skin[i] = params.cao_cut[i] + skin + params.additional_mesh[i];
+    full_skin[i] = params.cao_cut[i] + skin;
+
+  full_skin[2] += space_layer;
 
   /* inner left down grid point (global index) */
   for (i = 0; i < 3; i++)
diff --git a/src/core/electrostatics_magnetostatics/p3m-common.hpp b/src/core/electrostatics_magnetostatics/p3m-common.hpp
index 7e002994c95..26e9fbaf3e5 100644
--- a/src/core/electrostatics_magnetostatics/p3m-common.hpp
+++ b/src/core/electrostatics_magnetostatics/p3m-common.hpp
@@ -147,14 +147,11 @@ typedef struct {
   /** number of points unto which a single charge is interpolated, i.e.
    *  p3m.cao^3 */
   int cao3 = 0;
-  /** additional points around the charge assignment mesh, for method like
-   *  dielectric ELC creating virtual charges. */
-  double additional_mesh[3] = {};
 
   template <typename Archive> void serialize(Archive &ar, long int) {
     ar &tuning &alpha_L &r_cut_iL &mesh;
     ar &mesh_off &cao &accuracy &epsilon &cao_cut;
-    ar &a &ai &alpha &r_cut &cao3 &additional_mesh;
+    ar &a &ai &alpha &r_cut &cao3;
   }
 
 } P3MParameters;
@@ -186,7 +183,8 @@ double p3m_analytic_cotangent_sum(int n, double mesh_i, int cao);
  */
 void p3m_calc_local_ca_mesh(p3m_local_mesh &local_mesh,
                             const P3MParameters &params,
-                            const LocalBox<double> &local_geo, double skin);
+                            const LocalBox<double> &local_geo, double skin,
+                            double space_layer);
 
 /** Calculate the spatial position of the left down mesh
  *  point of the local mesh, to be stored in
diff --git a/src/core/electrostatics_magnetostatics/p3m-dipolar.cpp b/src/core/electrostatics_magnetostatics/p3m-dipolar.cpp
index 02bbce6992a..d2fe0cef479 100644
--- a/src/core/electrostatics_magnetostatics/p3m-dipolar.cpp
+++ b/src/core/electrostatics_magnetostatics/p3m-dipolar.cpp
@@ -70,6 +70,7 @@
 #include <array>
 #include <cstdio>
 #include <functional>
+#include <stdexcept>
 #include <vector>
 
 /************************************************
@@ -199,7 +200,7 @@ void dp3m_init() {
    * and the cutoff for charge assignment dp3m.params.cao_cut */
   dp3m_init_a_ai_cao_cut();
 
-  p3m_calc_local_ca_mesh(dp3m.local_mesh, dp3m.params, local_geo, skin);
+  p3m_calc_local_ca_mesh(dp3m.local_mesh, dp3m.params, local_geo, skin, 0.0);
 
   dp3m.sm.resize(comm_cart, dp3m.local_mesh);
 
@@ -225,8 +226,7 @@ void dp3m_init() {
  * functions related to the parsing & tuning of the dipolar parameters
  ******************/
 
-void dp3m_set_tune_params(double r_cut, int mesh, int cao, double alpha,
-                          double accuracy) {
+void dp3m_set_tune_params(double r_cut, int mesh, int cao, double accuracy) {
   if (r_cut >= 0) {
     dp3m.params.r_cut = r_cut;
     dp3m.params.r_cut_iL = r_cut / box_geo.length()[0];
@@ -238,75 +238,68 @@ void dp3m_set_tune_params(double r_cut, int mesh, int cao, double alpha,
   if (cao >= 0)
     dp3m.params.cao = cao;
 
-  if (alpha >= 0) {
-    dp3m.params.alpha = alpha;
-    dp3m.params.alpha_L = alpha * box_geo.length()[0];
-  }
-
   if (accuracy >= 0)
     dp3m.params.accuracy = accuracy;
 }
 
 /*****************************************************************************/
 
-int dp3m_set_params(double r_cut, int mesh, int cao, double alpha,
-                    double accuracy) {
-  if (dipole.method != DIPOLAR_P3M && dipole.method != DIPOLAR_MDLC_P3M)
-    Dipole::set_method_local(DIPOLAR_P3M);
-
+void dp3m_set_params(double r_cut, int mesh, int cao, double alpha,
+                     double accuracy) {
   if (r_cut < 0)
-    return -1;
+    throw std::runtime_error("DipolarP3M: invalid r_cut");
 
   if (mesh < 0)
-    return -2;
+    throw std::runtime_error("DipolarP3M: invalid mesh size");
+
+  if (cao < 1 || cao > 7)
+    throw std::runtime_error("DipolarP3M: invalid cao");
 
-  if (cao < 1 || cao > 7 || cao > mesh)
-    return -3;
+  if (cao > mesh)
+    throw std::runtime_error("DipolarP3M: cao larger than mesh size");
+
+  if (alpha <= 0.0 && alpha != -1.0)
+    throw std::runtime_error("DipolarP3M: invalid alpha");
+
+  if (accuracy <= 0.0 && accuracy != -1.0)
+    throw std::runtime_error("DipolarP3M: invalid accuracy");
+
+  if (dipole.method != DIPOLAR_P3M && dipole.method != DIPOLAR_MDLC_P3M)
+    Dipole::set_method_local(DIPOLAR_P3M);
 
   dp3m.params.r_cut = r_cut;
   dp3m.params.r_cut_iL = r_cut / box_geo.length()[0];
   dp3m.params.mesh[2] = dp3m.params.mesh[1] = dp3m.params.mesh[0] = mesh;
   dp3m.params.cao = cao;
-
-  if (alpha > 0) {
-    dp3m.params.alpha = alpha;
-    dp3m.params.alpha_L = alpha * box_geo.length()[0];
-  } else if (alpha != -1.0)
-    return -4;
-
-  if (accuracy >= 0)
-    dp3m.params.accuracy = accuracy;
-  else if (accuracy != -1.0)
-    return -5;
+  dp3m.params.alpha = alpha;
+  dp3m.params.alpha_L = alpha * box_geo.length()[0];
+  dp3m.params.accuracy = accuracy;
 
   mpi_bcast_coulomb_params();
-
-  return 0;
 }
 
-int dp3m_set_mesh_offset(double x, double y, double z) {
+void dp3m_set_mesh_offset(double x, double y, double z) {
+  if (x == -1.0 && y == -1.0 && z == -1.0)
+    return;
+
   if (x < 0.0 || x > 1.0 || y < 0.0 || y > 1.0 || z < 0.0 || z > 1.0)
-    return ES_ERROR;
+    throw std::runtime_error("DipolarP3M: invalid mesh offset");
 
   dp3m.params.mesh_off[0] = x;
   dp3m.params.mesh_off[1] = y;
   dp3m.params.mesh_off[2] = z;
 
   mpi_bcast_coulomb_params();
-
-  return ES_OK;
 }
 
 /** We left the handling of the epsilon, due to portability reasons in
  *  the future for the electrical dipoles, or if people want to do
  *  electrical dipoles alone using the magnetic code. Currently unused.
  */
-int dp3m_set_eps(double eps) {
+void dp3m_set_eps(double eps) {
   dp3m.params.epsilon = eps;
 
   mpi_bcast_coulomb_params();
-
-  return ES_OK;
 }
 
 namespace {
diff --git a/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp b/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp
index fc262d361ca..660842d10be 100644
--- a/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp
+++ b/src/core/electrostatics_magnetostatics/p3m-dipolar.hpp
@@ -60,11 +60,11 @@ struct dp3m_data_struct : public p3m_data_struct_base {
 
   /** local mesh. */
   p3m_local_mesh local_mesh;
-  /** real space mesh (local) for CA/FFT.*/
+  /** real space mesh (local) for CA/FFT. */
   fft_vector<double> rs_mesh;
-  /** real space mesh (local) for CA/FFT of the dipolar field.*/
+  /** real space mesh (local) for CA/FFT of the dipolar field. */
   std::array<fft_vector<double>, 3> rs_mesh_dip;
-  /** k-space mesh (local) for k-space calculation and FFT.*/
+  /** k-space mesh (local) for k-space calculation and FFT. */
   std::vector<double> ks_mesh;
 
   /** number of dipolar particles (only on master node). */
@@ -72,7 +72,7 @@ struct dp3m_data_struct : public p3m_data_struct_base {
   /** Sum of square of magnetic dipoles (only on master node). */
   double sum_mu2;
 
-  /** position shift for calc. of first assignment mesh point. */
+  /** position shift for calculation of first assignment mesh point. */
   double pos_shift;
 
   p3m_interpolation_cache inter_weights;
@@ -80,7 +80,7 @@ struct dp3m_data_struct : public p3m_data_struct_base {
   /** send/recv mesh sizes */
   p3m_send_mesh sm;
 
-  /* Stores the value of the energy correction due to MS effects */
+  /** value of the energy correction due to MS effects */
   double energy_correction;
 
   fft_data_struct fft;
@@ -90,18 +90,17 @@ struct dp3m_data_struct : public p3m_data_struct_base {
 extern dp3m_data_struct dp3m;
 
 /** @copydoc p3m_set_tune_params */
-void dp3m_set_tune_params(double r_cut, int mesh, int cao, double alpha,
-                          double accuracy);
+void dp3m_set_tune_params(double r_cut, int mesh, int cao, double accuracy);
 
 /** @copydoc p3m_set_params */
-int dp3m_set_params(double r_cut, int mesh, int cao, double alpha,
-                    double accuracy);
+void dp3m_set_params(double r_cut, int mesh, int cao, double alpha,
+                     double accuracy);
 
 /** @copydoc p3m_set_mesh_offset */
-int dp3m_set_mesh_offset(double x, double y, double z);
+void dp3m_set_mesh_offset(double x, double y, double z);
 
 /** @copydoc p3m_set_eps */
-int dp3m_set_eps(double eps);
+void dp3m_set_eps(double eps);
 
 /** Initialize all structures, parameters and arrays needed for the
  *  P3M algorithm for dipole-dipole interactions.
@@ -115,8 +114,6 @@ void dp3m_scaleby_box_l();
 bool dp3m_sanity_checks(const Utils::Vector3i &grid);
 
 /** Assign the physical dipoles using the tabulated assignment function.
- *  If Dstore_ca_frac is true, then the charge fractions are buffered in
- *  Dcur_ca_fmp and Dcur_ca_frac.
  */
 void dp3m_dipole_assign(const ParticleRange &particles);
 
diff --git a/src/core/electrostatics_magnetostatics/p3m.cpp b/src/core/electrostatics_magnetostatics/p3m.cpp
index 2a9b2d5bd66..58eb290e686 100644
--- a/src/core/electrostatics_magnetostatics/p3m.cpp
+++ b/src/core/electrostatics_magnetostatics/p3m.cpp
@@ -66,6 +66,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <functional>
+#include <stdexcept>
 
 using Utils::sinc;
 
@@ -184,7 +185,13 @@ void p3m_init() {
     return;
   }
 
-  p3m_calc_local_ca_mesh(p3m.local_mesh, p3m.params, local_geo, skin);
+  double elc_layer = 0.0;
+  if (coulomb.method == COULOMB_ELC_P3M) {
+    elc_layer = elc_params.space_layer;
+  }
+
+  p3m_calc_local_ca_mesh(p3m.local_mesh, p3m.params, local_geo, skin,
+                         elc_layer);
 
   p3m.sm.resize(comm_cart, p3m.local_mesh);
 
@@ -205,7 +212,7 @@ void p3m_init() {
   p3m_count_charged_particles();
 }
 
-void p3m_set_tune_params(double r_cut, const int mesh[3], int cao, double alpha,
+void p3m_set_tune_params(double r_cut, const int mesh[3], int cao,
                          double accuracy) {
   if (r_cut >= 0) {
     p3m.params.r_cut = r_cut;
@@ -224,29 +231,33 @@ void p3m_set_tune_params(double r_cut, const int mesh[3], int cao, double alpha,
   if (cao >= 0)
     p3m.params.cao = cao;
 
-  if (alpha >= 0) {
-    p3m.params.alpha = alpha;
-    p3m.params.alpha_L = alpha * box_geo.length()[0];
-  }
-
   if (accuracy >= 0)
     p3m.params.accuracy = accuracy;
 }
 
-int p3m_set_params(double r_cut, const int *mesh, int cao, double alpha,
-                   double accuracy) {
-  if (coulomb.method != COULOMB_P3M && coulomb.method != COULOMB_ELC_P3M &&
-      coulomb.method != COULOMB_P3M_GPU)
-    coulomb.method = COULOMB_P3M;
-
+void p3m_set_params(double r_cut, const int *mesh, int cao, double alpha,
+                    double accuracy) {
   if (r_cut < 0)
-    return -1;
+    throw std::runtime_error("P3M: invalid r_cut");
+
+  if (mesh[0] < 0 || mesh[1] < 0 || mesh[2] < 0)
+    throw std::runtime_error("P3M: invalid mesh size");
+
+  if (cao < 1 || cao > 7)
+    throw std::runtime_error("P3M: invalid cao");
+
+  if (cao > mesh[0] || cao > mesh[1] || cao > mesh[2])
+    throw std::runtime_error("P3M: cao larger than mesh size");
 
-  if ((mesh[0] < 0) || (mesh[1] < 0) || (mesh[2] < 0))
-    return -2;
+  if (alpha <= 0.0 && alpha != -1.0)
+    throw std::runtime_error("P3M: invalid alpha");
 
-  if (cao < 1 || cao > 7 || cao > mesh[0] || cao > mesh[1] || cao > mesh[2])
-    return -3;
+  if (accuracy <= 0.0 && accuracy != -1.0)
+    throw std::runtime_error("P3M: invalid accuracy");
+
+  if (coulomb.method != COULOMB_P3M && coulomb.method != COULOMB_ELC_P3M &&
+      coulomb.method != COULOMB_P3M_GPU)
+    coulomb.method = COULOMB_P3M;
 
   p3m.params.r_cut = r_cut;
   p3m.params.r_cut_iL = r_cut * (1. / box_geo.length()[0]);
@@ -254,42 +265,31 @@ int p3m_set_params(double r_cut, const int *mesh, int cao, double alpha,
   p3m.params.mesh[1] = mesh[1];
   p3m.params.mesh[0] = mesh[0];
   p3m.params.cao = cao;
-
-  if (alpha > 0) {
-    p3m.params.alpha = alpha;
-    p3m.params.alpha_L = alpha * box_geo.length()[0];
-  } else if (alpha != -1.0)
-    return -4;
-
-  if (accuracy >= 0)
-    p3m.params.accuracy = accuracy;
-  else if (accuracy != -1.0)
-    return -5;
+  p3m.params.alpha = alpha;
+  p3m.params.alpha_L = alpha * box_geo.length()[0];
+  p3m.params.accuracy = accuracy;
 
   mpi_bcast_coulomb_params();
-
-  return 0;
 }
 
-int p3m_set_mesh_offset(double x, double y, double z) {
+void p3m_set_mesh_offset(double x, double y, double z) {
+  if (x == -1.0 && y == -1.0 && z == -1.0)
+    return;
+
   if (x < 0.0 || x > 1.0 || y < 0.0 || y > 1.0 || z < 0.0 || z > 1.0)
-    return ES_ERROR;
+    throw std::runtime_error("P3M: invalid mesh offset");
 
   p3m.params.mesh_off[0] = x;
   p3m.params.mesh_off[1] = y;
   p3m.params.mesh_off[2] = z;
 
   mpi_bcast_coulomb_params();
-
-  return ES_OK;
 }
 
-int p3m_set_eps(double eps) {
+void p3m_set_eps(double eps) {
   p3m.params.epsilon = eps;
 
   mpi_bcast_coulomb_params();
-
-  return ES_OK;
 }
 
 namespace {
diff --git a/src/core/electrostatics_magnetostatics/p3m.hpp b/src/core/electrostatics_magnetostatics/p3m.hpp
index 4433af82a7b..56859930bb5 100644
--- a/src/core/electrostatics_magnetostatics/p3m.hpp
+++ b/src/core/electrostatics_magnetostatics/p3m.hpp
@@ -61,9 +61,9 @@ struct p3m_data_struct : public p3m_data_struct_base {
 
   /** local mesh. */
   p3m_local_mesh local_mesh;
-  /** real space mesh (local) for CA/FFT.*/
+  /** real space mesh (local) for CA/FFT. */
   fft_vector<double> rs_mesh;
-  /** mesh (local) for the electric field.*/
+  /** mesh (local) for the electric field. */
   std::array<fft_vector<double>, 3> E_mesh;
 
   /** number of charged particles (only on master node). */
@@ -197,10 +197,9 @@ inline void p3m_add_pair_force(double q1q2, Utils::Vector3d const &d,
  *  @param[in]  r_cut        @copybrief P3MParameters::r_cut
  *  @param[in]  mesh         @copybrief P3MParameters::mesh
  *  @param[in]  cao          @copybrief P3MParameters::cao
- *  @param[in]  alpha        @copybrief P3MParameters::alpha
  *  @param[in]  accuracy     @copybrief P3MParameters::accuracy
  */
-void p3m_set_tune_params(double r_cut, const int mesh[3], int cao, double alpha,
+void p3m_set_tune_params(double r_cut, const int mesh[3], int cao,
                          double accuracy);
 
 /** Set custom parameters
@@ -210,23 +209,22 @@ void p3m_set_tune_params(double r_cut, const int mesh[3], int cao, double alpha,
  *  @param[in]  cao          @copybrief P3MParameters::cao
  *  @param[in]  alpha        @copybrief P3MParameters::alpha
  *  @param[in]  accuracy     @copybrief P3MParameters::accuracy
- *  @return Custom error code
  */
-int p3m_set_params(double r_cut, const int *mesh, int cao, double alpha,
-                   double accuracy);
+void p3m_set_params(double r_cut, const int *mesh, int cao, double alpha,
+                    double accuracy);
 
 /** Set mesh offset
  *
  *  @param[in]  x , y , z  Components of @ref P3MParameters::mesh_off
  *                         "mesh_off"
  */
-int p3m_set_mesh_offset(double x, double y, double z);
+void p3m_set_mesh_offset(double x, double y, double z);
 
 /** Set @ref P3MParameters::epsilon "epsilon" parameter
  *
  *  @param[in]  eps          @copybrief P3MParameters::epsilon
  */
-int p3m_set_eps(double eps);
+void p3m_set_eps(double eps);
 
 /** Calculate real space contribution of Coulomb pair energy. */
 inline double p3m_pair_energy(double chgfac, double dist) {
diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
index 63516a28146..72c2ac6a687 100644
--- a/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
+++ b/src/core/electrostatics_magnetostatics/p3m_gpu_cuda.cu
@@ -54,7 +54,7 @@
 #include "BoxGeometry.hpp"
 #include "EspressoSystemInterface.hpp"
 #include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 #include "electrostatics_magnetostatics/coulomb.hpp"
 #include "global.hpp"
 
@@ -68,6 +68,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <stdexcept>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
@@ -404,7 +405,7 @@ void assign_charges(const CUDA_particle_data *const pdata, const P3MGpuData p) {
   default:
     break;
   }
-  _cuda_check_errors(block, grid, "assign_charge", __FILE__, __LINE__);
+  cuda_check_errors_exit(block, grid, "assign_charge", __FILE__, __LINE__);
 }
 
 template <int cao, bool shared>
@@ -549,7 +550,7 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
   default:
     break;
   }
-  _cuda_check_errors(block, grid, "assign_forces", __FILE__, __LINE__);
+  cuda_check_errors_exit(block, grid, "assign_forces", __FILE__, __LINE__);
 }
 
 /* Init the internal data structures of the P3M GPU.
@@ -559,6 +560,9 @@ void assign_forces(const CUDA_particle_data *const pdata, const P3MGpuData p,
  * is (cuFFT convention) Nx x Ny x [ Nz /2 + 1 ].
  */
 void p3m_gpu_init(int cao, const int mesh[3], double alpha) {
+  if (mesh[0] == -1 && mesh[1] == -1 && mesh[2] == -1)
+    throw std::runtime_error("P3M: invalid mesh size");
+
   espressoSystemInterface.requestParticleStructGpu();
 
   bool reinit_if = false, mesh_changed = false;
diff --git a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
index dae80628d23..8015e624d18 100644
--- a/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
+++ b/src/core/electrostatics_magnetostatics/p3m_gpu_error_cuda.cu
@@ -25,7 +25,7 @@
 
 #include "p3m_gpu_error.hpp"
 
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 
 #include <utils/math/int_pow.hpp>
 #include <utils/math/sinc.hpp>
diff --git a/src/core/event.cpp b/src/core/event.cpp
index 08f5af1372d..4e8f7f6a369 100644
--- a/src/core/event.cpp
+++ b/src/core/event.cpp
@@ -32,6 +32,7 @@
 #include "config.hpp"
 #include "cuda_init.hpp"
 #include "cuda_interface.hpp"
+#include "cuda_utils.hpp"
 #include "electrostatics_magnetostatics/coulomb.hpp"
 #include "electrostatics_magnetostatics/dipole.hpp"
 #include "errorhandling.hpp"
@@ -55,6 +56,8 @@
 
 #include <utils/mpi/all_compare.hpp>
 
+#include <cstdio>
+
 #include <mpi.h>
 
 /** whether the thermostat has to be reinitialized before integration */
@@ -78,7 +81,11 @@ static int reinit_magnetostatics = false;
 void on_program_start() {
 #ifdef CUDA
   if (this_node == 0) {
-    cuda_init();
+    try {
+      cuda_init();
+    } catch (cuda_runtime_error const &err) {
+      // pass
+    }
   }
 #endif
 
@@ -87,11 +94,8 @@ void on_program_start() {
   /* initially go for domain decomposition */
   cells_re_init(CELL_STRUCTURE_DOMDEC);
 
-  /*
-    call all initializations to do only on the master node here.
-  */
   if (this_node == 0) {
-    /* interaction_data.c: make sure 0<->0 ia always exists */
+    /* make sure interaction 0<->0 always exists */
     make_particle_type_exist(0);
   }
 }
@@ -161,20 +165,20 @@ void on_observable_calc() {
     Coulomb::on_observable_calc();
     reinit_electrostatics = false;
   }
-#endif /*ifdef ELECTROSTATICS */
+#endif /* ELECTROSTATICS */
 
 #ifdef DIPOLES
   if (reinit_magnetostatics) {
     Dipole::on_observable_calc();
     reinit_magnetostatics = false;
   }
-#endif /*ifdef ELECTROSTATICS */
+#endif /* DIPOLES */
 
 #ifdef ELECTROKINETICS
   if (ek_initialized) {
     ek_integrate_electrostatics();
   }
-#endif
+#endif /* ELECTROKINETICS */
 
   clear_particle_node();
 }
@@ -236,10 +240,10 @@ void on_lbboundary_change() {
 void on_boxl_change() {
   grid_changed_box_l(box_geo);
   /* Electrostatics cutoffs mostly depend on the system size,
-     therefore recalculate them. */
+   * therefore recalculate them. */
   cells_re_init(cell_structure.decomposition_type());
 
-/* Now give methods a chance to react to the change in box length */
+  /* Now give methods a chance to react to the change in box length */
 #ifdef ELECTROSTATICS
   Coulomb::on_boxl_change();
 #endif
@@ -257,9 +261,9 @@ void on_boxl_change() {
 void on_cell_structure_change() {
   clear_particle_node();
 
-/* Now give methods a chance to react to the change in cell
-   structure. Most ES methods need to reinitialize, as they depend
-   on skin, node grid and so on. */
+  /* Now give methods a chance to react to the change in cell
+   * structure. Most ES methods need to reinitialize, as they depend
+   * on skin, node grid and so on. */
 #ifdef ELECTROSTATICS
   Coulomb::init();
 #endif /* ifdef ELECTROSTATICS */
diff --git a/src/core/forces.cpp b/src/core/forces.cpp
index d1ea3de495f..a7f9d0d9fe2 100644
--- a/src/core/forces.cpp
+++ b/src/core/forces.cpp
@@ -96,7 +96,7 @@ void force_calc(CellStructure &cell_structure, double time_step) {
   auto particles = cell_structure.local_particles();
   auto ghost_particles = cell_structure.ghost_particles();
 #ifdef ELECTROSTATICS
-  iccp3m_iteration(particles, cell_structure.ghost_particles());
+  icc_iteration(particles, cell_structure.ghost_particles());
 #endif
   init_forces(particles, time_step);
 
diff --git a/src/core/grid_based_algorithms/electrokinetics.hpp b/src/core/grid_based_algorithms/electrokinetics.hpp
index a86b29a9b79..00110e1e9af 100644
--- a/src/core/grid_based_algorithms/electrokinetics.hpp
+++ b/src/core/grid_based_algorithms/electrokinetics.hpp
@@ -142,7 +142,6 @@ void ek_integrate();
 void ek_integrate_electrostatics();
 void ek_print_parameters();
 void ek_print_lbpar();
-void lb_set_ek_pointer(EK_parameters *pointeradress);
 unsigned int ek_calculate_boundary_mass();
 int ek_print_vtk_density(int species, char *filename);
 int ek_print_vtk_flux(int species, char *filename);
diff --git a/src/core/grid_based_algorithms/electrokinetics_cuda.cu b/src/core/grid_based_algorithms/electrokinetics_cuda.cu
index f618dd4351c..d1ae96921a0 100644
--- a/src/core/grid_based_algorithms/electrokinetics_cuda.cu
+++ b/src/core/grid_based_algorithms/electrokinetics_cuda.cu
@@ -25,7 +25,7 @@
 #include "grid_based_algorithms/electrokinetics.hpp"
 
 #include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 #include "errorhandling.hpp"
 #include "fd-electrostatics.cuh"
 #include "grid_based_algorithms/lb_boundaries.hpp"
@@ -36,7 +36,6 @@
 
 #include <utils/math/int_pow.hpp>
 #include <utils/math/sqr.hpp>
-#include <utils/memory.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>
@@ -50,17 +49,14 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <vector>
 
 #if defined(OMPI_MPI_H) || defined(_MPI_H)
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
-/* TODO: get rid of this code duplication with lb-boundaries.h by solving the
-         cuda-mpi incompatibility */
-
 extern ActiveLB lattice_switch;
 extern bool ek_initialized;
-EK_parameters *lb_ek_parameters_gpu;
 
 // Used to limit register use for the pressure calculation
 #define EK_LINK_U00_pressure 0
@@ -73,9 +69,8 @@ EK_parameters *lb_ek_parameters_gpu;
 #ifdef EK_BOUNDARIES
 void LBBoundaries::lb_init_boundaries();
 #endif
-/* end of code duplication */
 
-#define PI_FLOAT 3.14159265358979323846f
+static constexpr unsigned int threads_per_block = 64;
 
 EK_parameters ek_parameters = {
     // agrid
@@ -156,6 +151,10 @@ EK_parameters ek_parameters = {
     nullptr,
     // lb_force_density_previous
     nullptr,
+#ifdef EK_DEBUG
+    // j_fluc
+    nullptr,
+#endif
     // rho
     {nullptr},
     // species_index
@@ -176,7 +175,6 @@ EK_parameters ek_parameters = {
 
 __device__ __constant__ EK_parameters ek_parameters_gpu[1];
 ekfloat *charge_gpu;
-EK_parameters *ek_parameters_gpu_pointer;
 LB_parameters_gpu *ek_lbparameters_gpu;
 CUDA_particle_data *particle_data_gpu;
 float *ek_lb_boundary_force;
@@ -255,26 +253,6 @@ __device__ unsigned int rhoindex_cartesian2linear_padded(unsigned int x,
          y * ek_parameters_gpu->dim_x_padded + x;
 }
 
-__device__ void jindex_linear2cartesian(unsigned int index, unsigned int *coord,
-                                        unsigned int *c) {
-
-  coord[0] = index % ek_parameters_gpu->dim_x;
-  index /= ek_parameters_gpu->dim_x;
-  coord[1] = index % ek_parameters_gpu->dim_y;
-  index /= ek_parameters_gpu->dim_y;
-  coord[2] = index % ek_parameters_gpu->dim_z;
-  *c = index / ek_parameters_gpu->dim_z;
-}
-
-__device__ unsigned int jindex_cartesian2linear(unsigned int x, unsigned int y,
-                                                unsigned int z,
-                                                unsigned int c) {
-
-  return c * ek_parameters_gpu->number_of_nodes +
-         z * ek_parameters_gpu->dim_y * ek_parameters_gpu->dim_x +
-         y * ek_parameters_gpu->dim_x + x;
-}
-
 // TODO fluxindex fastest running might improve caching
 __device__ unsigned int jindex_getByRhoLinear(unsigned int rho_index,
                                               unsigned int c) {
@@ -291,8 +269,9 @@ __device__ void ek_displacement(float *dx, LB_nodes_gpu n,
 
   float mode[19];
 
-  for (int i = 0; i < 19; i++) {
-    mode[i] = n.vd[i * ek_lbparameters_gpu->number_of_nodes + node_index];
+  for (unsigned i = 0; i < 19; i++) {
+    mode[i] =
+        n.populations[i * ek_lbparameters_gpu->number_of_nodes + node_index];
   }
 
   rho += mode[0] + mode[1] + mode[2] + mode[3] + mode[4] + mode[5] + mode[6] +
@@ -1171,14 +1150,14 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
                          LB_node_force_density_gpu node_f, LB_nodes_gpu lb_node,
                          LB_parameters_gpu *ek_lbparameters_gpu) {
   float dx[3];
-  int di[3];
+  unsigned int di[3];
   unsigned int node;
 
   ek_displacement(dx, lb_node, index, ek_lbparameters_gpu);
 
-  di[0] = 1 - signbit(dx[0]);
-  di[1] = 1 - signbit(dx[1]);
-  di[2] = 1 - signbit(dx[2]);
+  di[0] = 1 - static_cast<unsigned>(signbit(dx[0]));
+  di[1] = 1 - static_cast<unsigned>(signbit(dx[1]));
+  di[2] = 1 - static_cast<unsigned>(signbit(dx[2]));
 
   dx[0] = fabs(dx[0]);
   dx[1] = fabs(dx[1]);
@@ -1194,8 +1173,7 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
           ek_parameters_gpu->dim_x,
       coord[1], coord[2]);
 
-  target_node[0] = (coord[0] + 2 * static_cast<unsigned>(di[0]) - 1 +
-                    ek_parameters_gpu->dim_x) %
+  target_node[0] = (coord[0] + 2 * di[0] - 1 + ek_parameters_gpu->dim_x) %
                    ek_parameters_gpu->dim_x;
   target_node[1] = coord[1];
   target_node[2] = coord[2];
@@ -1218,8 +1196,7 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
       coord[2]);
 
   target_node[0] = coord[0];
-  target_node[1] = (coord[1] + 2 * static_cast<unsigned>(di[1]) - 1 +
-                    ek_parameters_gpu->dim_y) %
+  target_node[1] = (coord[1] + 2 * di[1] - 1 + ek_parameters_gpu->dim_y) %
                    ek_parameters_gpu->dim_y;
   target_node[2] = coord[2];
   target_node_index =
@@ -1240,8 +1217,7 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
 
   target_node[0] = coord[0];
   target_node[1] = coord[1];
-  target_node[2] = (coord[2] + 2 * static_cast<unsigned>(di[2]) - 1 +
-                    ek_parameters_gpu->dim_z) %
+  target_node[2] = (coord[2] + 2 * di[2] - 1 + ek_parameters_gpu->dim_z) %
                    ek_parameters_gpu->dim_z;
   target_node_index =
       rhoindex_cartesian2linear(target_node[0], target_node[1], target_node[2]);
@@ -1262,11 +1238,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
           ek_parameters_gpu->dim_z);
 
   target_node[0] = coord[0];
-  target_node[1] = (coord[1] + 2 * static_cast<unsigned>(di[1]) - 1 +
-                    ek_parameters_gpu->dim_y) %
+  target_node[1] = (coord[1] + 2 * di[1] - 1 + ek_parameters_gpu->dim_y) %
                    ek_parameters_gpu->dim_y;
-  target_node[2] = (coord[2] + 2 * static_cast<unsigned>(di[2]) - 1 +
-                    ek_parameters_gpu->dim_z) %
+  target_node[2] = (coord[2] + 2 * di[2] - 1 + ek_parameters_gpu->dim_z) %
                    ek_parameters_gpu->dim_z;
   target_node_index =
       rhoindex_cartesian2linear(target_node[0], target_node[1], target_node[2]);
@@ -1288,12 +1262,10 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
       (coord[2] + (1 - di[0]) * (2 * di[2] - 1) + ek_parameters_gpu->dim_z) %
           ek_parameters_gpu->dim_z);
 
-  target_node[0] = (coord[0] + 2 * static_cast<unsigned>(di[0]) - 1 +
-                    ek_parameters_gpu->dim_x) %
+  target_node[0] = (coord[0] + 2 * di[0] - 1 + ek_parameters_gpu->dim_x) %
                    ek_parameters_gpu->dim_x;
   target_node[1] = coord[1];
-  target_node[2] = (coord[2] + 2 * static_cast<unsigned>(di[2]) - 1 +
-                    ek_parameters_gpu->dim_z) %
+  target_node[2] = (coord[2] + 2 * di[2] - 1 + ek_parameters_gpu->dim_z) %
                    ek_parameters_gpu->dim_z;
   target_node_index =
       rhoindex_cartesian2linear(target_node[0], target_node[1], target_node[2]);
@@ -1315,11 +1287,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
           ek_parameters_gpu->dim_y,
       coord[2]);
 
-  target_node[0] = (coord[0] + 2 * static_cast<unsigned>(di[0]) - 1 +
-                    ek_parameters_gpu->dim_x) %
+  target_node[0] = (coord[0] + 2 * di[0] - 1 + ek_parameters_gpu->dim_x) %
                    ek_parameters_gpu->dim_x;
-  target_node[1] = (coord[1] + 2 * static_cast<unsigned>(di[1]) - 1 +
-                    ek_parameters_gpu->dim_y) %
+  target_node[1] = (coord[1] + 2 * di[1] - 1 + ek_parameters_gpu->dim_y) %
                    ek_parameters_gpu->dim_y;
   target_node[2] = coord[2];
   target_node_index =
@@ -1343,14 +1313,11 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
       (coord[2] + (1 - di[0]) * (2 * di[2] - 1) + ek_parameters_gpu->dim_z) %
           ek_parameters_gpu->dim_z);
 
-  target_node[0] = (coord[0] + 2 * static_cast<unsigned>(di[0]) - 1 +
-                    ek_parameters_gpu->dim_x) %
+  target_node[0] = (coord[0] + 2 * di[0] - 1 + ek_parameters_gpu->dim_x) %
                    ek_parameters_gpu->dim_x;
-  target_node[1] = (coord[1] + 2 * static_cast<unsigned>(di[1]) - 1 +
-                    ek_parameters_gpu->dim_y) %
+  target_node[1] = (coord[1] + 2 * di[1] - 1 + ek_parameters_gpu->dim_y) %
                    ek_parameters_gpu->dim_y;
-  target_node[2] = (coord[2] + 2 * static_cast<unsigned>(di[2]) - 1 +
-                    ek_parameters_gpu->dim_z) %
+  target_node[2] = (coord[2] + 2 * di[2] - 1 + ek_parameters_gpu->dim_z) %
                    ek_parameters_gpu->dim_z;
   target_node_index =
       rhoindex_cartesian2linear(target_node[0], target_node[1], target_node[2]);
@@ -1360,9 +1327,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex,
   atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(
                 node, (1 - di[0]) * (EK_LINK_UUU + 2 * di[1] + di[2]) +
                           di[0] * (EK_LINK_UDD - 2 * di[1] - di[2]))],
-            static_cast<float>(2 * di[0] - 1) *
+            (2 * static_cast<ekfloat>(di[0]) - 1) *
                 ek_parameters_gpu->rho[species_index][index] * dx[0] * dx[1] *
-                dx[2] * static_cast<float>(not_boundary));
+                dx[2] * static_cast<ekfloat>(not_boundary));
 }
 
 __device__ float4 ek_random_wrapper_philox(unsigned int index,
@@ -1405,7 +1372,7 @@ __device__ void ek_add_fluctuations_to_flux(unsigned int index,
 #endif
     float fluc = 0.0f;
 
-    for (int i = 0; i < 9; i++) {
+    for (unsigned i = 0; i < 9; i++) {
 
       if (i % 4 == 0) {
         random_floats = ek_random_wrapper_philox(index, i + 40, philox_counter);
@@ -1804,7 +1771,7 @@ __global__ void ek_apply_boundaries(unsigned int species_index,
           (coord[2] + 1) % ek_parameters_gpu->dim_z);
 
       /* Clear fluxes on links connecting a boundary node */
-      for (int i = 0; i < 13; i++)
+      for (unsigned i = 0; i < 13; i++)
         ek_parameters_gpu->j[jindex_getByRhoLinear(index, i)] = 0.0f;
 
       ek_parameters_gpu->j[jindex_getByRhoLinear(
@@ -1841,7 +1808,7 @@ __global__ void ek_clear_fluxes() {
   unsigned int index = ek_getThreadIndex();
 
   if (index < ek_parameters_gpu->number_of_nodes) {
-    for (int i = 0; i < 13; i++) {
+    for (unsigned i = 0; i < 13; i++) {
       ek_parameters_gpu->j[jindex_getByRhoLinear(index, i)] = 0.0f;
 #ifdef EK_DEBUG
       ek_parameters_gpu->j_fluc[jindex_getByRhoLinear(index, i)] = 0.0f;
@@ -1883,32 +1850,29 @@ ek_gather_particle_charge_density(CUDA_particle_data *particle_data,
                                   size_t number_of_particles,
                                   LB_parameters_gpu *ek_lbparameters_gpu) {
   unsigned int index = ek_getThreadIndex();
-  int lowernode[3];
+  unsigned int lowernode[3];
   float cellpos[3];
   float gridpos;
 
   if (index < number_of_particles) {
     gridpos = particle_data[index].p[0] / ek_parameters_gpu->agrid - 0.5f;
-    lowernode[0] = (int)floorf(gridpos);
+    lowernode[0] = static_cast<unsigned>(floorf(gridpos));
     cellpos[0] = gridpos - static_cast<float>(lowernode[0]);
 
     gridpos = particle_data[index].p[1] / ek_parameters_gpu->agrid - 0.5f;
-    lowernode[1] = (int)floorf(gridpos);
+    lowernode[1] = static_cast<unsigned>(floorf(gridpos));
     cellpos[1] = gridpos - static_cast<float>(lowernode[1]);
 
     gridpos = particle_data[index].p[2] / ek_parameters_gpu->agrid - 0.5f;
-    lowernode[2] = (int)floorf(gridpos);
+    lowernode[2] = static_cast<unsigned>(floorf(gridpos));
     cellpos[2] = gridpos - static_cast<float>(lowernode[2]);
 
-    lowernode[0] =
-        static_cast<int>((lowernode[0] + ek_lbparameters_gpu->dim_x) %
-                         ek_lbparameters_gpu->dim_x);
-    lowernode[1] =
-        static_cast<int>((lowernode[1] + ek_lbparameters_gpu->dim_y) %
-                         ek_lbparameters_gpu->dim_y);
-    lowernode[2] =
-        static_cast<int>((lowernode[2] + ek_lbparameters_gpu->dim_z) %
-                         ek_lbparameters_gpu->dim_z);
+    lowernode[0] = (lowernode[0] + ek_lbparameters_gpu->dim_x) %
+                   ek_lbparameters_gpu->dim_x;
+    lowernode[1] = (lowernode[1] + ek_lbparameters_gpu->dim_y) %
+                   ek_lbparameters_gpu->dim_y;
+    lowernode[2] = (lowernode[2] + ek_lbparameters_gpu->dim_z) %
+                   ek_lbparameters_gpu->dim_z;
 
     atomicAdd(&((cufftReal *)ek_parameters_gpu
                     ->charge_potential)[rhoindex_cartesian2linear_padded(
@@ -1973,32 +1937,29 @@ ek_spread_particle_force(CUDA_particle_data *particle_data,
                          LB_parameters_gpu *ek_lbparameters_gpu) {
 
   unsigned int index = ek_getThreadIndex();
-  int lowernode[3];
+  unsigned int lowernode[3];
   float cellpos[3];
   float gridpos;
 
   if (index < number_of_particles) {
     gridpos = particle_data[index].p[0] / ek_parameters_gpu->agrid - 0.5f;
-    lowernode[0] = (int)floorf(gridpos);
-    cellpos[0] = gridpos - (float)(lowernode[0]);
+    lowernode[0] = static_cast<unsigned>(floorf(gridpos));
+    cellpos[0] = gridpos - static_cast<float>(lowernode[0]);
 
     gridpos = particle_data[index].p[1] / ek_parameters_gpu->agrid - 0.5f;
-    lowernode[1] = (int)floorf(gridpos);
-    cellpos[1] = gridpos - (float)(lowernode[1]);
+    lowernode[1] = static_cast<unsigned>(floorf(gridpos));
+    cellpos[1] = gridpos - static_cast<float>(lowernode[1]);
 
     gridpos = particle_data[index].p[2] / ek_parameters_gpu->agrid - 0.5f;
-    lowernode[2] = (int)floorf(gridpos);
-    cellpos[2] = gridpos - (float)(lowernode[2]);
-
-    lowernode[0] =
-        static_cast<int>((lowernode[0] + ek_lbparameters_gpu->dim_x) %
-                         ek_lbparameters_gpu->dim_x);
-    lowernode[1] =
-        static_cast<int>((lowernode[1] + ek_lbparameters_gpu->dim_y) %
-                         ek_lbparameters_gpu->dim_y);
-    lowernode[2] =
-        static_cast<int>((lowernode[2] + ek_lbparameters_gpu->dim_z) %
-                         ek_lbparameters_gpu->dim_z);
+    lowernode[2] = static_cast<unsigned>(floorf(gridpos));
+    cellpos[2] = gridpos - static_cast<float>(lowernode[2]);
+
+    lowernode[0] = (lowernode[0] + ek_lbparameters_gpu->dim_x) %
+                   ek_lbparameters_gpu->dim_x;
+    lowernode[1] = (lowernode[1] + ek_lbparameters_gpu->dim_y) %
+                   ek_lbparameters_gpu->dim_y;
+    lowernode[2] = (lowernode[2] + ek_lbparameters_gpu->dim_z) %
+                   ek_lbparameters_gpu->dim_z;
 
     float efield[3] = {0., 0., 0.};
     for (unsigned int dim = 0; dim < 3; ++dim) {
@@ -2149,7 +2110,7 @@ __global__ void ek_calculate_system_charge(ekfloat *charge_gpu) {
 }
 
 // TODO delete ?? (it has the previous step setting now)
-// This is not compatible with external LB force_densitys!
+// This is not compatible with external LB force_densities!
 __global__ void ek_clear_node_force(LB_node_force_density_gpu node_f) {
 
   unsigned int index = ek_getThreadIndex();
@@ -2173,17 +2134,13 @@ __global__ void ek_clear_node_force(LB_node_force_density_gpu node_f) {
 }
 
 void ek_calculate_electrostatic_coupling() {
-  const int blocks_per_grid_y = 4;
-  const int threads_per_block = 64;
 
   if ((!ek_parameters.es_coupling) || (!ek_initialized))
     return;
 
   auto device_particles = gpu_get_particle_pointer();
-  auto blocks_per_grid_x = static_cast<int>(
-      (device_particles.size() + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid = calculate_dim_grid(
+      static_cast<unsigned>(device_particles.size()), 4, threads_per_block);
 
   KERNELCALL(ek_spread_particle_force, dim_grid, threads_per_block,
              device_particles.data(), device_particles.size(),
@@ -2192,13 +2149,8 @@ void ek_calculate_electrostatic_coupling() {
 
 void ek_integrate_electrostatics() {
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x =
-      static_cast<int>((ek_parameters.number_of_nodes +
-                        threads_per_block * blocks_per_grid_y - 1) /
-                       (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(ek_gather_species_charge_density, dim_grid, threads_per_block);
 
@@ -2215,13 +2167,10 @@ void ek_integrate_electrostatics() {
   }
 
   auto device_particles = gpu_get_particle_pointer();
-  if (not device_particles
-              .empty()) // TODO make it an if number_of_charged_particles != 0
-  {
-    blocks_per_grid_x = static_cast<int>(
-        (device_particles.size() + threads_per_block * blocks_per_grid_y - 1) /
-        (threads_per_block * blocks_per_grid_y));
-    dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  // TODO make it an if number_of_charged_particles != 0
+  if (not device_particles.empty()) {
+    dim_grid = calculate_dim_grid(
+        static_cast<unsigned>(device_particles.size()), 4, threads_per_block);
 
     particle_data_gpu = device_particles.data();
 
@@ -2233,14 +2182,8 @@ void ek_integrate_electrostatics() {
 }
 
 void ek_integrate() {
-  /** values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x =
-      static_cast<int>((ek_parameters.number_of_nodes +
-                        threads_per_block * blocks_per_grid_y - 1) /
-                       (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
   /* Clears the force on the nodes and must be called before fluxes are
      calculated, since in the reaction set up the previous-step LB force is
@@ -2251,7 +2194,7 @@ void ek_integrate() {
   // KERNELCALL( ek_clear_node_force, dim_grid, threads_per_block, node_f );
 
   /* Integrate diffusion-advection */
-  for (int i = 0; i < ek_parameters.number_of_species; i++) {
+  for (unsigned i = 0; i < ek_parameters.number_of_species; i++) {
     KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
     KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block, i,
                *current_nodes, node_f, ek_lbparameters_gpu, ek_lb_device_values,
@@ -2281,13 +2224,8 @@ void ek_gather_wallcharge_species_density(ekfloat *wallcharge_species_density,
 }
 void ek_init_species_density_wallcharge(ekfloat *wallcharge_species_density,
                                         int wallcharge_species) {
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x =
-      static_cast<int>((ek_parameters.number_of_nodes +
-                        threads_per_block * blocks_per_grid_y - 1) /
-                       (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(ek_clear_boundary_densities, dim_grid, threads_per_block,
              *current_nodes);
@@ -2339,19 +2277,7 @@ int ek_init() {
     return 1;
   }
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  int blocks_per_grid_x;
-  dim3 dim_grid;
-
   if (!ek_initialized) {
-    if (cudaGetSymbolAddress((void **)&ek_parameters_gpu_pointer,
-                             ek_parameters_gpu) != cudaSuccess) {
-      fprintf(stderr, "ERROR: Fetching constant memory pointer\n");
-
-      return 1;
-    }
-
     for (auto &val : ek_parameters.species_index) {
       val = -1;
     }
@@ -2374,10 +2300,10 @@ int ek_init() {
     lb_lbcoupling_set_gamma(ek_parameters.friction);
 
     // Convert the density (given in MD units) to LB units
-    lbpar_gpu.rho = (ek_parameters.lb_density < 0.0
-                         ? 1.0f
-                         : ek_parameters.lb_density *
-                               Utils::int_pow<3>(ek_parameters.agrid));
+    lbpar_gpu.rho =
+        (ek_parameters.lb_density < 0.0)
+            ? 1.0f
+            : ek_parameters.lb_density * Utils::int_pow<3>(ek_parameters.agrid);
 
     lbpar_gpu.is_TRT = true;
 
@@ -2436,7 +2362,6 @@ int ek_init() {
                                      sizeof(EK_parameters)));
 
     lb_get_para_pointer(&ek_lbparameters_gpu);
-    lb_set_ek_pointer(ek_parameters_gpu_pointer);
 
     cuda_safe_mem(
         cudaMalloc((void **)&ek_parameters.lb_force_density_previous,
@@ -2491,11 +2416,9 @@ int ek_init() {
                                      sizeof(EK_parameters)));
 
     // clear initial LB force and finish up
-    blocks_per_grid_x = static_cast<int>(
-        (ek_parameters.dim_z * ek_parameters.dim_y * (ek_parameters.dim_x) +
-         threads_per_block * blocks_per_grid_y - 1) /
-        (threads_per_block * blocks_per_grid_y));
-    dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+    dim3 dim_grid = calculate_dim_grid(
+        ek_parameters.dim_z * ek_parameters.dim_y * ek_parameters.dim_x, 4,
+        threads_per_block);
     KERNELCALL(ek_clear_node_force, dim_grid, threads_per_block, node_f);
 
     ek_initialized = true;
@@ -2518,11 +2441,8 @@ int ek_init() {
     cuda_safe_mem(cudaMemcpyToSymbol(ek_parameters_gpu, &ek_parameters,
                                      sizeof(EK_parameters)));
 
-    blocks_per_grid_x =
-        static_cast<int>((ek_parameters.number_of_nodes +
-                          threads_per_block * blocks_per_grid_y - 1) /
-                         (threads_per_block * blocks_per_grid_y));
-    dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+    dim3 dim_grid =
+        calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
     KERNELCALL(ek_init_species_density_homogeneous, dim_grid,
                threads_per_block);
@@ -2540,24 +2460,17 @@ int ek_init() {
   return 0;
 }
 
-void lb_set_ek_pointer(EK_parameters *pointeradress) {
-  lb_ek_parameters_gpu = pointeradress;
-}
-
 unsigned int ek_calculate_boundary_mass() {
-  auto *bound_array = (unsigned int *)Utils::malloc(lbpar_gpu.number_of_nodes *
-                                                    sizeof(unsigned int));
+  std::vector<unsigned int> bound_array(lbpar_gpu.number_of_nodes);
 
-  lb_get_boundary_flags_GPU(bound_array);
+  lb_get_boundary_flags_GPU(bound_array.data());
 
   unsigned int boundary_node_number = 0;
 
-  for (int j = 0; j < ek_parameters.number_of_nodes; j++)
+  for (unsigned j = 0; j < ek_parameters.number_of_nodes; j++)
     if (bound_array[j] != 0)
       boundary_node_number++;
 
-  free(bound_array);
-
   return boundary_node_number;
 }
 
@@ -2571,10 +2484,7 @@ void rhoindex_linear2cartesian_host(unsigned int index, unsigned int *coord) {
 
 unsigned int jindex_cartesian2linear_host(unsigned int x, unsigned int y,
                                           unsigned int z, unsigned int c) {
-
-  x = (x + ek_parameters.dim_x) %
-      ek_parameters
-          .dim_x; // this does not happen in the GPU version of this function
+  x = (x + ek_parameters.dim_x) % ek_parameters.dim_x;
   y = (y + ek_parameters.dim_y) % ek_parameters.dim_y;
   z = (z + ek_parameters.dim_z) % ek_parameters.dim_z;
 
@@ -2604,9 +2514,8 @@ int ek_lb_print_vtk_velocity(char *filename) {
     return 1;
   }
 
-  auto *host_values = (LB_rho_v_pi_gpu *)Utils::malloc(
-      lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_pi_gpu));
-  lb_get_values_GPU(host_values);
+  std::vector<LB_rho_v_pi_gpu> host_values(lbpar_gpu.number_of_nodes);
+  lb_get_values_GPU(host_values.data());
   auto const lattice_speed = lbpar_gpu.agrid / lbpar_gpu.tau;
   fprintf(fp, "\
 # vtk DataFile Version 2.0\n\
@@ -2625,35 +2534,30 @@ LOOKUP_TABLE default\n",
           lbpar_gpu.agrid * 0.5f, lbpar_gpu.agrid, lbpar_gpu.agrid,
           lbpar_gpu.agrid, lbpar_gpu.number_of_nodes);
 
-  for (int i = 0; i < lbpar_gpu.number_of_nodes; i++) {
-    fprintf(fp, "%e %e %e ", host_values[i].v[0] * lattice_speed,
+  for (unsigned i = 0; i < lbpar_gpu.number_of_nodes; i++) {
+    fprintf(fp, "%e %e %e\n", host_values[i].v[0] * lattice_speed,
             host_values[i].v[1] * lattice_speed,
             host_values[i].v[2] * lattice_speed);
   }
 
-  free(host_values);
   fclose(fp);
 
   return 0;
 }
 
-int ek_node_print_velocity(
-    int x, int y, int z,
-    double *velocity) { // TODO only calculate single node velocity
+int ek_node_print_velocity(int x, int y, int z, double *velocity) {
+  // TODO: only calculate single node velocity
+  std::vector<LB_rho_v_pi_gpu> host_values(lbpar_gpu.number_of_nodes);
+  lb_get_values_GPU(host_values.data());
 
-  auto *host_values = (LB_rho_v_pi_gpu *)Utils::malloc(
-      lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_pi_gpu));
-  lb_get_values_GPU(host_values);
-
-  auto const i = z * ek_parameters.dim_y * ek_parameters.dim_x +
-                 y * ek_parameters.dim_x + x;
+  auto const index =
+      static_cast<unsigned>(z) * ek_parameters.dim_y * ek_parameters.dim_x +
+      static_cast<unsigned>(y) * ek_parameters.dim_x + static_cast<unsigned>(x);
   auto const lattice_speed = lbpar_gpu.agrid / lbpar_gpu.tau;
 
-  velocity[0] = host_values[i].v[0] * lattice_speed;
-  velocity[1] = host_values[i].v[1] * lattice_speed;
-  velocity[2] = host_values[i].v[2] * lattice_speed;
-
-  free(host_values);
+  velocity[0] = host_values[index].v[0] * lattice_speed;
+  velocity[1] = host_values[index].v[1] * lattice_speed;
+  velocity[2] = host_values[index].v[2] * lattice_speed;
 
   return 0;
 }
@@ -2666,9 +2570,8 @@ int ek_lb_print_vtk_density(char *filename) {
     return 1;
   }
 
-  auto *host_values = (LB_rho_v_pi_gpu *)Utils::malloc(
-      lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_pi_gpu));
-  lb_get_values_GPU(host_values);
+  std::vector<LB_rho_v_pi_gpu> host_values(lbpar_gpu.number_of_nodes);
+  lb_get_values_GPU(host_values.data());
 
   fprintf(fp, "\
 # vtk DataFile Version 2.0\n\
@@ -2688,11 +2591,10 @@ LOOKUP_TABLE default\n",
           lbpar_gpu.agrid * 0.5f, lbpar_gpu.agrid, lbpar_gpu.agrid,
           lbpar_gpu.agrid, lbpar_gpu.number_of_nodes);
   auto const agrid = lb_lbfluid_get_agrid();
-  for (int i = 0; i < lbpar_gpu.number_of_nodes; i++) {
-    fprintf(fp, "%e ", host_values[i].rho / agrid / agrid / agrid);
+  for (unsigned i = 0; i < lbpar_gpu.number_of_nodes; i++) {
+    fprintf(fp, "%e\n", host_values[i].rho / agrid / agrid / agrid);
   }
 
-  free(host_values);
   fclose(fp);
 
   return 0;
@@ -2710,12 +2612,11 @@ int ek_print_vtk_density(int species, char *filename) {
     return 1;
   }
 
-  auto *densities =
-      (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes * sizeof(ekfloat));
+  std::vector<ekfloat> densities(ek_parameters.number_of_nodes);
 
   cuda_safe_mem(cudaMemcpy(
-      densities, ek_parameters.rho[ek_parameters.species_index[species]],
-      ek_parameters.number_of_nodes * sizeof(ekfloat), cudaMemcpyDeviceToHost));
+      densities.data(), ek_parameters.rho[ek_parameters.species_index[species]],
+      densities.size() * sizeof(ekfloat), cudaMemcpyDeviceToHost));
 
   fprintf(fp, "\
 # vtk DataFile Version 2.0\n\
@@ -2736,11 +2637,10 @@ LOOKUP_TABLE default\n",
           ek_parameters.agrid, ek_parameters.agrid, ek_parameters.agrid,
           ek_parameters.number_of_nodes, species);
 
-  for (int i = 0; i < ek_parameters.number_of_nodes; i++) {
+  for (unsigned i = 0; i < ek_parameters.number_of_nodes; i++) {
     fprintf(fp, "%e\n", densities[i] / Utils::int_pow<3>(ek_parameters.agrid));
   }
 
-  free(densities);
   fclose(fp);
 
   return 0;
@@ -2752,18 +2652,16 @@ int ek_node_print_density(int species, int x, int y, int z, double *density) {
     return 1;
   }
 
-  auto *densities =
-      (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes * sizeof(ekfloat));
+  std::vector<ekfloat> densities(ek_parameters.number_of_nodes);
 
   cuda_safe_mem(cudaMemcpy(
-      densities, ek_parameters.rho[ek_parameters.species_index[species]],
-      ek_parameters.number_of_nodes * sizeof(ekfloat), cudaMemcpyDeviceToHost));
+      densities.data(), ek_parameters.rho[ek_parameters.species_index[species]],
+      densities.size() * sizeof(ekfloat), cudaMemcpyDeviceToHost));
 
-  *density = densities[z * ek_parameters.dim_y * ek_parameters.dim_x +
-                       y * ek_parameters.dim_x + x] /
-             Utils::int_pow<3>(ek_parameters.agrid);
-
-  free(densities);
+  auto const index =
+      static_cast<unsigned>(z) * ek_parameters.dim_y * ek_parameters.dim_x +
+      static_cast<unsigned>(y) * ek_parameters.dim_x + static_cast<unsigned>(x);
+  *density = densities[index] / Utils::int_pow<3>(ek_parameters.agrid);
 
   return 0;
 }
@@ -2778,34 +2676,30 @@ int ek_node_print_flux(int species, int x, int y, int z, double *flux) {
                                    // into Cartesian coordinates for output
   unsigned int coord[3];
 
-  coord[0] = x;
-  coord[1] = y;
-  coord[2] = z;
+  coord[0] = static_cast<unsigned>(x);
+  coord[1] = static_cast<unsigned>(y);
+  coord[2] = static_cast<unsigned>(z);
 
-  auto *fluxes = (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes * 13 *
-                                          sizeof(ekfloat));
+  std::vector<ekfloat> fluxes(ek_parameters.number_of_nodes * 13);
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x =
-      static_cast<int>((ek_parameters.number_of_nodes +
-                        threads_per_block * blocks_per_grid_y - 1) /
-                       (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
   KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block,
-             ek_parameters.species_index[species], *current_nodes, node_f,
-             ek_lbparameters_gpu, ek_lb_device_values, philox_counter.value());
+             static_cast<unsigned>(ek_parameters.species_index[species]),
+             *current_nodes, node_f, ek_lbparameters_gpu, ek_lb_device_values,
+             philox_counter.value());
   reset_LB_force_densities_GPU(false);
 
 #ifdef EK_BOUNDARIES
   KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
-             ek_parameters.species_index[species], *current_nodes, node_f);
+             static_cast<unsigned>(ek_parameters.species_index[species]),
+             *current_nodes, node_f);
 #endif
 
-  cuda_safe_mem(cudaMemcpy(fluxes, ek_parameters.j,
-                           ek_parameters.number_of_nodes * 13 * sizeof(ekfloat),
+  cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j,
+                           fluxes.size() * sizeof(ekfloat),
                            cudaMemcpyDeviceToHost));
 
   auto const i = rhoindex_cartesian2linear_host(coord[0], coord[1], coord[2]);
@@ -2970,24 +2864,24 @@ int ek_node_print_flux(int species, int x, int y, int z, double *flux) {
   flux[2] = flux_local_cartesian[2] /
             (ek_parameters.time_step * Utils::sqr(ek_parameters.agrid));
 
-  free(fluxes);
-
   return 0;
 }
 
 int ek_node_set_density(int species, int x, int y, int z, double density) {
-  if (ek_parameters.species_index[species] != -1) {
-    auto index =
-        static_cast<int>(z * ek_parameters.dim_y * ek_parameters.dim_x +
-                         y * ek_parameters.dim_x + x);
-    ekfloat num_particles =
-        static_cast<ekfloat>(density) * Utils::int_pow<3>(ek_parameters.agrid);
 
-    cuda_safe_mem(cudaMemcpy(
-        &ek_parameters.rho[ek_parameters.species_index[species]][index],
-        &num_particles, sizeof(ekfloat), cudaMemcpyHostToDevice));
-  } else
+  if (ek_parameters.species_index[species] == -1) {
     return 1;
+  }
+
+  auto const index =
+      static_cast<unsigned>(z) * ek_parameters.dim_y * ek_parameters.dim_x +
+      static_cast<unsigned>(y) * ek_parameters.dim_x + static_cast<unsigned>(x);
+  ekfloat num_particles =
+      static_cast<ekfloat>(density) * Utils::int_pow<3>(ek_parameters.agrid);
+
+  cuda_safe_mem(cudaMemcpy(
+      &ek_parameters.rho[ek_parameters.species_index[species]][index],
+      &num_particles, sizeof(ekfloat), cudaMemcpyHostToDevice));
 
   return 0;
 }
@@ -3009,30 +2903,26 @@ int ek_print_vtk_flux(int species, char *filename) {
 
   unsigned int coord[3];
 
-  auto *fluxes = (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes * 13 *
-                                          sizeof(ekfloat));
+  std::vector<ekfloat> fluxes(ek_parameters.number_of_nodes * 13);
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x =
-      static_cast<int>((ek_parameters.number_of_nodes +
-                        threads_per_block * blocks_per_grid_y - 1) /
-                       (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
   KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block,
-             ek_parameters.species_index[species], *current_nodes, node_f,
-             ek_lbparameters_gpu, ek_lb_device_values, philox_counter.value());
+             static_cast<unsigned>(ek_parameters.species_index[species]),
+             *current_nodes, node_f, ek_lbparameters_gpu, ek_lb_device_values,
+             philox_counter.value());
   reset_LB_force_densities_GPU(false);
 
 #ifdef EK_BOUNDARIES
   KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
-             ek_parameters.species_index[species], *current_nodes, node_f);
+             static_cast<unsigned>(ek_parameters.species_index[species]),
+             *current_nodes, node_f);
 #endif
 
-  cuda_safe_mem(cudaMemcpy(fluxes, ek_parameters.j,
-                           ek_parameters.number_of_nodes * 13 * sizeof(ekfloat),
+  cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j,
+                           fluxes.size() * sizeof(ekfloat),
                            cudaMemcpyDeviceToHost));
 
   fprintf(fp, "\
@@ -3054,7 +2944,7 @@ LOOKUP_TABLE default\n",
           ek_parameters.agrid, ek_parameters.agrid, ek_parameters.agrid,
           ek_parameters.number_of_nodes, species);
 
-  for (int i = 0; i < ek_parameters.number_of_nodes; i++) {
+  for (unsigned i = 0; i < ek_parameters.number_of_nodes; i++) {
     rhoindex_linear2cartesian_host(i, coord);
 
     flux_local_cartesian[0] =
@@ -3219,7 +3109,6 @@ LOOKUP_TABLE default\n",
                 (ek_parameters.time_step * Utils::sqr(ek_parameters.agrid)));
   }
 
-  free(fluxes);
   fclose(fp);
 
   return 0;
@@ -3229,6 +3118,10 @@ int ek_print_vtk_flux_fluc(int species, char *filename) {
 #ifndef EK_DEBUG
   return 1;
 #else
+  if (ek_parameters.species_index[species] == -1) {
+    return 1;
+  }
+
   FILE *fp = fopen(filename, "w");
   ekfloat flux_local_cartesian[3]; // temporary variable for converting fluxes
                                    // into cartesian coordinates for output
@@ -3239,39 +3132,30 @@ int ek_print_vtk_flux_fluc(int species, char *filename) {
     return 1;
   }
 
-  ekfloat *fluxes = (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes *
-                                             13 * sizeof(ekfloat));
+  std::vector<ekfloat> fluxes(ek_parameters.number_of_nodes * 13);
 
-  if (ek_parameters.species_index[species] != -1) {
-    int threads_per_block = 64;
-    int blocks_per_grid_y = 4;
-    int blocks_per_grid_x = (ek_parameters.number_of_nodes +
-                             threads_per_block * blocks_per_grid_y - 1) /
-                            (threads_per_block * blocks_per_grid_y);
-    dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
-    KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
-    KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block,
-               ek_parameters.species_index[species], *current_nodes, node_f,
-               ek_lbparameters_gpu, ek_lb_device_values,
-               philox_counter.value());
-    reset_LB_force_densities_GPU(false);
+  KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
+  KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block,
+             static_cast<unsigned>(ek_parameters.species_index[species]),
+             *current_nodes, node_f, ek_lbparameters_gpu, ek_lb_device_values,
+             philox_counter.value());
+  reset_LB_force_densities_GPU(false);
 
 #ifdef EK_BOUNDARIES
-    KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
-               ek_parameters.species_index[species], *current_nodes, node_f);
+  KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
+             ek_parameters.species_index[species], *current_nodes, node_f);
 #endif
 
-    cuda_safe_mem(
-        cudaMemcpy(fluxes, ek_parameters.j_fluc,
-                   ek_parameters.number_of_nodes * 13 * sizeof(ekfloat),
-                   cudaMemcpyDeviceToHost));
-  } else
-    return 1;
+  cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j_fluc,
+                           fluxes.size() * sizeof(ekfloat),
+                           cudaMemcpyDeviceToHost));
 
   fprintf(fp, "\
 # vtk DataFile Version 2.0\n\
-flux_%d\n\
+flux_fluc_%d\n\
 ASCII\n\
 \n\
 DATASET STRUCTURED_POINTS\n\
@@ -3280,7 +3164,7 @@ ORIGIN %f %f %f\n\
 SPACING %f %f %f\n\
 \n\
 POINT_DATA %u\n\
-SCALARS flux_%d float 3\n\
+SCALARS flux_fluc_%d float 4\n\
 LOOKUP_TABLE default\n",
           species, ek_parameters.dim_x, ek_parameters.dim_y,
           ek_parameters.dim_z, ek_parameters.agrid * 0.5f,
@@ -3288,7 +3172,7 @@ LOOKUP_TABLE default\n",
           ek_parameters.agrid, ek_parameters.agrid, ek_parameters.agrid,
           ek_parameters.number_of_nodes, species);
 
-  for (int i = 0; i < ek_parameters.number_of_nodes; i++) {
+  for (unsigned i = 0; i < ek_parameters.number_of_nodes; i++) {
 
     float flux_local_linksum = 0;
     rhoindex_linear2cartesian_host(i, coord);
@@ -3458,7 +3342,6 @@ LOOKUP_TABLE default\n",
         flux_local_linksum / (ek_parameters.agrid * ek_parameters.agrid));
   }
 
-  free(fluxes);
   fclose(fp);
 
   return 0;
@@ -3479,35 +3362,31 @@ int ek_print_vtk_flux_link(int species, char *filename) {
 
   unsigned int coord[3];
 
-  auto *fluxes = (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes * 13 *
-                                          sizeof(ekfloat));
+  std::vector<ekfloat> fluxes(ek_parameters.number_of_nodes * 13);
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x =
-      static_cast<int>((ek_parameters.number_of_nodes +
-                        threads_per_block * blocks_per_grid_y - 1) /
-                       (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(ek_clear_fluxes, dim_grid, threads_per_block);
   KERNELCALL(ek_calculate_quantities, dim_grid, threads_per_block,
-             ek_parameters.species_index[species], *current_nodes, node_f,
-             ek_lbparameters_gpu, ek_lb_device_values, philox_counter.value());
+             static_cast<unsigned>(ek_parameters.species_index[species]),
+             *current_nodes, node_f, ek_lbparameters_gpu, ek_lb_device_values,
+             philox_counter.value());
   reset_LB_force_densities_GPU(false);
 
 #ifdef EK_BOUNDARIES
   KERNELCALL(ek_apply_boundaries, dim_grid, threads_per_block,
-             ek_parameters.species_index[species], *current_nodes, node_f);
+             static_cast<unsigned>(ek_parameters.species_index[species]),
+             *current_nodes, node_f);
 #endif
 
-  cuda_safe_mem(cudaMemcpy(fluxes, ek_parameters.j,
-                           ek_parameters.number_of_nodes * 13 * sizeof(ekfloat),
+  cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j,
+                           fluxes.size() * sizeof(ekfloat),
                            cudaMemcpyDeviceToHost));
 
   fprintf(fp, "\
 # vtk DataFile Version 2.0\n\
-flux_%d\n\
+flux_link_%d\n\
 ASCII\n\
 \n\
 DATASET STRUCTURED_POINTS\n\
@@ -3516,7 +3395,7 @@ ORIGIN %f %f %f\n\
 SPACING %f %f %f\n\
 \n\
 POINT_DATA %u\n\
-SCALARS flux_%d float 3\n\
+SCALARS flux_link_%d float 13\n\
 LOOKUP_TABLE default\n",
           species, ek_parameters.dim_x, ek_parameters.dim_y,
           ek_parameters.dim_z, ek_parameters.agrid * 0.5f,
@@ -3524,10 +3403,10 @@ LOOKUP_TABLE default\n",
           ek_parameters.agrid, ek_parameters.agrid, ek_parameters.agrid,
           ek_parameters.number_of_nodes, species);
 
-  for (int i = 0; i < ek_parameters.number_of_nodes; i++) {
+  for (unsigned i = 0; i < ek_parameters.number_of_nodes; i++) {
     rhoindex_linear2cartesian_host(i, coord);
 
-    fprintf(fp, "%e %e %e %e %e %e %e %e %e %e %e %e %e \n",
+    fprintf(fp, "%e %e %e %e %e %e %e %e %e %e %e %e %e\n",
             fluxes[jindex_getByRhoLinear_host(i, 0)],
             fluxes[jindex_getByRhoLinear_host(i, 1)],
             fluxes[jindex_getByRhoLinear_host(i, 2)],
@@ -3543,19 +3422,19 @@ LOOKUP_TABLE default\n",
             fluxes[jindex_getByRhoLinear_host(i, 12)]);
   }
 
-  free(fluxes);
   fclose(fp);
 
   return 0;
 }
 
 int ek_node_print_potential(int x, int y, int z, double *potential) {
-  auto i =
-      static_cast<int>(z * ek_parameters.dim_y * ek_parameters.dim_x_padded +
-                       y * ek_parameters.dim_x_padded + x);
+  auto const index = static_cast<unsigned>(z) * ek_parameters.dim_y *
+                         ek_parameters.dim_x_padded +
+                     static_cast<unsigned>(y) * ek_parameters.dim_x_padded +
+                     static_cast<unsigned>(x);
   float pot;
 
-  cuda_safe_mem(cudaMemcpy(&pot, &ek_parameters.charge_potential[i],
+  cuda_safe_mem(cudaMemcpy(&pot, &ek_parameters.charge_potential[index],
                            1 * sizeof(cufftReal), cudaMemcpyDeviceToHost));
 
   *potential = pot;
@@ -3570,15 +3449,14 @@ int ek_print_vtk_potential(char *filename) {
     return 1;
   }
 
-  auto *potential =
-      (float *)Utils::malloc(ek_parameters.number_of_nodes * sizeof(cufftReal));
+  std::vector<cufftReal> potential(ek_parameters.number_of_nodes);
 
-  cuda_safe_mem(cudaMemcpy2D(potential, ek_parameters.dim_x * sizeof(cufftReal),
-                             ek_parameters.charge_potential,
-                             ek_parameters.dim_x_padded * sizeof(cufftReal),
-                             ek_parameters.dim_x * sizeof(cufftReal),
-                             ek_parameters.dim_z * ek_parameters.dim_y,
-                             cudaMemcpyDeviceToHost));
+  cuda_safe_mem(cudaMemcpy2D(
+      potential.data(), ek_parameters.dim_x * sizeof(cufftReal),
+      ek_parameters.charge_potential,
+      ek_parameters.dim_x_padded * sizeof(cufftReal),
+      ek_parameters.dim_x * sizeof(cufftReal),
+      ek_parameters.dim_z * ek_parameters.dim_y, cudaMemcpyDeviceToHost));
 
   fprintf(fp, "\
 # vtk DataFile Version 2.0\n\
@@ -3598,11 +3476,10 @@ LOOKUP_TABLE default\n",
           ek_parameters.agrid * 0.5f, ek_parameters.agrid, ek_parameters.agrid,
           ek_parameters.agrid, ek_parameters.number_of_nodes);
 
-  for (int i = 0; i < ek_parameters.number_of_nodes; i++) {
+  for (unsigned i = 0; i < ek_parameters.number_of_nodes; i++) {
     fprintf(fp, "%e\n", potential[i]);
   }
 
-  free(potential);
   fclose(fp);
 
   return 0;
@@ -3616,15 +3493,14 @@ int ek_print_vtk_particle_potential(char *filename) {
     return 1;
   }
 
-  auto *potential =
-      (float *)Utils::malloc(ek_parameters.number_of_nodes * sizeof(cufftReal));
+  std::vector<cufftReal> potential(ek_parameters.number_of_nodes);
 
-  cuda_safe_mem(cudaMemcpy2D(potential, ek_parameters.dim_x * sizeof(cufftReal),
-                             ek_parameters.charge_potential_buffer,
-                             ek_parameters.dim_x_padded * sizeof(cufftReal),
-                             ek_parameters.dim_x * sizeof(cufftReal),
-                             ek_parameters.dim_z * ek_parameters.dim_y,
-                             cudaMemcpyDeviceToHost));
+  cuda_safe_mem(cudaMemcpy2D(
+      potential.data(), ek_parameters.dim_x * sizeof(cufftReal),
+      ek_parameters.charge_potential_buffer,
+      ek_parameters.dim_x_padded * sizeof(cufftReal),
+      ek_parameters.dim_x * sizeof(cufftReal),
+      ek_parameters.dim_z * ek_parameters.dim_y, cudaMemcpyDeviceToHost));
 
   fprintf(fp, "\
 # vtk DataFile Version 2.0\n\
@@ -3644,11 +3520,10 @@ LOOKUP_TABLE default\n",
           ek_parameters.agrid * 0.5f, ek_parameters.agrid, ek_parameters.agrid,
           ek_parameters.agrid, ek_parameters.number_of_nodes);
 
-  for (int i = 0; i < ek_parameters.number_of_nodes; i++) {
+  for (unsigned i = 0; i < ek_parameters.number_of_nodes; i++) {
     fprintf(fp, "%e\n", potential[i]);
   }
 
-  free(potential);
   fclose(fp);
 
   return 0;
@@ -3665,11 +3540,10 @@ int ek_print_vtk_lbforce_density(char *filename) {
     return 1;
   }
 
-  auto *lbforce_density = (lbForceFloat *)Utils::malloc(
-      ek_parameters.number_of_nodes * 3 * sizeof(lbForceFloat));
+  std::vector<lbForceFloat> lbforce_density(ek_parameters.number_of_nodes * 3);
 
   cuda_safe_mem(
-      cudaMemcpy(lbforce_density, node_f.force_density_buf,
+      cudaMemcpy(lbforce_density.data(), node_f.force_density_buf,
                  ek_parameters.number_of_nodes * 3 * sizeof(lbForceFloat),
                  cudaMemcpyDeviceToHost));
 
@@ -3691,19 +3565,14 @@ LOOKUP_TABLE default\n",
           ek_parameters.agrid * 0.5f, ek_parameters.agrid, ek_parameters.agrid,
           ek_parameters.agrid, ek_parameters.number_of_nodes);
 
-  for (int i = 0; i < ek_parameters.number_of_nodes; i++) {
-    fprintf(fp, "%e %e %e\n",
-            lbforce_density[i] / (powf(ek_parameters.time_step, 2.0) *
-                                  powf(ek_parameters.agrid, 4.0)),
-            lbforce_density[i + ek_parameters.number_of_nodes] /
-                (powf(ek_parameters.time_step, 2.0) *
-                 powf(ek_parameters.agrid, 4.0)),
-            lbforce_density[i + 2 * ek_parameters.number_of_nodes] /
-                (powf(ek_parameters.time_step, 2.0) *
-                 powf(ek_parameters.agrid, 4.0)));
+  auto const norm = (Utils::int_pow<2>(ek_parameters.time_step) *
+                     Utils::int_pow<4>(ek_parameters.agrid));
+  for (unsigned i = 0; i < ek_parameters.number_of_nodes; i++) {
+    fprintf(fp, "%e %e %e\n", lbforce_density[i] / norm,
+            lbforce_density[i + ek_parameters.number_of_nodes] / norm,
+            lbforce_density[i + 2 * ek_parameters.number_of_nodes] / norm);
   }
 
-  free(lbforce_density);
   fclose(fp);
 
   return 0;
@@ -4016,13 +3885,8 @@ ekfloat ek_get_particle_charge() {
 ekfloat ek_calculate_net_charge() {
   cuda_safe_mem(cudaMemset(charge_gpu, 0, sizeof(ekfloat)));
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x =
-      static_cast<int>((ek_parameters.number_of_nodes +
-                        threads_per_block * blocks_per_grid_y - 1) /
-                       (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(ek_calculate_system_charge, dim_grid, threads_per_block,
              charge_gpu);
@@ -4049,7 +3913,7 @@ int ek_neutralize_system(int species) {
   ekfloat compensating_species_density = 0.0f;
 
 #ifndef EK_BOUNDARIES
-  for (int i = 0; i < ek_parameters.number_of_species; i++)
+  for (unsigned i = 0; i < ek_parameters.number_of_species; i++)
     compensating_species_density +=
         ek_parameters.density[i] * ek_parameters.valency[i];
 
@@ -4072,7 +3936,7 @@ int ek_neutralize_system(int species) {
       ek_parameters.density[species_index] -
       (charge / ek_parameters.valency[species_index]) /
           (Utils::int_pow<3>(ek_parameters.agrid) *
-           ekfloat(ek_parameters.number_of_nodes -
+           ekfloat(static_cast<int>(ek_parameters.number_of_nodes) -
                    ek_parameters.number_of_boundary_nodes));
 #endif // EK_BOUNDARIES
 
@@ -4086,23 +3950,21 @@ int ek_neutralize_system(int species) {
 
 int ek_save_checkpoint(char *filename, char *lb_filename) {
   std::ofstream fout(filename, std::ofstream::binary);
-  auto *densities =
-      (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes * sizeof(ekfloat));
+  std::vector<ekfloat> densities(ek_parameters.number_of_nodes);
+  auto const nchars =
+      static_cast<std::streamsize>(densities.size() * sizeof(ekfloat));
 
-  for (int i = 0; i < ek_parameters.number_of_species; i++) {
-    cuda_safe_mem(cudaMemcpy(densities, ek_parameters.rho[i],
-                             ek_parameters.number_of_nodes * sizeof(ekfloat),
+  for (unsigned i = 0; i < ek_parameters.number_of_species; i++) {
+    cuda_safe_mem(cudaMemcpy(densities.data(), ek_parameters.rho[i],
+                             densities.size() * sizeof(ekfloat),
                              cudaMemcpyDeviceToHost));
 
-    if (!fout.write((char *)densities,
-                    sizeof(ekfloat) * ek_parameters.number_of_nodes)) {
-      free(densities);
+    if (!fout.write(reinterpret_cast<char *>(densities.data()), nchars)) {
       fout.close();
       return 1;
     }
   }
 
-  free(densities);
   fout.close();
 
   lb_lbfluid_save_checkpoint(lb_filename, true);
@@ -4113,23 +3975,21 @@ int ek_load_checkpoint(char *filename) {
   std::string fname(filename);
   std::ifstream fin((const char *)(fname + ".ek").c_str(),
                     std::ifstream::binary);
-  auto *densities =
-      (ekfloat *)Utils::malloc(ek_parameters.number_of_nodes * sizeof(ekfloat));
+  std::vector<ekfloat> densities(ek_parameters.number_of_nodes);
+  auto const nchars =
+      static_cast<std::streamsize>(densities.size() * sizeof(ekfloat));
 
-  for (int i = 0; i < ek_parameters.number_of_species; i++) {
-    if (!fin.read((char *)densities,
-                  sizeof(ekfloat) * ek_parameters.number_of_nodes)) {
-      free(densities);
+  for (unsigned i = 0; i < ek_parameters.number_of_species; i++) {
+    if (!fin.read(reinterpret_cast<char *>(densities.data()), nchars)) {
       fin.close();
       return 1;
     }
 
-    cuda_safe_mem(cudaMemcpy(ek_parameters.rho[i], densities,
-                             ek_parameters.number_of_nodes * sizeof(ekfloat),
+    cuda_safe_mem(cudaMemcpy(ek_parameters.rho[i], densities.data(),
+                             densities.size() * sizeof(ekfloat),
                              cudaMemcpyHostToDevice));
   }
 
-  free(densities);
   fin.close();
 
   lb_lbfluid_load_checkpoint((char *)(fname + ".lb").c_str(), true);
diff --git a/src/core/grid_based_algorithms/fd-electrostatics.cuh b/src/core/grid_based_algorithms/fd-electrostatics.cuh
index 24232116441..567e37b5ab9 100644
--- a/src/core/grid_based_algorithms/fd-electrostatics.cuh
+++ b/src/core/grid_based_algorithms/fd-electrostatics.cuh
@@ -21,8 +21,6 @@
 
 #include <cufft.h>
 
-#define PI_FLOAT 3.14159265358979323846f
-
 class FdElectrostatics {
 public:
   struct InputParameters {
@@ -67,12 +65,4 @@ private:
   bool initialized;
 };
 
-// extern __device__ __constant__ FdElectrostatics::Parameters
-// fde_parameters_gpu;
-
-__device__ cufftReal fde_getNode(int x, int y, int z);
-__device__ cufftReal fde_getNode(int i);
-__device__ void fde_setNode(int x, int y, int z, cufftReal value);
-__device__ void fde_setNode(int i, cufftReal value);
-
 #endif
diff --git a/src/core/grid_based_algorithms/fd-electrostatics_cuda.cu b/src/core/grid_based_algorithms/fd-electrostatics_cuda.cu
index 795be9eb98c..9d869142a21 100644
--- a/src/core/grid_based_algorithms/fd-electrostatics_cuda.cu
+++ b/src/core/grid_based_algorithms/fd-electrostatics_cuda.cu
@@ -21,7 +21,9 @@
 
 #include "grid_based_algorithms/fd-electrostatics.cuh"
 
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
+
+#include <utils/constants.hpp>
 
 #include <cuda.h>
 #include <cufft.h>
@@ -34,6 +36,13 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
+static constexpr unsigned int threads_per_block = 64;
+
+__device__ cufftReal fde_getNode(int x, int y, int z);
+__device__ cufftReal fde_getNode(int i);
+__device__ void fde_setNode(int x, int y, int z, cufftReal value);
+__device__ void fde_setNode(int i, cufftReal value);
+
 __global__ void createGreensfcn();
 __global__ void multiplyGreensfcn(cufftComplex *charge_potential);
 
@@ -102,13 +111,10 @@ FdElectrostatics::FdElectrostatics(InputParameters inputParameters,
   cuda_safe_mem(
       cudaMemcpyToSymbol(fde_parameters_gpu, &parameters, sizeof(Parameters)));
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  int blocks_per_grid_x =
-      (parameters.dim_z * parameters.dim_y * (parameters.dim_x / 2 + 1) +
-       threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y);
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid = calculate_dim_grid(
+      static_cast<unsigned>(parameters.dim_z * parameters.dim_y *
+                            (parameters.dim_x / 2 + 1)),
+      4, threads_per_block);
   KERNELCALL_stream(createGreensfcn, dim_grid, threads_per_block, stream);
 
   /* create 3D FFT plans */
@@ -151,14 +157,15 @@ __global__ void createGreensfcn() {
       // setting 0th Fourier mode to 0 enforces charge neutrality
       fde_parameters_gpu->greensfcn[index] = 0.0f;
     } else {
+      constexpr cufftReal two_pi = 2.0f * Utils::pi<cufftReal>();
       fde_parameters_gpu->greensfcn[index] =
-          -4.0f * PI_FLOAT * fde_parameters_gpu->prefactor *
+          -2.0f * two_pi * fde_parameters_gpu->prefactor *
           fde_parameters_gpu->agrid * fde_parameters_gpu->agrid * 0.5f /
-          (cos(2.0f * PI_FLOAT * static_cast<cufftReal>(coord[0]) /
+          (cos(two_pi * static_cast<cufftReal>(coord[0]) /
                static_cast<cufftReal>(fde_parameters_gpu->dim_x)) +
-           cos(2.0f * PI_FLOAT * static_cast<cufftReal>(coord[1]) /
+           cos(two_pi * static_cast<cufftReal>(coord[1]) /
                static_cast<cufftReal>(fde_parameters_gpu->dim_y)) +
-           cos(2.0f * PI_FLOAT * static_cast<cufftReal>(coord[2]) /
+           cos(two_pi * static_cast<cufftReal>(coord[2]) /
                static_cast<cufftReal>(fde_parameters_gpu->dim_z)) -
            3.0f) /
           static_cast<cufftReal>(fde_parameters_gpu->dim_x *
@@ -193,13 +200,10 @@ void FdElectrostatics::calculatePotential(cufftComplex *charge_potential) {
     fprintf(stderr, "ERROR: Unable to execute FFT plan\n");
   }
 
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  int blocks_per_grid_x =
-      (parameters.dim_z * parameters.dim_y * (parameters.dim_x / 2 + 1) +
-       threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y);
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid = calculate_dim_grid(
+      static_cast<unsigned>(parameters.dim_z * parameters.dim_y *
+                            (parameters.dim_x / 2 + 1)),
+      4, threads_per_block);
 
   KERNELCALL(multiplyGreensfcn, dim_grid, threads_per_block, charge_potential);
 
diff --git a/src/core/grid_based_algorithms/halo.cpp b/src/core/grid_based_algorithms/halo.cpp
index a7055e684da..b90be347376 100644
--- a/src/core/grid_based_algorithms/halo.cpp
+++ b/src/core/grid_based_algorithms/halo.cpp
@@ -33,91 +33,43 @@
 #include "halo.hpp"
 
 #include <utils/Vector.hpp>
-#include <utils/memory.hpp>
 
 #include <cstdlib>
 #include <cstring>
+#include <memory>
 
-/** Primitive fieldtypes and their initializers */
-struct _Fieldtype fieldtype_double = {0, nullptr, nullptr, sizeof(double), 0,
-                                      0, 0,       false,   nullptr};
-
-void halo_create_field_vector(int vblocks, int vstride, int vskip,
-                              Fieldtype oldtype, Fieldtype *const newtype) {
-
-  Fieldtype ntype = *newtype = (Fieldtype)Utils::malloc(sizeof(*ntype));
-
-  ntype->subtype = oldtype;
-  ntype->vflag = true;
-
-  ntype->vblocks = vblocks;
-  ntype->vstride = vstride;
-  ntype->vskip = vskip;
-
-  ntype->extent = oldtype->extent * ((vblocks - 1) * vskip + vstride);
-
-  int count = ntype->count = oldtype->count;
-  ntype->lengths = (int *)Utils::malloc(count * 2 * sizeof(int));
-  ntype->disps = (int *)((char *)ntype->lengths + count * sizeof(int));
-
-  for (int i = 0; i < count; i++) {
-    ntype->disps[i] = oldtype->disps[i];
-    ntype->lengths[i] = oldtype->lengths[i];
-  }
-}
-
-void halo_create_field_hvector(int vblocks, int vstride, int vskip,
-                               Fieldtype oldtype, Fieldtype *const newtype) {
-
-  Fieldtype ntype = *newtype = (Fieldtype)Utils::malloc(sizeof(*ntype));
-
-  ntype->subtype = oldtype;
-  ntype->vflag = false;
-
-  ntype->vblocks = vblocks;
-  ntype->vstride = vstride;
-  ntype->vskip = vskip;
-
-  ntype->extent = oldtype->extent * vstride + (vblocks - 1) * vskip;
-
-  int const count = ntype->count = oldtype->count;
-  ntype->lengths = (int *)Utils::malloc(count * 2 * sizeof(int));
-  ntype->disps = (int *)((char *)ntype->lengths + count * sizeof(int));
-
-  for (int i = 0; i < count; i++) {
-    ntype->disps[i] = oldtype->disps[i];
-    ntype->lengths[i] = oldtype->lengths[i];
-  }
-}
+/** Predefined fieldtype for double-precision LB */
+static std::shared_ptr<FieldType> fieldtype_double =
+    std::make_shared<FieldType>(static_cast<int>(sizeof(double)));
 
 /** Set halo region to a given value
  * @param[out] dest pointer to the halo buffer
  * @param value integer value to write into the halo buffer
  * @param type halo field layout description
  */
-void halo_dtset(char *dest, int value, Fieldtype type) {
+void halo_dtset(char *dest, int value, std::shared_ptr<FieldType> type) {
   auto const vblocks = type->vblocks;
   auto const vstride = type->vstride;
   auto const vskip = type->vskip;
-  auto const count = type->count;
-  int const *const lens = type->lengths;
-  int const *const disps = type->disps;
+  auto const &lens = type->lengths;
+  auto const &disps = type->disps;
   auto const extent = type->extent;
   auto const block_size = static_cast<long>(vskip) * static_cast<long>(extent);
 
   for (int i = 0; i < vblocks; i++) {
     for (int j = 0; j < vstride; j++) {
-      for (int k = 0; k < count; k++)
+      for (std::size_t k = 0; k < disps.size(); k++)
         memset(dest + disps[k], value, lens[k]);
     }
     dest += block_size;
   }
 }
 
-void halo_dtcopy(char *r_buffer, char *s_buffer, int count, Fieldtype type);
+void halo_dtcopy(char *r_buffer, char *s_buffer, int count,
+                 std::shared_ptr<FieldType> type);
 
-void halo_copy_vector(char *r_buffer, char *s_buffer, int count, Fieldtype type,
-                      bool vflag) {
+void halo_copy_vector(char *r_buffer, char *s_buffer, int count,
+                      std::shared_ptr<FieldType> type, bool vflag) {
 
   auto const vblocks = type->vblocks;
   auto const vstride = type->vstride;
@@ -136,13 +88,14 @@ void halo_copy_vector(char *r_buffer, char *s_buffer, int count, Fieldtype type,
   }
 }
 
-/** Copy lattice data with layout described by fieldtype.
+/** Copy lattice data with layout described by @p type.
  * @param r_buffer data destination
  * @param s_buffer data source
  * @param count    amount of data to copy
  * @param type     field layout type
  */
-void halo_dtcopy(char *r_buffer, char *s_buffer, int count, Fieldtype type) {
+void halo_dtcopy(char *r_buffer, char *s_buffer, int count,
+                 std::shared_ptr<FieldType> type) {
 
   if (type->subtype) {
     halo_copy_vector(r_buffer, s_buffer, count, type, type->vflag);
@@ -162,23 +115,22 @@ void halo_dtcopy(char *r_buffer, char *s_buffer, int count, Fieldtype type) {
   }
 }
 
-void prepare_halo_communication(HaloCommunicator *const hc,
-                                Lattice const *const lattice,
-                                Fieldtype fieldtype, MPI_Datatype datatype,
+void prepare_halo_communication(HaloCommunicator &hc, const Lattice &lattice,
+                                MPI_Datatype datatype,
                                 const Utils::Vector3i &local_node_grid) {
 
-  const auto grid = lattice->grid;
-  const auto period = lattice->halo_grid;
+  const auto &grid = lattice.grid;
+  const auto &period = lattice.halo_grid;
 
-  for (int n = 0; n < hc->num; n++) {
-    MPI_Type_free(&(hc->halo_info[n].datatype));
+  for (int n = 0; n < hc.num; n++) {
+    MPI_Type_free(&(hc.halo_info[n].datatype));
   }
 
   int const num = 2 * 3; /* two communications in each space direction */
-  hc->num = num;
-  hc->halo_info.resize(num);
+  hc.num = num;
+  hc.halo_info.resize(num);
 
-  auto const extent = static_cast<long>(fieldtype->extent);
+  auto const extent = static_cast<long>(fieldtype_double->extent);
 
   auto const node_neighbors = calc_node_neighbors(comm_cart);
 
@@ -186,7 +138,7 @@ void prepare_halo_communication(HaloCommunicator *const hc,
   for (int dir = 0; dir < 3; dir++) {
     for (int lr = 0; lr < 2; lr++) {
 
-      HaloInfo *hinfo = &(hc->halo_info[cnt]);
+      HaloInfo &hinfo = hc.halo_info[cnt];
 
       int nblocks = 1;
       for (int k = dir + 1; k < 3; k++) {
@@ -203,46 +155,46 @@ void prepare_halo_communication(HaloCommunicator *const hc,
 
       if (lr == 0) {
         /* send to left, recv from right */
-        hinfo->s_offset = extent * static_cast<long>(stride * 1);
-        hinfo->r_offset = extent * static_cast<long>(stride * (grid[dir] + 1));
+        hinfo.s_offset = extent * static_cast<long>(stride * 1);
+        hinfo.r_offset = extent * static_cast<long>(stride * (grid[dir] + 1));
       } else {
         /* send to right, recv from left */
-        hinfo->s_offset = extent * static_cast<long>(stride * grid[dir]);
-        hinfo->r_offset = extent * static_cast<long>(stride * 0);
+        hinfo.s_offset = extent * static_cast<long>(stride * grid[dir]);
+        hinfo.r_offset = extent * static_cast<long>(stride * 0);
       }
 
-      hinfo->source_node = node_neighbors[2 * dir + 1 - lr];
-      hinfo->dest_node = node_neighbors[2 * dir + lr];
+      hinfo.source_node = node_neighbors[2 * dir + 1 - lr];
+      hinfo.dest_node = node_neighbors[2 * dir + lr];
 
-      halo_create_field_vector(nblocks, stride, skip, fieldtype,
-                               &hinfo->fieldtype);
+      hinfo.fieldtype = std::make_shared<FieldType>(nblocks, stride, skip, true,
+                                                    fieldtype_double);
 
-      MPI_Type_vector(nblocks, stride, skip, datatype, &hinfo->datatype);
-      MPI_Type_commit(&hinfo->datatype);
+      MPI_Type_vector(nblocks, stride, skip, datatype, &hinfo.datatype);
+      MPI_Type_commit(&hinfo.datatype);
 
       if (!box_geo.periodic(dir) &&
           (local_geo.boundary()[2 * dir + lr] != 0 ||
            local_geo.boundary()[2 * dir + 1 - lr] != 0)) {
         if (local_node_grid[dir] == 1) {
-          hinfo->type = HALO_OPEN;
+          hinfo.type = HALO_OPEN;
         } else if (lr == 0) {
           if (local_geo.boundary()[2 * dir + lr] == 1) {
-            hinfo->type = HALO_RECV;
+            hinfo.type = HALO_RECV;
           } else {
-            hinfo->type = HALO_SEND;
+            hinfo.type = HALO_SEND;
           }
         } else {
           if (local_geo.boundary()[2 * dir + lr] == -1) {
-            hinfo->type = HALO_RECV;
+            hinfo.type = HALO_RECV;
           } else {
-            hinfo->type = HALO_SEND;
+            hinfo.type = HALO_SEND;
           }
         }
       } else {
         if (local_node_grid[dir] == 1) {
-          hc->halo_info[cnt].type = HALO_LOCL;
+          hc.halo_info[cnt].type = HALO_LOCL;
         } else {
-          hc->halo_info[cnt].type = HALO_SENDRECV;
+          hc.halo_info[cnt].type = HALO_SENDRECV;
         }
       }
       cnt++;
@@ -250,44 +202,44 @@ void prepare_halo_communication(HaloCommunicator *const hc,
   }
 }
 
-void release_halo_communication(HaloCommunicator *const hc) {
-  for (int n = 0; n < hc->num; n++) {
-    MPI_Type_free(&(hc->halo_info[n].datatype));
+void release_halo_communication(HaloCommunicator &hc) {
+  for (int n = 0; n < hc.num; n++) {
+    MPI_Type_free(&(hc.halo_info[n].datatype));
   }
 }
 
-void halo_communication(HaloCommunicator const *const hc, char *const base) {
+void halo_communication(const HaloCommunicator &hc, char *const base) {
 
-  Fieldtype fieldtype;
+  std::shared_ptr<FieldType> fieldtype;
   MPI_Datatype datatype;
   MPI_Request request;
   MPI_Status status;
 
-  for (int n = 0; n < hc->num; n++) {
+  for (int n = 0; n < hc.num; n++) {
     int s_node, r_node;
-    int comm_type = hc->halo_info[n].type;
-    char *s_buffer = (char *)base + hc->halo_info[n].s_offset;
-    char *r_buffer = (char *)base + hc->halo_info[n].r_offset;
+    int comm_type = hc.halo_info[n].type;
+    char *s_buffer = (char *)base + hc.halo_info[n].s_offset;
+    char *r_buffer = (char *)base + hc.halo_info[n].r_offset;
 
     switch (comm_type) {
 
     case HALO_LOCL:
-      fieldtype = hc->halo_info[n].fieldtype;
+      fieldtype = hc.halo_info[n].fieldtype;
       halo_dtcopy(r_buffer, s_buffer, 1, fieldtype);
       break;
 
     case HALO_SENDRECV:
-      datatype = hc->halo_info[n].datatype;
-      s_node = hc->halo_info[n].source_node;
-      r_node = hc->halo_info[n].dest_node;
+      datatype = hc.halo_info[n].datatype;
+      s_node = hc.halo_info[n].source_node;
+      r_node = hc.halo_info[n].dest_node;
       MPI_Sendrecv(s_buffer, 1, datatype, r_node, REQ_HALO_SPREAD, r_buffer, 1,
                    datatype, s_node, REQ_HALO_SPREAD, comm_cart, &status);
       break;
 
     case HALO_SEND:
-      datatype = hc->halo_info[n].datatype;
-      fieldtype = hc->halo_info[n].fieldtype;
-      r_node = hc->halo_info[n].dest_node;
+      datatype = hc.halo_info[n].datatype;
+      fieldtype = hc.halo_info[n].fieldtype;
+      r_node = hc.halo_info[n].dest_node;
       MPI_Isend(s_buffer, 1, datatype, r_node, REQ_HALO_SPREAD, comm_cart,
                 &request);
       halo_dtset(r_buffer, 0, fieldtype);
@@ -295,15 +247,15 @@ void halo_communication(HaloCommunicator const *const hc, char *const base) {
       break;
 
     case HALO_RECV:
-      datatype = hc->halo_info[n].datatype;
-      s_node = hc->halo_info[n].source_node;
+      datatype = hc.halo_info[n].datatype;
+      s_node = hc.halo_info[n].source_node;
       MPI_Irecv(r_buffer, 1, datatype, s_node, REQ_HALO_SPREAD, comm_cart,
                 &request);
       MPI_Wait(&request, &status);
       break;
 
     case HALO_OPEN:
-      fieldtype = hc->halo_info[n].fieldtype;
+      fieldtype = hc.halo_info[n].fieldtype;
       /** \todo this does not work for the n_i - \<n_i\> */
       halo_dtset(r_buffer, 0, fieldtype);
       break;
diff --git a/src/core/grid_based_algorithms/halo.hpp b/src/core/grid_based_algorithms/halo.hpp
index 193ee2c6ea4..4e0b8d39a96 100644
--- a/src/core/grid_based_algorithms/halo.hpp
+++ b/src/core/grid_based_algorithms/halo.hpp
@@ -34,6 +34,7 @@
 
 #include <mpi.h>
 
+#include <memory>
 #include <vector>
 
 /** \name Types of halo communications */
@@ -55,26 +56,33 @@
 
 /** Layout of the lattice data.
  *  The description is similar to MPI datatypes but a bit more compact.
- *  See \ref halo_create_field_vector and \ref
- *  halo_dtcopy to understand how it works.
  */
-typedef struct _Fieldtype *Fieldtype;
-struct _Fieldtype {
-  int count;    /**< number of subtypes in fieldtype */
-  int *disps;   /**< displacements of the subtypes */
-  int *lengths; /**< lengths of the subtypes */
-  int extent;   /**< extent of the complete fieldtype including gaps */
-  int vblocks;  /**< number of blocks in field vectors */
-  int vstride;  /**< size of strides in field vectors */
-  int vskip;    /**< displacement between strides in field vectors */
+struct FieldType {
+  FieldType(int new_extent)
+      : count(0), disps({}), lengths({}), extent(new_extent), vblocks(0),
+        vstride(0), vskip(0), vflag(false), subtype(nullptr) {}
+  FieldType(int new_vblocks, int new_vstride, int new_vskip, bool new_vflag,
+            std::shared_ptr<FieldType> oldtype)
+      : count(oldtype->count), disps(oldtype->disps), lengths(oldtype->lengths),
+        extent(0), vblocks(new_vblocks), vstride(new_vstride), vskip(new_vskip),
+        vflag(new_vflag), subtype(oldtype) {
+    if (vflag) {
+      extent = oldtype->extent * ((vblocks - 1) * vskip + vstride);
+    } else {
+      extent = oldtype->extent * vstride + (vblocks - 1) * vskip;
+    }
+  }
+  int count;                /**< number of subtypes in fieldtype */
+  std::vector<int> disps;   /**< displacements of the subtypes */
+  std::vector<int> lengths; /**< lengths of the subtypes */
+  int extent;  /**< extent of the complete fieldtype including gaps */
+  int vblocks; /**< number of blocks in field vectors */
+  int vstride; /**< size of strides in field vectors */
+  int vskip;   /**< displacement between strides in field vectors */
   bool vflag;
-  Fieldtype subtype;
+  std::shared_ptr<FieldType> subtype;
 };
 
-/** Predefined fieldtypes */
-extern struct _Fieldtype fieldtype_double;
-#define FIELDTYPE_DOUBLE (&fieldtype_double)
-
 /** Structure describing a Halo region */
 typedef struct {
 
@@ -86,7 +94,8 @@ typedef struct {
   unsigned long s_offset; /**< offset for send buffer */
   unsigned long r_offset; /**< offset for receive buffer */
 
-  Fieldtype fieldtype;   /**< type layout of the data being exchanged */
+  std::shared_ptr<FieldType>
+      fieldtype;         /**< type layout of the data being exchanged */
   MPI_Datatype datatype; /**< MPI datatype of data being communicated */
 
 } HaloInfo;
@@ -102,40 +111,27 @@ class HaloCommunicator {
   std::vector<HaloInfo> halo_info; /**< set of halo communications */
 };
 
-/** Creates a field vector layout
- *  @param vblocks       number of vector blocks
- *  @param vstride       size of strides in field vector
- *  @param vskip         displacements of strides in field vector
- *  @param oldtype       fieldtype the vector is composed of
- *  @param[out] newtype  newly created fieldtype
- */
-void halo_create_field_vector(int vblocks, int vstride, int vskip,
-                              Fieldtype oldtype, Fieldtype *newtype);
-void halo_create_field_hvector(int vblocks, int vstride, int vskip,
-                               Fieldtype oldtype, Fieldtype *newtype);
-
 /** Preparation of the halo parallelization scheme. Sets up the
  *  necessary data structures for \ref halo_communication
  *  @param[in,out] hc       halo communicator being created
  *  @param[in]     lattice  lattice the communication is created for
- *  @param fieldtype        field layout of the lattice data
  *  @param datatype         MPI datatype for the lattice data
  *  @param local_node_grid  Number of nodes in each spatial dimension
  */
-void prepare_halo_communication(HaloCommunicator *hc, Lattice const *lattice,
-                                Fieldtype fieldtype, MPI_Datatype datatype,
+void prepare_halo_communication(HaloCommunicator &hc, const Lattice &lattice,
+                                MPI_Datatype datatype,
                                 const Utils::Vector3i &local_node_grid);
 
 /** Frees data structures associated with a halo communicator
  *  @param[in,out] hc  halo communicator to be released
  */
-void release_halo_communication(HaloCommunicator *hc);
+void release_halo_communication(HaloCommunicator &hc);
 
 /** Perform communication according to the parallelization scheme
  *  described by the halo communicator
  *  @param[in]  hc    halo communicator describing the parallelization scheme
  *  @param[in]  base  base plane of local node
  */
-void halo_communication(HaloCommunicator const *hc, char *base);
+void halo_communication(const HaloCommunicator &hc, char *base);
 
 #endif /* HALO_H */
diff --git a/src/core/grid_based_algorithms/lb.cpp b/src/core/grid_based_algorithms/lb.cpp
index 1a1b3acd717..6dc4edf4040 100644
--- a/src/core/grid_based_algorithms/lb.cpp
+++ b/src/core/grid_based_algorithms/lb.cpp
@@ -45,7 +45,6 @@
 #include <utils/index.hpp>
 #include <utils/math/matrix_vector_product.hpp>
 #include <utils/math/sqr.hpp>
-#include <utils/memory.hpp>
 #include <utils/uniform.hpp>
 
 #include <Random123/philox.h>
@@ -64,6 +63,7 @@
 #include <cstring>
 #include <functional>
 #include <iostream>
+#include <memory>
 #include <stdexcept>
 
 using Utils::get_linear_index;
@@ -176,11 +176,11 @@ using LB_FluidData = boost::multi_array<double, 2>;
 static LB_FluidData lbfluid_a;
 static LB_FluidData lbfluid_b;
 
-/** Pointer to the velocity populations of the fluid.
- *  lbfluid contains pre-collision populations, lbfluid_post
- *  contains post-collision.
+/** Span of the velocity populations of the fluid (pre-collision populations).
  */
 LB_Fluid lbfluid;
+/** Span of the velocity populations of the fluid (post-collision populations).
+ */
 LB_Fluid lbfluid_post;
 
 std::vector<LB_FluidNode> lbfields;
@@ -653,21 +653,20 @@ void lb_prepare_communication(HaloCommunicator &halo_comm,
    * datatypes */
 
   /* prepare the communication for a single velocity */
-  prepare_halo_communication(&comm, &lb_lattice, FIELDTYPE_DOUBLE, MPI_DOUBLE,
-                             node_grid);
+  prepare_halo_communication(comm, lb_lattice, MPI_DOUBLE, node_grid);
 
   halo_comm.num = comm.num;
   halo_comm.halo_info.resize(comm.num);
 
   /* replicate the halo structure */
   for (int i = 0; i < comm.num; i++) {
-    HaloInfo *hinfo = &(halo_comm.halo_info[i]);
+    HaloInfo &hinfo = halo_comm.halo_info[i];
 
-    hinfo->source_node = comm.halo_info[i].source_node;
-    hinfo->dest_node = comm.halo_info[i].dest_node;
-    hinfo->s_offset = comm.halo_info[i].s_offset;
-    hinfo->r_offset = comm.halo_info[i].r_offset;
-    hinfo->type = comm.halo_info[i].type;
+    hinfo.source_node = comm.halo_info[i].source_node;
+    hinfo.dest_node = comm.halo_info[i].dest_node;
+    hinfo.s_offset = comm.halo_info[i].s_offset;
+    hinfo.r_offset = comm.halo_info[i].r_offset;
+    hinfo.type = comm.halo_info[i].type;
 
     /* generate the vector datatype for the structure of lattices we
      * have to use hvector here because the extent of the subtypes
@@ -679,16 +678,16 @@ void lb_prepare_communication(HaloCommunicator &halo_comm,
     MPI_Type_get_extent(MPI_DOUBLE, &lower, &extent);
     MPI_Type_create_hvector(D3Q19::n_vel, 1,
                             lb_lattice.halo_grid_volume * extent,
-                            comm.halo_info[i].datatype, &hinfo->datatype);
-    MPI_Type_commit(&hinfo->datatype);
+                            comm.halo_info[i].datatype, &hinfo.datatype);
+    MPI_Type_commit(&hinfo.datatype);
 
-    halo_create_field_hvector(
+    hinfo.fieldtype = std::make_shared<FieldType>(
         D3Q19::n_vel, 1,
-        static_cast<int>(lb_lattice.halo_grid_volume * sizeof(double)),
-        comm.halo_info[i].fieldtype, &hinfo->fieldtype);
+        static_cast<int>(lb_lattice.halo_grid_volume * sizeof(double)), false,
+        comm.halo_info[i].fieldtype);
   }
 
-  release_halo_communication(&comm);
+  release_halo_communication(comm);
 }
 
 /***********************************************************************/
@@ -743,7 +742,6 @@ void lb_set_population_from_density_momentum_density_stress(
 }
 /**@}*/
 
-/** Calculation of hydrodynamic modes */
 std::array<double, 19> lb_calc_modes(Lattice::index_t index,
                                      const LB_Fluid &lb_fluid) {
   return Utils::matrix_vector_product<double, 19, e_ki>(
@@ -895,10 +893,10 @@ auto lb_next_offsets(const Lattice &lb_lattice,
 }
 
 template <typename T>
-void lb_stream(LB_Fluid &lbfluid, const std::array<T, 19> &populations,
+void lb_stream(LB_Fluid &lb_fluid, const std::array<T, 19> &populations,
                size_t index, std::array<ptrdiff_t, 19> const &offsets) {
   for (int i = 0; i < populations.size(); i++) {
-    lbfluid[i][index + offsets[i]] = populations[i];
+    lb_fluid[i][index + offsets[i]] = populations[i];
   }
 }
 
@@ -971,7 +969,7 @@ void lb_collide_stream() {
   /* swap the pointers for old and new population fields */
   std::swap(lbfluid, lbfluid_post);
 
-  halo_communication(&update_halo_comm,
+  halo_communication(update_halo_comm,
                      reinterpret_cast<char *>(lbfluid[0].data()));
 
 #ifdef ADDITIONAL_CHECKS
@@ -1000,31 +998,25 @@ void lattice_boltzmann_update() {
 /** \name Coupling part */
 /***********************************************************************/
 /**@{*/
-
-static int compare_buffers(double *buf1, double *buf2, int size) {
-  int ret;
-  if (memcmp(buf1, buf2, size) != 0) {
+#ifdef ADDITIONAL_CHECKS
+template <class T> int compare_buffers(T const &buff_a, T const &buff_b) {
+  if (buff_a != buff_b) {
     runtimeErrorMsg() << "Halo buffers are not identical";
-    ret = 1;
-  } else {
-    ret = 0;
+    return ES_ERROR;
   }
-  return ret;
+  return ES_OK;
 }
 
-#ifdef ADDITIONAL_CHECKS
 /** Check consistency of the halo regions.
  *  Test whether the halo regions have been exchanged correctly.
  */
 void lb_check_halo_regions(const LB_Fluid &lb_fluid,
                            const Lattice &lb_lattice) {
   Lattice::index_t index;
-  int i, x, y, z, s_node, r_node, count = D3Q19::n_vel;
-  double *s_buffer, *r_buffer;
-  MPI_Status status[2];
-
-  r_buffer = (double *)Utils::malloc(count * sizeof(double));
-  s_buffer = (double *)Utils::malloc(count * sizeof(double));
+  std::size_t i;
+  int x, y, z, s_node, r_node;
+  std::array<double, D3Q19::n_vel> s_buffer;
+  std::array<double, D3Q19::n_vel> r_buffer;
 
   auto const node_neighbors = calc_node_neighbors(comm_cart);
 
@@ -1038,22 +1030,19 @@ void lb_check_halo_regions(const LB_Fluid &lb_fluid,
         s_node = node_neighbors[1];
         r_node = node_neighbors[0];
         if (n_nodes > 1) {
-          MPI_Sendrecv(s_buffer, count, MPI_DOUBLE, r_node, REQ_HALO_CHECK,
-                       r_buffer, count, MPI_DOUBLE, s_node, REQ_HALO_CHECK,
-                       comm_cart, status);
+          comm_cart.sendrecv(r_node, REQ_HALO_CHECK, s_buffer, s_node,
+                             REQ_HALO_CHECK, r_buffer);
           index =
               get_linear_index(lb_lattice.grid[0], y, z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             s_buffer[i] = lb_fluid[i][index];
-          compare_buffers(s_buffer, r_buffer,
-                          count * static_cast<int>(sizeof(double)));
+          compare_buffers(s_buffer, r_buffer);
         } else {
           index =
               get_linear_index(lb_lattice.grid[0], y, z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             r_buffer[i] = lb_fluid[i][index];
-          if (compare_buffers(s_buffer, r_buffer,
-                              count * static_cast<int>(sizeof(double)))) {
+          if (compare_buffers(s_buffer, r_buffer)) {
             std::cerr << "buffers differ in dir=" << 0 << " at index=" << index
                       << " y=" << y << " z=" << z << "\n";
           }
@@ -1067,20 +1056,17 @@ void lb_check_halo_regions(const LB_Fluid &lb_fluid,
         s_node = node_neighbors[0];
         r_node = node_neighbors[1];
         if (n_nodes > 1) {
-          MPI_Sendrecv(s_buffer, count, MPI_DOUBLE, r_node, REQ_HALO_CHECK,
-                       r_buffer, count, MPI_DOUBLE, s_node, REQ_HALO_CHECK,
-                       comm_cart, status);
+          comm_cart.sendrecv(r_node, REQ_HALO_CHECK, s_buffer, s_node,
+                             REQ_HALO_CHECK, r_buffer);
           index = get_linear_index(1, y, z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             s_buffer[i] = lb_fluid[i][index];
-          compare_buffers(s_buffer, r_buffer,
-                          count * static_cast<int>(sizeof(double)));
+          compare_buffers(s_buffer, r_buffer);
         } else {
           index = get_linear_index(1, y, z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             r_buffer[i] = lb_fluid[i][index];
-          if (compare_buffers(s_buffer, r_buffer,
-                              count * static_cast<int>(sizeof(double)))) {
+          if (compare_buffers(s_buffer, r_buffer)) {
             std::cerr << "buffers differ in dir=0 at index=" << index
                       << " y=" << y << " z=" << z << "\n";
           }
@@ -1099,22 +1085,19 @@ void lb_check_halo_regions(const LB_Fluid &lb_fluid,
         s_node = node_neighbors[3];
         r_node = node_neighbors[2];
         if (n_nodes > 1) {
-          MPI_Sendrecv(s_buffer, count, MPI_DOUBLE, r_node, REQ_HALO_CHECK,
-                       r_buffer, count, MPI_DOUBLE, s_node, REQ_HALO_CHECK,
-                       comm_cart, status);
+          comm_cart.sendrecv(r_node, REQ_HALO_CHECK, s_buffer, s_node,
+                             REQ_HALO_CHECK, r_buffer);
           index =
               get_linear_index(x, lb_lattice.grid[1], z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             s_buffer[i] = lb_fluid[i][index];
-          compare_buffers(s_buffer, r_buffer,
-                          count * static_cast<int>(sizeof(double)));
+          compare_buffers(s_buffer, r_buffer);
         } else {
           index =
               get_linear_index(x, lb_lattice.grid[1], z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             r_buffer[i] = lb_fluid[i][index];
-          if (compare_buffers(s_buffer, r_buffer,
-                              count * static_cast<int>(sizeof(double)))) {
+          if (compare_buffers(s_buffer, r_buffer)) {
             std::cerr << "buffers differ in dir=1 at index=" << index
                       << " x=" << x << " z=" << z << "\n";
           }
@@ -1129,20 +1112,17 @@ void lb_check_halo_regions(const LB_Fluid &lb_fluid,
         s_node = node_neighbors[2];
         r_node = node_neighbors[3];
         if (n_nodes > 1) {
-          MPI_Sendrecv(s_buffer, count, MPI_DOUBLE, r_node, REQ_HALO_CHECK,
-                       r_buffer, count, MPI_DOUBLE, s_node, REQ_HALO_CHECK,
-                       comm_cart, status);
+          comm_cart.sendrecv(r_node, REQ_HALO_CHECK, s_buffer, s_node,
+                             REQ_HALO_CHECK, r_buffer);
           index = get_linear_index(x, 1, z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             s_buffer[i] = lb_fluid[i][index];
-          compare_buffers(s_buffer, r_buffer,
-                          count * static_cast<int>(sizeof(double)));
+          compare_buffers(s_buffer, r_buffer);
         } else {
           index = get_linear_index(x, 1, z, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             r_buffer[i] = lb_fluid[i][index];
-          if (compare_buffers(s_buffer, r_buffer,
-                              count * static_cast<int>(sizeof(double)))) {
+          if (compare_buffers(s_buffer, r_buffer)) {
             std::cerr << "buffers differ in dir=1 at index=" << index
                       << " x=" << x << " z=" << z << "\n";
           }
@@ -1161,22 +1141,19 @@ void lb_check_halo_regions(const LB_Fluid &lb_fluid,
         s_node = node_neighbors[5];
         r_node = node_neighbors[4];
         if (n_nodes > 1) {
-          MPI_Sendrecv(s_buffer, count, MPI_DOUBLE, r_node, REQ_HALO_CHECK,
-                       r_buffer, count, MPI_DOUBLE, s_node, REQ_HALO_CHECK,
-                       comm_cart, status);
+          comm_cart.sendrecv(r_node, REQ_HALO_CHECK, s_buffer, s_node,
+                             REQ_HALO_CHECK, r_buffer);
           index =
               get_linear_index(x, y, lb_lattice.grid[2], lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             s_buffer[i] = lb_fluid[i][index];
-          compare_buffers(s_buffer, r_buffer,
-                          count * static_cast<int>(sizeof(double)));
+          compare_buffers(s_buffer, r_buffer);
         } else {
           index =
               get_linear_index(x, y, lb_lattice.grid[2], lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             r_buffer[i] = lb_fluid[i][index];
-          if (compare_buffers(s_buffer, r_buffer,
-                              count * static_cast<int>(sizeof(double)))) {
+          if (compare_buffers(s_buffer, r_buffer)) {
             std::cerr << "buffers differ in dir=2 at index=" << index
                       << " x=" << x << " y=" << y << " z=" << lb_lattice.grid[2]
                       << "\n";
@@ -1194,20 +1171,17 @@ void lb_check_halo_regions(const LB_Fluid &lb_fluid,
         s_node = node_neighbors[4];
         r_node = node_neighbors[5];
         if (n_nodes > 1) {
-          MPI_Sendrecv(s_buffer, count, MPI_DOUBLE, r_node, REQ_HALO_CHECK,
-                       r_buffer, count, MPI_DOUBLE, s_node, REQ_HALO_CHECK,
-                       comm_cart, status);
+          comm_cart.sendrecv(r_node, REQ_HALO_CHECK, s_buffer, s_node,
+                             REQ_HALO_CHECK, r_buffer);
           index = get_linear_index(x, y, 1, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             s_buffer[i] = lb_fluid[i][index];
-          compare_buffers(s_buffer, r_buffer,
-                          count * static_cast<int>(sizeof(double)));
+          compare_buffers(s_buffer, r_buffer);
         } else {
           index = get_linear_index(x, y, 1, lb_lattice.halo_grid);
           for (i = 0; i < D3Q19::n_vel; i++)
             r_buffer[i] = lb_fluid[i][index];
-          if (compare_buffers(s_buffer, r_buffer,
-                              count * static_cast<int>(sizeof(double)))) {
+          if (compare_buffers(s_buffer, r_buffer)) {
             std::cerr << "buffers differ in dir=2 at index=" << index
                       << " x=" << x << " y=" << y << "\n";
           }
@@ -1215,9 +1189,6 @@ void lb_check_halo_regions(const LB_Fluid &lb_fluid,
       }
     }
   }
-
-  free(r_buffer);
-  free(s_buffer);
 }
 #endif // ADDITIONAL_CHECKS
 
@@ -1342,7 +1313,8 @@ void lb_bounce_back(LB_Fluid &lb_fluid, const LB_Parameters &lb_parameters,
 
 /** Calculate the local fluid momentum.
  *  The calculation is implemented explicitly for the special case of D3Q19.
- *  @param[in]  index  Local lattice site
+ *  @param[in]  index     Local lattice site
+ *  @param[in]  lb_fluid  Populations of the fluid
  *  @retval The local fluid momentum.
  */
 Utils::Vector3d lb_calc_local_momentum_density(Lattice::index_t index,
@@ -1361,9 +1333,11 @@ Utils::Vector3d lb_calc_local_momentum_density(Lattice::index_t index,
                lb_fluid[18][index]}};
 }
 
-// Statistics in MD units.
 /** Calculate momentum of the LB fluid.
- * \param result Fluid momentum
+ *  @param[out] result         Fluid momentum in MD units
+ *  @param[in]  lb_parameters  LB parameters
+ *  @param[in]  lb_fields      Hydrodynamic fields of the fluid
+ *  @param[in]  lb_lattice     The underlying lattice
  */
 void lb_calc_fluid_momentum(double *result, const LB_Parameters &lb_parameters,
                             const std::vector<LB_FluidNode> &lb_fields,
diff --git a/src/core/grid_based_algorithms/lb.hpp b/src/core/grid_based_algorithms/lb.hpp
index f1dcb52894b..5089ec99499 100644
--- a/src/core/grid_based_algorithms/lb.hpp
+++ b/src/core/grid_based_algorithms/lb.hpp
@@ -144,10 +144,7 @@ void lb_reinit_fluid(std::vector<LB_FluidNode> &lb_fields,
                      const LB_Parameters &lb_parameters);
 
 void lb_reinit_parameters(LB_Parameters &lb_parameters);
-/** Pointer to the velocity populations of the fluid.
- *  lbfluid contains pre-collision populations, lbfluid_post
- *  contains post-collision populations
- */
+
 using LB_Fluid = std::array<Utils::Span<double>, 19>;
 extern LB_Fluid lbfluid;
 
@@ -172,7 +169,7 @@ template <std::size_t I> auto get(const LB_Fluid_Ref &lb_fluid) {
 
 } // namespace Utils
 
-/** Pointer to the hydrodynamic fields of the fluid */
+/** Hydrodynamic fields of the fluid */
 extern std::vector<LB_FluidNode> lbfields;
 
 /************************************************************/
@@ -213,7 +210,8 @@ Utils::Vector6d lb_calc_pressure_tensor(std::array<double, 19> const &modes,
 
 /** Calculation of hydrodynamic modes.
  *
- *  @param index number of the node to calculate the modes for
+ *  @param[in]  index     Number of the node to calculate the modes for
+ *  @param[in]  lb_fluid  Populations of the fluid
  *  @retval Array containing the modes.
  */
 std::array<double, 19> lb_calc_modes(Lattice::index_t index,
diff --git a/src/core/grid_based_algorithms/lb_boundaries.cpp b/src/core/grid_based_algorithms/lb_boundaries.cpp
index 1df5ca9e854..3f7338ef2b3 100644
--- a/src/core/grid_based_algorithms/lb_boundaries.cpp
+++ b/src/core/grid_based_algorithms/lb_boundaries.cpp
@@ -161,7 +161,7 @@ void lb_init_boundaries() {
       return;
     }
     ek_init_boundaries();
-    int number_of_boundnodes = 0;
+    unsigned number_of_boundnodes = 0;
     std::vector<int> host_boundary_node_list;
     std::vector<int> host_boundary_index_list;
     size_t size_of_index;
@@ -277,7 +277,7 @@ Utils::Vector3d lbboundary_get_force(LBBoundary const *lbb) {
   std::vector<double> forces(3 * lbboundaries.size());
   if (lattice_switch == ActiveLB::GPU) {
 #if defined(LB_BOUNDARIES_GPU) && defined(CUDA)
-    lb_gpu_get_boundary_forces(forces.data());
+    lb_gpu_get_boundary_forces(forces);
 #endif
   } else if (lattice_switch == ActiveLB::CPU) {
 #if defined(LB_BOUNDARIES)
diff --git a/src/core/grid_based_algorithms/lb_collective_interface.cpp b/src/core/grid_based_algorithms/lb_collective_interface.cpp
index f800bec579a..a2e1cb8e13c 100644
--- a/src/core/grid_based_algorithms/lb_collective_interface.cpp
+++ b/src/core/grid_based_algorithms/lb_collective_interface.cpp
@@ -81,11 +81,20 @@ mpi_lb_get_interpolated_velocity(Utils::Vector3d const &pos) {
 
 REGISTER_CALLBACK_ONE_RANK(mpi_lb_get_interpolated_velocity)
 
+boost::optional<double>
+mpi_lb_get_interpolated_density(Utils::Vector3d const &pos) {
+  return detail::lb_calc_for_pos(pos, [&](auto pos) {
+    return lb_lbinterpolation_get_interpolated_density(pos);
+  });
+}
+
+REGISTER_CALLBACK_ONE_RANK(mpi_lb_get_interpolated_density)
+
 auto mpi_lb_get_density(Utils::Vector3i const &index) {
-  return detail::lb_calc_fluid_kernel(index,
-                                      [&](auto modes, auto force_density) {
-                                        return lb_calc_density(modes, lbpar);
-                                      });
+  return detail::lb_calc_fluid_kernel(
+      index, [&](auto const &modes, auto const &force_density) {
+        return lb_calc_density(modes, lbpar);
+      });
 }
 
 REGISTER_CALLBACK_ONE_RANK(mpi_lb_get_density)
@@ -138,7 +147,7 @@ REGISTER_CALLBACK(mpi_lb_set_force_density)
 
 auto mpi_lb_get_momentum_density(Utils::Vector3i const &index) {
   return detail::lb_calc_fluid_kernel(
-      index, [&](auto modes, auto force_density) {
+      index, [&](auto const &modes, auto const &force_density) {
         return lb_calc_momentum_density(modes, force_density);
       });
 }
@@ -147,7 +156,7 @@ REGISTER_CALLBACK_ONE_RANK(mpi_lb_get_momentum_density)
 
 auto mpi_lb_get_pressure_tensor(Utils::Vector3i const &index) {
   return detail::lb_calc_fluid_kernel(
-      index, [&](auto modes, auto force_density) {
+      index, [&](auto const &modes, auto const &force_density) {
         return lb_calc_pressure_tensor(modes, force_density, lbpar);
       });
 }
diff --git a/src/core/grid_based_algorithms/lb_collective_interface.hpp b/src/core/grid_based_algorithms/lb_collective_interface.hpp
index 1c9afa8fb31..4b6b0272ab4 100644
--- a/src/core/grid_based_algorithms/lb_collective_interface.hpp
+++ b/src/core/grid_based_algorithms/lb_collective_interface.hpp
@@ -27,6 +27,8 @@
 /* collective getter functions */
 boost::optional<Utils::Vector3d>
 mpi_lb_get_interpolated_velocity(Utils::Vector3d const &pos);
+boost::optional<double>
+mpi_lb_get_interpolated_density(Utils::Vector3d const &pos);
 boost::optional<double> mpi_lb_get_density(Utils::Vector3i const &index);
 boost::optional<Utils::Vector19d>
 mpi_lb_get_populations(Utils::Vector3i const &index);
diff --git a/src/core/grid_based_algorithms/lb_interface.cpp b/src/core/grid_based_algorithms/lb_interface.cpp
index a48da0bda33..3fabd382a99 100644
--- a/src/core/grid_based_algorithms/lb_interface.cpp
+++ b/src/core/grid_based_algorithms/lb_interface.cpp
@@ -125,7 +125,7 @@ void lb_lbfluid_sanity_checks() {
 void lb_lbfluid_on_integration_start() {
   lb_lbfluid_sanity_checks();
   if (lattice_switch == ActiveLB::CPU) {
-    halo_communication(&update_halo_comm,
+    halo_communication(update_halo_comm,
                        reinterpret_cast<char *>(lbfluid[0].data()));
   }
 }
@@ -1000,8 +1000,7 @@ bool lb_lbnode_is_index_valid(Utils::Vector3i const &ind) {
 double lb_lbnode_get_density(const Utils::Vector3i &ind) {
   if (lattice_switch == ActiveLB::GPU) {
 #ifdef CUDA
-    auto const single_nodeindex = ind[0] + ind[1] * lbpar_gpu.dim_x +
-                                  ind[2] * lbpar_gpu.dim_x * lbpar_gpu.dim_y;
+    auto const single_nodeindex = calculate_node_index(lbpar_gpu, ind);
     static LB_rho_v_pi_gpu host_print_values;
     lb_print_node_GPU(single_nodeindex, &host_print_values);
     return host_print_values.rho;
@@ -1020,8 +1019,7 @@ const Utils::Vector3d lb_lbnode_get_velocity(const Utils::Vector3i &ind) {
   if (lattice_switch == ActiveLB::GPU) {
 #ifdef CUDA
     static LB_rho_v_pi_gpu host_print_values;
-    auto const single_nodeindex = ind[0] + ind[1] * lbpar_gpu.dim_x +
-                                  ind[2] * lbpar_gpu.dim_x * lbpar_gpu.dim_y;
+    auto const single_nodeindex = calculate_node_index(lbpar_gpu, ind);
     lb_print_node_GPU(single_nodeindex, &host_print_values);
     return {static_cast<double>(host_print_values.v[0]),
             static_cast<double>(host_print_values.v[1]),
@@ -1057,8 +1055,7 @@ lb_lbnode_get_pressure_tensor_neq(const Utils::Vector3i &ind) {
 #ifdef CUDA
     Utils::Vector6d tensor{};
     static LB_rho_v_pi_gpu host_print_values;
-    auto const single_nodeindex = ind[0] + ind[1] * lbpar_gpu.dim_x +
-                                  ind[2] * lbpar_gpu.dim_x * lbpar_gpu.dim_y;
+    auto const single_nodeindex = calculate_node_index(lbpar_gpu, ind);
     lb_print_node_GPU(single_nodeindex, &host_print_values);
     for (int i = 0; i < 6; i++) {
       tensor[i] = static_cast<double>(host_print_values.pi[i]);
@@ -1123,8 +1120,7 @@ int lb_lbnode_get_boundary(const Utils::Vector3i &ind) {
   if (lattice_switch == ActiveLB::GPU) {
 #ifdef CUDA
     unsigned int host_flag;
-    auto const single_nodeindex = ind[0] + ind[1] * lbpar_gpu.dim_x +
-                                  ind[2] * lbpar_gpu.dim_x * lbpar_gpu.dim_y;
+    auto const single_nodeindex = calculate_node_index(lbpar_gpu, ind);
     lb_get_boundary_flag_GPU(single_nodeindex, &host_flag);
     return static_cast<int>(host_flag);
 #else
@@ -1162,8 +1158,7 @@ const Utils::Vector19d lb_lbnode_get_pop(const Utils::Vector3i &ind) {
 void lb_lbnode_set_density(const Utils::Vector3i &ind, double p_density) {
   if (lattice_switch == ActiveLB::GPU) {
 #ifdef CUDA
-    auto const single_nodeindex = ind[0] + ind[1] * lbpar_gpu.dim_x +
-                                  ind[2] * lbpar_gpu.dim_x * lbpar_gpu.dim_y;
+    auto const single_nodeindex = calculate_node_index(lbpar_gpu, ind);
     auto const host_density = static_cast<float>(p_density);
     lb_set_node_rho_GPU(single_nodeindex, host_density);
 #endif //  CUDA
@@ -1188,8 +1183,7 @@ void lb_lbnode_set_velocity(const Utils::Vector3i &ind,
     host_velocity[0] = static_cast<float>(u[0]);
     host_velocity[1] = static_cast<float>(u[1]);
     host_velocity[2] = static_cast<float>(u[2]);
-    auto const single_nodeindex = ind[0] + ind[1] * lbpar_gpu.dim_x +
-                                  ind[2] * lbpar_gpu.dim_x * lbpar_gpu.dim_y;
+    auto const single_nodeindex = calculate_node_index(lbpar_gpu, ind);
     lb_set_node_velocity_GPU(single_nodeindex, host_velocity);
 #endif //  CUDA
   } else if (lattice_switch == ActiveLB::CPU) {
@@ -1279,3 +1273,23 @@ lb_lbfluid_get_interpolated_velocity(const Utils::Vector3d &pos) {
   }
   throw NoLBActive();
 }
+
+double lb_lbfluid_get_interpolated_density(const Utils::Vector3d &pos) {
+  auto const folded_pos = folded_position(pos, box_geo);
+  auto const interpolation_order = lb_lbinterpolation_get_interpolation_order();
+  if (lattice_switch == ActiveLB::GPU) {
+    throw std::runtime_error(
+        "Density interpolation is not implemented for the GPU LB.");
+  }
+  if (lattice_switch == ActiveLB::CPU) {
+    switch (interpolation_order) {
+    case (InterpolationOrder::quadratic):
+      throw std::runtime_error("The non-linear interpolation scheme is not "
+                               "implemented for the CPU LB.");
+    case (InterpolationOrder::linear):
+      return mpi_call(::Communication::Result::one_rank,
+                      mpi_lb_get_interpolated_density, folded_pos);
+    }
+  }
+  throw NoLBActive();
+}
diff --git a/src/core/grid_based_algorithms/lb_interface.hpp b/src/core/grid_based_algorithms/lb_interface.hpp
index c72ef9b7f8b..9b831197a68 100644
--- a/src/core/grid_based_algorithms/lb_interface.hpp
+++ b/src/core/grid_based_algorithms/lb_interface.hpp
@@ -265,4 +265,11 @@ Utils::Vector3d lb_lbfluid_calc_fluid_momentum();
 const Utils::Vector3d
 lb_lbfluid_get_interpolated_velocity(const Utils::Vector3d &pos);
 
+/**
+ * @brief Calculates the interpolated fluid density on the master process.
+ * @param pos Position at which the density is to be calculated.
+ * @retval interpolated fluid density.
+ */
+double lb_lbfluid_get_interpolated_density(const Utils::Vector3d &pos);
+
 #endif
diff --git a/src/core/grid_based_algorithms/lb_interpolation.cpp b/src/core/grid_based_algorithms/lb_interpolation.cpp
index ea0cd46e1dd..d0db33f33ae 100644
--- a/src/core/grid_based_algorithms/lb_interpolation.cpp
+++ b/src/core/grid_based_algorithms/lb_interpolation.cpp
@@ -82,6 +82,16 @@ Utils::Vector3d node_u(Lattice::index_t index) {
   return Utils::Vector3d{modes[1], modes[2], modes[3]} / local_density;
 }
 
+double node_dens(Lattice::index_t index) {
+#ifdef LB_BOUNDARIES
+  if (lbfields[index].boundary) {
+    return lbpar.density;
+  }
+#endif // LB_BOUNDARIES
+  auto const modes = lb_calc_modes(index, lbfluid);
+  return lbpar.density + modes[0];
+}
+
 } // namespace
 
 const Utils::Vector3d
@@ -98,6 +108,19 @@ lb_lbinterpolation_get_interpolated_velocity(const Utils::Vector3d &pos) {
   return interpolated_u;
 }
 
+double lb_lbinterpolation_get_interpolated_density(const Utils::Vector3d &pos) {
+  double interpolated_dens = 0.;
+
+  /* Calculate fluid density at the position.
+     This is done by linear interpolation (eq. (11) @cite ahlrichs99a) */
+  lattice_interpolation(lblattice, pos,
+                        [&interpolated_dens](Lattice::index_t index, double w) {
+                          interpolated_dens += w * node_dens(index);
+                        });
+
+  return interpolated_dens;
+}
+
 void lb_lbinterpolation_add_force_density(
     const Utils::Vector3d &pos, const Utils::Vector3d &force_density) {
   switch (interpolation_order) {
diff --git a/src/core/grid_based_algorithms/lb_interpolation.hpp b/src/core/grid_based_algorithms/lb_interpolation.hpp
index 68d9c6e1be6..28544afa9ef 100644
--- a/src/core/grid_based_algorithms/lb_interpolation.hpp
+++ b/src/core/grid_based_algorithms/lb_interpolation.hpp
@@ -41,6 +41,13 @@ InterpolationOrder lb_lbinterpolation_get_interpolation_order();
 const Utils::Vector3d
 lb_lbinterpolation_get_interpolated_velocity(const Utils::Vector3d &p);
 
+/**
+ * @brief Calculates the fluid density at a given position of the
+ * lattice.
+ * @note It can lead to undefined behaviour if the
+ * position is not within the local lattice. */
+double lb_lbinterpolation_get_interpolated_density(const Utils::Vector3d &p);
+
 /**
  * @brief Add a force density to the fluid at the given position.
  */
diff --git a/src/core/grid_based_algorithms/lbgpu.cpp b/src/core/grid_based_algorithms/lbgpu.cpp
index 7fed91b23df..aa30a96c9ed 100644
--- a/src/core/grid_based_algorithms/lbgpu.cpp
+++ b/src/core/grid_based_algorithms/lbgpu.cpp
@@ -207,7 +207,7 @@ void lb_init_gpu() {
   /* set parameters for transfer to gpu */
   lb_reinit_parameters_gpu();
 
-  lb_init_GPU(&lbpar_gpu);
+  lb_init_GPU(lbpar_gpu);
 
   gpu_init_particle_comm();
   cuda_bcast_global_part_params();
diff --git a/src/core/grid_based_algorithms/lbgpu.cuh b/src/core/grid_based_algorithms/lbgpu.cuh
index daa989e27ba..7bb8f94bcca 100644
--- a/src/core/grid_based_algorithms/lbgpu.cuh
+++ b/src/core/grid_based_algorithms/lbgpu.cuh
@@ -32,11 +32,10 @@
 
 #include <utils/Array.hpp>
 
-#ifdef CUDA
 /** Velocity densities for the lattice Boltzmann system. */
 struct LB_nodes_gpu {
   /** velocity density of the node */
-  float *vd = nullptr;
+  float *populations = nullptr;
   unsigned int *boundary = nullptr;
   Utils::Array<float, 3> *boundary_velocity = nullptr;
 };
@@ -73,7 +72,6 @@ inline __device__ float4 random_wrapper_philox(unsigned int index,
                  (CURAND_2POW32_INV / 2.0f);
   return rnd_floats;
 }
-#endif //  CUDA
 
 #endif // CUDA
 #endif
diff --git a/src/core/grid_based_algorithms/lbgpu.hpp b/src/core/grid_based_algorithms/lbgpu.hpp
index 857fd23dc4b..90d55f72af4 100644
--- a/src/core/grid_based_algorithms/lbgpu.hpp
+++ b/src/core/grid_based_algorithms/lbgpu.hpp
@@ -31,6 +31,7 @@
 #include "OptionalCounter.hpp"
 
 #include <utils/Vector.hpp>
+#include <utils/index.hpp>
 
 #include <cstddef>
 #include <cstdint>
@@ -48,7 +49,6 @@ typedef double lbForceFloat;
 typedef float lbForceFloat;
 #endif
 
-/**-------------------------------------------------------------------------*/
 /** Parameters for the lattice Boltzmann system for GPU. */
 struct LB_parameters_gpu {
   /** number density (LB units) */
@@ -110,40 +110,30 @@ struct LB_rho_v_gpu {
   float rho;
   /** velocity of the node */
 
-  float v[3];
+  Utils::Array<float, 3> v;
 };
 /* this structure is almost duplicated for memory efficiency. When the stress
    tensor element are needed at every timestep, this features should be
    explicitly switched on */
-typedef struct {
+struct LB_rho_v_pi_gpu {
   /** density of the node */
   float rho;
   /** velocity of the node */
-  float v[3];
+  Utils::Array<float, 3> v;
   /** pressure tensor */
-  float pi[6];
-} LB_rho_v_pi_gpu;
-
-typedef struct {
+  Utils::Array<float, 6> pi;
+};
 
+struct LB_node_force_density_gpu {
   lbForceFloat *force_density;
 #if defined(VIRTUAL_SITES_INERTIALESS_TRACERS) || defined(EK_DEBUG)
 
   // We need the node forces for the velocity interpolation at the virtual
-  // particles' position However, LBM wants to reset them immediately after the
-  // LBM update This variable keeps a backup
+  // particles' position. However, LBM wants to reset them immediately
+  // after the LBM update. This variable keeps a backup
   lbForceFloat *force_density_buf;
 #endif
-
-} LB_node_force_density_gpu;
-
-typedef struct {
-
-  float force_density[3];
-
-  unsigned int index;
-
-} LB_extern_nodeforcedensity_gpu;
+};
 
 /************************************************************/
 /** \name Exported Variables */
@@ -157,6 +147,8 @@ extern std::vector<LB_rho_v_pi_gpu> host_values;
 extern LB_node_force_density_gpu node_f;
 extern bool ek_initialized;
 #endif
+extern OptionalCounter rng_counter_fluid_gpu;
+extern OptionalCounter rng_counter_coupling_gpu;
 
 /**@}*/
 
@@ -167,9 +159,9 @@ extern bool ek_initialized;
 
 void lb_GPU_sanity_checks();
 
-void lb_get_device_values_pointer(LB_rho_v_gpu **pointeradress);
-void lb_get_boundary_force_pointer(float **pointeradress);
-void lb_get_para_pointer(LB_parameters_gpu **pointeradress);
+void lb_get_device_values_pointer(LB_rho_v_gpu **pointer_address);
+void lb_get_boundary_force_pointer(float **pointer_address);
+void lb_get_para_pointer(LB_parameters_gpu **pointer_address);
 void lattice_boltzmann_update_gpu();
 
 /** Perform a full initialization of the lattice Boltzmann system.
@@ -188,14 +180,15 @@ void lb_reinit_fluid_gpu();
 /** Reset the forces on the fluid nodes */
 void reset_LB_force_densities_GPU(bool buffer = true);
 
-void lb_init_GPU(LB_parameters_gpu *lbpar_gpu);
+void lb_init_GPU(const LB_parameters_gpu &lbpar_gpu);
 void lb_integrate_GPU();
 
 void lb_get_values_GPU(LB_rho_v_pi_gpu *host_values);
-void lb_print_node_GPU(int single_nodeindex,
+void lb_print_node_GPU(unsigned single_nodeindex,
                        LB_rho_v_pi_gpu *host_print_values);
 #ifdef LB_BOUNDARIES_GPU
-void lb_init_boundaries_GPU(int n_lb_boundaries, int number_of_boundnodes,
+void lb_init_boundaries_GPU(std::size_t n_lb_boundaries,
+                            unsigned number_of_boundnodes,
                             int *host_boundary_node_list,
                             int *host_boundary_index_list,
                             float *lb_bounday_velocity);
@@ -208,16 +201,17 @@ void lb_calc_particle_lattice_ia_gpu(bool couple_virtual, double friction);
 
 void lb_calc_fluid_mass_GPU(double *mass);
 void lb_calc_fluid_momentum_GPU(double *host_mom);
-void lb_get_boundary_flag_GPU(int single_nodeindex, unsigned int *host_flag);
+void lb_get_boundary_flag_GPU(unsigned int single_nodeindex,
+                              unsigned int *host_flag);
 void lb_get_boundary_flags_GPU(unsigned int *host_bound_array);
 
-void lb_set_node_velocity_GPU(int single_nodeindex, float *host_velocity);
-void lb_set_node_rho_GPU(int single_nodeindex, float host_rho);
+void lb_set_node_velocity_GPU(unsigned single_nodeindex, float *host_velocity);
+void lb_set_node_rho_GPU(unsigned single_nodeindex, float host_rho);
 
 void reinit_parameters_GPU(LB_parameters_gpu *lbpar_gpu);
 void lb_reinit_extern_nodeforce_GPU(LB_parameters_gpu *lbpar_gpu);
 void lb_reinit_GPU(LB_parameters_gpu *lbpar_gpu);
-void lb_gpu_get_boundary_forces(double *forces);
+void lb_gpu_get_boundary_forces(std::vector<double> &forces);
 void lb_save_checkpoint_GPU(float *host_checkpoint_vd);
 void lb_load_checkpoint_GPU(float const *host_checkpoint_vd);
 
@@ -236,9 +230,17 @@ uint64_t lb_fluid_get_rng_state_gpu();
 void lb_fluid_set_rng_state_gpu(uint64_t counter);
 uint64_t lb_coupling_get_rng_state_gpu();
 void lb_coupling_set_rng_state_gpu(uint64_t counter);
+
+/** Calculate the node index from its coordinates */
+inline unsigned int calculate_node_index(LB_parameters_gpu const &lbpar,
+                                         Utils::Vector3i const &coord) {
+  return static_cast<unsigned>(Utils::get_linear_index(
+      coord, Utils::Vector3i{static_cast<int>(lbpar.dim_x),
+                             static_cast<int>(lbpar.dim_y),
+                             static_cast<int>(lbpar.dim_z)}));
+}
 /**@}*/
-extern OptionalCounter rng_counter_fluid_gpu;
-extern OptionalCounter rng_counter_coupling_gpu;
+
 #endif /*  CUDA */
 
-#endif /*  CUDA_H */
+#endif /*  LBGPU_HPP */
diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu
index edc73900c90..f9b18c80851 100644
--- a/src/core/grid_based_algorithms/lbgpu_cuda.cu
+++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu
@@ -33,12 +33,12 @@
 #include "grid_based_algorithms/lbgpu.hpp"
 
 #include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 #include "errorhandling.hpp"
+#include "lbgpu.hpp"
 
 #include <utils/Array.hpp>
 #include <utils/Counter.hpp>
-#include <utils/memory.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
@@ -49,6 +49,7 @@
 #include <cuda.h>
 #include <curand_kernel.h>
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -58,13 +59,13 @@
 
 extern int this_node;
 
-/** device_rho_v: struct for hydrodynamic fields: this is for internal use
- *  (i.e. stores values in LB units) and should not used for
+/** struct for hydrodynamic fields: this is for internal use
+ *  (i.e. stores values in LB units) and should not be used for
  *  printing values
  */
 static LB_rho_v_gpu *device_rho_v = nullptr;
 
-/** print_rho_v_pi: struct for hydrodynamic fields: this is the interface
+/** struct for hydrodynamic fields: this is the interface
  *  and stores values in MD units. It should not be used
  *  as an input for any LB calculations. TODO: in the future,
  *  one might want to have several structures for printing
@@ -97,11 +98,8 @@ LB_node_force_density_gpu node_f = {
 static float *lb_boundary_force = nullptr;
 #endif
 
-/** @name pointers for additional cuda check flag */
-/**@{*/
-static int *gpu_check = nullptr;
-static int *h_gpu_check = nullptr;
-/**@}*/
+/** @brief Whether LB GPU was initialized */
+static bool *device_gpu_lb_initialized = nullptr;
 
 /** @brief Direction of data transfer between @ref nodes_a and @ref nodes_b
  *  during integration in @ref lb_integrate_GPU
@@ -117,11 +115,8 @@ static size_t size_of_rho_v_pi;
 /** Parameters residing in constant memory */
 __device__ __constant__ LB_parameters_gpu para[1];
 
-/*********************************************************/
-/** \name device functions called by kernel functions */
-/*********************************************************/
-
 static constexpr float sqrt12 = 3.4641016151377544f;
+static constexpr unsigned int threads_per_block = 64;
 OptionalCounter rng_counter_coupling_gpu;
 OptionalCounter rng_counter_fluid_gpu;
 
@@ -141,139 +136,152 @@ template <typename T> __device__ uint3 index_to_xyz(T index) {
  *  @param[in] x,y,z     The xyz array
  */
 template <typename T> __device__ T xyz_to_index(T x, T y, T z) {
-  return x + para->dim_x * (y + para->dim_y * z);
+  return x +
+         static_cast<T>(para->dim_x) * (y + static_cast<T>(para->dim_y) * z);
 }
 
 __device__ __inline__ float calc_mode_x_from_n(LB_nodes_gpu n_a,
                                                unsigned int index, int x) {
-  auto const flat_index = [&index](int population) {
+  auto const flat_index = [&index](unsigned population) {
     return population * para->number_of_nodes + index;
   };
   switch (x) {
   case 0:
-    return n_a.vd[flat_index(0)] + n_a.vd[flat_index(1)] +
-           n_a.vd[flat_index(2)] + n_a.vd[flat_index(3)] +
-           n_a.vd[flat_index(4)] + n_a.vd[flat_index(5)] +
-           n_a.vd[flat_index(6)] + n_a.vd[flat_index(7)] +
-           n_a.vd[flat_index(8)] + n_a.vd[flat_index(9)] +
-           n_a.vd[flat_index(10)] + n_a.vd[flat_index(11)] +
-           n_a.vd[flat_index(12)] + n_a.vd[flat_index(13)] +
-           n_a.vd[flat_index(14)] + n_a.vd[flat_index(15)] +
-           n_a.vd[flat_index(16)] + n_a.vd[flat_index(17)] +
-           n_a.vd[flat_index(18)];
+    return n_a.populations[flat_index(0)] + n_a.populations[flat_index(1)] +
+           n_a.populations[flat_index(2)] + n_a.populations[flat_index(3)] +
+           n_a.populations[flat_index(4)] + n_a.populations[flat_index(5)] +
+           n_a.populations[flat_index(6)] + n_a.populations[flat_index(7)] +
+           n_a.populations[flat_index(8)] + n_a.populations[flat_index(9)] +
+           n_a.populations[flat_index(10)] + n_a.populations[flat_index(11)] +
+           n_a.populations[flat_index(12)] + n_a.populations[flat_index(13)] +
+           n_a.populations[flat_index(14)] + n_a.populations[flat_index(15)] +
+           n_a.populations[flat_index(16)] + n_a.populations[flat_index(17)] +
+           n_a.populations[flat_index(18)];
   case 1:
-    return (n_a.vd[flat_index(1)] - n_a.vd[flat_index(2)]) +
-           (n_a.vd[flat_index(7)] - n_a.vd[flat_index(8)]) +
-           (n_a.vd[flat_index(9)] - n_a.vd[flat_index(10)]) +
-           (n_a.vd[flat_index(11)] - n_a.vd[flat_index(12)]) +
-           (n_a.vd[flat_index(13)] - n_a.vd[flat_index(14)]);
+    return (n_a.populations[flat_index(1)] - n_a.populations[flat_index(2)]) +
+           (n_a.populations[flat_index(7)] - n_a.populations[flat_index(8)]) +
+           (n_a.populations[flat_index(9)] - n_a.populations[flat_index(10)]) +
+           (n_a.populations[flat_index(11)] - n_a.populations[flat_index(12)]) +
+           (n_a.populations[flat_index(13)] - n_a.populations[flat_index(14)]);
   case 2:
-    return (n_a.vd[flat_index(3)] - n_a.vd[flat_index(4)]) +
-           (n_a.vd[flat_index(7)] - n_a.vd[flat_index(8)]) -
-           (n_a.vd[flat_index(9)] - n_a.vd[flat_index(10)]) +
-           (n_a.vd[flat_index(15)] - n_a.vd[flat_index(16)]) +
-           (n_a.vd[flat_index(17)] - n_a.vd[flat_index(18)]);
+    return (n_a.populations[flat_index(3)] - n_a.populations[flat_index(4)]) +
+           (n_a.populations[flat_index(7)] - n_a.populations[flat_index(8)]) -
+           (n_a.populations[flat_index(9)] - n_a.populations[flat_index(10)]) +
+           (n_a.populations[flat_index(15)] - n_a.populations[flat_index(16)]) +
+           (n_a.populations[flat_index(17)] - n_a.populations[flat_index(18)]);
   case 3:
-    return (n_a.vd[flat_index(5)] - n_a.vd[flat_index(6)]) +
-           (n_a.vd[flat_index(11)] - n_a.vd[flat_index(12)]) -
-           (n_a.vd[flat_index(13)] - n_a.vd[flat_index(14)]) +
-           (n_a.vd[flat_index(15)] - n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] - n_a.vd[flat_index(18)]);
+    return (n_a.populations[flat_index(5)] - n_a.populations[flat_index(6)]) +
+           (n_a.populations[flat_index(11)] - n_a.populations[flat_index(12)]) -
+           (n_a.populations[flat_index(13)] - n_a.populations[flat_index(14)]) +
+           (n_a.populations[flat_index(15)] - n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] - n_a.populations[flat_index(18)]);
   case 4:
-    return -n_a.vd[flat_index(0)] + n_a.vd[flat_index(7)] +
-           n_a.vd[flat_index(8)] + n_a.vd[flat_index(9)] +
-           n_a.vd[flat_index(10)] + n_a.vd[flat_index(11)] +
-           n_a.vd[flat_index(12)] + n_a.vd[flat_index(13)] +
-           n_a.vd[flat_index(14)] + n_a.vd[flat_index(15)] +
-           n_a.vd[flat_index(16)] + n_a.vd[flat_index(17)] +
-           n_a.vd[flat_index(18)];
+    return -n_a.populations[flat_index(0)] + n_a.populations[flat_index(7)] +
+           n_a.populations[flat_index(8)] + n_a.populations[flat_index(9)] +
+           n_a.populations[flat_index(10)] + n_a.populations[flat_index(11)] +
+           n_a.populations[flat_index(12)] + n_a.populations[flat_index(13)] +
+           n_a.populations[flat_index(14)] + n_a.populations[flat_index(15)] +
+           n_a.populations[flat_index(16)] + n_a.populations[flat_index(17)] +
+           n_a.populations[flat_index(18)];
   case 5:
-    return (n_a.vd[flat_index(1)] + n_a.vd[flat_index(2)]) -
-           (n_a.vd[flat_index(3)] + n_a.vd[flat_index(4)]) +
-           (n_a.vd[flat_index(11)] + n_a.vd[flat_index(12)]) +
-           (n_a.vd[flat_index(13)] + n_a.vd[flat_index(14)]) -
-           (n_a.vd[flat_index(15)] + n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] + n_a.vd[flat_index(18)]);
+    return (n_a.populations[flat_index(1)] + n_a.populations[flat_index(2)]) -
+           (n_a.populations[flat_index(3)] + n_a.populations[flat_index(4)]) +
+           (n_a.populations[flat_index(11)] + n_a.populations[flat_index(12)]) +
+           (n_a.populations[flat_index(13)] + n_a.populations[flat_index(14)]) -
+           (n_a.populations[flat_index(15)] + n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] + n_a.populations[flat_index(18)]);
   case 6:
-    return (n_a.vd[flat_index(1)] + n_a.vd[flat_index(2)]) +
-           (n_a.vd[flat_index(3)] + n_a.vd[flat_index(4)]) -
-           (n_a.vd[flat_index(11)] + n_a.vd[flat_index(12)]) -
-           (n_a.vd[flat_index(13)] + n_a.vd[flat_index(14)]) -
-           (n_a.vd[flat_index(15)] + n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] + n_a.vd[flat_index(18)]) -
-           2.0f * ((n_a.vd[flat_index(5)] + n_a.vd[flat_index(6)]) -
-                   (n_a.vd[flat_index(7)] + n_a.vd[flat_index(8)]) -
-                   (n_a.vd[flat_index(9)] + n_a.vd[flat_index(10)]));
+    return (n_a.populations[flat_index(1)] + n_a.populations[flat_index(2)]) +
+           (n_a.populations[flat_index(3)] + n_a.populations[flat_index(4)]) -
+           (n_a.populations[flat_index(11)] + n_a.populations[flat_index(12)]) -
+           (n_a.populations[flat_index(13)] + n_a.populations[flat_index(14)]) -
+           (n_a.populations[flat_index(15)] + n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] + n_a.populations[flat_index(18)]) -
+           2.0f * ((n_a.populations[flat_index(5)] +
+                    n_a.populations[flat_index(6)]) -
+                   (n_a.populations[flat_index(7)] +
+                    n_a.populations[flat_index(8)]) -
+                   (n_a.populations[flat_index(9)] +
+                    n_a.populations[flat_index(10)]));
   case 7:
-    return (n_a.vd[flat_index(7)] + n_a.vd[flat_index(8)]) -
-           (n_a.vd[flat_index(9)] + n_a.vd[flat_index(10)]);
+    return (n_a.populations[flat_index(7)] + n_a.populations[flat_index(8)]) -
+           (n_a.populations[flat_index(9)] + n_a.populations[flat_index(10)]);
   case 8:
-    return (n_a.vd[flat_index(11)] + n_a.vd[flat_index(12)]) -
-           (n_a.vd[flat_index(13)] + n_a.vd[flat_index(14)]);
+    return (n_a.populations[flat_index(11)] + n_a.populations[flat_index(12)]) -
+           (n_a.populations[flat_index(13)] + n_a.populations[flat_index(14)]);
   case 9:
-    return (n_a.vd[flat_index(15)] + n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] + n_a.vd[flat_index(18)]);
+    return (n_a.populations[flat_index(15)] + n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] + n_a.populations[flat_index(18)]);
   case 10:
-    return -2.0f * (n_a.vd[flat_index(1)] - n_a.vd[flat_index(2)]) +
-           (n_a.vd[flat_index(7)] - n_a.vd[flat_index(8)]) +
-           (n_a.vd[flat_index(9)] - n_a.vd[flat_index(10)]) +
-           (n_a.vd[flat_index(11)] - n_a.vd[flat_index(12)]) +
-           (n_a.vd[flat_index(13)] - n_a.vd[flat_index(14)]);
+    return -2.0f * (n_a.populations[flat_index(1)] -
+                    n_a.populations[flat_index(2)]) +
+           (n_a.populations[flat_index(7)] - n_a.populations[flat_index(8)]) +
+           (n_a.populations[flat_index(9)] - n_a.populations[flat_index(10)]) +
+           (n_a.populations[flat_index(11)] - n_a.populations[flat_index(12)]) +
+           (n_a.populations[flat_index(13)] - n_a.populations[flat_index(14)]);
   case 11:
-    return -2.0f * (n_a.vd[flat_index(3)] - n_a.vd[flat_index(4)]) +
-           (n_a.vd[flat_index(7)] - n_a.vd[flat_index(8)]) -
-           (n_a.vd[flat_index(9)] - n_a.vd[flat_index(10)]) +
-           (n_a.vd[flat_index(15)] - n_a.vd[flat_index(16)]) +
-           (n_a.vd[flat_index(17)] - n_a.vd[flat_index(18)]);
+    return -2.0f * (n_a.populations[flat_index(3)] -
+                    n_a.populations[flat_index(4)]) +
+           (n_a.populations[flat_index(7)] - n_a.populations[flat_index(8)]) -
+           (n_a.populations[flat_index(9)] - n_a.populations[flat_index(10)]) +
+           (n_a.populations[flat_index(15)] - n_a.populations[flat_index(16)]) +
+           (n_a.populations[flat_index(17)] - n_a.populations[flat_index(18)]);
   case 12:
-    return -2.0f * (n_a.vd[flat_index(5)] - n_a.vd[flat_index(6)]) +
-           (n_a.vd[flat_index(11)] - n_a.vd[flat_index(12)]) -
-           (n_a.vd[flat_index(13)] - n_a.vd[flat_index(14)]) +
-           (n_a.vd[flat_index(15)] - n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] - n_a.vd[flat_index(18)]);
+    return -2.0f * (n_a.populations[flat_index(5)] -
+                    n_a.populations[flat_index(6)]) +
+           (n_a.populations[flat_index(11)] - n_a.populations[flat_index(12)]) -
+           (n_a.populations[flat_index(13)] - n_a.populations[flat_index(14)]) +
+           (n_a.populations[flat_index(15)] - n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] - n_a.populations[flat_index(18)]);
   case 13:
-    return (n_a.vd[flat_index(7)] - n_a.vd[flat_index(8)]) +
-           (n_a.vd[flat_index(9)] - n_a.vd[flat_index(10)]) -
-           (n_a.vd[flat_index(11)] - n_a.vd[flat_index(12)]) -
-           (n_a.vd[flat_index(13)] - n_a.vd[flat_index(14)]);
+    return (n_a.populations[flat_index(7)] - n_a.populations[flat_index(8)]) +
+           (n_a.populations[flat_index(9)] - n_a.populations[flat_index(10)]) -
+           (n_a.populations[flat_index(11)] - n_a.populations[flat_index(12)]) -
+           (n_a.populations[flat_index(13)] - n_a.populations[flat_index(14)]);
   case 14:
-    return (n_a.vd[flat_index(7)] - n_a.vd[flat_index(8)]) -
-           (n_a.vd[flat_index(9)] - n_a.vd[flat_index(10)]) -
-           (n_a.vd[flat_index(15)] - n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] - n_a.vd[flat_index(18)]);
+    return (n_a.populations[flat_index(7)] - n_a.populations[flat_index(8)]) -
+           (n_a.populations[flat_index(9)] - n_a.populations[flat_index(10)]) -
+           (n_a.populations[flat_index(15)] - n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] - n_a.populations[flat_index(18)]);
   case 15:
-    return (n_a.vd[flat_index(11)] - n_a.vd[flat_index(12)]) -
-           (n_a.vd[flat_index(13)] - n_a.vd[flat_index(14)]) -
-           (n_a.vd[flat_index(15)] - n_a.vd[flat_index(16)]) +
-           (n_a.vd[flat_index(17)] - n_a.vd[flat_index(18)]);
+    return (n_a.populations[flat_index(11)] - n_a.populations[flat_index(12)]) -
+           (n_a.populations[flat_index(13)] - n_a.populations[flat_index(14)]) -
+           (n_a.populations[flat_index(15)] - n_a.populations[flat_index(16)]) +
+           (n_a.populations[flat_index(17)] - n_a.populations[flat_index(18)]);
   case 16:
-    return n_a.vd[flat_index(0)] + n_a.vd[flat_index(7)] +
-           n_a.vd[flat_index(8)] + n_a.vd[flat_index(9)] +
-           n_a.vd[flat_index(10)] + n_a.vd[flat_index(11)] +
-           n_a.vd[flat_index(12)] + n_a.vd[flat_index(13)] +
-           n_a.vd[flat_index(14)] + n_a.vd[flat_index(15)] +
-           n_a.vd[flat_index(16)] + n_a.vd[flat_index(17)] +
-           n_a.vd[flat_index(18)] -
-           2.0f * ((n_a.vd[flat_index(1)] + n_a.vd[flat_index(2)]) +
-                   (n_a.vd[flat_index(3)] + n_a.vd[flat_index(4)]) +
-                   (n_a.vd[flat_index(5)] + n_a.vd[flat_index(6)]));
+    return n_a.populations[flat_index(0)] + n_a.populations[flat_index(7)] +
+           n_a.populations[flat_index(8)] + n_a.populations[flat_index(9)] +
+           n_a.populations[flat_index(10)] + n_a.populations[flat_index(11)] +
+           n_a.populations[flat_index(12)] + n_a.populations[flat_index(13)] +
+           n_a.populations[flat_index(14)] + n_a.populations[flat_index(15)] +
+           n_a.populations[flat_index(16)] + n_a.populations[flat_index(17)] +
+           n_a.populations[flat_index(18)] -
+           2.0f * ((n_a.populations[flat_index(1)] +
+                    n_a.populations[flat_index(2)]) +
+                   (n_a.populations[flat_index(3)] +
+                    n_a.populations[flat_index(4)]) +
+                   (n_a.populations[flat_index(5)] +
+                    n_a.populations[flat_index(6)]));
   case 17:
-    return -(n_a.vd[flat_index(1)] + n_a.vd[flat_index(2)]) +
-           (n_a.vd[flat_index(3)] + n_a.vd[flat_index(4)]) +
-           (n_a.vd[flat_index(11)] + n_a.vd[flat_index(12)]) +
-           (n_a.vd[flat_index(13)] + n_a.vd[flat_index(14)]) -
-           (n_a.vd[flat_index(15)] + n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] + n_a.vd[flat_index(18)]);
+    return -(n_a.populations[flat_index(1)] + n_a.populations[flat_index(2)]) +
+           (n_a.populations[flat_index(3)] + n_a.populations[flat_index(4)]) +
+           (n_a.populations[flat_index(11)] + n_a.populations[flat_index(12)]) +
+           (n_a.populations[flat_index(13)] + n_a.populations[flat_index(14)]) -
+           (n_a.populations[flat_index(15)] + n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] + n_a.populations[flat_index(18)]);
   case 18:
-    return -(n_a.vd[flat_index(1)] + n_a.vd[flat_index(2)]) -
-           (n_a.vd[flat_index(3)] + n_a.vd[flat_index(4)]) -
-           (n_a.vd[flat_index(11)] + n_a.vd[flat_index(12)]) -
-           (n_a.vd[flat_index(13)] + n_a.vd[flat_index(14)]) -
-           (n_a.vd[flat_index(15)] + n_a.vd[flat_index(16)]) -
-           (n_a.vd[flat_index(17)] + n_a.vd[flat_index(18)]) +
-           2.0f * ((n_a.vd[flat_index(5)] + n_a.vd[flat_index(6)]) +
-                   (n_a.vd[flat_index(7)] + n_a.vd[flat_index(8)]) +
-                   (n_a.vd[flat_index(9)] + n_a.vd[flat_index(10)]));
+    return -(n_a.populations[flat_index(1)] + n_a.populations[flat_index(2)]) -
+           (n_a.populations[flat_index(3)] + n_a.populations[flat_index(4)]) -
+           (n_a.populations[flat_index(11)] + n_a.populations[flat_index(12)]) -
+           (n_a.populations[flat_index(13)] + n_a.populations[flat_index(14)]) -
+           (n_a.populations[flat_index(15)] + n_a.populations[flat_index(16)]) -
+           (n_a.populations[flat_index(17)] + n_a.populations[flat_index(18)]) +
+           2.0f * ((n_a.populations[flat_index(5)] +
+                    n_a.populations[flat_index(6)]) +
+                   (n_a.populations[flat_index(7)] +
+                    n_a.populations[flat_index(8)]) +
+                   (n_a.populations[flat_index(9)] +
+                    n_a.populations[flat_index(10)]));
   }
   return 0.0;
 }
@@ -425,26 +433,22 @@ reset_LB_force_densities_kernel(LB_node_force_density_gpu node_f,
 }
 
 void reset_LB_force_densities_GPU(bool buffer) {
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(reset_LB_force_densities_kernel, dim_grid, threads_per_block,
              node_f, buffer);
 }
 
 /**
- *  @param[in]  mode    Local register values mode
+ *  @param[in]  modes    Local register values modes
  *  @param[in]  index   Node index / thread index
  *  @param[in]  node_f  Local node force
  *  @param[out] d_v     Local device values
  */
-__device__ void update_rho_v(Utils::Array<float, 19> const &mode,
+__device__ void update_rho_v(Utils::Array<float, 19> const &modes,
                              unsigned int index,
-                             LB_node_force_density_gpu node_f,
+                             LB_node_force_density_gpu const &node_f,
                              LB_rho_v_gpu *d_v) {
   float Rho_tot = 0.0f;
   float u_tot[3] = {0.0f, 0.0f, 0.0f};
@@ -453,11 +457,11 @@ __device__ void update_rho_v(Utils::Array<float, 19> const &mode,
    * remember that the populations are stored as differences to their
    * equilibrium value */
 
-  d_v[index].rho = mode[0] + para->rho;
-  Rho_tot += mode[0] + para->rho;
-  u_tot[0] += mode[1];
-  u_tot[1] += mode[2];
-  u_tot[2] += mode[3];
+  d_v[index].rho = modes[0] + para->rho;
+  Rho_tot += modes[0] + para->rho;
+  u_tot[0] += modes[1];
+  u_tot[1] += modes[2];
+  u_tot[2] += modes[3];
 
   /** If forces are present, the momentum density is redefined to
    *  include one half-step of the force action. See the
@@ -503,7 +507,7 @@ __device__ void relax_modes(Utils::Array<float, 19> &mode, unsigned int index,
   j[1] = Rho * u_tot[1];
   j[2] = Rho * u_tot[2];
 
-  /** equilibrium part of the stress modes (eq13 schiller) */
+  /* equilibrium part of the stress modes (eq13 schiller) */
 
   modes_from_pi_eq[0] = ((j[0] * j[0]) + (j[1] * j[1]) + (j[2] * j[2])) / Rho;
   modes_from_pi_eq[1] = ((j[0] * j[0]) - (j[1] * j[1])) / Rho;
@@ -514,7 +518,7 @@ __device__ void relax_modes(Utils::Array<float, 19> &mode, unsigned int index,
   modes_from_pi_eq[4] = j[0] * j[2] / Rho;
   modes_from_pi_eq[5] = j[1] * j[2] / Rho;
 
-  /** relax the stress modes (eq14 schiller) */
+  /* relax the stress modes (eq14 schiller) */
 
   mode[4] =
       modes_from_pi_eq[0] + para->gamma_bulk * (mode[4] - modes_from_pi_eq[0]);
@@ -546,6 +550,7 @@ __device__ void relax_modes(Utils::Array<float, 19> &mode, unsigned int index,
 /** Thermalization of the modes with Gaussian random numbers
  *  @param[in] index     Node index / thread index
  *  @param[in,out] mode  Local register values mode
+ *  @param[in]  philox_counter   Philox counter
  */
 __device__ void thermalize_modes(Utils::Array<float, 19> &mode,
                                  unsigned int index, uint64_t philox_counter) {
@@ -655,130 +660,141 @@ __device__ void calc_n_from_modes_push(LB_nodes_gpu n_b,
   unsigned int y = xyz.y;
   unsigned int z = xyz.z;
 
-  n_b.vd[0 * para->number_of_nodes + x + para->dim_x * y +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[0 * para->number_of_nodes + x + para->dim_x * y +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 3.0f * (mode[0] - mode[4] + mode[16]);
 
-  n_b.vd[1 * para->number_of_nodes + (x + 1) % para->dim_x + para->dim_x * y +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[1 * para->number_of_nodes + (x + 1) % para->dim_x +
+                  para->dim_x * y + para->dim_x * para->dim_y * z] =
       1.0f / 18.0f *
       (mode[0] + mode[1] + mode[5] + mode[6] - mode[17] - mode[18] -
        2.0f * (mode[10] + mode[16]));
 
-  n_b.vd[2 * para->number_of_nodes + (para->dim_x + x - 1) % para->dim_x +
-         para->dim_x * y + para->dim_x * para->dim_y * z] =
+  n_b.populations[2 * para->number_of_nodes +
+                  (para->dim_x + x - 1) % para->dim_x + para->dim_x * y +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 18.0f *
       (mode[0] - mode[1] + mode[5] + mode[6] - mode[17] - mode[18] +
        2.0f * (mode[10] - mode[16]));
 
-  n_b.vd[3 * para->number_of_nodes + x + para->dim_x * ((y + 1) % para->dim_y) +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[3 * para->number_of_nodes + x +
+                  para->dim_x * ((y + 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 18.0f *
       (mode[0] + mode[2] - mode[5] + mode[6] + mode[17] - mode[18] -
        2.0f * (mode[11] + mode[16]));
 
-  n_b.vd[4 * para->number_of_nodes + x +
-         para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[4 * para->number_of_nodes + x +
+                  para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 18.0f *
       (mode[0] - mode[2] - mode[5] + mode[6] + mode[17] - mode[18] +
        2.0f * (mode[11] - mode[16]));
 
-  n_b.vd[5 * para->number_of_nodes + x + para->dim_x * y +
-         para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
+  n_b.populations[5 * para->number_of_nodes + x + para->dim_x * y +
+                  para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
       1.0f / 18.0f *
       (mode[0] + mode[3] - 2.0f * (mode[6] + mode[12] + mode[16] - mode[18]));
 
-  n_b.vd[6 * para->number_of_nodes + x + para->dim_x * y +
-         para->dim_x * para->dim_y * ((para->dim_z + z - 1) % para->dim_z)] =
+  n_b.populations[6 * para->number_of_nodes + x + para->dim_x * y +
+                  para->dim_x * para->dim_y *
+                      ((para->dim_z + z - 1) % para->dim_z)] =
       1.0f / 18.0f *
       (mode[0] - mode[3] - 2.0f * (mode[6] - mode[12] + mode[16] - mode[18]));
 
-  n_b.vd[7 * para->number_of_nodes + (x + 1) % para->dim_x +
-         para->dim_x * ((y + 1) % para->dim_y) +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[7 * para->number_of_nodes + (x + 1) % para->dim_x +
+                  para->dim_x * ((y + 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 36.0f *
       (mode[0] + mode[1] + mode[2] + mode[4] + 2.0f * mode[6] + mode[7] +
        mode[10] + mode[11] + mode[13] + mode[14] + mode[16] + 2.0f * mode[18]);
 
-  n_b.vd[8 * para->number_of_nodes + (para->dim_x + x - 1) % para->dim_x +
-         para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[8 * para->number_of_nodes +
+                  (para->dim_x + x - 1) % para->dim_x +
+                  para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 36.0f *
       (mode[0] - mode[1] - mode[2] + mode[4] + 2.0f * mode[6] + mode[7] -
        mode[10] - mode[11] - mode[13] - mode[14] + mode[16] + 2.0f * mode[18]);
 
-  n_b.vd[9 * para->number_of_nodes + (x + 1) % para->dim_x +
-         para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[9 * para->number_of_nodes + (x + 1) % para->dim_x +
+                  para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 36.0f *
       (mode[0] + mode[1] - mode[2] + mode[4] + 2.0f * mode[6] - mode[7] +
        mode[10] - mode[11] + mode[13] - mode[14] + mode[16] + 2.0f * mode[18]);
 
-  n_b.vd[10 * para->number_of_nodes + (para->dim_x + x - 1) % para->dim_x +
-         para->dim_x * ((y + 1) % para->dim_y) +
-         para->dim_x * para->dim_y * z] =
+  n_b.populations[10 * para->number_of_nodes +
+                  (para->dim_x + x - 1) % para->dim_x +
+                  para->dim_x * ((y + 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * z] =
       1.0f / 36.0f *
       (mode[0] - mode[1] + mode[2] + mode[4] + 2.0f * mode[6] - mode[7] -
        mode[10] + mode[11] - mode[13] + mode[14] + mode[16] + 2.0f * mode[18]);
 
-  n_b.vd[11 * para->number_of_nodes + (x + 1) % para->dim_x + para->dim_x * y +
-         para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
+  n_b.populations[11 * para->number_of_nodes + (x + 1) % para->dim_x +
+                  para->dim_x * y +
+                  para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] + mode[1] + mode[3] + mode[4] + mode[5] - mode[6] + mode[8] +
        mode[10] + mode[12] - mode[13] + mode[15] + mode[16] + mode[17] -
        mode[18]);
 
-  n_b.vd[12 * para->number_of_nodes + (para->dim_x + x - 1) % para->dim_x +
-         para->dim_x * y +
-         para->dim_x * para->dim_y * ((para->dim_z + z - 1) % para->dim_z)] =
+  n_b.populations[12 * para->number_of_nodes +
+                  (para->dim_x + x - 1) % para->dim_x + para->dim_x * y +
+                  para->dim_x * para->dim_y *
+                      ((para->dim_z + z - 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] - mode[1] - mode[3] + mode[4] + mode[5] - mode[6] + mode[8] -
        mode[10] - mode[12] + mode[13] - mode[15] + mode[16] + mode[17] -
        mode[18]);
 
-  n_b.vd[13 * para->number_of_nodes + (x + 1) % para->dim_x + para->dim_x * y +
-         para->dim_x * para->dim_y * ((para->dim_z + z - 1) % para->dim_z)] =
+  n_b.populations[13 * para->number_of_nodes + (x + 1) % para->dim_x +
+                  para->dim_x * y +
+                  para->dim_x * para->dim_y *
+                      ((para->dim_z + z - 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] + mode[1] - mode[3] + mode[4] + mode[5] - mode[6] - mode[8] +
        mode[10] - mode[12] - mode[13] - mode[15] + mode[16] + mode[17] -
        mode[18]);
 
-  n_b.vd[14 * para->number_of_nodes + (para->dim_x + x - 1) % para->dim_x +
-         para->dim_x * y +
-         para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
+  n_b.populations[14 * para->number_of_nodes +
+                  (para->dim_x + x - 1) % para->dim_x + para->dim_x * y +
+                  para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] - mode[1] + mode[3] + mode[4] + mode[5] - mode[6] - mode[8] -
        mode[10] + mode[12] + mode[13] + mode[15] + mode[16] + mode[17] -
        mode[18]);
 
-  n_b.vd[15 * para->number_of_nodes + x +
-         para->dim_x * ((y + 1) % para->dim_y) +
-         para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
+  n_b.populations[15 * para->number_of_nodes + x +
+                  para->dim_x * ((y + 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] + mode[2] + mode[3] + mode[4] - mode[5] - mode[6] + mode[9] +
        mode[11] + mode[12] - mode[14] - mode[15] + mode[16] - mode[17] -
        mode[18]);
 
-  n_b.vd[16 * para->number_of_nodes + x +
-         para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
-         para->dim_x * para->dim_y * ((para->dim_z + z - 1) % para->dim_z)] =
+  n_b.populations[16 * para->number_of_nodes + x +
+                  para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
+                  para->dim_x * para->dim_y *
+                      ((para->dim_z + z - 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] - mode[2] - mode[3] + mode[4] - mode[5] - mode[6] + mode[9] -
        mode[11] - mode[12] + mode[14] + mode[15] + mode[16] - mode[17] -
        mode[18]);
 
-  n_b.vd[17 * para->number_of_nodes + x +
-         para->dim_x * ((y + 1) % para->dim_y) +
-         para->dim_x * para->dim_y * ((para->dim_z + z - 1) % para->dim_z)] =
+  n_b.populations[17 * para->number_of_nodes + x +
+                  para->dim_x * ((y + 1) % para->dim_y) +
+                  para->dim_x * para->dim_y *
+                      ((para->dim_z + z - 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] + mode[2] - mode[3] + mode[4] - mode[5] - mode[6] - mode[9] +
        mode[11] - mode[12] - mode[14] + mode[15] + mode[16] - mode[17] -
        mode[18]);
 
-  n_b.vd[18 * para->number_of_nodes + x +
-         para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
-         para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
+  n_b.populations[18 * para->number_of_nodes + x +
+                  para->dim_x * ((para->dim_y + y - 1) % para->dim_y) +
+                  para->dim_x * para->dim_y * ((z + 1) % para->dim_z)] =
       1.0f / 36.0f *
       (mode[0] - mode[2] + mode[3] + mode[4] - mode[5] - mode[6] - mode[9] -
        mode[11] + mode[12] + mode[14] - mode[15] + mode[16] - mode[17] -
@@ -804,7 +820,7 @@ __device__ void bounce_back_boundaries(LB_nodes_gpu n_curr,
   float shift, weight, pop_to_bounce_back;
   float boundary_force[3] = {0.0f, 0.0f, 0.0f};
   size_t to_index, to_index_x, to_index_y, to_index_z;
-  int population, inverse;
+  unsigned population, inverse;
 
   if (boundaries.index[index] != 0) {
     auto const v = boundaries.velocity[index];
@@ -815,7 +831,7 @@ __device__ void bounce_back_boundaries(LB_nodes_gpu n_curr,
     unsigned int y = xyz.y;
     unsigned int z = xyz.z;
 
-    /* store vd temporary in second lattice to avoid race conditions */
+    /* store populations temporary in second lattice to avoid race conditions */
 
     // TODO : PUT IN EQUILIBRIUM CONTRIBUTION TO THE BOUNCE-BACK DENSITY FOR THE
     // BOUNDARY FORCE
@@ -827,10 +843,11 @@ __device__ void bounce_back_boundaries(LB_nodes_gpu n_curr,
   shift = 2.0f / para->agrid * para->rho * 3.0f * weight * para->tau *         \
           (v[0] * static_cast<float>(c[0]) + v[1] * static_cast<float>(c[1]) + \
            v[2] * static_cast<float>(c[2]));                                   \
-  pop_to_bounce_back = n_curr.vd[population * para->number_of_nodes + index];  \
-  to_index_x = (x + c[0] + para->dim_x) % para->dim_x;                         \
-  to_index_y = (y + c[1] + para->dim_y) % para->dim_y;                         \
-  to_index_z = (z + c[2] + para->dim_z) % para->dim_z;                         \
+  pop_to_bounce_back =                                                         \
+      n_curr.populations[population * para->number_of_nodes + index];          \
+  to_index_x = (x + static_cast<unsigned>(c[0]) + para->dim_x) % para->dim_x;  \
+  to_index_y = (y + static_cast<unsigned>(c[1]) + para->dim_y) % para->dim_y;  \
+  to_index_z = (z + static_cast<unsigned>(c[2]) + para->dim_z) % para->dim_z;  \
   to_index = to_index_x + para->dim_x * to_index_y +                           \
              para->dim_x * para->dim_y * to_index_z;                           \
   if (n_curr.boundary[to_index] == 0) {                                        \
@@ -840,7 +857,7 @@ __device__ void bounce_back_boundaries(LB_nodes_gpu n_curr,
         (2.0f * pop_to_bounce_back + shift) * static_cast<float>(c[1]);        \
     boundary_force[2] +=                                                       \
         (2.0f * pop_to_bounce_back + shift) * static_cast<float>(c[2]);        \
-    n_curr.vd[inverse * para->number_of_nodes + to_index] =                    \
+    n_curr.populations[inverse * para->number_of_nodes + to_index] =           \
         pop_to_bounce_back + shift;                                            \
   }
 
@@ -1063,9 +1080,82 @@ __device__ void apply_forces(unsigned int index, Utils::Array<float, 19> &mode,
   reset_LB_force_densities(index, node_f);
 }
 
+__device__ Utils::Array<float, 19>
+stress_modes(LB_rho_v_gpu const &rho_v, const Utils::Array<float, 19> &modes) {
+  /* note that d_v[index].v[] already includes the 1/2 f term, accounting
+   * for the pre- and post-collisional average
+   */
+  auto const density = rho_v.rho;
+  Utils::Array<float, 3> j{density * rho_v.v[0], density * rho_v.v[1],
+                           density * rho_v.v[2]};
+  // equilibrium part of the stress modes, which comes from
+  // the equality between modes and stress tensor components
+
+  /* m4 = trace(pi) - rho
+     m5 = pi_xx - pi_yy
+     m6 = trace(pi) - 3 pi_zz
+     m7 = pi_xy
+     m8 = pi_xz
+     m9 = pi_yz */
+
+  // and plugging in the Euler stress for the equilibrium:
+  // pi_eq = rho_0*c_s^2*I3 + (j \otimes j)/rho
+  // with I3 the 3D identity matrix and
+  // rho = \trace(rho_0*c_s^2*I3), which yields
+
+  /* m4_from_pi_eq = j.j
+     m5_from_pi_eq = j_x*j_x - j_y*j_y
+     m6_from_pi_eq = j.j - 3*j_z*j_z
+     m7_from_pi_eq = j_x*j_y
+     m8_from_pi_eq = j_x*j_z
+     m9_from_pi_eq = j_y*j_z */
+
+  // where the / density term has been dropped. We thus obtain:
+  /* Now we must predict the outcome of the next collision */
+  /* We immediately average pre- and post-collision. */
+  /* TODO: need a reference for this. */
+  Utils::Array<float, 6> modes_from_pi_eq{
+      (j[0] * j[0] + j[1] * j[1] + j[2] * j[2]) / density,
+      (j[0] * j[0] - j[1] * j[1]) / density,
+      (j[0] * j[0] + j[1] * j[1] + j[2] * j[2] - 3.0f * j[2] * j[2]) / density,
+      j[0] * j[1] / density,
+      j[0] * j[2] / density,
+      j[1] * j[2] / density};
+  auto res = modes;
+  res[4] = modes_from_pi_eq[0] +
+           (0.5f + 0.5f * para->gamma_bulk) * (modes[4] - modes_from_pi_eq[0]);
+  res[5] = modes_from_pi_eq[1] +
+           (0.5f + 0.5f * para->gamma_shear) * (modes[5] - modes_from_pi_eq[1]);
+  res[6] = modes_from_pi_eq[2] +
+           (0.5f + 0.5f * para->gamma_shear) * (modes[6] - modes_from_pi_eq[2]);
+  res[7] = modes_from_pi_eq[3] +
+           (0.5f + 0.5f * para->gamma_shear) * (modes[7] - modes_from_pi_eq[3]);
+  res[8] = modes_from_pi_eq[4] +
+           (0.5f + 0.5f * para->gamma_shear) * (modes[8] - modes_from_pi_eq[4]);
+  res[9] = modes_from_pi_eq[5] +
+           (0.5f + 0.5f * para->gamma_shear) * (modes[9] - modes_from_pi_eq[5]);
+  return res;
+}
+
+// Transform the stress tensor components according to the modes that
+// correspond to those used by U. Schiller. In terms of populations this
+// expression then corresponds exactly to those in eq. (116)-(121) in
+// @cite dunweg07a, when these are written out in populations.
+// But to ensure this, the expression in Schiller's modes has to be
+// different!
+__device__ Utils::Array<float, 6>
+stress_from_stress_modes(Utils::Array<float, 19> const &modes) {
+  return {(2.0f * (modes[0] + modes[4]) + modes[6] + 3.0f * modes[5]) / 6.0f,
+          modes[7],
+          (2.0f * (modes[0] + modes[4]) + modes[6] - 3.0f * modes[5]) / 6.0f,
+          modes[8],
+          modes[9],
+          (modes[0] + modes[4] - modes[6]) / 3.0f};
+}
+
 /** Calculate hydrodynamic fields in LB units
  *  @param[in]  n_a     Local node residing in array a for boundary flag
- *  @param[out] mode    Local register values mode
+ *  @param[out] modes    Local register values modes
  *  @param[out] d_p_v   Local print values
  *  @param[out] d_v     Local device values
  *  @param[in]  node_f  Local node force
@@ -1074,111 +1164,26 @@ __device__ void apply_forces(unsigned int index, Utils::Array<float, 19> &mode,
  *  TODO: code duplication with \ref calc_values_from_m
  */
 __device__ void
-calc_values_in_LB_units(LB_nodes_gpu n_a, Utils::Array<float, 19> &mode,
+calc_values_in_LB_units(LB_nodes_gpu n_a, Utils::Array<float, 19> const &modes,
                         LB_rho_v_pi_gpu *d_p_v, LB_rho_v_gpu *d_v,
                         LB_node_force_density_gpu node_f, unsigned int index,
                         unsigned int print_index) {
-  Utils::Array<float, 3> j{};
-  Utils::Array<float, 6> modes_from_pi_eq{};
-  Utils::Array<float, 6> pi{};
 
   if (n_a.boundary[index] == 0) {
     /* Ensure we are working with the current values of d_v */
-
-    update_rho_v(mode, index, node_f, d_v);
+    update_rho_v(modes, index, node_f, d_v);
 
     d_p_v[print_index].rho = d_v[index].rho;
 
-    d_p_v[print_index].v[0] = d_v[index].v[0];
-    d_p_v[print_index].v[1] = d_v[index].v[1];
-    d_p_v[print_index].v[2] = d_v[index].v[2];
-    /* stress calculation */
-    float Rho = d_v[index].rho;
+    d_p_v[print_index].v = d_v[index].v;
+    auto const modes_tmp = stress_modes(d_v[index], modes);
 
-    /* note that d_v[index].v[] already includes the 1/2 f term, accounting
-     * for the pre- and post-collisional average
-     */
+    d_p_v[print_index].pi = stress_from_stress_modes(modes_tmp);
 
-    j[0] = Rho * d_v[index].v[0];
-    j[1] = Rho * d_v[index].v[1];
-    j[2] = Rho * d_v[index].v[2];
-
-    // equilibrium part of the stress modes, which comes from
-    // the equality between modes and stress tensor components
-
-    /* m4 = trace(pi) - rho
-       m5 = pi_xx - pi_yy
-       m6 = trace(pi) - 3 pi_zz
-       m7 = pi_xy
-       m8 = pi_xz
-       m9 = pi_yz */
-
-    // and plugging in the Euler stress for the equilibrium:
-    // pi_eq = rho_0*c_s^2*I3 + (j \otimes j)/rho
-    // with I3 the 3D identity matrix and
-    // rho = \trace(rho_0*c_s^2*I3), which yields
-
-    /* m4_from_pi_eq = j.j
-       m5_from_pi_eq = j_x*j_x - j_y*j_y
-       m6_from_pi_eq = j.j - 3*j_z*j_z
-       m7_from_pi_eq = j_x*j_y
-       m8_from_pi_eq = j_x*j_z
-       m9_from_pi_eq = j_y*j_z */
-
-    // where the / Rho term has been dropped. We thus obtain:
-
-    modes_from_pi_eq[0] = (j[0] * j[0] + j[1] * j[1] + j[2] * j[2]) / Rho;
-    modes_from_pi_eq[1] = (j[0] * j[0] - j[1] * j[1]) / Rho;
-    modes_from_pi_eq[2] =
-        (j[0] * j[0] + j[1] * j[1] + j[2] * j[2] - 3.0f * j[2] * j[2]) / Rho;
-    modes_from_pi_eq[3] = j[0] * j[1] / Rho;
-    modes_from_pi_eq[4] = j[0] * j[2] / Rho;
-    modes_from_pi_eq[5] = j[1] * j[2] / Rho;
-
-    /* Now we must predict the outcome of the next collision */
-    /* We immediately average pre- and post-collision. */
-    /* TODO: need a reference for this. */
-
-    mode[4] = modes_from_pi_eq[0] + (0.5f + 0.5f * para->gamma_bulk) *
-                                        (mode[4] - modes_from_pi_eq[0]);
-    mode[5] = modes_from_pi_eq[1] + (0.5f + 0.5f * para->gamma_shear) *
-                                        (mode[5] - modes_from_pi_eq[1]);
-    mode[6] = modes_from_pi_eq[2] + (0.5f + 0.5f * para->gamma_shear) *
-                                        (mode[6] - modes_from_pi_eq[2]);
-    mode[7] = modes_from_pi_eq[3] + (0.5f + 0.5f * para->gamma_shear) *
-                                        (mode[7] - modes_from_pi_eq[3]);
-    mode[8] = modes_from_pi_eq[4] + (0.5f + 0.5f * para->gamma_shear) *
-                                        (mode[8] - modes_from_pi_eq[4]);
-    mode[9] = modes_from_pi_eq[5] + (0.5f + 0.5f * para->gamma_shear) *
-                                        (mode[9] - modes_from_pi_eq[5]);
-
-    // Transform the stress tensor components according to the modes that
-    // correspond to those used by U. Schiller. In terms of populations this
-    // expression then corresponds exactly to those in eq. (116)-(121) in
-    // @cite dunweg07a, when these are written out in populations.
-    // But to ensure this, the expression in Schiller's modes has to be
-    // different!
-
-    pi[0] +=
-        (2.0f * (mode[0] + mode[4]) + mode[6] + 3.0f * mode[5]) / 6.0f; // xx
-    pi[1] += mode[7];                                                   // xy
-    pi[2] +=
-        (2.0f * (mode[0] + mode[4]) + mode[6] - 3.0f * mode[5]) / 6.0f; // yy
-    pi[3] += mode[8];                                                   // xz
-    pi[4] += mode[9];                                                   // yz
-    pi[5] += (mode[0] + mode[4] - mode[6]) / 3.0f;                      // zz
-
-    for (int i = 0; i < 6; i++) {
-      d_p_v[print_index].pi[i] = pi[i];
-    }
   } else {
     d_p_v[print_index].rho = 0.0f;
-
-    for (auto &val : d_p_v[print_index].v)
-      val = 0.0f;
-
-    for (auto &val : d_p_v[print_index].pi)
-      val = 0.0f;
+    d_p_v[print_index].v = {};
+    d_p_v[print_index].pi = {};
   }
 }
 
@@ -1189,76 +1194,20 @@ calc_values_in_LB_units(LB_nodes_gpu n_a, Utils::Array<float, 19> &mode,
  *  @param[out] j_out         Momentum
  *  @param[out] pi_out        Pressure tensor
  */
-__device__ void calc_values_from_m(Utils::Array<float, 19> &mode_single,
-                                   LB_rho_v_gpu *d_v_single, float *rho_out,
-                                   float *j_out, float *pi_out) {
-  Utils::Array<float, 6> modes_from_pi_eq{};
-  Utils::Array<float, 6> j{};
-  float Rho;
-
-  // stress calculation
-
-  // Set the rho output value
-
-  Rho = d_v_single->rho;
-  *rho_out = d_v_single->rho;
-
-  // note that d_v_single->v[] already includes the 1/2 f term,
-  // accounting for the pre- and post-collisional average
-
-  j[0] = Rho * d_v_single->v[0];
-  j[1] = Rho * d_v_single->v[1];
-  j[2] = Rho * d_v_single->v[2];
-
-  j_out[3] = j[0];
-  j_out[3] = j[1];
-  j_out[3] = j[2];
-
-  // equilibrium part of the stress modes, which comes from
-  // the equality between modes and stress tensor components
-
-  modes_from_pi_eq[0] = (j[0] * j[0] + j[1] * j[1] + j[2] * j[2]) / Rho;
-  modes_from_pi_eq[1] = (j[0] * j[0] - j[1] * j[1]) / Rho;
-  modes_from_pi_eq[2] =
-      (j[0] * j[0] + j[1] * j[1] + j[2] * j[2] - 3.0f * j[2] * j[2]) / Rho;
-  modes_from_pi_eq[3] = j[0] * j[1] / Rho;
-  modes_from_pi_eq[4] = j[0] * j[2] / Rho;
-  modes_from_pi_eq[5] = j[1] * j[2] / Rho;
+__device__ void calc_values_from_m(Utils::Array<float, 19> const &mode_single,
+                                   LB_rho_v_gpu const &d_v_single,
+                                   float *rho_out, float *j_out,
+                                   Utils::Array<float, 6> &pi_out) {
+  *rho_out = d_v_single.rho;
+  float Rho = d_v_single.rho;
+  j_out[0] = Rho * d_v_single.v[0];
+  j_out[1] = Rho * d_v_single.v[1];
+  j_out[2] = Rho * d_v_single.v[2];
 
   // Now we must predict the outcome of the next collision
   // We immediately average pre- and post-collision.
-
-  mode_single[4] =
-      modes_from_pi_eq[0] +
-      (0.5f + 0.5f * para->gamma_bulk) * (mode_single[4] - modes_from_pi_eq[0]);
-  mode_single[5] =
-      modes_from_pi_eq[1] + (0.5f + 0.5f * para->gamma_shear) *
-                                (mode_single[5] - modes_from_pi_eq[1]);
-  mode_single[6] =
-      modes_from_pi_eq[2] + (0.5f + 0.5f * para->gamma_shear) *
-                                (mode_single[6] - modes_from_pi_eq[2]);
-  mode_single[7] =
-      modes_from_pi_eq[3] + (0.5f + 0.5f * para->gamma_shear) *
-                                (mode_single[7] - modes_from_pi_eq[3]);
-  mode_single[8] =
-      modes_from_pi_eq[4] + (0.5f + 0.5f * para->gamma_shear) *
-                                (mode_single[8] - modes_from_pi_eq[4]);
-  mode_single[9] =
-      modes_from_pi_eq[5] + (0.5f + 0.5f * para->gamma_shear) *
-                                (mode_single[9] - modes_from_pi_eq[5]);
-
   // Transform the stress tensor components according to the mode_singles.
-
-  pi_out[0] = (2.0f * (mode_single[0] + mode_single[4]) + mode_single[6] +
-               3.0f * mode_single[5]) /
-              6.0f;           // xx
-  pi_out[1] = mode_single[7]; // xy
-  pi_out[2] = (2.0f * (mode_single[0] + mode_single[4]) + mode_single[6] -
-               3.0f * mode_single[5]) /
-              6.0f;                                                      // yy
-  pi_out[3] = mode_single[8];                                            // xz
-  pi_out[4] = mode_single[9];                                            // yz
-  pi_out[5] = (mode_single[0] + mode_single[4] - mode_single[6]) / 3.0f; // zz
+  pi_out = stress_from_stress_modes(stress_modes(d_v_single, mode_single));
 }
 
 /**
@@ -1269,59 +1218,59 @@ __device__ void calc_values_from_m(Utils::Array<float, 19> &mode_single,
 __device__ void calc_mode(Utils::Array<float, 4> &mode, LB_nodes_gpu n_a,
                           unsigned int node_index) {
   /* mass mode */
-  mode[0] = n_a.vd[0 * para->number_of_nodes + node_index] +
-            n_a.vd[1 * para->number_of_nodes + node_index] +
-            n_a.vd[2 * para->number_of_nodes + node_index] +
-            n_a.vd[3 * para->number_of_nodes + node_index] +
-            n_a.vd[4 * para->number_of_nodes + node_index] +
-            n_a.vd[5 * para->number_of_nodes + node_index] +
-            n_a.vd[6 * para->number_of_nodes + node_index] +
-            n_a.vd[7 * para->number_of_nodes + node_index] +
-            n_a.vd[8 * para->number_of_nodes + node_index] +
-            n_a.vd[9 * para->number_of_nodes + node_index] +
-            n_a.vd[10 * para->number_of_nodes + node_index] +
-            n_a.vd[11 * para->number_of_nodes + node_index] +
-            n_a.vd[12 * para->number_of_nodes + node_index] +
-            n_a.vd[13 * para->number_of_nodes + node_index] +
-            n_a.vd[14 * para->number_of_nodes + node_index] +
-            n_a.vd[15 * para->number_of_nodes + node_index] +
-            n_a.vd[16 * para->number_of_nodes + node_index] +
-            n_a.vd[17 * para->number_of_nodes + node_index] +
-            n_a.vd[18 * para->number_of_nodes + node_index];
+  mode[0] = n_a.populations[0 * para->number_of_nodes + node_index] +
+            n_a.populations[1 * para->number_of_nodes + node_index] +
+            n_a.populations[2 * para->number_of_nodes + node_index] +
+            n_a.populations[3 * para->number_of_nodes + node_index] +
+            n_a.populations[4 * para->number_of_nodes + node_index] +
+            n_a.populations[5 * para->number_of_nodes + node_index] +
+            n_a.populations[6 * para->number_of_nodes + node_index] +
+            n_a.populations[7 * para->number_of_nodes + node_index] +
+            n_a.populations[8 * para->number_of_nodes + node_index] +
+            n_a.populations[9 * para->number_of_nodes + node_index] +
+            n_a.populations[10 * para->number_of_nodes + node_index] +
+            n_a.populations[11 * para->number_of_nodes + node_index] +
+            n_a.populations[12 * para->number_of_nodes + node_index] +
+            n_a.populations[13 * para->number_of_nodes + node_index] +
+            n_a.populations[14 * para->number_of_nodes + node_index] +
+            n_a.populations[15 * para->number_of_nodes + node_index] +
+            n_a.populations[16 * para->number_of_nodes + node_index] +
+            n_a.populations[17 * para->number_of_nodes + node_index] +
+            n_a.populations[18 * para->number_of_nodes + node_index];
 
   /* momentum modes */
-  mode[1] = (n_a.vd[1 * para->number_of_nodes + node_index] -
-             n_a.vd[2 * para->number_of_nodes + node_index]) +
-            (n_a.vd[7 * para->number_of_nodes + node_index] -
-             n_a.vd[8 * para->number_of_nodes + node_index]) +
-            (n_a.vd[9 * para->number_of_nodes + node_index] -
-             n_a.vd[10 * para->number_of_nodes + node_index]) +
-            (n_a.vd[11 * para->number_of_nodes + node_index] -
-             n_a.vd[12 * para->number_of_nodes + node_index]) +
-            (n_a.vd[13 * para->number_of_nodes + node_index] -
-             n_a.vd[14 * para->number_of_nodes + node_index]);
-
-  mode[2] = (n_a.vd[3 * para->number_of_nodes + node_index] -
-             n_a.vd[4 * para->number_of_nodes + node_index]) +
-            (n_a.vd[7 * para->number_of_nodes + node_index] -
-             n_a.vd[8 * para->number_of_nodes + node_index]) -
-            (n_a.vd[9 * para->number_of_nodes + node_index] -
-             n_a.vd[10 * para->number_of_nodes + node_index]) +
-            (n_a.vd[15 * para->number_of_nodes + node_index] -
-             n_a.vd[16 * para->number_of_nodes + node_index]) +
-            (n_a.vd[17 * para->number_of_nodes + node_index] -
-             n_a.vd[18 * para->number_of_nodes + node_index]);
-
-  mode[3] = (n_a.vd[5 * para->number_of_nodes + node_index] -
-             n_a.vd[6 * para->number_of_nodes + node_index]) +
-            (n_a.vd[11 * para->number_of_nodes + node_index] -
-             n_a.vd[12 * para->number_of_nodes + node_index]) -
-            (n_a.vd[13 * para->number_of_nodes + node_index] -
-             n_a.vd[14 * para->number_of_nodes + node_index]) +
-            (n_a.vd[15 * para->number_of_nodes + node_index] -
-             n_a.vd[16 * para->number_of_nodes + node_index]) -
-            (n_a.vd[17 * para->number_of_nodes + node_index] -
-             n_a.vd[18 * para->number_of_nodes + node_index]);
+  mode[1] = (n_a.populations[1 * para->number_of_nodes + node_index] -
+             n_a.populations[2 * para->number_of_nodes + node_index]) +
+            (n_a.populations[7 * para->number_of_nodes + node_index] -
+             n_a.populations[8 * para->number_of_nodes + node_index]) +
+            (n_a.populations[9 * para->number_of_nodes + node_index] -
+             n_a.populations[10 * para->number_of_nodes + node_index]) +
+            (n_a.populations[11 * para->number_of_nodes + node_index] -
+             n_a.populations[12 * para->number_of_nodes + node_index]) +
+            (n_a.populations[13 * para->number_of_nodes + node_index] -
+             n_a.populations[14 * para->number_of_nodes + node_index]);
+
+  mode[2] = (n_a.populations[3 * para->number_of_nodes + node_index] -
+             n_a.populations[4 * para->number_of_nodes + node_index]) +
+            (n_a.populations[7 * para->number_of_nodes + node_index] -
+             n_a.populations[8 * para->number_of_nodes + node_index]) -
+            (n_a.populations[9 * para->number_of_nodes + node_index] -
+             n_a.populations[10 * para->number_of_nodes + node_index]) +
+            (n_a.populations[15 * para->number_of_nodes + node_index] -
+             n_a.populations[16 * para->number_of_nodes + node_index]) +
+            (n_a.populations[17 * para->number_of_nodes + node_index] -
+             n_a.populations[18 * para->number_of_nodes + node_index]);
+
+  mode[3] = (n_a.populations[5 * para->number_of_nodes + node_index] -
+             n_a.populations[6 * para->number_of_nodes + node_index]) +
+            (n_a.populations[11 * para->number_of_nodes + node_index] -
+             n_a.populations[12 * para->number_of_nodes + node_index]) -
+            (n_a.populations[13 * para->number_of_nodes + node_index] -
+             n_a.populations[14 * para->number_of_nodes + node_index]) +
+            (n_a.populations[15 * para->number_of_nodes + node_index] -
+             n_a.populations[16 * para->number_of_nodes + node_index]) -
+            (n_a.populations[17 * para->number_of_nodes + node_index] -
+             n_a.populations[18 * para->number_of_nodes + node_index]);
 }
 
 /** Calculate temperature of the fluid kernel
@@ -1368,10 +1317,9 @@ __device__ __inline__ float three_point_polynomial_larger_than_half(float u) {
 
 /**
  * @brief Get velocity of at index.
- *
  */
 __device__ __inline__ float3 node_velocity(float rho_eq, LB_nodes_gpu n_a,
-                                           int index) {
+                                           unsigned index) {
   auto const boundary_index = n_a.boundary[index];
 
   if (boundary_index) {
@@ -1394,7 +1342,7 @@ velocity_interpolation(LB_nodes_gpu n_a, float const *particle_position,
   Utils::Array<int, 3> center_node_index{};
   Utils::Array<float3, 3> temp_delta{};
 
-  for (int i = 0; i < 3; ++i) {
+  for (unsigned i = 0; i < 3; ++i) {
     // position of particle in units of agrid.
     auto const scaled_pos = particle_position[i] / para->agrid - 0.5f;
     center_node_index[i] = static_cast<int>(rint(scaled_pos));
@@ -1431,7 +1379,7 @@ velocity_interpolation(LB_nodes_gpu n_a, float const *particle_position,
     return ind;
   };
 
-  int cnt = 0;
+  unsigned cnt = 0;
   float3 interpolated_u{0.0f, 0.0f, 0.0f};
 #pragma unroll 1
   for (int i = 0; i < 3; ++i) {
@@ -1446,7 +1394,7 @@ velocity_interpolation(LB_nodes_gpu n_a, float const *particle_position,
         auto const z = fold_if_necessary(center_node_index[2] - 1 + k,
                                          static_cast<int>(para->dim_z));
         delta[cnt] = temp_delta[i].x * temp_delta[j].y * temp_delta[k].z;
-        auto const index = xyz_to_index(x, y, z);
+        auto const index = static_cast<unsigned>(xyz_to_index(x, y, z));
         node_indices[cnt] = index;
 
         auto const node_u = node_velocity(para->rho, n_a, index);
@@ -1477,7 +1425,7 @@ velocity_interpolation(LB_nodes_gpu n_a, float const *particle_position,
   Utils::Array<float, 6> temp_delta;
   // Eq. (10) and (11) in @cite ahlrichs99a page 8227
 #pragma unroll
-  for (int i = 0; i < 3; ++i) {
+  for (unsigned i = 0; i < 3; ++i) {
     auto const scaledpos = particle_position[i] / para->agrid - 0.5f;
     left_node_index[i] = static_cast<int>(floorf(scaledpos));
     temp_delta[3 + i] = scaledpos - static_cast<float>(left_node_index[i]);
@@ -1501,26 +1449,23 @@ velocity_interpolation(LB_nodes_gpu n_a, float const *particle_position,
                 static_cast<int>(para->dim_y);
   int const z = (left_node_index[2] + static_cast<int>(para->dim_z)) %
                 static_cast<int>(para->dim_z);
-  auto xp1 = x + 1;
-  auto yp1 = y + 1;
-  auto zp1 = z + 1;
   auto fold_if_necessary = [](int ind, int dim) {
     return ind >= dim ? ind % dim : ind;
   };
-  xp1 = fold_if_necessary(xp1, static_cast<int>(para->dim_x));
-  yp1 = fold_if_necessary(yp1, static_cast<int>(para->dim_y));
-  zp1 = fold_if_necessary(zp1, static_cast<int>(para->dim_z));
-  node_index[0] = xyz_to_index(x, y, z);
-  node_index[1] = xyz_to_index(xp1, y, z);
-  node_index[2] = xyz_to_index(x, yp1, z);
-  node_index[3] = xyz_to_index(xp1, yp1, z);
-  node_index[4] = xyz_to_index(x, y, zp1);
-  node_index[5] = xyz_to_index(xp1, y, zp1);
-  node_index[6] = xyz_to_index(x, yp1, zp1);
-  node_index[7] = xyz_to_index(xp1, yp1, zp1);
+  auto const xp1 = fold_if_necessary(x + 1, static_cast<int>(para->dim_x));
+  auto const yp1 = fold_if_necessary(y + 1, static_cast<int>(para->dim_y));
+  auto const zp1 = fold_if_necessary(z + 1, static_cast<int>(para->dim_z));
+  node_index[0] = static_cast<unsigned>(xyz_to_index(x, y, z));
+  node_index[1] = static_cast<unsigned>(xyz_to_index(xp1, y, z));
+  node_index[2] = static_cast<unsigned>(xyz_to_index(x, yp1, z));
+  node_index[3] = static_cast<unsigned>(xyz_to_index(xp1, yp1, z));
+  node_index[4] = static_cast<unsigned>(xyz_to_index(x, y, zp1));
+  node_index[5] = static_cast<unsigned>(xyz_to_index(xp1, y, zp1));
+  node_index[6] = static_cast<unsigned>(xyz_to_index(x, yp1, zp1));
+  node_index[7] = static_cast<unsigned>(xyz_to_index(xp1, yp1, zp1));
 
   float3 interpolated_u{0.0f, 0.0f, 0.0f};
-  for (int i = 0; i < 8; ++i) {
+  for (unsigned i = 0; i < 8; ++i) {
     auto const node_u = node_velocity(para->rho, n_a, node_index[i]);
     interpolated_u.x += delta[i] * node_u.x;
     interpolated_u.y += delta[i] * node_u.y;
@@ -1541,6 +1486,7 @@ velocity_interpolation(LB_nodes_gpu n_a, float const *particle_position,
  *  @param[in]  d_v                Local device values
  *  @param[in]  flag_cs            Determine if we are at the centre (0,
  *                                 typical) or at the source (1, swimmer only)
+ *  @param[in]  philox_counter     Philox counter
  *  @param[in]  friction           Friction constant for the particle coupling
  *  @tparam no_of_neighbours       The number of neighbours to consider for
  *                                 interpolation
@@ -1635,7 +1581,8 @@ __device__ void calc_viscous_force(
   if (para->kT > 0.0) {
     /* add stochastic force of zero mean (eq. (15) @cite ahlrichs99a) */
     float4 random_floats = random_wrapper_philox(
-        particle_data[part_index].identity, LBQ * 32, philox_counter);
+        static_cast<unsigned>(particle_data[part_index].identity), LBQ * 32,
+        philox_counter);
     /* lb_coupl_pref is stored in MD units (force).
      * Eq. (16) @cite ahlrichs99a.
      * The factor 12 comes from the fact that we use random numbers
@@ -1697,8 +1644,8 @@ calc_node_force(Utils::Array<float, no_of_neighbours> const &delta,
                 float const *delta_j,
                 Utils::Array<unsigned int, no_of_neighbours> const &node_index,
                 LB_node_force_density_gpu node_f) {
-  for (int node = 0; node < no_of_neighbours; ++node) {
-    for (int i = 0; i < 3; ++i) {
+  for (std::size_t node = 0; node < no_of_neighbours; ++node) {
+    for (unsigned i = 0; i < 3; ++i) {
       atomicAdd(
           &(node_f.force_density[i * para->number_of_nodes + node_index[node]]),
           delta[node] * delta_j[i]);
@@ -1723,7 +1670,7 @@ calc_node_force(Utils::Array<float, no_of_neighbours> const &delta,
  */
 __global__ void calc_n_from_rho_j_pi(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
                                      LB_node_force_density_gpu node_f,
-                                     int *gpu_check) {
+                                     bool *gpu_check) {
   /* TODO: this can handle only a uniform density, something similar, but local,
            has to be called every time the fields are set by the user ! */
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
@@ -1731,9 +1678,9 @@ __global__ void calc_n_from_rho_j_pi(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
   if (index < para->number_of_nodes) {
     Utils::Array<float, 19> mode;
 
-    /* default values for fields in lattice units */
-    gpu_check[0] = 1;
+    gpu_check[0] = true;
 
+    /* default values for fields in lattice units */
     float Rho = para->rho;
     Utils::Array<float, 3> v{};
     Utils::Array<float, 6> pi = {{Rho * D3Q19::c_sound_sq<float>, 0.0f,
@@ -1765,28 +1712,28 @@ __global__ void calc_n_from_rho_j_pi(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
     float tmp1, tmp2;
 
     /* update the q=0 sublattice */
-    n_a.vd[(0) * para->number_of_nodes + index] =
+    n_a.populations[(0) * para->number_of_nodes + index] =
         1.0f / 3.0f * (local_rho - avg_rho) - 1.0f / 2.0f * trace;
 
     /* update the q=1 sublattice */
     rho_times_coeff = 1.0f / 18.0f * (local_rho - avg_rho);
 
-    n_a.vd[(1) * para->number_of_nodes + index] =
+    n_a.populations[(1) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 6.0f * local_j[0] + 1.0f / 4.0f * local_pi[0] -
         1.0f / 12.0f * trace;
-    n_a.vd[(2) * para->number_of_nodes + index] =
+    n_a.populations[(2) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 6.0f * local_j[0] + 1.0f / 4.0f * local_pi[0] -
         1.0f / 12.0f * trace;
-    n_a.vd[(3) * para->number_of_nodes + index] =
+    n_a.populations[(3) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 6.0f * local_j[1] + 1.0f / 4.0f * local_pi[2] -
         1.0f / 12.0f * trace;
-    n_a.vd[(4) * para->number_of_nodes + index] =
+    n_a.populations[(4) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 6.0f * local_j[1] + 1.0f / 4.0f * local_pi[2] -
         1.0f / 12.0f * trace;
-    n_a.vd[(5) * para->number_of_nodes + index] =
+    n_a.populations[(5) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 6.0f * local_j[2] + 1.0f / 4.0f * local_pi[5] -
         1.0f / 12.0f * trace;
-    n_a.vd[(6) * para->number_of_nodes + index] =
+    n_a.populations[(6) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 6.0f * local_j[2] + 1.0f / 4.0f * local_pi[5] -
         1.0f / 12.0f * trace;
 
@@ -1795,48 +1742,48 @@ __global__ void calc_n_from_rho_j_pi(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
 
     tmp1 = local_pi[0] + local_pi[2];
     tmp2 = 2.0f * local_pi[1];
-    n_a.vd[(7) * para->number_of_nodes + index] =
+    n_a.populations[(7) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] + local_j[1]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(8) * para->number_of_nodes + index] =
+    n_a.populations[(8) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] + local_j[1]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(9) * para->number_of_nodes + index] =
+    n_a.populations[(9) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] - local_j[1]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(10) * para->number_of_nodes + index] =
+    n_a.populations[(10) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] - local_j[1]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
 
     tmp1 = local_pi[0] + local_pi[5];
     tmp2 = 2.0f * local_pi[3];
 
-    n_a.vd[(11) * para->number_of_nodes + index] =
+    n_a.populations[(11) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(12) * para->number_of_nodes + index] =
+    n_a.populations[(12) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(13) * para->number_of_nodes + index] =
+    n_a.populations[(13) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(14) * para->number_of_nodes + index] =
+    n_a.populations[(14) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
 
     tmp1 = local_pi[2] + local_pi[5];
     tmp2 = 2.0f * local_pi[4];
 
-    n_a.vd[(15) * para->number_of_nodes + index] =
+    n_a.populations[(15) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[1] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(16) * para->number_of_nodes + index] =
+    n_a.populations[(16) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[1] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(17) * para->number_of_nodes + index] =
+    n_a.populations[(17) * para->number_of_nodes + index] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[1] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(18) * para->number_of_nodes + index] =
+    n_a.populations[(18) * para->number_of_nodes + index] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[1] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
 
@@ -1845,7 +1792,7 @@ __global__ void calc_n_from_rho_j_pi(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
   }
 }
 
-__global__ void set_force_density(int single_nodeindex,
+__global__ void set_force_density(unsigned single_nodeindex,
                                   float const *force_density,
                                   LB_node_force_density_gpu node_f) {
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
@@ -1874,7 +1821,7 @@ __global__ void set_force_density(int single_nodeindex,
  *  @param[out] d_v               Local device values
  *  @param[in]  node_f            Node forces
  */
-__global__ void set_u_from_rho_v_pi(LB_nodes_gpu n_a, int single_nodeindex,
+__global__ void set_u_from_rho_v_pi(LB_nodes_gpu n_a, unsigned single_nodeindex,
                                     float const *velocity, LB_rho_v_gpu *d_v,
                                     LB_node_force_density_gpu node_f) {
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
@@ -1891,7 +1838,7 @@ __global__ void set_u_from_rho_v_pi(LB_nodes_gpu n_a, int single_nodeindex,
     Utils::Array<float, 19> mode_for_pi;
     float rho_from_m;
     float j_from_m[3];
-    float pi_from_m[6];
+    Utils::Array<float, 6> pi_from_m;
 
     // Calculate the modes for this node
 
@@ -1904,7 +1851,7 @@ __global__ void set_u_from_rho_v_pi(LB_nodes_gpu n_a, int single_nodeindex,
     // Calculate the density, velocity, and pressure tensor
     // in LB unit for this node
 
-    calc_values_from_m(mode_for_pi, &d_v[single_nodeindex], &rho_from_m,
+    calc_values_from_m(mode_for_pi, d_v[single_nodeindex], &rho_from_m,
                        j_from_m, pi_from_m);
 
     // Take LB component density and calculate the equilibrium part
@@ -1929,29 +1876,29 @@ __global__ void set_u_from_rho_v_pi(LB_nodes_gpu n_a, int single_nodeindex,
 
     // update the q=0 sublattice
 
-    n_a.vd[(0) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(0) * para->number_of_nodes + single_nodeindex] =
         1.0f / 3.0f * (local_rho - avg_rho) - 1.0f / 2.0f * trace;
 
     // update the q=1 sublattice
 
     rho_times_coeff = 1.0f / 18.0f * (local_rho - avg_rho);
 
-    n_a.vd[(1) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(1) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 6.0f * local_j[0] + 1.0f / 4.0f * local_pi[0] -
         1.0f / 12.0f * trace;
-    n_a.vd[(2) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(2) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 6.0f * local_j[0] + 1.0f / 4.0f * local_pi[0] -
         1.0f / 12.0f * trace;
-    n_a.vd[(3) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(3) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 6.0f * local_j[1] + 1.0f / 4.0f * local_pi[2] -
         1.0f / 12.0f * trace;
-    n_a.vd[(4) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(4) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 6.0f * local_j[1] + 1.0f / 4.0f * local_pi[2] -
         1.0f / 12.0f * trace;
-    n_a.vd[(5) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(5) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 6.0f * local_j[2] + 1.0f / 4.0f * local_pi[5] -
         1.0f / 12.0f * trace;
-    n_a.vd[(6) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(6) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 6.0f * local_j[2] + 1.0f / 4.0f * local_pi[5] -
         1.0f / 12.0f * trace;
 
@@ -1962,48 +1909,48 @@ __global__ void set_u_from_rho_v_pi(LB_nodes_gpu n_a, int single_nodeindex,
     tmp1 = local_pi[0] + local_pi[2];
     tmp2 = 2.0f * local_pi[1];
 
-    n_a.vd[(7) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(7) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] + local_j[1]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(8) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(8) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] + local_j[1]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(9) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(9) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] - local_j[1]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(10) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(10) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] - local_j[1]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
 
     tmp1 = local_pi[0] + local_pi[5];
     tmp2 = 2.0f * local_pi[3];
 
-    n_a.vd[(11) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(11) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(12) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(12) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(13) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(13) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[0] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(14) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(14) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[0] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
 
     tmp1 = local_pi[2] + local_pi[5];
     tmp2 = 2.0f * local_pi[4];
 
-    n_a.vd[(15) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(15) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[1] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(16) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(16) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[1] + local_j[2]) +
         1.0f / 8.0f * (tmp1 + tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(17) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(17) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff + 1.0f / 12.0f * (local_j[1] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
-    n_a.vd[(18) * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[(18) * para->number_of_nodes + single_nodeindex] =
         rho_times_coeff - 1.0f / 12.0f * (local_j[1] - local_j[2]) +
         1.0f / 8.0f * (tmp1 - tmp2) - 1.0f / 24.0f * trace;
 
@@ -2060,7 +2007,7 @@ __global__ void reinit_node_force(LB_node_force_density_gpu node_f) {
  *  @param[in] d_v               Local modes
  */
 __global__ void set_rho(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
-                        int single_nodeindex, float rho) {
+                        unsigned single_nodeindex, float rho) {
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
                        blockDim.x * blockIdx.x + threadIdx.x;
   /* Note: this sets the velocities to zero */
@@ -2071,43 +2018,43 @@ __global__ void set_rho(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
     local_rho = (rho - para->rho);
     d_v[single_nodeindex].rho = rho;
 
-    n_a.vd[0 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[0 * para->number_of_nodes + single_nodeindex] =
         1.0f / 3.0f * local_rho;
-    n_a.vd[1 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[1 * para->number_of_nodes + single_nodeindex] =
         1.0f / 18.0f * local_rho;
-    n_a.vd[2 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[2 * para->number_of_nodes + single_nodeindex] =
         1.0f / 18.0f * local_rho;
-    n_a.vd[3 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[3 * para->number_of_nodes + single_nodeindex] =
         1.0f / 18.0f * local_rho;
-    n_a.vd[4 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[4 * para->number_of_nodes + single_nodeindex] =
         1.0f / 18.0f * local_rho;
-    n_a.vd[5 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[5 * para->number_of_nodes + single_nodeindex] =
         1.0f / 18.0f * local_rho;
-    n_a.vd[6 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[6 * para->number_of_nodes + single_nodeindex] =
         1.0f / 18.0f * local_rho;
-    n_a.vd[7 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[7 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[8 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[8 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[9 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[9 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[10 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[10 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[11 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[11 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[12 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[12 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[13 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[13 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[14 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[14 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[15 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[15 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[16 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[16 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[17 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[17 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
-    n_a.vd[18 * para->number_of_nodes + single_nodeindex] =
+    n_a.populations[18 * para->number_of_nodes + single_nodeindex] =
         1.0f / 36.0f * local_rho;
   }
 }
@@ -2115,12 +2062,14 @@ __global__ void set_rho(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
 /** Set the boundary flag for all boundary nodes
  *  @param[in]  boundary_node_list    Indices of the boundary nodes
  *  @param[in]  boundary_index_list   Flag for the corresponding boundary
+ *  @param[in]  boundary_velocities   Boundary velocities
  *  @param[in]  number_of_boundnodes  Number of boundary nodes
+ *  @param[in]  boundaries            Boundary information
  */
 __global__ void init_boundaries(int const *boundary_node_list,
                                 int const *boundary_index_list,
                                 float const *boundary_velocities,
-                                int number_of_boundnodes,
+                                unsigned number_of_boundnodes,
                                 LB_boundaries_gpu boundaries) {
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
                        blockDim.x * blockIdx.x + threadIdx.x;
@@ -2134,7 +2083,7 @@ __global__ void init_boundaries(int const *boundary_node_list,
         boundary_velocities[3 * (boundary_index - 1) + 1],
         boundary_velocities[3 * (boundary_index - 1) + 2]};
 
-    boundaries.index[node_index] = boundary_index;
+    boundaries.index[node_index] = static_cast<unsigned>(boundary_index);
     boundaries.velocity[node_index] = v;
   }
 }
@@ -2153,10 +2102,11 @@ __global__ void reset_boundaries(LB_boundaries_gpu boundaries) {
  *  @param[out]    n_b     Local node residing in array b
  *  @param[in,out] d_v     Local device values
  *  @param[in,out] node_f  Local node force density
+ *  @param[in]     philox_counter  Philox counter
  */
 __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_rho_v_gpu *d_v,
                           LB_node_force_density_gpu node_f,
-                          unsigned int philox_counter) {
+                          uint64_t philox_counter) {
   /* every node is connected to a thread via the index */
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
                        blockDim.x * blockIdx.x + threadIdx.x;
@@ -2202,9 +2152,11 @@ __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_rho_v_gpu *d_v,
  *  @param[in,out]  particle_force  Particle force
  *  @param[out] node_f              Local node force
  *  @param[in]  d_v                 Local device values
+ *  @param[in]  couple_virtual      If true, virtual particles are also coupled
  *  @param[in]  friction            Friction constant for the particle coupling
+ *  @param[in]  philox_counter      Philox counter
  *  @tparam     no_of_neighbours    The number of neighbours to consider for
- * interpolation
+ *                                  interpolation
  */
 template <std::size_t no_of_neighbours>
 __global__ void calc_fluid_particle_ia(
@@ -2299,8 +2251,9 @@ __global__ void lb_get_boundaries(LB_nodes_gpu n_a,
  *  @param[out] d_v     Local device values
  *  @param[in]  node_f  Local node force
  */
-__global__ void lb_print_node(int single_nodeindex, LB_rho_v_pi_gpu *d_p_v,
-                              LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
+__global__ void lb_print_node(unsigned int single_nodeindex,
+                              LB_rho_v_pi_gpu *d_p_v, LB_nodes_gpu n_a,
+                              LB_rho_v_gpu *d_v,
                               LB_node_force_density_gpu node_f) {
   Utils::Array<float, 19> mode;
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
@@ -2348,7 +2301,7 @@ __global__ void momentum(LB_nodes_gpu n_a, LB_rho_v_gpu *d_v,
  *  @param[out] device_flag       Result
  *  @param[in]  n_a               Local node residing in array a
  */
-__global__ void lb_get_boundary_flag(int single_nodeindex,
+__global__ void lb_get_boundary_flag(unsigned int single_nodeindex,
                                      unsigned int *device_flag,
                                      LB_nodes_gpu n_a) {
   unsigned int index = blockIdx.y * gridDim.x * blockDim.x +
@@ -2362,33 +2315,28 @@ __global__ void lb_get_boundary_flag(int single_nodeindex,
 /* Host functions to setup and call kernels*/
 /**********************************************************************/
 
-void lb_get_para_pointer(LB_parameters_gpu **pointeradress) {
-  if (cudaGetSymbolAddress((void **)pointeradress, para) != cudaSuccess) {
-    fprintf(stderr,
-            "Trouble getting address of LB parameters.\n"); // TODO give proper
-                                                            // error message
+void lb_get_para_pointer(LB_parameters_gpu **pointer_address) {
+  auto const error = cudaGetSymbolAddress((void **)pointer_address, para);
+  if (error != cudaSuccess) {
+    fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
     errexit();
   }
 }
 
-void lb_get_lbpar_pointer(LB_parameters_gpu **pointeradress) {
-  *pointeradress = &lbpar_gpu;
-}
-
-void lb_get_boundary_force_pointer(float **pointeradress) {
+void lb_get_boundary_force_pointer(float **pointer_address) {
 #ifdef LB_BOUNDARIES_GPU
-  *pointeradress = lb_boundary_force;
+  *pointer_address = lb_boundary_force;
 #endif
 }
 
-void lb_get_device_values_pointer(LB_rho_v_gpu **pointeradress) {
-  *pointeradress = device_rho_v;
+void lb_get_device_values_pointer(LB_rho_v_gpu **pointer_address) {
+  *pointer_address = device_rho_v;
 }
 
 /** Initialization for the lb gpu fluid called from host
  *  @param lbpar_gpu   Pointer to parameters to setup the lb field
  */
-void lb_init_GPU(LB_parameters_gpu *lbpar_gpu) {
+void lb_init_GPU(const LB_parameters_gpu &lbpar_gpu) {
 #define free_realloc_and_clear(var, size)                                      \
   {                                                                            \
     if ((var) != nullptr)                                                      \
@@ -2397,8 +2345,8 @@ void lb_init_GPU(LB_parameters_gpu *lbpar_gpu) {
     cudaMemset(var, 0, size);                                                  \
   }
 
-  size_of_rho_v = lbpar_gpu->number_of_nodes * sizeof(LB_rho_v_gpu);
-  size_of_rho_v_pi = lbpar_gpu->number_of_nodes * sizeof(LB_rho_v_pi_gpu);
+  size_of_rho_v = lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_gpu);
+  size_of_rho_v_pi = lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_pi_gpu);
 
   /* Allocate structs in device memory*/
   free_realloc_and_clear(device_rho_v, size_of_rho_v);
@@ -2406,61 +2354,51 @@ void lb_init_GPU(LB_parameters_gpu *lbpar_gpu) {
   /* TODO: this is almost a copy of device_rho_v; think about eliminating
    * it, and maybe pi can be added to device_rho_v in this case */
   free_realloc_and_clear(print_rho_v_pi, size_of_rho_v_pi);
-  free_realloc_and_clear(nodes_a.vd,
-                         lbpar_gpu->number_of_nodes * 19 * sizeof(float));
-  free_realloc_and_clear(nodes_b.vd,
-                         lbpar_gpu->number_of_nodes * 19 * sizeof(float));
+  free_realloc_and_clear(nodes_a.populations,
+                         lbpar_gpu.number_of_nodes * 19 * sizeof(float));
+  free_realloc_and_clear(nodes_b.populations,
+                         lbpar_gpu.number_of_nodes * 19 * sizeof(float));
   free_realloc_and_clear(node_f.force_density,
-                         lbpar_gpu->number_of_nodes * 3 * sizeof(lbForceFloat));
+                         lbpar_gpu.number_of_nodes * 3 * sizeof(lbForceFloat));
 #if defined(VIRTUAL_SITES_INERTIALESS_TRACERS) || defined(EK_DEBUG)
   free_realloc_and_clear(node_f.force_density_buf,
-                         lbpar_gpu->number_of_nodes * 3 * sizeof(lbForceFloat));
+                         lbpar_gpu.number_of_nodes * 3 * sizeof(lbForceFloat));
 #endif
   free_realloc_and_clear(boundaries.index,
-                         lbpar_gpu->number_of_nodes * sizeof(unsigned int));
+                         lbpar_gpu.number_of_nodes * sizeof(unsigned int));
   free_realloc_and_clear(boundaries.velocity,
-                         lbpar_gpu->number_of_nodes *
+                         lbpar_gpu.number_of_nodes *
                              sizeof(Utils::Array<float, 3>));
 
   nodes_a.boundary = nodes_b.boundary = boundaries.index;
   nodes_a.boundary_velocity = nodes_b.boundary_velocity = boundaries.velocity;
 
-  /*write parameters in const memory*/
-  cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu)));
-
-  /*check flag if lb gpu init works*/
-  free_realloc_and_clear(gpu_check, sizeof(int));
+  /* write parameters in const memory */
+  cuda_safe_mem(
+      cudaMemcpyToSymbol(para, &lbpar_gpu, sizeof(LB_parameters_gpu)));
 
-  if (h_gpu_check != nullptr)
-    free(h_gpu_check);
+  free_realloc_and_clear(device_gpu_lb_initialized, sizeof(bool));
 
-  h_gpu_check = (int *)Utils::malloc(sizeof(int));
-
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(reset_boundaries, dim_grid, threads_per_block, boundaries);
 
-  /* calc of velocitydensities from given parameters and initialize the
+  /* calc of velocity densities from given parameters and initialize the
    * Node_Force array with zero */
   KERNELCALL(reinit_node_force, dim_grid, threads_per_block, (node_f));
   KERNELCALL(calc_n_from_rho_j_pi, dim_grid, threads_per_block, nodes_a,
-             device_rho_v, node_f, gpu_check);
+             device_rho_v, node_f, device_gpu_lb_initialized);
 
   intflag = true;
   current_nodes = &nodes_a;
-  h_gpu_check[0] = 0;
-  cuda_safe_mem(
-      cudaMemcpy(h_gpu_check, gpu_check, sizeof(int), cudaMemcpyDeviceToHost));
+  bool host_gpu_lb_initialized = false;
+  cuda_safe_mem(cudaMemcpy(&host_gpu_lb_initialized, device_gpu_lb_initialized,
+                           sizeof(bool), cudaMemcpyDeviceToHost));
   cudaDeviceSynchronize();
 
-  if (!h_gpu_check[0]) {
-    fprintf(stderr, "initialization of lb gpu code failed! \n");
+  if (!host_gpu_lb_initialized) {
+    fprintf(stderr, "initialization of LB GPU code failed!\n");
     errexit();
   }
 }
@@ -2472,18 +2410,13 @@ void lb_reinit_GPU(LB_parameters_gpu *lbpar_gpu) {
   /* write parameters in const memory */
   cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu)));
 
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu->number_of_nodes, 4, threads_per_block);
 
   /* calc of velocity densities from given parameters and initialize the
    * Node_Force array with zero */
   KERNELCALL(calc_n_from_rho_j_pi, dim_grid, threads_per_block, nodes_a,
-             device_rho_v, node_f, gpu_check);
+             device_rho_v, node_f, device_gpu_lb_initialized);
 }
 
 #ifdef LB_BOUNDARIES_GPU
@@ -2496,7 +2429,8 @@ void lb_reinit_GPU(LB_parameters_gpu *lbpar_gpu) {
  *  @param host_lb_boundary_velocity   The constant velocity at the boundary,
  *                                     set by the user
  */
-void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
+void lb_init_boundaries_GPU(std::size_t host_n_lb_boundaries,
+                            unsigned number_of_boundnodes,
                             int *host_boundary_node_list,
                             int *host_boundary_index_list,
                             float *host_lb_boundary_velocity) {
@@ -2524,12 +2458,8 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
                  cudaMemcpyHostToDevice));
 
   /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(reset_boundaries, dim_grid, threads_per_block, boundaries);
 
@@ -2542,16 +2472,10 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
     fprintf(stderr,
             "WARNING: boundary cmd executed but no boundary node found!\n");
   } else {
-    int threads_per_block_bound = 64;
-    int blocks_per_grid_bound_y = 4;
-    int blocks_per_grid_bound_x =
-        (number_of_boundnodes +
-         threads_per_block_bound * blocks_per_grid_bound_y - 1) /
-        (threads_per_block_bound * blocks_per_grid_bound_y);
     dim3 dim_grid_bound =
-        make_uint3(blocks_per_grid_bound_x, blocks_per_grid_bound_y, 1);
+        calculate_dim_grid(number_of_boundnodes, 4, threads_per_block);
 
-    KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block_bound,
+    KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block,
                boundary_node_list, boundary_index_list, boundary_velocity,
                number_of_boundnodes, boundaries);
   }
@@ -2569,50 +2493,39 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
 void lb_reinit_extern_nodeforce_GPU(LB_parameters_gpu *lbpar_gpu) {
   cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu)));
 
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu->number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(reinit_node_force, dim_grid, threads_per_block, node_f);
 }
 
 /** Setup and call particle kernel from the host
  *  @tparam no_of_neighbours       The number of neighbours to consider for
- * interpolation
+ *                                 interpolation
  */
 template <std::size_t no_of_neighbours>
 void lb_calc_particle_lattice_ia_gpu(bool couple_virtual, double friction) {
   auto device_particles = gpu_get_particle_pointer();
 
-  if (not device_particles.empty()) {
-    /* call of the particle kernel */
-    /* values for the particle kernel */
-    int threads_per_block_particles = 64;
-    int blocks_per_grid_particles_y = 4;
-    auto blocks_per_grid_particles_x = static_cast<int>(
-        (device_particles.size() +
-         threads_per_block_particles * blocks_per_grid_particles_y - 1) /
-        (threads_per_block_particles * blocks_per_grid_particles_y));
-    dim3 dim_grid_particles =
-        make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1);
-    if (lbpar_gpu.kT > 0.0) {
-      assert(rng_counter_coupling_gpu);
-      KERNELCALL(calc_fluid_particle_ia<no_of_neighbours>, dim_grid_particles,
-                 threads_per_block_particles, *current_nodes, device_particles,
-                 gpu_get_particle_force_pointer(), node_f, device_rho_v,
-                 couple_virtual, rng_counter_coupling_gpu->value(),
-                 static_cast<float>(friction));
-    } else {
-      // We use a dummy value for the RNG counter if no temperature is set.
-      KERNELCALL(calc_fluid_particle_ia<no_of_neighbours>, dim_grid_particles,
-                 threads_per_block_particles, *current_nodes, device_particles,
-                 gpu_get_particle_force_pointer(), node_f, device_rho_v,
-                 couple_virtual, 0, static_cast<float>(friction));
-    }
+  if (device_particles.empty()) {
+    return;
+  }
+
+  dim3 dim_grid = calculate_dim_grid(
+      static_cast<unsigned>(device_particles.size()), 4, threads_per_block);
+  if (lbpar_gpu.kT > 0.0) {
+    assert(rng_counter_coupling_gpu);
+    KERNELCALL(calc_fluid_particle_ia<no_of_neighbours>, dim_grid,
+               threads_per_block, *current_nodes, device_particles,
+               gpu_get_particle_force_pointer(), node_f, device_rho_v,
+               couple_virtual, rng_counter_coupling_gpu->value(),
+               static_cast<float>(friction));
+  } else {
+    // We use a dummy value for the RNG counter if no temperature is set.
+    KERNELCALL(calc_fluid_particle_ia<no_of_neighbours>, dim_grid,
+               threads_per_block, *current_nodes, device_particles,
+               gpu_get_particle_force_pointer(), node_f, device_rho_v,
+               couple_virtual, 0, static_cast<float>(friction));
   }
 }
 template void lb_calc_particle_lattice_ia_gpu<8>(bool couple_virtual,
@@ -2624,13 +2537,8 @@ template void lb_calc_particle_lattice_ia_gpu<27>(bool couple_virtual,
  *  @param host_values   struct to save the gpu values
  */
 void lb_get_values_GPU(LB_rho_v_pi_gpu *host_values) {
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(get_mesoscopic_values_in_LB_units, dim_grid, threads_per_block,
              *current_nodes, print_rho_v_pi, device_rho_v, node_f);
@@ -2645,13 +2553,9 @@ void lb_get_boundary_flags_GPU(unsigned int *host_bound_array) {
   unsigned int *device_bound_array;
   cuda_safe_mem(cudaMalloc((void **)&device_bound_array,
                            lbpar_gpu.number_of_nodes * sizeof(unsigned int)));
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(lb_get_boundaries, dim_grid, threads_per_block, *current_nodes,
              device_bound_array);
@@ -2666,14 +2570,14 @@ void lb_get_boundary_flags_GPU(unsigned int *host_bound_array) {
 /** Setup and call kernel for getting macroscopic fluid values of a single
  *  node
  */
-void lb_print_node_GPU(int single_nodeindex,
+void lb_print_node_GPU(unsigned single_nodeindex,
                        LB_rho_v_pi_gpu *host_print_values) {
   LB_rho_v_pi_gpu *device_print_values;
   cuda_safe_mem(
       cudaMalloc((void **)&device_print_values, sizeof(LB_rho_v_pi_gpu)));
-  int threads_per_block_print = 1;
-  int blocks_per_grid_print_y = 1;
-  int blocks_per_grid_print_x = 1;
+  unsigned threads_per_block_print = 1;
+  unsigned blocks_per_grid_print_y = 1;
+  unsigned blocks_per_grid_print_x = 1;
   dim3 dim_grid_print =
       make_uint3(blocks_per_grid_print_x, blocks_per_grid_print_y, 1);
 
@@ -2696,13 +2600,8 @@ void lb_calc_fluid_mass_GPU(double *mass) {
   cuda_safe_mem(
       cudaMemcpy(tot_mass, &cpu_mass, sizeof(float), cudaMemcpyHostToDevice));
 
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(calc_mass, dim_grid, threads_per_block, *current_nodes, tot_mass);
 
@@ -2723,13 +2622,8 @@ void lb_calc_fluid_momentum_GPU(double *host_mom) {
   cuda_safe_mem(cudaMemcpy(tot_momentum, host_momentum, 3 * sizeof(float),
                            cudaMemcpyHostToDevice));
 
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
   KERNELCALL(momentum, dim_grid, threads_per_block, *current_nodes,
              device_rho_v, node_f, tot_momentum);
@@ -2747,7 +2641,7 @@ void lb_calc_fluid_momentum_GPU(double *host_mom) {
  *  @param[out] host_checkpoint_vd   LB populations
  */
 void lb_save_checkpoint_GPU(float *const host_checkpoint_vd) {
-  cuda_safe_mem(cudaMemcpy(host_checkpoint_vd, current_nodes->vd,
+  cuda_safe_mem(cudaMemcpy(host_checkpoint_vd, current_nodes->populations,
                            lbpar_gpu.number_of_nodes * 19 * sizeof(float),
                            cudaMemcpyDeviceToHost));
 }
@@ -2759,7 +2653,7 @@ void lb_load_checkpoint_GPU(float const *const host_checkpoint_vd) {
   current_nodes = &nodes_a;
   intflag = true;
 
-  cuda_safe_mem(cudaMemcpy(current_nodes->vd, host_checkpoint_vd,
+  cuda_safe_mem(cudaMemcpy(current_nodes->populations, host_checkpoint_vd,
                            lbpar_gpu.number_of_nodes * 19 * sizeof(float),
                            cudaMemcpyHostToDevice));
 }
@@ -2768,12 +2662,13 @@ void lb_load_checkpoint_GPU(float const *const host_checkpoint_vd) {
  *  @param single_nodeindex   number of the node to get the flag for
  *  @param host_flag          here goes the value of the boundary flag
  */
-void lb_get_boundary_flag_GPU(int single_nodeindex, unsigned int *host_flag) {
+void lb_get_boundary_flag_GPU(unsigned int single_nodeindex,
+                              unsigned int *host_flag) {
   unsigned int *device_flag;
   cuda_safe_mem(cudaMalloc((void **)&device_flag, sizeof(unsigned int)));
-  int threads_per_block_flag = 1;
-  int blocks_per_grid_flag_y = 1;
-  int blocks_per_grid_flag_x = 1;
+  unsigned threads_per_block_flag = 1;
+  unsigned blocks_per_grid_flag_y = 1;
+  unsigned blocks_per_grid_flag_x = 1;
   dim3 dim_grid_flag =
       make_uint3(blocks_per_grid_flag_x, blocks_per_grid_flag_y, 1);
 
@@ -2790,10 +2685,10 @@ void lb_get_boundary_flag_GPU(int single_nodeindex, unsigned int *host_flag) {
  *  @param single_nodeindex   the node to set the velocity for
  *  @param host_rho           the density to set
  */
-void lb_set_node_rho_GPU(int single_nodeindex, float host_rho) {
-  int threads_per_block_flag = 1;
-  int blocks_per_grid_flag_y = 1;
-  int blocks_per_grid_flag_x = 1;
+void lb_set_node_rho_GPU(unsigned single_nodeindex, float host_rho) {
+  unsigned threads_per_block_flag = 1;
+  unsigned blocks_per_grid_flag_y = 1;
+  unsigned blocks_per_grid_flag_x = 1;
   dim3 dim_grid_flag =
       make_uint3(blocks_per_grid_flag_x, blocks_per_grid_flag_y, 1);
   KERNELCALL(set_rho, dim_grid_flag, threads_per_block_flag, *current_nodes,
@@ -2804,14 +2699,14 @@ void lb_set_node_rho_GPU(int single_nodeindex, float host_rho) {
  *  @param single_nodeindex   the node to set the velocity for
  *  @param host_velocity      the velocity to set
  */
-void lb_set_node_velocity_GPU(int single_nodeindex, float *host_velocity) {
+void lb_set_node_velocity_GPU(unsigned single_nodeindex, float *host_velocity) {
   float *device_velocity;
   cuda_safe_mem(cudaMalloc((void **)&device_velocity, 3 * sizeof(float)));
   cuda_safe_mem(cudaMemcpy(device_velocity, host_velocity, 3 * sizeof(float),
                            cudaMemcpyHostToDevice));
-  int threads_per_block_flag = 1;
-  int blocks_per_grid_flag_y = 1;
-  int blocks_per_grid_flag_x = 1;
+  unsigned threads_per_block_flag = 1;
+  unsigned blocks_per_grid_flag_y = 1;
+  unsigned blocks_per_grid_flag_x = 1;
   dim3 dim_grid_flag =
       make_uint3(blocks_per_grid_flag_x, blocks_per_grid_flag_y, 1);
 
@@ -2839,13 +2734,8 @@ void reinit_parameters_GPU(LB_parameters_gpu *lbpar_gpu) {
 
 /** Integration kernel for the lb gpu fluid update called from host */
 void lb_integrate_GPU() {
-  /* values for the kernel call */
-  int threads_per_block = 64;
-  int blocks_per_grid_y = 4;
-  auto blocks_per_grid_x = static_cast<int>(
-      (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /
-      (threads_per_block * blocks_per_grid_y));
-  dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+  dim3 dim_grid =
+      calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 #ifdef LB_BOUNDARIES_GPU
   if (!LBBoundaries::lbboundaries.empty()) {
     cuda_safe_mem(
@@ -2887,19 +2777,14 @@ void lb_integrate_GPU() {
 #endif
 }
 
-void lb_gpu_get_boundary_forces(double *forces) {
+void lb_gpu_get_boundary_forces(std::vector<double> &forces) {
 #ifdef LB_BOUNDARIES_GPU
-  auto *temp = (float *)Utils::malloc(3 * LBBoundaries::lbboundaries.size() *
-                                      sizeof(float));
-  cuda_safe_mem(
-      cudaMemcpy(temp, lb_boundary_force,
-                 3 * LBBoundaries::lbboundaries.size() * sizeof(float),
-                 cudaMemcpyDeviceToHost));
-
-  for (int i = 0; i < 3 * LBBoundaries::lbboundaries.size(); i++) {
-    forces[i] = -(double)temp[i];
-  }
-  free(temp);
+  std::vector<float> temp(3 * LBBoundaries::lbboundaries.size());
+  cuda_safe_mem(cudaMemcpy(temp.data(), lb_boundary_force,
+                           temp.size() * sizeof(float),
+                           cudaMemcpyDeviceToHost));
+  std::transform(temp.begin(), temp.end(), forces.begin(),
+                 [](float val) { return -static_cast<double>(val); });
 #endif
 }
 
@@ -2923,10 +2808,10 @@ struct lb_lbfluid_mass_of_particle {
 __global__ void lb_lbfluid_set_population_kernel(LB_nodes_gpu n_a,
                                                  float const population[LBQ],
                                                  int x, int y, int z) {
-  auto const index = xyz_to_index(x, y, z);
+  auto const index = static_cast<unsigned>(xyz_to_index(x, y, z));
 
-  for (int i = 0; i < LBQ; ++i) {
-    n_a.vd[i * para->number_of_nodes + index] = population[i];
+  for (unsigned i = 0; i < LBQ; ++i) {
+    n_a.populations[i * para->number_of_nodes + index] = population[i];
   }
 }
 
@@ -2958,10 +2843,10 @@ void lb_lbfluid_set_population(const Utils::Vector3i &xyz,
 __global__ void lb_lbfluid_get_population_kernel(LB_nodes_gpu n_a,
                                                  float population[LBQ], int x,
                                                  int y, int z) {
-  auto const index = xyz_to_index(x, y, z);
+  auto const index = static_cast<unsigned>(xyz_to_index(x, y, z));
 
-  for (int i = 0; i < LBQ; ++i) {
-    population[i] = n_a.vd[i * para->number_of_nodes + index];
+  for (unsigned i = 0; i < LBQ; ++i) {
+    population[i] = n_a.populations[i * para->number_of_nodes + index];
   }
 }
 
@@ -2986,8 +2871,8 @@ void lb_lbfluid_get_population(const Utils::Vector3i &xyz,
 
 /**
  * @brief Velocity interpolation functor
- * @tparam no_of_neighbours The number of neighbours to consider for
- * interpolation
+ * @tparam no_of_neighbours     The number of neighbours to consider for
+ *                              interpolation
  */
 template <std::size_t no_of_neighbours> struct interpolation {
   LB_nodes_gpu current_nodes_gpu;
@@ -3006,21 +2891,22 @@ template <std::size_t no_of_neighbours> struct interpolation {
 template <std::size_t no_of_neighbours>
 void lb_get_interpolated_velocity_gpu(double const *positions,
                                       double *velocities, int length) {
-  thrust::host_vector<float3> positions_host(length);
-  for (int p = 0; p < 3 * length; p += 3) {
+  auto const size = static_cast<unsigned>(length);
+  thrust::host_vector<float3> positions_host(size);
+  for (unsigned p = 0; p < 3 * size; p += 3) {
     // Cast double coming from python to float.
     positions_host[p / 3].x = static_cast<float>(positions[p]);
     positions_host[p / 3].y = static_cast<float>(positions[p + 1]);
     positions_host[p / 3].z = static_cast<float>(positions[p + 2]);
   }
   thrust::device_vector<float3> positions_device = positions_host;
-  thrust::device_vector<float3> velocities_device(length);
+  thrust::device_vector<float3> velocities_device(size);
   thrust::transform(
       positions_device.begin(), positions_device.end(),
       velocities_device.begin(),
       interpolation<no_of_neighbours>(*current_nodes, device_rho_v));
   thrust::host_vector<float3> velocities_host = velocities_device;
-  int index = 0;
+  unsigned index = 0;
   for (auto v : velocities_host) {
     velocities[index] = static_cast<double>(v.x);
     velocities[index + 1] = static_cast<double>(v.y);
diff --git a/src/core/immersed_boundary/ImmersedBoundaries.cpp b/src/core/immersed_boundary/ImmersedBoundaries.cpp
index 882e93183d7..1fa15773244 100644
--- a/src/core/immersed_boundary/ImmersedBoundaries.cpp
+++ b/src/core/immersed_boundary/ImmersedBoundaries.cpp
@@ -97,6 +97,9 @@ static const IBM_VolCons_Parameters *vol_cons_parameters(Particle const &p1) {
  */
 void ImmersedBoundaries::calc_volumes(CellStructure &cs) {
 
+  if (!BoundariesFound)
+    return;
+
   // Partial volumes for each soft particle, to be summed up
   std::vector<double> tempVol(IBM_MAX_NUM);
 
@@ -153,6 +156,9 @@ void ImmersedBoundaries::calc_volumes(CellStructure &cs) {
 
 /** Calculate and add the volume force to each node */
 void ImmersedBoundaries::calc_volume_force(CellStructure &cs) {
+  if (!BoundariesFound)
+    return;
+
   cs.bond_loop(
       [this](Particle &p1, int bond_id, Utils::Span<Particle *> partners) {
         if (boost::get<IBM_Triel_Parameters>(&bonded_ia_params[bond_id]) !=
diff --git a/src/core/immersed_boundary/ImmersedBoundaries.hpp b/src/core/immersed_boundary/ImmersedBoundaries.hpp
index 0406532935b..ba6d815b0a4 100644
--- a/src/core/immersed_boundary/ImmersedBoundaries.hpp
+++ b/src/core/immersed_boundary/ImmersedBoundaries.hpp
@@ -27,7 +27,7 @@
 
 class ImmersedBoundaries {
 public:
-  ImmersedBoundaries() : VolumeInitDone(false) {
+  ImmersedBoundaries() : VolumeInitDone(false), BoundariesFound(false) {
     VolumesCurrent.resize(IBM_MAX_NUM);
   }
   void init_volume_conservation(CellStructure &cs);
@@ -38,8 +38,8 @@ class ImmersedBoundaries {
   void calc_volume_force(CellStructure &cs);
 
   std::vector<double> VolumesCurrent;
-  bool VolumeInitDone = false;
-  bool BoundariesFound = false;
+  bool VolumeInitDone;
+  bool BoundariesFound;
 };
 
 #endif
diff --git a/src/core/immersed_boundary/ibm_tribend.cpp b/src/core/immersed_boundary/ibm_tribend.cpp
index 23b98bb259c..6c8c7c82e77 100644
--- a/src/core/immersed_boundary/ibm_tribend.cpp
+++ b/src/core/immersed_boundary/ibm_tribend.cpp
@@ -25,6 +25,7 @@
 
 #include <utils/Vector.hpp>
 
+#include <algorithm>
 #include <cmath>
 #include <tuple>
 
@@ -52,9 +53,7 @@ IBM_Tribend_Parameters::calc_forces(Particle const &p1, Particle const &p2,
   n2 /= Aj;
 
   // Get the prefactor for the force term
-  auto sc = n1 * n2;
-  if (sc > 1.0)
-    sc = 1.0;
+  auto const sc = std::min(1.0, n1 * n2);
 
   // Get theta as angle between normals
   auto theta = acos(sc);
@@ -119,9 +118,7 @@ IBM_Tribend_Parameters::IBM_Tribend_Parameters(const int ind1, const int ind2,
     auto const n2 = n2l / n2l.norm();
 
     // calculate theta0 by taking the acos of the scalar n1*n2
-    auto sc = n1 * n2;
-    if (sc > 1.0)
-      sc = 1.0;
+    auto const sc = std::min(1.0, n1 * n2);
 
     theta0 = acos(sc);
 
diff --git a/src/core/integrators/velocity_verlet_inline.hpp b/src/core/integrators/velocity_verlet_inline.hpp
index 23a3e6f95a1..8d739c66e4e 100644
--- a/src/core/integrators/velocity_verlet_inline.hpp
+++ b/src/core/integrators/velocity_verlet_inline.hpp
@@ -40,7 +40,7 @@ inline void velocity_verlet_propagate_vel_pos(const ParticleRange &particles) {
   auto const skin2 = Utils::sqr(0.5 * skin);
   for (auto &p : particles) {
 #ifdef ROTATION
-    propagate_omega_quat_particle(p);
+    propagate_omega_quat_particle(p, time_step);
 #endif
 
     // Don't propagate translational degrees of freedom of vs
@@ -91,7 +91,7 @@ inline void velocity_verlet_step_1(const ParticleRange &particles) {
 inline void velocity_verlet_step_2(const ParticleRange &particles) {
   velocity_verlet_propagate_vel_final(particles);
 #ifdef ROTATION
-  convert_torques_propagate_omega(particles);
+  convert_torques_propagate_omega(particles, time_step);
 #endif
 }
 
diff --git a/src/core/integrators/velocity_verlet_npt.cpp b/src/core/integrators/velocity_verlet_npt.cpp
index 13b782679ee..815e5896ad8 100644
--- a/src/core/integrators/velocity_verlet_npt.cpp
+++ b/src/core/integrators/velocity_verlet_npt.cpp
@@ -167,7 +167,7 @@ void velocity_verlet_npt_propagate_vel(const ParticleRange &particles) {
 
   for (auto &p : particles) {
 #ifdef ROTATION
-    propagate_omega_quat_particle(p);
+    propagate_omega_quat_particle(p, time_step);
 #endif
 
     // Don't propagate translational degrees of freedom of vs
@@ -198,7 +198,7 @@ void velocity_verlet_npt_step_1(const ParticleRange &particles) {
 void velocity_verlet_npt_step_2(const ParticleRange &particles) {
   velocity_verlet_npt_propagate_vel_final(particles);
 #ifdef ROTATION
-  convert_torques_propagate_omega(particles);
+  convert_torques_propagate_omega(particles, time_step);
 #endif
   velocity_verlet_npt_finalize_p_inst();
 }
diff --git a/src/core/io/mpiio/mpiio.hpp b/src/core/io/mpiio/mpiio.hpp
index 40ae48a66ef..be678886bd5 100644
--- a/src/core/io/mpiio/mpiio.hpp
+++ b/src/core/io/mpiio/mpiio.hpp
@@ -44,6 +44,7 @@ enum MPIIOOutputFields : unsigned int {
  *
  * \param filename A null-terminated filename prefix.
  * \param fields Output specifier which fields to dump.
+ * \param particles range of particles to serialize.
  */
 void mpi_mpiio_common_write(const char *filename, unsigned fields,
                             const ParticleRange &particles);
diff --git a/src/core/observables/CylindricalDensityProfile.hpp b/src/core/observables/CylindricalDensityProfile.hpp
index d18ea8baeac..a7d1de6e312 100644
--- a/src/core/observables/CylindricalDensityProfile.hpp
+++ b/src/core/observables/CylindricalDensityProfile.hpp
@@ -41,7 +41,9 @@ class CylindricalDensityProfile : public CylindricalPidProfileObservable {
 
     for (auto p : particles) {
       histogram.update(Utils::transform_coordinate_cartesian_to_cylinder(
-          folded_position(traits.position(p), box_geo) - center, axis));
+          folded_position(traits.position(p), box_geo) -
+              transform_params->center(),
+          transform_params->axis(), transform_params->orientation()));
     }
 
     histogram.normalize();
diff --git a/src/core/observables/CylindricalFluxDensityProfile.hpp b/src/core/observables/CylindricalFluxDensityProfile.hpp
index 13ef33b9178..73b65f08c81 100644
--- a/src/core/observables/CylindricalFluxDensityProfile.hpp
+++ b/src/core/observables/CylindricalFluxDensityProfile.hpp
@@ -43,11 +43,13 @@ class CylindricalFluxDensityProfile : public CylindricalPidProfileObservable {
 
     // Write data to the histogram
     for (auto p : particles) {
-      auto const pos = folded_position(traits.position(p), box_geo) - center;
+      auto const pos = folded_position(traits.position(p), box_geo) -
+                       transform_params->center();
       histogram.update(
-          Utils::transform_coordinate_cartesian_to_cylinder(pos, axis),
-          Utils::transform_vector_cartesian_to_cylinder(traits.velocity(p),
-                                                        axis, pos));
+          Utils::transform_coordinate_cartesian_to_cylinder(
+              pos, transform_params->axis(), transform_params->orientation()),
+          Utils::transform_vector_cartesian_to_cylinder(
+              traits.velocity(p), transform_params->axis(), pos));
     }
     histogram.normalize();
     return histogram.get_histogram();
diff --git a/src/core/observables/CylindricalLBFluxDensityProfileAtParticlePositions.cpp b/src/core/observables/CylindricalLBFluxDensityProfileAtParticlePositions.cpp
index de51d4a9e10..5ef211f40db 100644
--- a/src/core/observables/CylindricalLBFluxDensityProfileAtParticlePositions.cpp
+++ b/src/core/observables/CylindricalLBFluxDensityProfileAtParticlePositions.cpp
@@ -42,13 +42,24 @@ CylindricalLBFluxDensityProfileAtParticlePositions::evaluate(
     auto const pos = folded_position(traits.position(p), box_geo);
     auto const v = lb_lbfluid_get_interpolated_velocity(pos) *
                    lb_lbfluid_get_lattice_speed();
+    auto const flux_dens = lb_lbfluid_get_interpolated_density(pos) * v;
 
-    histogram.update(
-        Utils::transform_coordinate_cartesian_to_cylinder(pos - center, axis),
-        Utils::transform_vector_cartesian_to_cylinder(v, axis, pos - center));
+    histogram.update(Utils::transform_coordinate_cartesian_to_cylinder(
+                         pos - transform_params->center(),
+                         transform_params->axis(),
+                         transform_params->orientation()),
+                     Utils::transform_vector_cartesian_to_cylinder(
+                         flux_dens, transform_params->axis(),
+                         pos - transform_params->center()));
   }
 
-  histogram.normalize();
-  return histogram.get_histogram();
+  // normalize by number of hits per bin
+  auto hist_tmp = histogram.get_histogram();
+  auto tot_count = histogram.get_tot_count();
+  std::transform(hist_tmp.begin(), hist_tmp.end(), tot_count.begin(),
+                 hist_tmp.begin(), [](auto hi, auto ci) {
+                   return ci > 0 ? hi / static_cast<double>(ci) : 0.;
+                 });
+  return hist_tmp;
 }
 } // namespace Observables
diff --git a/src/core/observables/CylindricalLBProfileObservable.hpp b/src/core/observables/CylindricalLBProfileObservable.hpp
index c1d5eea4d7e..df4e66e5936 100644
--- a/src/core/observables/CylindricalLBProfileObservable.hpp
+++ b/src/core/observables/CylindricalLBProfileObservable.hpp
@@ -21,6 +21,7 @@
 
 #include "CylindricalProfileObservable.hpp"
 
+#include <utility>
 #include <utils/Vector.hpp>
 #include <utils/math/coordinate_transformation.hpp>
 #include <utils/math/vec_rotate.hpp>
@@ -30,15 +31,15 @@ namespace Observables {
 
 class CylindricalLBProfileObservable : public CylindricalProfileObservable {
 public:
-  CylindricalLBProfileObservable(Utils::Vector3d const &center,
-                                 Utils::Vector3d const &axis, int n_r_bins,
-                                 int n_phi_bins, int n_z_bins, double min_r,
-                                 double max_r, double min_phi, double max_phi,
-                                 double min_z, double max_z,
-                                 double sampling_density)
-      : CylindricalProfileObservable(center, axis, n_r_bins, n_phi_bins,
-                                     n_z_bins, min_r, max_r, min_phi, max_phi,
-                                     min_z, max_z),
+  CylindricalLBProfileObservable(
+      std::shared_ptr<Utils::CylindricalTransformationParameters>
+          transform_params,
+      int n_r_bins, int n_phi_bins, int n_z_bins, double min_r, double max_r,
+      double min_phi, double max_phi, double min_z, double max_z,
+      double sampling_density)
+      : CylindricalProfileObservable(std::move(transform_params), n_r_bins,
+                                     n_phi_bins, n_z_bins, min_r, max_r,
+                                     min_phi, max_phi, min_z, max_z),
         sampling_density(sampling_density) {
     calculate_sampling_positions();
   }
@@ -47,17 +48,16 @@ class CylindricalLBProfileObservable : public CylindricalProfileObservable {
         limits[0], limits[1], limits[2], n_bins[0], n_bins[1], n_bins[2],
         sampling_density);
     for (auto &p : sampling_positions) {
-      double theta;
-      Utils::Vector3d rotation_axis;
-      auto p_cart = Utils::transform_coordinate_cylinder_to_cartesian(
-          p, Utils::Vector3d{{0.0, 0.0, 1.0}});
+      auto p_cart = Utils::transform_coordinate_cylinder_to_cartesian(p);
       // We have to rotate the coordinates since the utils function assumes
       // z-axis symmetry.
-      std::tie(theta, rotation_axis) =
-          Utils::rotation_params(Utils::Vector3d{{0.0, 0.0, 1.0}}, axis);
+      constexpr Utils::Vector3d z_axis{{0.0, 0.0, 1.0}};
+      auto const theta = Utils::angle_between(z_axis, transform_params->axis());
+      auto const rot_axis =
+          Utils::vector_product(z_axis, transform_params->axis()).normalize();
       if (theta > std::numeric_limits<double>::epsilon())
-        p_cart = Utils::vec_rotate(rotation_axis, theta, p_cart);
-      p = p_cart + center;
+        p_cart = Utils::vec_rotate(rot_axis, theta, p_cart);
+      p = p_cart + transform_params->center();
     }
   }
   std::vector<Utils::Vector3d> sampling_positions;
diff --git a/src/core/observables/CylindricalLBVelocityProfile.cpp b/src/core/observables/CylindricalLBVelocityProfile.cpp
index 28148f5d493..791417859b8 100644
--- a/src/core/observables/CylindricalLBVelocityProfile.cpp
+++ b/src/core/observables/CylindricalLBVelocityProfile.cpp
@@ -35,11 +35,12 @@ std::vector<double> CylindricalLBVelocityProfile::operator()() const {
   for (auto const &p : sampling_positions) {
     auto const velocity = lb_lbfluid_get_interpolated_velocity(p) *
                           lb_lbfluid_get_lattice_speed();
-    auto const pos_shifted = p - center;
-    auto const pos_cyl =
-        Utils::transform_coordinate_cartesian_to_cylinder(pos_shifted, axis);
-    histogram.update(pos_cyl, Utils::transform_vector_cartesian_to_cylinder(
-                                  velocity, axis, pos_shifted));
+    auto const pos_shifted = p - transform_params->center();
+    auto const pos_cyl = Utils::transform_coordinate_cartesian_to_cylinder(
+        pos_shifted, transform_params->axis(), transform_params->orientation());
+    histogram.update(pos_cyl,
+                     Utils::transform_vector_cartesian_to_cylinder(
+                         velocity, transform_params->axis(), pos_shifted));
   }
   auto hist_data = histogram.get_histogram();
   auto const tot_count = histogram.get_tot_count();
diff --git a/src/core/observables/CylindricalLBVelocityProfileAtParticlePositions.cpp b/src/core/observables/CylindricalLBVelocityProfileAtParticlePositions.cpp
index e1106574cfa..9650c7dda6f 100644
--- a/src/core/observables/CylindricalLBVelocityProfileAtParticlePositions.cpp
+++ b/src/core/observables/CylindricalLBVelocityProfileAtParticlePositions.cpp
@@ -41,17 +41,20 @@ std::vector<double> CylindricalLBVelocityProfileAtParticlePositions::evaluate(
                    lb_lbfluid_get_lattice_speed();
 
     histogram.update(
-        Utils::transform_coordinate_cartesian_to_cylinder(pos - center, axis),
-        Utils::transform_vector_cartesian_to_cylinder(v, axis, pos - center));
+        Utils::transform_coordinate_cartesian_to_cylinder(
+            pos - transform_params->center(), transform_params->axis(),
+            transform_params->orientation()),
+        Utils::transform_vector_cartesian_to_cylinder(
+            v, transform_params->axis(), pos - transform_params->center()));
   }
 
+  // normalize by number of hits per bin
   auto hist_tmp = histogram.get_histogram();
   auto tot_count = histogram.get_tot_count();
-  for (size_t ind = 0; ind < hist_tmp.size(); ++ind) {
-    if (tot_count[ind] > 0) {
-      hist_tmp[ind] /= static_cast<double>(tot_count[ind]);
-    }
-  }
+  std::transform(hist_tmp.begin(), hist_tmp.end(), tot_count.begin(),
+                 hist_tmp.begin(), [](auto hi, auto ci) {
+                   return ci > 0 ? hi / static_cast<double>(ci) : 0.;
+                 });
   return hist_tmp;
 }
 
diff --git a/src/core/observables/CylindricalPidProfileObservable.hpp b/src/core/observables/CylindricalPidProfileObservable.hpp
index bbece2be6e5..8a3ae7886e1 100644
--- a/src/core/observables/CylindricalPidProfileObservable.hpp
+++ b/src/core/observables/CylindricalPidProfileObservable.hpp
@@ -19,6 +19,8 @@
 #ifndef OBSERVABLES_CYLINDRICALPIDPROFILEOBSERVABLE_HPP
 #define OBSERVABLES_CYLINDRICALPIDPROFILEOBSERVABLE_HPP
 
+#include <utility>
+
 #include "CylindricalProfileObservable.hpp"
 #include "PidObservable.hpp"
 
@@ -27,16 +29,16 @@ namespace Observables {
 class CylindricalPidProfileObservable : public PidObservable,
                                         public CylindricalProfileObservable {
 public:
-  CylindricalPidProfileObservable(std::vector<int> const &ids,
-                                  Utils::Vector3d const &center,
-                                  Utils::Vector3d const &axis, int n_r_bins,
-                                  int n_phi_bins, int n_z_bins, double min_r,
-                                  double max_r, double min_phi, double max_phi,
-                                  double min_z, double max_z)
+  CylindricalPidProfileObservable(
+      std::vector<int> const &ids,
+      std::shared_ptr<Utils::CylindricalTransformationParameters>
+          transform_params,
+      int n_r_bins, int n_phi_bins, int n_z_bins, double min_r, double max_r,
+      double min_phi, double max_phi, double min_z, double max_z)
       : PidObservable(ids),
-        CylindricalProfileObservable(center, axis, n_r_bins, n_phi_bins,
-                                     n_z_bins, min_r, max_r, min_phi, max_phi,
-                                     min_z, max_z) {}
+        CylindricalProfileObservable(std::move(transform_params), n_r_bins,
+                                     n_phi_bins, n_z_bins, min_r, max_r,
+                                     min_phi, max_phi, min_z, max_z) {}
 };
 
 } // Namespace Observables
diff --git a/src/core/observables/CylindricalProfileObservable.hpp b/src/core/observables/CylindricalProfileObservable.hpp
index c28669a4e62..224856f0cfd 100644
--- a/src/core/observables/CylindricalProfileObservable.hpp
+++ b/src/core/observables/CylindricalProfileObservable.hpp
@@ -22,12 +22,17 @@
 #include "ProfileObservable.hpp"
 
 #include <utils/Vector.hpp>
+#include <utils/math/abs.hpp>
+#include <utils/math/cylindrical_transformation_parameters.hpp>
 #include <utils/math/make_lin_space.hpp>
 
 #include <boost/range/algorithm.hpp>
 
 #include <array>
 #include <cstddef>
+#include <limits>
+#include <memory>
+#include <utility>
 #include <vector>
 
 namespace Observables {
@@ -35,16 +40,16 @@ namespace Observables {
 /** Cylindrical profile observable */
 class CylindricalProfileObservable : public ProfileObservable {
 public:
-  CylindricalProfileObservable(Utils::Vector3d const &center,
-                               Utils::Vector3d const &axis, int n_r_bins,
-                               int n_phi_bins, int n_z_bins, double min_r,
-                               double max_r, double min_phi, double max_phi,
-                               double min_z, double max_z)
+  CylindricalProfileObservable(
+      std::shared_ptr<Utils::CylindricalTransformationParameters>
+          transform_params,
+      int n_r_bins, int n_phi_bins, int n_z_bins, double min_r, double max_r,
+      double min_phi, double max_phi, double min_z, double max_z)
       : ProfileObservable(n_r_bins, n_phi_bins, n_z_bins, min_r, max_r, min_phi,
                           max_phi, min_z, max_z),
-        center(center), axis(axis) {}
-  Utils::Vector3d center;
-  Utils::Vector3d axis;
+        transform_params(std::move(transform_params)) {}
+
+  std::shared_ptr<Utils::CylindricalTransformationParameters> transform_params;
 };
 
 } // Namespace Observables
diff --git a/src/core/observables/CylindricalVelocityProfile.hpp b/src/core/observables/CylindricalVelocityProfile.hpp
index 58dfa0bc5fc..6b70dd2feaf 100644
--- a/src/core/observables/CylindricalVelocityProfile.hpp
+++ b/src/core/observables/CylindricalVelocityProfile.hpp
@@ -43,11 +43,13 @@ class CylindricalVelocityProfile : public CylindricalPidProfileObservable {
     Utils::CylindricalHistogram<double, 3> histogram(n_bins, 3, limits);
 
     for (auto p : particles) {
-      auto const pos = folded_position(traits.position(p), box_geo) - center;
+      auto const pos = folded_position(traits.position(p), box_geo) -
+                       transform_params->center();
       histogram.update(
-          Utils::transform_coordinate_cartesian_to_cylinder(pos, axis),
-          Utils::transform_vector_cartesian_to_cylinder(traits.velocity(p),
-                                                        axis, pos));
+          Utils::transform_coordinate_cartesian_to_cylinder(
+              pos, transform_params->axis(), transform_params->orientation()),
+          Utils::transform_vector_cartesian_to_cylinder(
+              traits.velocity(p), transform_params->axis(), pos));
     }
 
     auto hist_tmp = histogram.get_histogram();
diff --git a/src/core/observables/ParticleAngularVelocities.hpp b/src/core/observables/ParticleAngularVelocities.hpp
index 03065a21811..ffaae395840 100644
--- a/src/core/observables/ParticleAngularVelocities.hpp
+++ b/src/core/observables/ParticleAngularVelocities.hpp
@@ -20,7 +20,6 @@
 #define OBSERVABLES_PARTICLEANGULARVELOCITIES_HPP
 
 #include "PidObservable.hpp"
-#include "integrate.hpp"
 #include "rotation.hpp"
 
 #include <utils/Span.hpp>
diff --git a/src/core/observables/ParticleBodyAngularVelocities.hpp b/src/core/observables/ParticleBodyAngularVelocities.hpp
index e082d37ea25..22af41b07a6 100644
--- a/src/core/observables/ParticleBodyAngularVelocities.hpp
+++ b/src/core/observables/ParticleBodyAngularVelocities.hpp
@@ -20,7 +20,6 @@
 #define OBSERVABLES_PARTICLEBODYANGULARVELOCITIES_HPP
 
 #include "PidObservable.hpp"
-#include "integrate.hpp"
 
 #include <utils/Span.hpp>
 
diff --git a/src/core/observables/ParticleBodyVelocities.hpp b/src/core/observables/ParticleBodyVelocities.hpp
index 772139c12b4..e95c016fbf4 100644
--- a/src/core/observables/ParticleBodyVelocities.hpp
+++ b/src/core/observables/ParticleBodyVelocities.hpp
@@ -20,7 +20,6 @@
 #define OBSERVABLES_PARTICLEBODYVELOCITIES_HPP
 
 #include "PidObservable.hpp"
-#include "integrate.hpp"
 
 #include "rotation.hpp"
 
diff --git a/src/core/observables/ParticleForces.hpp b/src/core/observables/ParticleForces.hpp
index 2c953961d35..962531c8772 100644
--- a/src/core/observables/ParticleForces.hpp
+++ b/src/core/observables/ParticleForces.hpp
@@ -21,7 +21,6 @@
 
 #include "Particle.hpp"
 #include "PidObservable.hpp"
-#include "integrate.hpp"
 #include <cstddef>
 #include <vector>
 
diff --git a/src/core/particle_data.cpp b/src/core/particle_data.cpp
index 73db3218e78..a185fa71eff 100644
--- a/src/core/particle_data.cpp
+++ b/src/core/particle_data.cpp
@@ -1021,14 +1021,6 @@ void mpi_rescale_particles(int dir, double scale) {
  *  @param _delete if true, delete the exclusion instead of add
  */
 void local_change_exclusion(int part1, int part2, int _delete) {
-  if (part1 == -1 && part2 == -1) {
-    for (auto &p : cell_structure.local_particles()) {
-      p.exclusions().clear();
-    }
-
-    return;
-  }
-
   /* part1, if here */
   auto part = cell_structure.get_local_particle(part1);
   if (part) {
@@ -1088,8 +1080,6 @@ int change_exclusion(int part1, int part2, int _delete) {
   return ES_ERROR;
 }
 
-void remove_all_exclusions() { mpi_send_exclusion(-1, -1, 1); }
-
 void auto_exclusions(int distance) {
   /* partners is a list containing the currently found excluded particles for
      each particle, and their distance, as an interleaved list */
diff --git a/src/core/particle_data.hpp b/src/core/particle_data.hpp
index 8fab4251da9..6eeb12c4eb3 100644
--- a/src/core/particle_data.hpp
+++ b/src/core/particle_data.hpp
@@ -321,9 +321,6 @@ const std::vector<BondView> &get_particle_bonds(int part);
  *          exclusion set)
  */
 int change_exclusion(int part, int part2, int _delete);
-
-/** remove all exclusions. */
-void remove_all_exclusions();
 #endif
 
 /** Remove particle with a given identity. Also removes all bonds to the
diff --git a/src/core/polymer.hpp b/src/core/polymer.hpp
index d6cab6cfbcd..17152c2b6a6 100644
--- a/src/core/polymer.hpp
+++ b/src/core/polymer.hpp
@@ -36,14 +36,18 @@
 #include <vector>
 
 /** Determines valid polymer positions and returns them.
+ *  @param  partCfg           particle collection
  *  @param  n_polymers        how many polymers to create
  *  @param  beads_per_chain   monomers per chain
  *  @param  bond_length       length of the bonds between two monomers
- *  @param  seed              seed for RNG
+ *  @param  start_positions   starting positions of each polymers
  *  @param  min_distance      minimum distance between all particles
  *  @param  max_tries         how often a monomer/polymer should be reset if
  *                            current position collides with a previous particle
+ *  @param  use_bond_angle    whether to use the @p bond_angle argument
  *  @param  bond_angle        desired bond-angle to be fixed
+ *  @param  respect_constraints  whether to respect constraints
+ *  @param  seed              seed for RNG
  */
 std::vector<std::vector<Utils::Vector3d>>
 draw_polymer_positions(PartCfg &partCfg, int n_polymers, int beads_per_chain,
diff --git a/src/core/rattle.cpp b/src/core/rattle.cpp
index 45dee5a6663..cfce8583142 100644
--- a/src/core/rattle.cpp
+++ b/src/core/rattle.cpp
@@ -37,39 +37,6 @@
 
 #include <cmath>
 
-/** \name Private functions */
-/************************************************************/
-/**@{*/
-
-/** Positional Corrections are added to the current particle positions. Invoked
- * from \ref correct_pos_shake() */
-static void app_pos_correction(const ParticleRange &particles);
-
-/** Transfers temporarily the current forces from f.f[3] of the \ref Particle
-    structure to r.p_old[3] location and also initializes velocity correction
-    vector. Invoked from \ref correct_vel_shake()*/
-static void transfer_force_init_vel(const ParticleRange &particles,
-                                    const ParticleRange &ghost_particles);
-
-/** Calculates corrections of the  current particle velocities according to
-   RATTLE
-    algorithm. Invoked from \ref correct_vel_shake()*/
-static void compute_vel_corr_vec(int *repeat_, CellStructure &cs);
-
-/** Velocity corrections are added to the current particle velocities. Invoked
-   from
-    \ref correct_vel_shake()*/
-static void apply_vel_corr(const ParticleRange &particles);
-
-/**Invoked from \ref correct_vel_shake(). Put back the forces from r.p_old to
- * f.f*/
-static void revert_force(const ParticleRange &particles,
-                         const ParticleRange &ghost_particles);
-
-/**@}*/
-
-/*Initialize old positions (particle positions at previous time step)
-  of the particles*/
 void save_old_pos(const ParticleRange &particles,
                   const ParticleRange &ghost_particles) {
   auto save_pos = [](Particle &p) {
@@ -84,8 +51,9 @@ void save_old_pos(const ParticleRange &particles,
     save_pos(p);
 }
 
-/**Initialize the correction vector. The correction vector is stored in f.f of
- * particle structure. */
+/** Initialize the velocity correction vectors. The correction vectors are
+ *  stored in @ref ParticleForce::f "Particle::f::f".
+ */
 static void init_correction_vector(const ParticleRange &local_particles,
                                    const ParticleRange &ghost_particles) {
   auto reset_force = [](Particle &p) {
@@ -131,7 +99,8 @@ static bool add_pos_corr_vec(Rigid_bond_parameters const &ia_params,
 
   return false;
 }
-/**Compute positional corrections*/
+
+/** Compute position corrections */
 static void compute_pos_corr_vec(int *repeat_, CellStructure &cs) {
   cs.bond_loop(
       [repeat_](Particle &p1, int bond_id, Utils::Span<Particle *> partners) {
@@ -148,20 +117,16 @@ static void compute_pos_corr_vec(int *repeat_, CellStructure &cs) {
       });
 }
 
-/**Apply corrections to each particle**/
+/** Apply position corrections */
 static void app_pos_correction(const ParticleRange &particles) {
-  /*Apply corrections*/
   for (auto &p : particles) {
     for (int j = 0; j < 3; j++) {
       p.r.p[j] += p.f.f[j];
       p.m.v[j] += p.f.f[j];
     }
-    /**Completed for one particle*/
-  } // for i loop
+  }
 }
 
-/** Calculates the corrections required for each of the particle coordinates
-    according to the RATTLE algorithm. Invoked from \ref correct_pos_shake()*/
 void correct_pos_shake(CellStructure &cs) {
   cells_update_ghosts(Cells::DATA_PART_POSITION | Cells::DATA_PART_PROPERTIES);
 
@@ -178,7 +143,7 @@ void correct_pos_shake(CellStructure &cs) {
     cell_structure.ghosts_reduce_forces();
 
     app_pos_correction(particles);
-    /**Ghost Positions Update*/
+    /* Ghost Positions Update */
     cs.ghosts_update(Cells::DATA_PART_POSITION | Cells::DATA_PART_MOMENTUM);
 
     repeat = boost::mpi::all_reduce(comm_cart, (repeat_ > 0),
@@ -194,11 +159,10 @@ void correct_pos_shake(CellStructure &cs) {
   check_resort_particles();
 }
 
-/**The forces are transferred temporarily from f.f member of particle structure
-   to r.p_old,
-    which is idle now and initialize the velocity correction vector to zero at
-   f.f[3]
-    of Particle structure*/
+/** Transfer the current forces from @ref ParticleForce::f "Particle::f::f"
+ *  to @ref ParticlePosition::p_old "Particle::r::p_old" and reset the
+ *  velocity correction vectors at @ref ParticleForce::f "Particle::f::f".
+ */
 static void transfer_force_init_vel(const ParticleRange &particles,
                                     const ParticleRange &ghost_particles) {
   auto copy_reset = [](Particle &p) {
@@ -246,7 +210,7 @@ static bool add_vel_corr_vec(Rigid_bond_parameters const &ia_params,
   return false;
 }
 
-/** Velocity correction vectors are computed*/
+/** Compute velocity correction vectors */
 static void compute_vel_corr_vec(int *repeat_, CellStructure &cs) {
   cs.bond_loop(
       [repeat_](Particle &p1, int bond_id, Utils::Span<Particle *> partners) {
@@ -258,23 +222,21 @@ static void compute_vel_corr_vec(int *repeat_, CellStructure &cs) {
             *repeat_ += 1;
         }
 
-        /* Rigid bonds can not break */
+        /* Rigid bonds cannot break */
         return false;
       });
 }
 
-/**Apply velocity corrections*/
+/** Apply velocity corrections */
 static void apply_vel_corr(const ParticleRange &particles) {
-  /*Apply corrections*/
   for (auto &p : particles) {
     for (int j = 0; j < 3; j++) {
       p.m.v[j] += p.f.f[j];
     }
-    /**Completed for one particle*/
-  } // for i loop
+  }
 }
 
-/**Put back the forces from r.p_old to f.f*/
+/** Put back the forces from r.p_old to f.f */
 static void revert_force(const ParticleRange &particles,
                          const ParticleRange &ghost_particles) {
   auto revert = [](Particle &p) {
@@ -292,9 +254,9 @@ static void revert_force(const ParticleRange &particles,
 void correct_vel_shake(CellStructure &cs) {
   cs.ghosts_update(Cells::DATA_PART_POSITION | Cells::DATA_PART_MOMENTUM);
 
-  /**transfer the current forces to r.p_old of the particle structure so that
-  velocity corrections can be stored temporarily at the f.f[3] of the particle
-  structure  */
+  /* transfer the current forces to r.p_old of the particle structure so that
+   * velocity corrections can be stored temporarily at the f.f member of the
+   * particle structure */
   auto particles = cs.local_particles();
   auto ghost_particles = cs.ghost_particles();
 
@@ -322,7 +284,6 @@ void correct_vel_shake(CellStructure &cs) {
             this_node, cnt);
     errexit();
   }
-  /**Puts back the forces from r.p_old to f.f[3]*/
   revert_force(particles, ghost_particles);
 }
 
diff --git a/src/core/rattle.hpp b/src/core/rattle.hpp
index 88bf09b978b..9d193ee83a9 100644
--- a/src/core/rattle.hpp
+++ b/src/core/rattle.hpp
@@ -34,8 +34,9 @@
 
 #ifdef BOND_CONSTRAINT
 
-/** Transfers the current particle positions from r.p[3] to r.p_pold[3]
-    of the \ref Particle structure. Invoked from \ref correct_pos_shake() */
+/** Transfer the current particle positions from @ref ParticlePosition::p
+ *  "Particle::r::p" to @ref ParticlePosition::p_old "Particle::r::p_old"
+ */
 void save_old_pos(const ParticleRange &particles,
                   const ParticleRange &ghost_particles);
 
diff --git a/src/core/rotation.cpp b/src/core/rotation.cpp
index c185e23cc26..86223f6c15a 100644
--- a/src/core/rotation.cpp
+++ b/src/core/rotation.cpp
@@ -34,7 +34,6 @@
 #include "rotation.hpp"
 
 #ifdef ROTATION
-#include "integrate.hpp"
 
 #include <utils/Vector.hpp>
 #include <utils/mask.hpp>
@@ -116,13 +115,13 @@ static void define_Qdd(Particle const &p, Utils::Quaternion<double> &Qd,
  *  notation for quaternions, while @cite omelyan98a uses scalar-last
  *  notation.
  *
- *  For very high angular velocities (e.g. if the product of @ref time_step
+ *  For very high angular velocities (e.g. if the product of @p time_step
  *  with the largest component of @ref ParticleMomentum::omega "p.m.omega"
  *  is superior to ~2.0), the calculation might fail.
  *
  *  \todo implement for fixed_coord_flag
  */
-void propagate_omega_quat_particle(Particle &p) {
+void propagate_omega_quat_particle(Particle &p, double time_step) {
 
   // If rotation for the particle is disabled entirely, return early.
   if (p.p.rotation == ROTATION_FIXED)
@@ -159,7 +158,8 @@ void propagate_omega_quat_particle(Particle &p) {
   }
 }
 
-void convert_torques_propagate_omega(const ParticleRange &particles) {
+void convert_torques_propagate_omega(const ParticleRange &particles,
+                                     double time_step) {
   for (auto &p : particles) {
     // Skip particle if rotation is turned off entirely for it.
     if (p.p.rotation == ROTATION_FIXED)
diff --git a/src/core/rotation.hpp b/src/core/rotation.hpp
index c8f6d135946..ada1217f5e3 100644
--- a/src/core/rotation.hpp
+++ b/src/core/rotation.hpp
@@ -43,12 +43,13 @@
 /** @brief Propagate angular velocities and update quaternions on a
  *  particle.
  */
-void propagate_omega_quat_particle(Particle &p);
+void propagate_omega_quat_particle(Particle &p, double time_step);
 
 /** @brief Convert torques to the body-fixed frame and propagate
  *  angular velocities.
  */
-void convert_torques_propagate_omega(const ParticleRange &particles);
+void convert_torques_propagate_omega(const ParticleRange &particles,
+                                     double time_step);
 
 /** Convert torques to the body-fixed frame before the integration loop. */
 void convert_initial_torques(const ParticleRange &particles);
diff --git a/src/core/short_range_loop.hpp b/src/core/short_range_loop.hpp
index b11131d60aa..fc52ee3eb03 100644
--- a/src/core/short_range_loop.hpp
+++ b/src/core/short_range_loop.hpp
@@ -31,7 +31,7 @@ namespace detail {
  *        any arguments.
  */
 struct True {
-  template <class... T> bool operator()(T...) const { return true; }
+  template <class... T> bool operator()(T &...) const { return true; }
 };
 } // namespace detail
 
diff --git a/src/core/statistics.hpp b/src/core/statistics.hpp
index 09c1290031e..d24c9a50f5a 100644
--- a/src/core/statistics.hpp
+++ b/src/core/statistics.hpp
@@ -32,6 +32,7 @@
 
 /** Calculate the minimal distance of two particles with types in set1 resp.
  *  set2.
+ *  @param partCfg particle collection.
  *  @param set1 types of particles
  *  @param set2 types of particles
  *  @return the minimal distance of two particles
@@ -68,6 +69,7 @@ double distto(PartCfg &partCfg, const Utils::Vector3d &pos, int pid = -1);
  *  into @p r_bins bins which are either equidistant (@p log_flag==false) or
  *  logarithmically equidistant (@p log_flag==true). The result is stored
  *  in the @p array dist.
+ *  @param partCfg  particle collection.
  *  @param p1_types list with types of particles to find the distribution for.
  *  @param p2_types list with types of particles the others are distributed
  *                  around.
@@ -96,6 +98,7 @@ void calc_part_distribution(PartCfg &partCfg, std::vector<int> const &p1_types,
  *  and sf[1]=1. For q=7, there are no possible wave vectors, so
  *  sf[2*(7-1)]=sf[2*(7-1)+1]=0.
  *
+ *  @param partCfg   particle collection
  *  @param p_types   list with types of particles to be analyzed
  *  @param order     the maximum wave vector length in 2PI/L
  */
diff --git a/src/core/stokesian_dynamics/sd_interface.cpp b/src/core/stokesian_dynamics/sd_interface.cpp
index 626bbdc3fa6..86c4bcdfa1c 100644
--- a/src/core/stokesian_dynamics/sd_interface.cpp
+++ b/src/core/stokesian_dynamics/sd_interface.cpp
@@ -119,8 +119,6 @@ void set_sd_viscosity(double eta) {
   sd_viscosity = eta;
 }
 
-double get_sd_viscosity() { return sd_viscosity; }
-
 void set_sd_radius_dict(std::unordered_map<int, double> const &x) {
   /* Check that radii are positive */
   for (auto const &kv : x) {
@@ -134,8 +132,6 @@ void set_sd_radius_dict(std::unordered_map<int, double> const &x) {
   radius_dict = x;
 }
 
-std::unordered_map<int, double> get_sd_radius_dict() { return radius_dict; }
-
 void set_sd_kT(double kT) {
   if (kT < 0.0) {
     throw std::runtime_error("kT has an invalid value: " + std::to_string(kT));
@@ -148,8 +144,6 @@ double get_sd_kT() { return sd_kT; }
 
 void set_sd_flags(int flg) { sd_flags = flg; }
 
-int get_sd_flags() { return sd_flags; }
-
 void propagate_vel_pos_sd(const ParticleRange &particles,
                           const boost::mpi::communicator &comm,
                           const double time_step) {
diff --git a/src/core/stokesian_dynamics/sd_interface.hpp b/src/core/stokesian_dynamics/sd_interface.hpp
index 01219147538..d9107a95bdd 100644
--- a/src/core/stokesian_dynamics/sd_interface.hpp
+++ b/src/core/stokesian_dynamics/sd_interface.hpp
@@ -35,16 +35,13 @@
 #include <unordered_map>
 
 void set_sd_viscosity(double eta);
-double get_sd_viscosity();
 
 void set_sd_radius_dict(std::unordered_map<int, double> const &x);
-std::unordered_map<int, double> get_sd_radius_dict();
 
 void set_sd_kT(double kT);
 double get_sd_kT();
 
 void set_sd_flags(int flg);
-int get_sd_flags();
 
 /** Takes the forces and torques on all particles and computes their
  *  velocities. Acts globally on particles on all nodes; i.e. particle data
diff --git a/src/core/unit_tests/Particle_test.cpp b/src/core/unit_tests/Particle_test.cpp
index 1f1fe77b6cc..a73eb0ad252 100644
--- a/src/core/unit_tests/Particle_test.cpp
+++ b/src/core/unit_tests/Particle_test.cpp
@@ -122,3 +122,68 @@ BOOST_AUTO_TEST_CASE(properties_serialization) {
     BOOST_CHECK_EQUAL(out.identity, prop.identity);
   }
 }
+
+void check_particle_force(ParticleForce const &out, ParticleForce const &ref) {
+  BOOST_TEST(out.f == ref.f, boost::test_tools::per_element());
+#ifdef ROTATION
+  BOOST_TEST(out.torque == ref.torque, boost::test_tools::per_element());
+#endif
+}
+
+namespace Utils {
+template <>
+struct is_statically_serializable<ParticleForce> : std::true_type {};
+} // namespace Utils
+
+BOOST_AUTO_TEST_CASE(force_serialization) {
+  auto const expected_size =
+      Utils::MemcpyOArchive::packing_size<ParticleForce>();
+
+  BOOST_CHECK_LE(expected_size, sizeof(ParticleForce));
+
+  std::vector<char> buf(expected_size);
+
+  auto pf = ParticleForce{{1, 2, 3}};
+#ifdef ROTATION
+  pf.torque = {4, 5, 6};
+#endif
+
+  {
+    auto oa = Utils::MemcpyOArchive{Utils::make_span(buf)};
+
+    oa << pf;
+
+    BOOST_CHECK_EQUAL(oa.bytes_written(), expected_size);
+  }
+
+  {
+    auto ia = Utils::MemcpyIArchive{Utils::make_span(buf)};
+    ParticleForce out;
+
+    ia >> out;
+
+    BOOST_CHECK_EQUAL(ia.bytes_read(), expected_size);
+    check_particle_force(out, pf);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(force_constructors) {
+
+  auto pf = ParticleForce{{1, 2, 3}};
+#ifdef ROTATION
+  pf.torque = {4, 5, 6};
+#endif
+
+  // check copy constructor
+  {
+    ParticleForce out(pf);
+    check_particle_force(out, pf);
+  }
+
+  // check copy assignment operator
+  {
+    ParticleForce out; // avoid copy elision
+    out = pf;
+    check_particle_force(out, pf);
+  }
+}
diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
index 8aecba41972..9f6dddbf21b 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
+++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda.cu
@@ -17,7 +17,6 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-// *******
 // This is an internal file of the IMMERSED BOUNDARY implementation
 // It should not be included by any main ESPResSo routines
 // Functions to be exported for ESPResSo are in ibm_main.hpp
@@ -31,7 +30,7 @@
 
 #include "Particle.hpp"
 #include "cuda_interface.hpp"
-#include "cuda_utils.hpp"
+#include "cuda_utils.cuh"
 #include "grid_based_algorithms/lb_boundaries.hpp"
 #include "grid_based_algorithms/lbgpu.cuh"
 #include "grid_based_algorithms/lbgpu.hpp"
@@ -40,29 +39,31 @@
 
 #include <cstddef>
 
-// To avoid include of communication.hpp in cuda file
+// To avoid including communication.hpp
 extern int this_node;
 
-// ***** Other functions for internal use *****
-void InitCUDA_IBM(int numParticles);
+// Other functions for internal use
+void InitCUDA_IBM(std::size_t numParticles);
 
-// ***** Our own global variables ********
+// Our own global variables
 IBM_CUDA_ParticleDataInput *IBM_ParticleDataInput_device = nullptr;
 IBM_CUDA_ParticleDataOutput *IBM_ParticleDataOutput_device = nullptr;
-int IBM_numParticlesCache = -1; // To detect a change in particle number which
-                                // requires reallocation of memory
+bool IBM_initialized = false;
+std::size_t IBM_numParticlesCache = 0; // To detect a change in particle number
+                                       // which requires reallocation of memory
 
-// ****** These variables are defined in lbgpu_cuda.cu, but we also want them
-// here ****
+// These variables are defined in lbgpu_cuda.cu, but we also want them here
 extern LB_node_force_density_gpu node_f;
 extern LB_nodes_gpu *current_nodes;
 
-// ** These variables are static in lbgpu_cuda.cu, so we need to duplicate them
+// These variables are static in lbgpu_cuda.cu, so we need to duplicate them
 // here. They are initialized in ForcesIntoFluid. The pointers are on the host,
 // but point into device memory.
 LB_parameters_gpu *para_gpu = nullptr;
 float *lb_boundary_velocity_IBM = nullptr;
 
+static constexpr unsigned int threads_per_block = 64;
+
 /** @copybrief calc_m_from_n
  *
  *  This is a re-implementation of @ref calc_m_from_n. It does exactly the
@@ -73,60 +74,60 @@ __device__ void Calc_m_from_n_IBM(const LB_nodes_gpu n_a,
                                   const LB_parameters_gpu *const paraP) {
   const LB_parameters_gpu &para = *paraP;
   // mass mode
-  mode[0] = n_a.vd[0 * para.number_of_nodes + index] +
-            n_a.vd[1 * para.number_of_nodes + index] +
-            n_a.vd[2 * para.number_of_nodes + index] +
-            n_a.vd[3 * para.number_of_nodes + index] +
-            n_a.vd[4 * para.number_of_nodes + index] +
-            n_a.vd[5 * para.number_of_nodes + index] +
-            n_a.vd[6 * para.number_of_nodes + index] +
-            n_a.vd[7 * para.number_of_nodes + index] +
-            n_a.vd[8 * para.number_of_nodes + index] +
-            n_a.vd[9 * para.number_of_nodes + index] +
-            n_a.vd[10 * para.number_of_nodes + index] +
-            n_a.vd[11 * para.number_of_nodes + index] +
-            n_a.vd[12 * para.number_of_nodes + index] +
-            n_a.vd[13 * para.number_of_nodes + index] +
-            n_a.vd[14 * para.number_of_nodes + index] +
-            n_a.vd[15 * para.number_of_nodes + index] +
-            n_a.vd[16 * para.number_of_nodes + index] +
-            n_a.vd[17 * para.number_of_nodes + index] +
-            n_a.vd[18 * para.number_of_nodes + index];
+  mode[0] = n_a.populations[0 * para.number_of_nodes + index] +
+            n_a.populations[1 * para.number_of_nodes + index] +
+            n_a.populations[2 * para.number_of_nodes + index] +
+            n_a.populations[3 * para.number_of_nodes + index] +
+            n_a.populations[4 * para.number_of_nodes + index] +
+            n_a.populations[5 * para.number_of_nodes + index] +
+            n_a.populations[6 * para.number_of_nodes + index] +
+            n_a.populations[7 * para.number_of_nodes + index] +
+            n_a.populations[8 * para.number_of_nodes + index] +
+            n_a.populations[9 * para.number_of_nodes + index] +
+            n_a.populations[10 * para.number_of_nodes + index] +
+            n_a.populations[11 * para.number_of_nodes + index] +
+            n_a.populations[12 * para.number_of_nodes + index] +
+            n_a.populations[13 * para.number_of_nodes + index] +
+            n_a.populations[14 * para.number_of_nodes + index] +
+            n_a.populations[15 * para.number_of_nodes + index] +
+            n_a.populations[16 * para.number_of_nodes + index] +
+            n_a.populations[17 * para.number_of_nodes + index] +
+            n_a.populations[18 * para.number_of_nodes + index];
 
   // momentum modes
 
-  mode[1] = (n_a.vd[1 * para.number_of_nodes + index] -
-             n_a.vd[2 * para.number_of_nodes + index]) +
-            (n_a.vd[7 * para.number_of_nodes + index] -
-             n_a.vd[8 * para.number_of_nodes + index]) +
-            (n_a.vd[9 * para.number_of_nodes + index] -
-             n_a.vd[10 * para.number_of_nodes + index]) +
-            (n_a.vd[11 * para.number_of_nodes + index] -
-             n_a.vd[12 * para.number_of_nodes + index]) +
-            (n_a.vd[13 * para.number_of_nodes + index] -
-             n_a.vd[14 * para.number_of_nodes + index]);
-
-  mode[2] = (n_a.vd[3 * para.number_of_nodes + index] -
-             n_a.vd[4 * para.number_of_nodes + index]) +
-            (n_a.vd[7 * para.number_of_nodes + index] -
-             n_a.vd[8 * para.number_of_nodes + index]) -
-            (n_a.vd[9 * para.number_of_nodes + index] -
-             n_a.vd[10 * para.number_of_nodes + index]) +
-            (n_a.vd[15 * para.number_of_nodes + index] -
-             n_a.vd[16 * para.number_of_nodes + index]) +
-            (n_a.vd[17 * para.number_of_nodes + index] -
-             n_a.vd[18 * para.number_of_nodes + index]);
-
-  mode[3] = (n_a.vd[5 * para.number_of_nodes + index] -
-             n_a.vd[6 * para.number_of_nodes + index]) +
-            (n_a.vd[11 * para.number_of_nodes + index] -
-             n_a.vd[12 * para.number_of_nodes + index]) -
-            (n_a.vd[13 * para.number_of_nodes + index] -
-             n_a.vd[14 * para.number_of_nodes + index]) +
-            (n_a.vd[15 * para.number_of_nodes + index] -
-             n_a.vd[16 * para.number_of_nodes + index]) -
-            (n_a.vd[17 * para.number_of_nodes + index] -
-             n_a.vd[18 * para.number_of_nodes + index]);
+  mode[1] = (n_a.populations[1 * para.number_of_nodes + index] -
+             n_a.populations[2 * para.number_of_nodes + index]) +
+            (n_a.populations[7 * para.number_of_nodes + index] -
+             n_a.populations[8 * para.number_of_nodes + index]) +
+            (n_a.populations[9 * para.number_of_nodes + index] -
+             n_a.populations[10 * para.number_of_nodes + index]) +
+            (n_a.populations[11 * para.number_of_nodes + index] -
+             n_a.populations[12 * para.number_of_nodes + index]) +
+            (n_a.populations[13 * para.number_of_nodes + index] -
+             n_a.populations[14 * para.number_of_nodes + index]);
+
+  mode[2] = (n_a.populations[3 * para.number_of_nodes + index] -
+             n_a.populations[4 * para.number_of_nodes + index]) +
+            (n_a.populations[7 * para.number_of_nodes + index] -
+             n_a.populations[8 * para.number_of_nodes + index]) -
+            (n_a.populations[9 * para.number_of_nodes + index] -
+             n_a.populations[10 * para.number_of_nodes + index]) +
+            (n_a.populations[15 * para.number_of_nodes + index] -
+             n_a.populations[16 * para.number_of_nodes + index]) +
+            (n_a.populations[17 * para.number_of_nodes + index] -
+             n_a.populations[18 * para.number_of_nodes + index]);
+
+  mode[3] = (n_a.populations[5 * para.number_of_nodes + index] -
+             n_a.populations[6 * para.number_of_nodes + index]) +
+            (n_a.populations[11 * para.number_of_nodes + index] -
+             n_a.populations[12 * para.number_of_nodes + index]) -
+            (n_a.populations[13 * para.number_of_nodes + index] -
+             n_a.populations[14 * para.number_of_nodes + index]) +
+            (n_a.populations[15 * para.number_of_nodes + index] -
+             n_a.populations[16 * para.number_of_nodes + index]) -
+            (n_a.populations[17 * para.number_of_nodes + index] -
+             n_a.populations[18 * para.number_of_nodes + index]);
 }
 
 __global__ void
@@ -233,8 +234,8 @@ __global__ void ParticleVelocitiesFromLB_Kernel(
                     particles_input[particleIndex].pos[2]};
     float v[3] = {0};
 
-    // ***** This part is copied from get_interpolated_velocity
-    // ***** + we add the force + we consider boundaries
+    // This part is copied from get_interpolated_velocity
+    // + we add the force + we consider boundaries
 
     float temp_delta[6];
     float delta[8];
@@ -361,14 +362,8 @@ __global__ void ResetLBForces_Kernel(LB_node_force_density_gpu node_f,
 /** Call a kernel to reset the forces on the LB nodes to the external force. */
 void IBM_ResetLBForces_GPU() {
   if (this_node == 0) {
-    // Setup for kernel call
-    int threads_per_block = 64;
-    int blocks_per_grid_y = 4;
-    auto blocks_per_grid_x =
-        static_cast<int>((lbpar_gpu.number_of_nodes +
-                          threads_per_block * blocks_per_grid_y - 1) /
-                         (threads_per_block * blocks_per_grid_y));
-    dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
+    dim3 dim_grid =
+        calculate_dim_grid(lbpar_gpu.number_of_nodes, 4, threads_per_block);
 
     KERNELCALL(ResetLBForces_Kernel, dim_grid, threads_per_block, node_f,
                para_gpu);
@@ -386,65 +381,56 @@ void IBM_ForcesIntoFluid_GPU(ParticleRange particles) {
   // (2) Copy forces to the GPU
   // (3) interpolate on the LBM grid and spread forces
 
-  const int numParticles = gpu_get_particle_pointer().size();
+  auto const numParticles = gpu_get_particle_pointer().size();
 
-  // Storage only needed on master and allocated only once at the first time
-  // step if ( IBM_ParticleDataInput_host == nullptr && this_node == 0 )
-  if (IBM_ParticleDataInput_host == nullptr ||
+  // Storage only needed on head node
+  if (IBM_ParticleDataInput_host.empty() || !IBM_initialized ||
       numParticles != IBM_numParticlesCache)
     InitCUDA_IBM(numParticles);
 
   // We gather particle positions and forces from all nodes
   IBM_cuda_mpi_get_particles(particles);
 
-  // ***** GPU stuff only on master *****
+  // GPU only on head node
   if (this_node == 0 && numParticles > 0) {
 
     // Copy data to device
     cuda_safe_mem(cudaMemcpy(IBM_ParticleDataInput_device,
-                             IBM_ParticleDataInput_host,
+                             IBM_ParticleDataInput_host.data(),
                              numParticles * sizeof(IBM_CUDA_ParticleDataInput),
                              cudaMemcpyHostToDevice));
 
     // Kernel call for spreading the forces on the LB grid
-    int threads_per_block_particles = 64;
-    int blocks_per_grid_particles_y = 4;
-    int blocks_per_grid_particles_x =
-        (numParticles +
-         threads_per_block_particles * blocks_per_grid_particles_y - 1) /
-        (threads_per_block_particles * blocks_per_grid_particles_y);
-    dim3 dim_grid_particles =
-        make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1);
-
-    KERNELCALL(ForcesIntoFluid_Kernel, dim_grid_particles,
-               threads_per_block_particles, IBM_ParticleDataInput_device,
-               numParticles, node_f, para_gpu);
+    dim3 dim_grid = calculate_dim_grid(static_cast<unsigned>(numParticles), 4,
+                                       threads_per_block);
+    KERNELCALL(ForcesIntoFluid_Kernel, dim_grid, threads_per_block,
+               IBM_ParticleDataInput_device, numParticles, node_f, para_gpu);
   }
 }
 
-void InitCUDA_IBM(const int numParticles) {
+void InitCUDA_IBM(std::size_t const numParticles) {
 
-  if (this_node == 0) // GPU only on master
-  {
+  // GPU only on head node
+  if (this_node == 0) {
 
     // Check if we have to delete
-    if (IBM_ParticleDataInput_host != nullptr) {
-      delete[] IBM_ParticleDataInput_host;
-      delete[] IBM_ParticleDataOutput_host;
+    if (!IBM_ParticleDataInput_host.empty()) {
+      IBM_ParticleDataInput_host.clear();
+      IBM_ParticleDataOutput_host.clear();
       cuda_safe_mem(cudaFree(IBM_ParticleDataInput_device));
       cuda_safe_mem(cudaFree(IBM_ParticleDataOutput_device));
       cuda_safe_mem(cudaFree(lb_boundary_velocity_IBM));
     }
 
     // Back and forth communication of positions and velocities
-    IBM_ParticleDataInput_host = new IBM_CUDA_ParticleDataInput[numParticles];
+    IBM_ParticleDataInput_host.resize(numParticles);
+    IBM_ParticleDataOutput_host.resize(numParticles);
     cuda_safe_mem(
         cudaMalloc((void **)&IBM_ParticleDataInput_device,
                    numParticles * sizeof(IBM_CUDA_ParticleDataInput)));
     cuda_safe_mem(
         cudaMalloc((void **)&IBM_ParticleDataOutput_device,
                    numParticles * sizeof(IBM_CUDA_ParticleDataOutput)));
-    IBM_ParticleDataOutput_host = new IBM_CUDA_ParticleDataOutput[numParticles];
 
     // Use LB parameters
     lb_get_para_pointer(&para_gpu);
@@ -480,6 +466,7 @@ void InitCUDA_IBM(const int numParticles) {
 #endif
 
     IBM_numParticlesCache = numParticles;
+    IBM_initialized = true;
   }
 }
 
@@ -492,34 +479,26 @@ void ParticleVelocitiesFromLB_GPU(ParticleRange particles) {
   // (2) transfer velocities back to CPU
   // (3) spread velocities to local cells via MPI
 
-  const int numParticles = gpu_get_particle_pointer().size();
+  auto const numParticles = gpu_get_particle_pointer().size();
 
-  // **** GPU stuff only on master ****
+  // GPU only on head node
   if (this_node == 0 && numParticles > 0) {
     // Kernel call
-    int threads_per_block_particles = 64;
-    int blocks_per_grid_particles_y = 4;
-    int blocks_per_grid_particles_x =
-        (numParticles +
-         threads_per_block_particles * blocks_per_grid_particles_y - 1) /
-        (threads_per_block_particles * blocks_per_grid_particles_y);
-    dim3 dim_grid_particles =
-        make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1);
-    KERNELCALL(ParticleVelocitiesFromLB_Kernel, dim_grid_particles,
-               threads_per_block_particles, *current_nodes,
-               IBM_ParticleDataInput_device, numParticles,
+    dim3 dim_grid = calculate_dim_grid(static_cast<unsigned>(numParticles), 4,
+                                       threads_per_block);
+    KERNELCALL(ParticleVelocitiesFromLB_Kernel, dim_grid, threads_per_block,
+               *current_nodes, IBM_ParticleDataInput_device, numParticles,
                IBM_ParticleDataOutput_device, node_f, lb_boundary_velocity_IBM,
                para_gpu);
 
     // Copy velocities from device to host
-    cuda_safe_mem(cudaMemcpy(IBM_ParticleDataOutput_host,
+    cuda_safe_mem(cudaMemcpy(IBM_ParticleDataOutput_host.data(),
                              IBM_ParticleDataOutput_device,
                              numParticles * sizeof(IBM_CUDA_ParticleDataOutput),
                              cudaMemcpyDeviceToHost));
   }
 
-  // ***** Back to all nodes ****
-  // Spread using MPI
+  // Scatter to all nodes
   IBM_cuda_mpi_send_velocities(particles);
 }
 
diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.cpp b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.cpp
index b8a4c70c9b1..a363e546c26 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.cpp
+++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.cpp
@@ -35,33 +35,32 @@
 #include <utils/mpi/gather_buffer.hpp>
 #include <utils/mpi/scatter_buffer.hpp>
 
+#include <vector>
+
 // Variables for communication
-IBM_CUDA_ParticleDataInput *IBM_ParticleDataInput_host = nullptr;
-IBM_CUDA_ParticleDataOutput *IBM_ParticleDataOutput_host = nullptr;
+std::vector<IBM_CUDA_ParticleDataInput> IBM_ParticleDataInput_host = {};
+std::vector<IBM_CUDA_ParticleDataOutput> IBM_ParticleDataOutput_host = {};
 
-namespace {
-void pack_particles(ParticleRange particles,
-                    IBM_CUDA_ParticleDataInput *buffer) {
-  int dummy[3] = {0, 0, 0};
+static void pack_particles(ParticleRange particles,
+                           std::vector<IBM_CUDA_ParticleDataInput> &buffer) {
 
   int i = 0;
   for (auto const &part : particles) {
-    Utils::Vector3d pos = folded_position(part.r.p, box_geo);
+    auto const pos = folded_position(part.r.p, box_geo);
 
-    buffer[i].pos[0] = (float)pos[0];
-    buffer[i].pos[1] = (float)pos[1];
-    buffer[i].pos[2] = (float)pos[2];
+    buffer[i].pos[0] = static_cast<float>(pos[0]);
+    buffer[i].pos[1] = static_cast<float>(pos[1]);
+    buffer[i].pos[2] = static_cast<float>(pos[2]);
 
-    buffer[i].f[0] = (float)part.f.f[0];
-    buffer[i].f[1] = (float)part.f.f[1];
-    buffer[i].f[2] = (float)part.f.f[2];
+    buffer[i].f[0] = static_cast<float>(part.f.f[0]);
+    buffer[i].f[1] = static_cast<float>(part.f.f[1]);
+    buffer[i].f[2] = static_cast<float>(part.f.f[2]);
 
     buffer[i].is_virtual = part.p.is_virtual;
 
     i++;
   }
 }
-} // namespace
 
 /** Gather particle positions on the master node in order to communicate them
  *  to GPU. We transfer all particles (real and virtual), but actually we would
@@ -75,30 +74,28 @@ void IBM_cuda_mpi_get_particles(ParticleRange particles) {
     static std::vector<IBM_CUDA_ParticleDataInput> buffer;
     buffer.resize(n_part);
     /* pack local parts into buffer */
-    pack_particles(particles, buffer.data());
+    pack_particles(particles, buffer);
 
-    Utils::Mpi::gather_buffer(buffer.data(), buffer.size(), comm_cart);
+    Utils::Mpi::gather_buffer(buffer, comm_cart);
   } else {
     /* Pack own particles */
     pack_particles(particles, IBM_ParticleDataInput_host);
 
-    Utils::Mpi::gather_buffer(IBM_ParticleDataInput_host, n_part, comm_cart);
+    Utils::Mpi::gather_buffer(IBM_ParticleDataInput_host, comm_cart);
   }
 }
 
-namespace {
-void set_velocities(ParticleRange particles,
-                    IBM_CUDA_ParticleDataOutput *buffer) {
+static void set_velocities(ParticleRange particles,
+                           std::vector<IBM_CUDA_ParticleDataOutput> &buffer) {
   int i = 0;
   for (auto &part : particles) {
-    if (part.p.is_virtual)
+    if (part.p.is_virtual) {
       for (int j = 0; j < 3; j++)
-        part.m.v[j] = buffer[i].v[j];
-
+        part.m.v[j] = static_cast<double>(buffer[i].v[j]);
+    }
     i++;
   }
 }
-} // namespace
 
 /** Particle velocities have been communicated from GPU, now transmit to all
  *  nodes. Analogous to @ref cuda_mpi_send_forces.
@@ -113,10 +110,11 @@ void IBM_cuda_mpi_send_velocities(ParticleRange particles) {
 
     Utils::Mpi::scatter_buffer(buffer.data(), n_part, comm_cart);
 
-    set_velocities(particles, buffer.data());
+    set_velocities(particles, buffer);
   } else {
     /* Scatter forces to slaves */
-    Utils::Mpi::scatter_buffer(IBM_ParticleDataOutput_host, n_part, comm_cart);
+    Utils::Mpi::scatter_buffer(IBM_ParticleDataOutput_host.data(), n_part,
+                               comm_cart);
 
     set_velocities(particles, IBM_ParticleDataOutput_host);
   }
diff --git a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp
index 80e02c6709a..db948444aca 100644
--- a/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp
+++ b/src/core/virtual_sites/lb_inertialess_tracers_cuda_interface.hpp
@@ -31,6 +31,8 @@
 
 #include "ParticleRange.hpp"
 
+#include <vector>
+
 // *********** Communication functions ********
 // Implemented in real C++, but called from the ibm_cuda.cu
 void IBM_cuda_mpi_send_velocities(ParticleRange particles);
@@ -50,8 +52,8 @@ typedef struct {
 } IBM_CUDA_ParticleDataOutput;
 
 // ******** global variables for CUDA and MPI communication ******
-extern IBM_CUDA_ParticleDataInput *IBM_ParticleDataInput_host;
-extern IBM_CUDA_ParticleDataOutput *IBM_ParticleDataOutput_host;
+extern std::vector<IBM_CUDA_ParticleDataInput> IBM_ParticleDataInput_host;
+extern std::vector<IBM_CUDA_ParticleDataOutput> IBM_ParticleDataOutput_host;
 
 #endif
 
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 79372bbed62..170f14123bc 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -2,6 +2,17 @@
 set(PYTHON_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 set(PYTHON_FRONTEND ${PYTHON_EXECUTABLE})
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+  execute_process(
+    COMMAND "${PYTHON_FRONTEND}" "-c"
+            "import sysconfig; print(sysconfig.get_config_var('exec_prefix'))"
+    OUTPUT_VARIABLE DARWIN_EXEC_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(PYTHON_FRONTEND_TMP
+      "${DARWIN_EXEC_PREFIX}/Resources/Python.app/Contents/MacOS/Python")
+  if(EXISTS ${PYTHON_FRONTEND_TMP})
+    set(PYTHON_FRONTEND ${PYTHON_FRONTEND_TMP})
+  endif()
+endif()
 configure_file(pypresso.cmakein ${CMAKE_BINARY_DIR}/pypresso @ONLY)
 
 if(IPYTHON_EXECUTABLE)
diff --git a/src/python/espressomd/actors.pyx b/src/python/espressomd/actors.pyx
index 35a1fceb599..fad742ce4bb 100644
--- a/src/python/espressomd/actors.pyx
+++ b/src/python/espressomd/actors.pyx
@@ -16,7 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 include "myconfig.pxi"
 from .highlander import ThereCanOnlyBeOne
-from .utils cimport handle_errors
+from .utils import handle_errors
 
 cdef class Actor:
 
@@ -79,7 +79,7 @@ cdef class Actor:
         if inter in Actor.active_list:
             if not Actor.active_list[inter]:
                 raise Exception(
-                    "Class not registered in Actor.active_list " + self.__class__.__bases__[0])
+                    "Class not registered in Actor.active_list: " + self.__class__.__bases__[0].__name__)
             Actor.active_list[inter] = False
 
     def is_valid(self):
diff --git a/src/python/espressomd/analyze.pyx b/src/python/espressomd/analyze.pyx
index 66343b96c4f..7f3bad50c93 100644
--- a/src/python/espressomd/analyze.pyx
+++ b/src/python/espressomd/analyze.pyx
@@ -31,9 +31,9 @@ from .globals import Globals
 
 from collections import OrderedDict
 from .system import System
-from .utils import array_locked, is_valid_type
+from .utils import array_locked, is_valid_type, handle_errors
 from .utils cimport Vector3i, Vector3d, Vector9d
-from .utils cimport handle_errors, check_type_or_throw_except
+from .utils cimport check_type_or_throw_except
 from .utils cimport create_nparray_from_double_array
 from .particle_data cimport get_n_part
 
diff --git a/src/python/espressomd/cellsystem.pyx b/src/python/espressomd/cellsystem.pyx
index 101d73b4d49..ecb96ba253f 100644
--- a/src/python/espressomd/cellsystem.pyx
+++ b/src/python/espressomd/cellsystem.pyx
@@ -25,7 +25,8 @@ from .globals cimport verlet_reuse, skin
 from .globals cimport mpi_bcast_parameter
 from libcpp.vector cimport vector
 from .cellsystem cimport cell_structure
-from .utils cimport handle_errors, Vector3i, check_type_or_throw_except
+from .utils import handle_errors
+from .utils cimport Vector3i, check_type_or_throw_except
 
 
 cdef class CellSystem:
diff --git a/src/python/espressomd/collision_detection.pyx b/src/python/espressomd/collision_detection.pyx
index 4674f84b789..5797a0647ff 100644
--- a/src/python/espressomd/collision_detection.pyx
+++ b/src/python/espressomd/collision_detection.pyx
@@ -16,7 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 from .script_interface import ScriptInterfaceHelper, script_interface_register
 from .utils import to_str
-from .utils cimport handle_errors
+from .utils import handle_errors
 from .interactions import BondedInteraction, BondedInteractions
 
 
@@ -117,7 +117,7 @@ class CollisionDetection(ScriptInterfaceHelper):
 
         """
 
-        if not ("mode" in kwargs):
+        if "mode" not in kwargs:
             raise Exception(
                 "Collision mode must be specified via the mode keyword argument")
 
diff --git a/src/python/espressomd/constraints.py b/src/python/espressomd/constraints.py
index d5cdbe5f400..91cb63a8f69 100644
--- a/src/python/espressomd/constraints.py
+++ b/src/python/espressomd/constraints.py
@@ -342,10 +342,11 @@ class ForceField(_Interpolated):
         Spacing of the grid points.
     default_scale : :obj:`float`
         Scaling factor for particles that have no individual scaling factor.
-    particle_scales : array_like of (:obj:`int`, :obj:`float`)
-        A list of tuples of ids and scaling factors. For
-        particles in the list the interaction is scaled with
-        their individual scaling factor before it is applied.
+    particle_scales : :obj:`dict`
+        A dictionary mapping particle ids to scaling factors.
+        For these particles, the interaction is scaled with
+        their individual scaling factor. Other particles are
+        scaled with the default scaling factor.
 
     """
 
@@ -373,10 +374,11 @@ class PotentialField(_Interpolated):
         Spacing of the grid points.
     default_scale : :obj:`float`
         Scaling factor for particles that have no individual scaling factor.
-    particle_scales : array_like (:obj:`int`, :obj:`float`)
-        A list of tuples of ids and scaling factors. For
-        particles in the list the interaction is scaled with
-        their individual scaling factor before it is applied.
+    particle_scales : :obj:`dict`
+        A dictionary mapping particle ids to scaling factors.
+        For these particles, the interaction is scaled with
+        their individual scaling factor. Other particles are
+        scaled with the default scaling factor.
 
     """
 
diff --git a/src/python/espressomd/cuda_init.pxd b/src/python/espressomd/cuda_init.pxd
index 34d7a6c05d7..34c3e703475 100644
--- a/src/python/espressomd/cuda_init.pxd
+++ b/src/python/espressomd/cuda_init.pxd
@@ -16,9 +16,22 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
+
+from libcpp.vector cimport vector
+
 cdef extern from "cuda_init.hpp":
-    int cuda_set_device(int dev) except +
-    int cuda_get_device()
-    cdef int cuda_get_n_gpus()
-    void cuda_get_gpu_name(int dev, char name[64])
-#    int getdevicelist(int* devl, char* devname)
+    cdef struct EspressoGpuDevice:
+        int id
+        char name[64]
+        char proc_name[64]
+        int node
+        int compute_capability_major
+        int compute_capability_minor
+        size_t total_memory
+        int n_cores
+
+    void cuda_set_device(int dev) except +
+    int cuda_get_device() except +
+    int cuda_get_n_gpus() except +
+    void cuda_get_gpu_name(int dev, char name[64]) except +
+    vector[EspressoGpuDevice] cuda_gather_gpus()
diff --git a/src/python/espressomd/cuda_init.pyx b/src/python/espressomd/cuda_init.pyx
index 7d611bd769f..8e70aab0e4d 100644
--- a/src/python/espressomd/cuda_init.pyx
+++ b/src/python/espressomd/cuda_init.pyx
@@ -18,6 +18,7 @@
 #
 include "myconfig.pxi"
 from . cimport cuda_init
+from . import utils
 
 cdef class CudaInitHandle:
     def __init__(self):
@@ -37,12 +38,10 @@ cdef class CudaInitHandle:
 
             """
             dev = cuda_get_device()
-            if dev == -1:
-                raise Exception("cuda device get error")
             return dev
 
         @device.setter
-        def device(self, int _dev):
+        def device(self, int dev):
             """
             Specify which device to use.
 
@@ -52,35 +51,72 @@ cdef class CudaInitHandle:
                 Set the device id of the graphics card to use.
 
             """
-            cuda_set_device(_dev)
+            cuda_set_device(dev)
 
-    IF CUDA == 1:
-        @property
-        def device_list(self):
+        def list_devices(self):
             """
             List devices.
 
             Returns
             -------
-            :obj:`list` :
+            :obj:`dict` :
                 List of available CUDA devices.
 
             """
             cdef char gpu_name_buffer[4 + 64]
+            n_gpus = 0
+            try:
+                n_gpus = cuda_get_n_gpus()
+            except RuntimeError:
+                pass
             devices = dict()
-            for i in range(cuda_get_n_gpus()):
-                cuda_get_gpu_name(i, gpu_name_buffer)
-                devices[i] = gpu_name_buffer
+            for i in range(n_gpus):
+                try:
+                    cuda_get_gpu_name(i, gpu_name_buffer)
+                except RuntimeError:
+                    continue
+                devices[i] = utils.to_str(gpu_name_buffer)
             return devices
 
-        @device_list.setter
-        def device_list(self, dict _dev_dict):
-            raise Exception("cuda device list is read only")
+        def list_devices_properties(self):
+            """
+            List devices with their properties on each host machine.
 
+            Returns
+            -------
+            :obj:`dict` :
+                List of available CUDA devices with their properties.
+
+            """
+            cdef vector[EspressoGpuDevice] devices
+            cdef EspressoGpuDevice dev
+            try:
+                devices = cuda_gather_gpus()
+            except RuntimeError:
+                pass
+            resources = dict()
+            for i in range(devices.size()):
+                dev = devices[i]
+                hostname = utils.to_str(dev.proc_name)
+                if hostname not in resources:
+                    resources[hostname] = {}
+                resources[hostname][dev.id] = {
+                    'name': utils.to_str(dev.name),
+                    'compute_capability': (
+                        dev.compute_capability_major,
+                        dev.compute_capability_minor
+                    ),
+                    'cores': dev.n_cores,
+                    'total_memory': dev.total_memory,
+                }
+            return resources
 
 IF CUDA:
     def gpu_available():
-        return cuda_get_n_gpus() > 0
+        try:
+            return cuda_get_n_gpus() > 0
+        except RuntimeError:
+            return False
 ELSE:
     def gpu_available():
         return False
diff --git a/src/python/espressomd/electrokinetics.pxd b/src/python/espressomd/electrokinetics.pxd
index bdbf7cc34ce..7be18e61e6f 100644
--- a/src/python/espressomd/electrokinetics.pxd
+++ b/src/python/espressomd/electrokinetics.pxd
@@ -28,53 +28,103 @@ IF ELECTROKINETICS and CUDA:
         DEF MAX_NUMBER_OF_SPECIES = 10
 
         # EK data struct
-        ctypedef struct EK_parameters:
-            float agrid
-            float time_step
-            float lb_density
-            unsigned int dim_x
-            unsigned int dim_y
-            unsigned int dim_z
-            unsigned int number_of_nodes
-            float viscosity
-            float bulk_viscosity
-            float gamma_odd
-            float gamma_even
-            float friction
-            float T
-            float prefactor
-            float lb_force_density[3]
-            unsigned int number_of_species
-            int reaction_species[3]
-            float rho_reactant_reservoir
-            float rho_product0_reservoir
-            float rho_product1_reservoir
-            float reaction_ct_rate
-            float reaction_fraction_0
-            float reaction_fraction_1
-            float mass_reactant
-            float mass_product0
-            float mass_product1
-            int stencil
-            int number_of_boundary_nodes
-            float fluctuation_amplitude
-            bool fluctuations
-            bool advection
-            bool fluidcoupling_ideal_contribution
-            float * charge_potential
-            ekfloat * j
-            float * lb_force_density_previous
-            ekfloat * rho[MAX_NUMBER_OF_SPECIES]
-            int species_index[MAX_NUMBER_OF_SPECIES]
-            float density[MAX_NUMBER_OF_SPECIES]
-            float D[MAX_NUMBER_OF_SPECIES]
-            float d[MAX_NUMBER_OF_SPECIES]
-            float valency[MAX_NUMBER_OF_SPECIES]
-            float ext_force_density[3][MAX_NUMBER_OF_SPECIES]
-            char * node_is_catalyst
-            bool es_coupling
-            float * charge_potential_buffer
-            float * electric_field
+        IF EK_DEBUG:
+            ctypedef struct EK_parameters:
+                float agrid
+                float time_step
+                float lb_density
+                unsigned int dim_x
+                unsigned int dim_y
+                unsigned int dim_z
+                unsigned int number_of_nodes
+                float viscosity
+                float bulk_viscosity
+                float gamma_odd
+                float gamma_even
+                float friction
+                float T
+                float prefactor
+                float lb_force_density[3]
+                unsigned int number_of_species
+                int reaction_species[3]
+                float rho_reactant_reservoir
+                float rho_product0_reservoir
+                float rho_product1_reservoir
+                float reaction_ct_rate
+                float reaction_fraction_0
+                float reaction_fraction_1
+                float mass_reactant
+                float mass_product0
+                float mass_product1
+                int stencil
+                int number_of_boundary_nodes
+                float fluctuation_amplitude
+                bool fluctuations
+                bool advection
+                bool fluidcoupling_ideal_contribution
+                float * charge_potential
+                ekfloat * j
+                float * lb_force_density_previous
+                ekfloat * j_fluc
+                ekfloat * rho[MAX_NUMBER_OF_SPECIES]
+                int species_index[MAX_NUMBER_OF_SPECIES]
+                float density[MAX_NUMBER_OF_SPECIES]
+                float D[MAX_NUMBER_OF_SPECIES]
+                float d[MAX_NUMBER_OF_SPECIES]
+                float valency[MAX_NUMBER_OF_SPECIES]
+                float ext_force_density[3][MAX_NUMBER_OF_SPECIES]
+                char * node_is_catalyst
+                bool es_coupling
+                float * charge_potential_buffer
+                float * electric_field
+        ELSE:
+            ctypedef struct EK_parameters:
+                float agrid
+                float time_step
+                float lb_density
+                unsigned int dim_x
+                unsigned int dim_y
+                unsigned int dim_z
+                unsigned int number_of_nodes
+                float viscosity
+                float bulk_viscosity
+                float gamma_odd
+                float gamma_even
+                float friction
+                float T
+                float prefactor
+                float lb_force_density[3]
+                unsigned int number_of_species
+                int reaction_species[3]
+                float rho_reactant_reservoir
+                float rho_product0_reservoir
+                float rho_product1_reservoir
+                float reaction_ct_rate
+                float reaction_fraction_0
+                float reaction_fraction_1
+                float mass_reactant
+                float mass_product0
+                float mass_product1
+                int stencil
+                int number_of_boundary_nodes
+                float fluctuation_amplitude
+                bool fluctuations
+                bool advection
+                bool fluidcoupling_ideal_contribution
+                float * charge_potential
+                ekfloat * j
+                float * lb_force_density_previous
+                ekfloat * rho[MAX_NUMBER_OF_SPECIES]
+                int species_index[MAX_NUMBER_OF_SPECIES]
+                float density[MAX_NUMBER_OF_SPECIES]
+                float D[MAX_NUMBER_OF_SPECIES]
+                float d[MAX_NUMBER_OF_SPECIES]
+                float valency[MAX_NUMBER_OF_SPECIES]
+                float ext_force_density[3][MAX_NUMBER_OF_SPECIES]
+                char * node_is_catalyst
+                bool es_coupling
+                float * charge_potential_buffer
+                float * electric_field
 
         cdef extern EK_parameters ek_parameters
 
diff --git a/src/python/espressomd/electrokinetics.pyx b/src/python/espressomd/electrokinetics.pyx
index d24e2cf9b13..73edb490835 100644
--- a/src/python/espressomd/electrokinetics.pyx
+++ b/src/python/espressomd/electrokinetics.pyx
@@ -26,7 +26,7 @@ from . import utils
 import tempfile
 import shutil
 from .utils import is_valid_type
-from .utils cimport Vector3i, Vector6d, handle_errors
+from .utils cimport Vector3i, Vector6d
 import numpy as np
 
 IF ELECTROKINETICS:
@@ -366,7 +366,7 @@ IF ELECTROKINETICS:
             self.node[1] = key[1]
             self.node[2] = key[2]
             if not lb_lbnode_is_index_valid(self.node):
-                raise ValueError("LB node index out of bounds")
+                raise IndexError("LB node index out of bounds")
 
         property potential:
             def __get__(self):
@@ -544,7 +544,7 @@ IF ELECTROKINETICS:
             self.node[2] = key[2]
             self.id = id
             if not lb_lbnode_is_index_valid(self.node):
-                raise ValueError("LB node index out of bounds")
+                raise IndexError("LB node index out of bounds")
 
         property density:
             def __set__(self, value):
@@ -572,4 +572,4 @@ IF ELECTROKINETICS:
                         self.id, self.node[0], self.node[1], self.node[2], flux) != 0:
                     raise Exception("Species has not been added to EK.")
 
-                return np.array(flux[0], flux[1], flux[2])
+                return np.array([flux[0], flux[1], flux[2]])
diff --git a/src/python/espressomd/electrostatic_extensions.pxd b/src/python/espressomd/electrostatic_extensions.pxd
index cc35493a54f..c119a540e78 100644
--- a/src/python/espressomd/electrostatic_extensions.pxd
+++ b/src/python/espressomd/electrostatic_extensions.pxd
@@ -20,31 +20,13 @@
 include "myconfig.pxi"
 from .electrostatics cimport *
 from libcpp.vector cimport vector
-from libcpp cimport bool
 from .utils cimport Vector3d
 
 IF ELECTROSTATICS and P3M:
 
-    cdef extern from "electrostatics_magnetostatics/elc.hpp":
-        ctypedef struct ELC_struct:
-            double maxPWerror
-            double gap_size
-            double far_cut
-            bool neutralize
-            double delta_mid_top,
-            double delta_mid_bot,
-            bool const_pot,
-            double pot_diff
-
-        int ELC_set_params(double maxPWerror, double min_dist, double far_cut,
-                           bool neutralize, double delta_mid_top, double delta_mid_bot, bool const_pot, double pot_diff)
-
-        # links intern C-struct with python object
-        ELC_struct elc_params
-
     cdef extern from "electrostatics_magnetostatics/icc.hpp":
-        ctypedef struct iccp3m_struct:
-            int n_ic
+        ctypedef struct icc_struct:
+            int n_icc
             int num_iteration
             double eout
             vector[double] areas
@@ -58,7 +40,14 @@ IF ELECTROSTATICS and P3M:
             int first_id
 
         # links intern C-struct with python object
-        iccp3m_struct iccp3m_cfg
+        cdef extern icc_struct icc_cfg
+
+        void icc_set_params(int n_icc, double convergence, double relaxation,
+                            Vector3d & ext_field, int max_iterations,
+                            int first_id, double eps_out,
+                            vector[double] & areas,
+                            vector[double] & e_in,
+                            vector[double] & sigma,
+                            vector[Vector3d] & normals) except +
 
-        void iccp3m_alloc_lists()
-        int mpi_iccp3m_init()
+        void icc_deactivate()
diff --git a/src/python/espressomd/electrostatic_extensions.pyx b/src/python/espressomd/electrostatic_extensions.pyx
index 78523d23900..6b289e402a1 100644
--- a/src/python/espressomd/electrostatic_extensions.pyx
+++ b/src/python/espressomd/electrostatic_extensions.pyx
@@ -22,7 +22,9 @@ include "myconfig.pxi"
 from . cimport actors
 from . import actors
 import numpy as np
-from .utils cimport handle_errors, check_type_or_throw_except, check_range_or_except
+from .utils import handle_errors, array_locked
+from .utils cimport check_type_or_throw_except, check_range_or_except, Vector3d, make_Vector3d, make_array_locked, make_array_locked_vector
+from libcpp.vector cimport vector
 
 IF ELECTROSTATICS and P3M:
     from espressomd.electrostatics import check_neutrality
@@ -30,126 +32,6 @@ IF ELECTROSTATICS and P3M:
     cdef class ElectrostaticExtensions(actors.Actor):
         pass
 
-    cdef class ELC(ElectrostaticExtensions):
-        """
-        Electrostatics solver for systems with two periodic dimensions.
-        See :ref:`Electrostatic Layer Correction (ELC)` for more details.
-
-        Parameters
-        ----------
-        gap_size : :obj:`float`, required
-            The gap size gives the height :math:`h` of the empty region between
-            the system box and the neighboring artificial images. |es| does not
-            make sure that the gap is actually empty, this is the user's
-            responsibility. The method will run even if the condition is not
-            fulfilled, however, the error bound will not be reached. Therefore
-            you should really make sure that the gap region is empty (e.g.
-            with wall constraints).
-        maxPWerror : :obj:`float`, required
-            The maximal pairwise error sets the least upper bound (LUB) error
-            of the force between any two charges without prefactors (see the
-            papers). The algorithm tries to find parameters to meet this LUB
-            requirements or will throw an error if there are none.
-        delta_mid_top : :obj:`float`, optional
-            Dielectric contrast :math:`\\Delta_t` between the upper boundary
-            and the simulation box.
-        delta_mid_bottom : :obj:`float`, optional
-            Dielectric contrast :math:`\\Delta_b` between the lower boundary
-            and the simulation box.
-        const_pot : :obj:`bool`, optional
-            Activate a constant electric potential between the top and bottom
-            of the simulation box.
-        pot_diff : :obj:`float`, optional
-            If ``const_pot`` is enabled, this parameter controls the applied
-            voltage between the boundaries of the simulation box in the
-            *z*-direction (at :math:`z = 0` and :math:`z = L_z - h`).
-        neutralize : :obj:`bool`, optional
-            By default, *ELC* just as P3M adds a homogeneous neutralizing
-            background to the system in case of a net charge. However, unlike
-            in three dimensions, this background adds a parabolic potential
-            across the slab :cite:`ballenegger09a`. Therefore, under normal
-            circumstances, you will probably want to disable the neutralization
-            for non-neutral systems. This corresponds then to a formal
-            regularization of the forces and energies :cite:`ballenegger09a`.
-            Also, if you add neutralizing walls explicitly as constraints, you
-            have to disable the neutralization. When using a dielectric
-            contrast or full metallic walls (``delta_mid_top != 0`` or
-            ``delta_mid_bot != 0`` or ``const_pot=True``), ``neutralize`` is
-            overwritten and switched off internally. Note that the special
-            case of non-neutral systems with a *non-metallic* dielectric jump
-            (e.g. ``delta_mid_top`` or ``delta_mid_bot`` in ``]-1,1[``) is not
-            covered by the algorithm and will throw an error.
-        far_cut : :obj:`float`, optional
-            Cutoff radius, use with care, intended for testing purposes. When
-            setting the cutoff directly, the maximal pairwise error is ignored.
-        """
-
-        def validate_params(self):
-            default_params = self.default_params()
-            check_type_or_throw_except(
-                self._params["maxPWerror"], 1, float, "")
-            check_range_or_except(
-                self._params, "maxPWerror", 0, False, "inf", True)
-            check_type_or_throw_except(self._params["gap_size"], 1, float, "")
-            check_range_or_except(
-                self._params, "gap_size", 0, False, "inf", True)
-            check_type_or_throw_except(self._params["far_cut"], 1, float, "")
-            check_type_or_throw_except(
-                self._params["neutralize"], 1, type(True), "")
-
-        def valid_keys(self):
-            return ["maxPWerror", "gap_size", "far_cut", "neutralize",
-                    "delta_mid_top", "delta_mid_bot", "const_pot", "pot_diff",
-                    "check_neutrality"]
-
-        def required_keys(self):
-            return ["maxPWerror", "gap_size"]
-
-        def default_params(self):
-            return {"maxPWerror": -1,
-                    "gap_size": -1,
-                    "far_cut": -1,
-                    "delta_mid_top": 0,
-                    "delta_mid_bot": 0,
-                    "const_pot": False,
-                    "pot_diff": 0.0,
-                    "neutralize": True,
-                    "check_neutrality": True}
-
-        def _get_params_from_es_core(self):
-            params = {}
-            params.update(elc_params)
-            return params
-
-        def _set_params_in_es_core(self):
-            if coulomb.method == COULOMB_P3M_GPU:
-                raise Exception(
-                    "ELC tuning failed, ELC is not set up to work with the GPU P3M")
-
-            if self._params["const_pot"]:
-                self._params["delta_mid_top"] = -1
-                self._params["delta_mid_bot"] = -1
-
-            if ELC_set_params(
-                self._params["maxPWerror"],
-                self._params["gap_size"],
-                self._params["far_cut"],
-                self._params["neutralize"],
-                self._params["delta_mid_top"],
-                self._params["delta_mid_bot"],
-                self._params["const_pot"],
-                    self._params["pot_diff"]):
-                handle_errors(
-                    "ELC tuning failed, ELC is not set up to work with the GPU P3M")
-
-        def _activate_method(self):
-            check_neutrality(self._params)
-            self._set_params_in_es_core()
-
-        def _deactivate_method(self):
-            raise Exception(
-                "Unable to remove ELC as the state of the underlying electrostatics method will remain unclear.")
-
     cdef class ICC(ElectrostaticExtensions):
         """
         Interface to the induced charge calculation scheme for dielectric
@@ -180,47 +62,35 @@ IF ELECTROSTATICS and P3M:
         sigmas : (``n_icc``, ) array_like :obj:`float`, optional
             Additional surface charge density in the absence of any charge
             induction.
-        epsilons : (``n_icc``, ) array_like :obj:`float`, optional
+        epsilons : (``n_icc``, ) array_like :obj:`float`
             Dielectric constant associated to the areas.
 
         """
 
         def validate_params(self):
-            default_params = self.default_params()
-
             check_type_or_throw_except(self._params["n_icc"], 1, int, "")
-            check_range_or_except(
-                self._params, "n_icc", 1, True, "inf", True)
+
+            check_type_or_throw_except(
+                self._params["first_id"], 1, int, "")
 
             check_type_or_throw_except(
                 self._params["convergence"], 1, float, "")
-            check_range_or_except(
-                self._params, "convergence", 0, False, "inf", True)
 
             check_type_or_throw_except(
                 self._params["relaxation"], 1, float, "")
-            check_range_or_except(
-                self._params, "relaxation", 0, False, "inf", True)
 
             check_type_or_throw_except(
                 self._params["ext_field"], 3, float, "")
 
             check_type_or_throw_except(
                 self._params["max_iterations"], 1, int, "")
-            check_range_or_except(
-                self._params, "max_iterations", 0, False, "inf", True)
-
-            check_type_or_throw_except(
-                self._params["first_id"], 1, int, "")
-            check_range_or_except(
-                self._params, "first_id", 0, True, "inf", True)
 
             check_type_or_throw_except(
                 self._params["eps_out"], 1, float, "")
 
             n_icc = self._params["n_icc"]
+            assert n_icc >= 0, "ICC: invalid number of particles"
 
-            # Required list input
             self._params["normals"] = np.array(self._params["normals"])
             if self._params["normals"].size != n_icc * 3:
                 raise ValueError(
@@ -231,18 +101,14 @@ IF ELECTROSTATICS and P3M:
             check_type_or_throw_except(
                 self._params["areas"], n_icc, float, "Error in area list.")
 
-            # Not Required
             if "sigmas" in self._params.keys():
                 check_type_or_throw_except(
                     self._params["sigmas"], n_icc, float, "Error in sigma list.")
             else:
                 self._params["sigmas"] = np.zeros(n_icc)
 
-            if "epsilons" in self._params.keys():
-                check_type_or_throw_except(
-                    self._params["epsilons"], n_icc, float, "Error in epsilon list.")
-            else:
-                self._params["epsilons"] = np.zeros(n_icc)
+            check_type_or_throw_except(
+                self._params["epsilons"], n_icc, float, "Error in epsilon list.")
 
         def valid_keys(self):
             return ["n_icc", "convergence", "relaxation", "ext_field",
@@ -250,92 +116,68 @@ IF ELECTROSTATICS and P3M:
                     "areas", "sigmas", "epsilons", "check_neutrality"]
 
         def required_keys(self):
-            return ["n_icc", "normals", "areas"]
+            return ["n_icc", "normals", "areas", "epsilons"]
 
         def default_params(self):
-            return {"n_icc": 0,
-                    "convergence": 1e-3,
+            return {"convergence": 1e-3,
                     "relaxation": 0.7,
                     "ext_field": [0, 0, 0],
                     "max_iterations": 100,
                     "first_id": 0,
-                    "esp_out": 1,
-                    "normals": [],
-                    "areas": [],
-                    "sigmas": [],
-                    "epsilons": [],
+                    "eps_out": 1,
                     "check_neutrality": True}
 
         def _get_params_from_es_core(self):
             params = {}
-            params["n_icc"] = iccp3m_cfg.n_ic
-
-            # Fill Lists
-            normals = []
-            areas = []
-            sigmas = []
-            epsilons = []
-            for i in range(iccp3m_cfg.n_ic):
-                normals.append([iccp3m_cfg.normals[i][0], iccp3m_cfg.normals[
-                               i][1], iccp3m_cfg.normals[i][2]])
-                areas.append(iccp3m_cfg.areas[i])
-                epsilons.append(iccp3m_cfg.ein[i])
-                sigmas.append(iccp3m_cfg.sigma[i])
-
-            params["normals"] = normals
-            params["areas"] = areas
-            params["epsilons"] = epsilons
-            params["sigmas"] = sigmas
-
-            params["ext_field"] = [iccp3m_cfg.ext_field[0],
-                                   iccp3m_cfg.ext_field[1], iccp3m_cfg.ext_field[2]]
-            params["first_id"] = iccp3m_cfg.first_id
-            params["max_iterations"] = iccp3m_cfg.num_iteration
-            params["convergence"] = iccp3m_cfg.convergence
-            params["relaxation"] = iccp3m_cfg.relax
-            params["eps_out"] = iccp3m_cfg.eout
+            params["n_icc"] = icc_cfg.n_icc
+            params["first_id"] = icc_cfg.first_id
+            params["max_iterations"] = icc_cfg.num_iteration
+            params["convergence"] = icc_cfg.convergence
+            params["relaxation"] = icc_cfg.relax
+            params["eps_out"] = icc_cfg.eout
+            params["normals"] = make_array_locked_vector(icc_cfg.normals)
+            params["areas"] = array_locked(icc_cfg.areas)
+            params["epsilons"] = array_locked(icc_cfg.ein)
+            params["sigmas"] = array_locked(icc_cfg.sigma)
+            params["ext_field"] = make_array_locked(icc_cfg.ext_field)
 
             return params
 
         def _set_params_in_es_core(self):
-            # First set number of icc particles
-            iccp3m_cfg.n_ic = self._params["n_icc"]
-            # Allocate ICC lists
-            iccp3m_alloc_lists()
-
-            # Fill Lists
-            for i in range(iccp3m_cfg.n_ic):
-                iccp3m_cfg.normals[i][0] = self._params["normals"][i][0]
-                iccp3m_cfg.normals[i][1] = self._params["normals"][i][1]
-                iccp3m_cfg.normals[i][2] = self._params["normals"][i][2]
-
-                iccp3m_cfg.areas[i] = self._params["areas"][i]
-                iccp3m_cfg.ein[i] = self._params["epsilons"][i]
-                iccp3m_cfg.sigma[i] = self._params["sigmas"][i]
-
-            iccp3m_cfg.ext_field[0] = self._params["ext_field"][0]
-            iccp3m_cfg.ext_field[1] = self._params["ext_field"][1]
-            iccp3m_cfg.ext_field[2] = self._params["ext_field"][2]
-            iccp3m_cfg.first_id = self._params["first_id"]
-            iccp3m_cfg.num_iteration = self._params["max_iterations"]
-            iccp3m_cfg.convergence = self._params["convergence"]
-            iccp3m_cfg.relax = self._params["relaxation"]
-            iccp3m_cfg.eout = self._params["eps_out"]
-
-            # Broadcasts vars
-            mpi_iccp3m_init()
+            cdef Vector3d ext_field = make_Vector3d(self._params["ext_field"])
+            cdef vector[double] areas, e_in, sigma
+            cdef vector[Vector3d] normals
+            areas.resize(self._params["n_icc"])
+            e_in.resize(self._params["n_icc"])
+            sigma.resize(self._params["n_icc"])
+            normals.resize(self._params["n_icc"])
+
+            for i in range(self._params["n_icc"]):
+                areas[i] = self._params["areas"][i]
+                e_in[i] = self._params["epsilons"][i]
+                sigma[i] = self._params["sigmas"][i]
+
+                for j in range(3):
+                    normals[i][j] = self._params["normals"][i][j]
+
+            icc_set_params(self._params["n_icc"],
+                           self._params["convergence"],
+                           self._params["relaxation"],
+                           ext_field,
+                           self._params["max_iterations"],
+                           self._params["first_id"],
+                           self._params["eps_out"],
+                           areas,
+                           e_in,
+                           sigma,
+                           normals)
 
         def _activate_method(self):
             check_neutrality(self._params)
             self._set_params_in_es_core()
 
         def _deactivate_method(self):
-            iccp3m_cfg.n_ic = 0
-            # Allocate ICC lists
-            iccp3m_alloc_lists()
-
-            # Broadcasts vars
-            mpi_iccp3m_init()
+            icc_deactivate()
 
         def last_iterations(self):
             """
@@ -348,4 +190,4 @@ IF ELECTROSTATICS and P3M:
                 Number of iterations
 
             """
-            return iccp3m_cfg.citeration
+            return icc_cfg.citeration
diff --git a/src/python/espressomd/electrostatics.pxd b/src/python/espressomd/electrostatics.pxd
index b6c1452a41c..7f9d5e90651 100644
--- a/src/python/espressomd/electrostatics.pxd
+++ b/src/python/espressomd/electrostatics.pxd
@@ -18,8 +18,7 @@
 #
 
 include "myconfig.pxi"
-from .utils import is_valid_type, to_str, handle_errors
-from .utils cimport handle_errors
+from .utils import is_valid_type, to_str
 from libcpp cimport bool
 
 cdef extern from "SystemInterface.hpp":
@@ -68,10 +67,10 @@ IF ELECTROSTATICS:
         from p3m_common cimport P3MParameters
 
         cdef extern from "electrostatics_magnetostatics/p3m.hpp":
-            int p3m_set_params(double r_cut, int * mesh, int cao, double alpha, double accuracy)
-            void p3m_set_tune_params(double r_cut, int mesh[3], int cao, double alpha, double accuracy)
-            int p3m_set_mesh_offset(double x, double y, double z)
-            int p3m_set_eps(double eps)
+            void p3m_set_params(double r_cut, int * mesh, int cao, double alpha, double accuracy) except +
+            void p3m_set_tune_params(double r_cut, int mesh[3], int cao, double accuracy)
+            void p3m_set_mesh_offset(double x, double y, double z) except +
+            void p3m_set_eps(double eps)
             int p3m_adaptive_tune(bool verbose)
 
             ctypedef struct p3m_data_struct:
@@ -82,75 +81,25 @@ IF ELECTROSTATICS:
 
         IF CUDA:
             cdef extern from "electrostatics_magnetostatics/p3m_gpu.hpp":
-                void p3m_gpu_init(int cao, int * mesh, double alpha)
-
-            cdef inline python_p3m_gpu_init(params):
-                cdef int cao
-                cdef int mesh[3]
-                cdef double alpha
-                cao = params["cao"]
-                # Mesh can be specified as single int, but here, an array is
-                # needed
-                if not hasattr(params["mesh"], "__getitem__"):
-                    for i in range(3):
-                        mesh[i] = params["mesh"]
-                else:
-                    mesh = params["mesh"]
-                alpha = params["alpha"]
-                p3m_gpu_init(cao, mesh, alpha)
-                handle_errors("python_p3m_gpu_init")
-
-        cdef inline python_p3m_set_mesh_offset(mesh_off):
-            cdef double mesh_offset[3]
-            mesh_offset[0] = mesh_off[0]
-            mesh_offset[1] = mesh_off[1]
-            mesh_offset[2] = mesh_off[2]
-            return p3m_set_mesh_offset(
-                mesh_offset[0], mesh_offset[1], mesh_offset[2])
-
-        cdef inline python_p3m_adaptive_tune(bool verbose):
-            cdef int response = p3m_adaptive_tune(verbose)
-            if response:
-                handle_errors("python_p3m_adaptive_tune")
-
-        cdef inline python_p3m_set_params(p_r_cut, p_mesh, p_cao, p_alpha, p_accuracy):
-            cdef int mesh[3]
-            cdef double r_cut
-            cdef int cao
-            cdef double alpha
-            cdef double accuracy
-            r_cut = p_r_cut
-            cao = p_cao
-            alpha = p_alpha
-            accuracy = p_accuracy
-            if is_valid_type(p_mesh, int):
-                mesh[0] = p_mesh
-                mesh[1] = p_mesh
-                mesh[2] = p_mesh
-            else:
-                mesh = p_mesh
-
-            return p3m_set_params(r_cut, mesh, cao, alpha, accuracy)
-
-        cdef inline python_p3m_set_tune_params(p_r_cut, p_mesh, p_cao, p_alpha, p_accuracy):
-            cdef int mesh[3]
-            cdef double r_cut
-            cdef int cao
-            cdef double alpha
-            cdef double accuracy
-            r_cut = p_r_cut
-            cao = p_cao
-            alpha = p_alpha
-            accuracy = p_accuracy
-
-            if is_valid_type(p_mesh, int):
-                mesh[0] = p_mesh
-                mesh[1] = p_mesh
-                mesh[2] = p_mesh
-            else:
-                mesh = p_mesh
-
-            p3m_set_tune_params(r_cut, mesh, cao, alpha, accuracy)
+                void p3m_gpu_init(int cao, int * mesh, double alpha) except +
+
+        cdef extern from "electrostatics_magnetostatics/elc.hpp":
+            ctypedef struct ELC_struct:
+                double maxPWerror
+                double gap_size
+                double far_cut
+                bool neutralize
+                double delta_mid_top
+                double delta_mid_bot
+                bool const_pot
+                double pot_diff
+
+            int ELC_set_params(double maxPWerror, double min_dist, double far_cut,
+                               bool neutralize, double delta_mid_top,
+                               double delta_mid_bot, bool const_pot, double pot_diff)
+
+            # links intern C-struct with python object
+            ELC_struct elc_params
 
     cdef extern from "electrostatics_magnetostatics/debye_hueckel.hpp":
         ctypedef struct Debye_hueckel_params:
@@ -186,15 +135,6 @@ IF ELECTROSTATICS:
         int MMM1D_init()
         int mmm1d_tune(bool verbose)
 
-    cdef inline pyMMM1D_tune(bool verbose):
-        cdef int resp
-        resp = MMM1D_init()
-        if resp:
-            handle_errors("pyMMM1D_tune")
-        resp = mmm1d_tune(verbose)
-        if resp:
-            handle_errors("pyMMM1D_tune")
-
 IF ELECTROSTATICS and MMM1D_GPU:
 
     cdef extern from "actor/Mmm1dgpuForce.hpp":
diff --git a/src/python/espressomd/electrostatics.pyx b/src/python/espressomd/electrostatics.pyx
index 0e9b1491be0..f1af5f2dcbe 100644
--- a/src/python/espressomd/electrostatics.pyx
+++ b/src/python/espressomd/electrostatics.pyx
@@ -24,11 +24,12 @@ import numpy as np
 IF SCAFACOS == 1:
     from .scafacos import ScafacosConnector
     from . cimport scafacos
-from .utils cimport handle_errors
-from .utils import is_valid_type, check_type_or_throw_except, to_str
+from .utils import is_valid_type, check_type_or_throw_except, to_str, handle_errors
+from .utils cimport check_range_or_except
 from . cimport checks
 from .analyze cimport partCfg, PartCfg
 from .particle_data cimport particle
+import sys
 
 
 IF ELECTROSTATICS == 1:
@@ -192,92 +193,21 @@ IF ELECTROSTATICS:
 
 
 IF P3M == 1:
-    cdef class P3M(ElectrostaticInteraction):
-        """
-        P3M electrostatics solver.
-
-        Particle--Particle--Particle--Mesh (P3M) is a Fourier-based Ewald
-        summation method to calculate potentials in N-body simulation.
-        See :ref:`Coulomb P3M` for more details.
-
-        Parameters
-        ----------
-        prefactor : :obj:`float`
-            Electrostatics prefactor (see :eq:`coulomb_prefactor`).
-        accuracy : :obj:`float`
-            P3M tunes its parameters to provide this target accuracy.
-        alpha : :obj:`float`, optional
-            The Ewald parameter.
-        cao : :obj:`float`, optional
-            The charge-assignment order, an integer between 0 and 7.
-        epsilon : :obj:`float` or :obj:`str`, optional
-            A positive number for the dielectric constant of the
-            surrounding medium. Use ``'metallic'`` to set the dielectric
-            constant of the surrounding medium to infinity (default).
-        mesh : :obj:`int` or (3,) array_like of :obj:`int`, optional
-            The number of mesh points in x, y and z direction. Use a single
-            value for cubic boxes.
-        r_cut : :obj:`float`, optional
-            The real space cutoff.
-        tune : :obj:`bool`, optional
-            Used to activate/deactivate the tuning method on activation.
-            Defaults to ``True``.
-        check_neutrality : :obj:`bool`, optional
-            Raise a warning if the system is not electrically neutral when
-            set to ``True`` (default).
-
-        """
-
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
+    cdef class _P3MBase(ElectrostaticInteraction):
 
-        def validate_params(self):
-            default_params = self.default_params()
-            if not (self._params["prefactor"] > 0.0):
-                raise ValueError("prefactor should be a positive float")
-
-            if not (self._params["r_cut"] >= 0
-                    or self._params["r_cut"] == default_params["r_cut"]):
-                raise ValueError("P3M r_cut has to be >=0")
-
-            if is_valid_type(self._params["mesh"], int):
-                if self._params["mesh"] % 2 != 0 and self._params["mesh"] != -1:
-                    raise ValueError(
-                        "P3M requires an even number of mesh points in all directions")
+        cdef _check_and_copy_mesh_size(self, int mesh[3], pmesh):
+            if is_valid_type(pmesh, int):
+                pmesh = 3 * [pmesh]
             else:
-                check_type_or_throw_except(self._params["mesh"], 3, int,
-                                           "P3M mesh has to be an integer or integer list of length 3")
-                if (self._params["mesh"][0] % 2 != 0 and self._params["mesh"][0] != -1) or \
-                   (self._params["mesh"][1] % 2 != 0 and self._params["mesh"][1] != -1) or \
-                   (self._params["mesh"][2] % 2 != 0 and self._params["mesh"][2] != -1):
-                    raise ValueError(
-                        "P3M requires an even number of mesh points in all directions")
-
-            if not (self._params["cao"] >= -1 and self._params["cao"] <= 7):
-                raise ValueError(
-                    "P3M cao has to be an integer between -1 and 7")
-
-            if self._params["tune"] and not (self._params["accuracy"] >= 0):
-                raise ValueError("P3M accuracy has to be positive")
-
-            if self._params["epsilon"] == "metallic":
-                self._params["epsilon"] = 0.0
-
-            check_type_or_throw_except(
-                self._params["epsilon"], 1, float,
-                "epsilon should be a double or 'metallic'")
-
-            if self._params["mesh_off"] != default_params["mesh_off"]:
-                check_type_or_throw_except(self._params["mesh_off"], 3, float,
-                                           "mesh_off should be a (3,) array_like of values between 0.0 and 1.0")
-
-            if not (self._params["alpha"] == default_params["alpha"]
-                    or self._params["alpha"] > 0):
-                raise ValueError("alpha should be positive")
+                check_type_or_throw_except(
+                    pmesh, 3, int, "mesh size must be 3 ints")
+            for i in range(3):
+                mesh[i] = pmesh[i]
 
         def valid_keys(self):
             return ["mesh", "cao", "accuracy", "epsilon", "alpha", "r_cut",
-                    "prefactor", "tune", "check_neutrality", "verbose"]
+                    "prefactor", "tune", "check_neutrality", "verbose",
+                    "mesh_off"]
 
         def required_keys(self):
             return ["prefactor", "accuracy"]
@@ -301,19 +231,18 @@ IF P3M == 1:
             params["tune"] = self._params["tune"]
             return params
 
-        def _set_params_in_es_core(self):
-            # Sets lb, bcast, resets vars to zero if lb=0
+        def _tune(self):
+            cdef int mesh[3]
+            self._check_and_copy_mesh_size(mesh, self._params["mesh"])
+
             set_prefactor(self._params["prefactor"])
-            # Sets cdef vars and calls p3m_set_params() in core
-            python_p3m_set_params(self._params["r_cut"],
-                                  self._params["mesh"], self._params["cao"],
-                                  self._params["alpha"], self._params["accuracy"])
-            # p3m_set_params()  -> set r_cuts, mesh, cao, validates sanity, bcasts
-            # Careful: bcast calls on_coulomb_change(), which calls p3m_init(),
-            #         which resets r_cut if lb is zero. OK.
-            # Sets eps, bcast
             p3m_set_eps(self._params["epsilon"])
-            python_p3m_set_mesh_offset(self._params["mesh_off"])
+            p3m_set_tune_params(self._params["r_cut"], mesh,
+                                self._params["cao"], self._params["accuracy"])
+            tuning_error = p3m_adaptive_tune(self._params["verbose"])
+            if tuning_error:
+                handle_errors("P3M: tuning failed")
+            self._params.update(self._get_params_from_es_core())
 
         def tune(self, **tune_params_subset):
             # update the three necessary parameters if not provided by the user
@@ -324,25 +253,97 @@ IF P3M == 1:
 
             super().tune(**tune_params_subset)
 
-        def _tune(self):
+        def _set_params_in_es_core(self):
+            cdef int mesh[3]
+            self._check_and_copy_mesh_size(mesh, self._params["mesh"])
+
             set_prefactor(self._params["prefactor"])
+            # Sets p3m parameters
+            # p3m_set_params() -> set parameters and bcasts
+            # Careful: calls on_coulomb_change(), which calls p3m_init(),
+            #          which resets r_cut if prefactor=0
+            p3m_set_params(self._params["r_cut"], mesh, self._params["cao"],
+                           self._params["alpha"], self._params["accuracy"])
+            # Sets eps, bcast
             p3m_set_eps(self._params["epsilon"])
-            python_p3m_set_tune_params(self._params["r_cut"],
-                                       self._params["mesh"],
-                                       self._params["cao"],
-                                       -1.0,
-                                       self._params["accuracy"])
-            python_p3m_adaptive_tune(self._params["verbose"])
-            self._params.update(self._get_params_from_es_core())
+            p3m_set_mesh_offset(self._params["mesh_off"][0],
+                                self._params["mesh_off"][1],
+                                self._params["mesh_off"][2])
+
+        def validate_params(self):
+            default_params = self.default_params()
+            if not (self._params["prefactor"] > 0.0):
+                raise ValueError("prefactor should be a positive float")
+
+            if is_valid_type(self._params["mesh"], int):
+                if self._params["mesh"] % 2 != 0 and self._params["mesh"] != -1:
+                    raise ValueError(
+                        "P3M requires an even number of mesh points in all directions")
+            else:
+                check_type_or_throw_except(self._params["mesh"], 3, int,
+                                           "P3M mesh has to be an integer or integer list of length 3")
+                if (self._params["mesh"][0] % 2 != 0 and self._params["mesh"][0] != -1) or \
+                   (self._params["mesh"][1] % 2 != 0 and self._params["mesh"][1] != -1) or \
+                   (self._params["mesh"][2] % 2 != 0 and self._params["mesh"][2] != -1):
+                    raise ValueError(
+                        "P3M requires an even number of mesh points in all directions")
+
+            if self._params["epsilon"] == "metallic":
+                self._params["epsilon"] = 0.0
+
+            check_type_or_throw_except(
+                self._params["epsilon"], 1, float,
+                "epsilon should be a double or 'metallic'")
+
+            if self._params["mesh_off"] != default_params["mesh_off"]:
+                check_type_or_throw_except(self._params["mesh_off"], 3, float,
+                                           "mesh_off should be a (3,) array_like of values between 0.0 and 1.0")
+
+    cdef class P3M(_P3MBase):
+        """
+        P3M electrostatics solver.
+
+        Particle--Particle--Particle--Mesh (P3M) is a Fourier-based Ewald
+        summation method to calculate potentials in N-body simulation.
+        See :ref:`Coulomb P3M` for more details.
+
+        Parameters
+        ----------
+        prefactor : :obj:`float`
+            Electrostatics prefactor (see :eq:`coulomb_prefactor`).
+        accuracy : :obj:`float`
+            P3M tunes its parameters to provide this target accuracy.
+        alpha : :obj:`float`, optional
+            The Ewald parameter.
+        cao : :obj:`float`, optional
+            The charge-assignment order, an integer between 0 and 7.
+        epsilon : :obj:`float` or :obj:`str`, optional
+            A positive number for the dielectric constant of the
+            surrounding medium. Use ``'metallic'`` to set the dielectric
+            constant of the surrounding medium to infinity (default).
+        mesh : :obj:`int` or (3,) array_like of :obj:`int`, optional
+            The number of mesh points in x, y and z direction. Use a single
+            value for cubic boxes.
+        r_cut : :obj:`float`, optional
+            The real space cutoff.
+        tune : :obj:`bool`, optional
+            Used to activate/deactivate the tuning method on activation.
+            Defaults to ``True``.
+        check_neutrality : :obj:`bool`, optional
+            Raise a warning if the system is not electrically neutral when
+            set to ``True`` (default).
+
+        """
 
         def _activate_method(self):
             check_neutrality(self._params)
             if self._params["tune"]:
                 self._tune()
             self._set_params_in_es_core()
+            handle_errors("P3M: initialization failed")
 
     IF CUDA:
-        cdef class P3MGPU(ElectrostaticInteraction):
+        cdef class P3MGPU(_P3MBase):
             """
             P3M electrostatics solver with GPU support.
 
@@ -378,114 +379,158 @@ IF P3M == 1:
 
             """
 
-            def __init__(self, *args, **kwargs):
-                super().__init__(*args, **kwargs)
+            def _activate_method(self):
+                cdef int mesh[3]
+                self._check_and_copy_mesh_size(mesh, self._params["mesh"])
 
-            def validate_params(self):
-                default_params = self.default_params()
+                check_neutrality(self._params)
+                p3m_gpu_init(self._params["cao"], mesh, self._params["alpha"])
+                handle_errors("P3M: tuning failed")
+                coulomb.method = COULOMB_P3M_GPU
+                if self._params["tune"]:
+                    self._tune()
+                p3m_gpu_init(self._params["cao"], mesh, self._params["alpha"])
+                handle_errors("P3M: tuning failed")
+                self._set_params_in_es_core()
 
-                if not (self._params["r_cut"] >= 0
-                        or self._params["r_cut"] == default_params["r_cut"]):
-                    raise ValueError("P3M r_cut has to be >=0")
+            def _set_params_in_es_core(self):
+                super()._set_params_in_es_core()
+                handle_errors("P3M: initialization failed")
 
-                if is_valid_type(self._params["mesh"], int):
-                    if self._params["mesh"] % 2 != 0 and self._params["mesh"] != -1:
-                        raise ValueError(
-                            "P3M requires an even number of mesh points in all directions")
-                else:
-                    check_type_or_throw_except(self._params["mesh"], 3, int,
-                                               "P3M mesh has to be an integer or integer list of length 3")
-                    if (self._params["mesh"][0] % 2 != 0 and self._params["mesh"][0] != -1) or \
-                       (self._params["mesh"][1] % 2 != 0 and self._params["mesh"][1] != -1) or \
-                       (self._params["mesh"][2] % 2 != 0 and self._params["mesh"][2] != -1):
-                        raise ValueError(
-                            "P3M requires an even number of mesh points in all directions")
-
-                if not (self._params["cao"] >= -1
-                        and self._params["cao"] <= 7):
-                    raise ValueError(
-                        "P3M cao has to be an integer between -1 and 7")
+    cdef class ELC(ElectrostaticInteraction):
+        """
+        Electrostatics solver for systems with two periodic dimensions.
+        See :ref:`Electrostatic Layer Correction (ELC)` for more details.
 
-                if not (self._params["accuracy"] >= 0):
-                    raise ValueError("P3M accuracy has to be positive")
+        Parameters
+        ----------
+        p3m_actor : :obj:`P3M`, required
+            Base P3M actor.
+        gap_size : :obj:`float`, required
+            The gap size gives the height :math:`h` of the empty region between
+            the system box and the neighboring artificial images. |es| checks
+            that the gap is empty and will throw an error if it isn't. Therefore
+            you should really make sure that the gap region is empty (e.g.
+            with wall constraints).
+        maxPWerror : :obj:`float`, required
+            The maximal pairwise error sets the least upper bound (LUB) error
+            of the force between any two charges without prefactors (see the
+            papers). The algorithm tries to find parameters to meet this LUB
+            requirements or will throw an error if there are none.
+        delta_mid_top : :obj:`float`, optional
+            Dielectric contrast :math:`\\Delta_t` between the upper boundary
+            and the simulation box.
+        delta_mid_bottom : :obj:`float`, optional
+            Dielectric contrast :math:`\\Delta_b` between the lower boundary
+            and the simulation box.
+        const_pot : :obj:`bool`, optional
+            Activate a constant electric potential between the top and bottom
+            of the simulation box.
+        pot_diff : :obj:`float`, optional
+            If ``const_pot`` is enabled, this parameter controls the applied
+            voltage between the boundaries of the simulation box in the
+            *z*-direction (at :math:`z = 0` and :math:`z = L_z - h`).
+        neutralize : :obj:`bool`, optional
+            By default, *ELC* just as P3M adds a homogeneous neutralizing
+            background to the system in case of a net charge. However, unlike
+            in three dimensions, this background adds a parabolic potential
+            across the slab :cite:`ballenegger09a`. Therefore, under normal
+            circumstances, you will probably want to disable the neutralization
+            for non-neutral systems. This corresponds then to a formal
+            regularization of the forces and energies :cite:`ballenegger09a`.
+            Also, if you add neutralizing walls explicitly as constraints, you
+            have to disable the neutralization. When using a dielectric
+            contrast or full metallic walls (``delta_mid_top != 0`` or
+            ``delta_mid_bot != 0`` or ``const_pot=True``), ``neutralize`` is
+            overwritten and switched off internally. Note that the special
+            case of non-neutral systems with a *non-metallic* dielectric jump
+            (e.g. ``delta_mid_top`` or ``delta_mid_bot`` in ``]-1,1[``) is not
+            covered by the algorithm and will throw an error.
+        far_cut : :obj:`float`, optional
+            Cutoff radius, use with care, intended for testing purposes. When
+            setting the cutoff directly, the maximal pairwise error is ignored.
+        """
 
-                if self._params["epsilon"] == "metallic":
-                    self._params["epsilon"] = 0.0
+        def validate_params(self):
+            # P3M
+            if CUDA:
+                if isinstance(self._params["p3m_actor"], P3MGPU):
+                    raise ValueError(
+                        "ELC is not set up to work with the GPU P3M")
+            check_type_or_throw_except(
+                self._params["p3m_actor"], 1, getattr(
+                    sys.modules[__name__], "P3M"),
+                "p3m_actor has to be a P3M solver")
+            self._params["p3m_actor"]._params["epsilon"] = 0.0
+            self._params["p3m_actor"].validate_params()
+            # ELC
+            check_type_or_throw_except(
+                self._params["maxPWerror"], 1, float,
+                "maxPWerror has to be a float")
+            check_range_or_except(
+                self._params, "maxPWerror", 0, False, "inf", True)
+            check_type_or_throw_except(self._params["gap_size"], 1, float,
+                                       "gap_size has to be a float")
+            check_range_or_except(
+                self._params, "gap_size", 0, False, "inf", True)
+            check_type_or_throw_except(self._params["far_cut"], 1, float,
+                                       "far_cut has to be a float")
+            check_type_or_throw_except(
+                self._params["neutralize"], 1, type(True),
+                "neutralize has to be a bool")
 
-                check_type_or_throw_except(
-                    self._params["epsilon"], 1, float,
-                    "epsilon should be a double or 'metallic'")
+        def valid_keys(self):
+            return ["p3m_actor", "maxPWerror", "gap_size", "far_cut",
+                    "neutralize", "delta_mid_top", "delta_mid_bot",
+                    "const_pot", "pot_diff", "check_neutrality"]
 
-                if self._params["mesh_off"] != default_params["mesh_off"]:
-                    check_type_or_throw_except(self._params["mesh_off"], 3, float,
-                                               "mesh_off should be a (3,) array_like of values between 0.0 and 1.0")
+        def required_keys(self):
+            return ["p3m_actor", "maxPWerror", "gap_size"]
 
-            def valid_keys(self):
-                return ["mesh", "cao", "accuracy", "epsilon", "alpha", "r_cut",
-                        "prefactor", "tune", "check_neutrality", "verbose"]
+        def default_params(self):
+            return {"maxPWerror": -1,
+                    "gap_size": -1,
+                    "far_cut": -1,
+                    "delta_mid_top": 0,
+                    "delta_mid_bot": 0,
+                    "const_pot": False,
+                    "pot_diff": 0.0,
+                    "neutralize": True,
+                    "check_neutrality": True}
 
-            def required_keys(self):
-                return ["prefactor", "accuracy"]
+        def _get_params_from_es_core(self):
+            params = {}
+            params.update(elc_params)
+            params["p3m_actor"] = self._params["p3m_actor"]
+            return params
 
-            def default_params(self):
-                return {"cao": 0,
-                        "r_cut": -1,
-                        "alpha": 0,
-                        "accuracy": 0,
-                        "mesh": [0, 0, 0],
-                        "epsilon": 0.0,
-                        "mesh_off": [-1, -1, -1],
-                        "tune": True,
-                        "check_neutrality": True,
-                        "verbose": True}
-
-            def _get_params_from_es_core(self):
-                params = {}
-                params.update(p3m.params)
-                params["prefactor"] = coulomb.prefactor
-                params["tune"] = self._params["tune"]
-                return params
-
-            def tune(self, **tune_params_subset):
-                # update the three necessary parameters if not provided by the
-                # user
-                default_params = self.default_params()
-                for key in ["r_cut", "mesh", "cao"]:
-                    if key not in tune_params_subset:
-                        tune_params_subset[key] = default_params[key]
-
-                super().tune(**tune_params_subset)
-
-            def _tune(self):
-                set_prefactor(self._params["prefactor"])
-                p3m_set_eps(self._params["epsilon"])
-                python_p3m_set_tune_params(self._params["r_cut"],
-                                           self._params["mesh"],
-                                           self._params["cao"],
-                                           -1.0,
-                                           self._params["accuracy"])
-                python_p3m_adaptive_tune(self._params["verbose"])
-                self._params.update(self._get_params_from_es_core())
+        def _set_params_in_es_core(self):
+            self._params["p3m_actor"]._set_params_in_es_core()
+            if coulomb.method == COULOMB_P3M_GPU:
+                raise Exception("ELC is not set up to work with the GPU P3M")
+
+            if self._params["const_pot"]:
+                self._params["delta_mid_top"] = -1
+                self._params["delta_mid_bot"] = -1
+
+            if ELC_set_params(
+                self._params["maxPWerror"],
+                self._params["gap_size"],
+                self._params["far_cut"],
+                self._params["neutralize"],
+                self._params["delta_mid_top"],
+                self._params["delta_mid_bot"],
+                self._params["const_pot"],
+                    self._params["pot_diff"]):
+                handle_errors("ELC tuning failed")
 
-            def _activate_method(self):
-                check_neutrality(self._params)
-                python_p3m_gpu_init(self._params)
-                coulomb.method = COULOMB_P3M_GPU
-                if self._params["tune"]:
-                    self._tune()
-                python_p3m_gpu_init(self._params)
-                self._set_params_in_es_core()
+        def tune(self, **tune_params_subset):
+            self._params["p3m_actor"].tune(**tune_params_subset)
 
-            def _set_params_in_es_core(self):
-                set_prefactor(self._params["prefactor"])
-                python_p3m_set_params(self._params["r_cut"],
-                                      self._params["mesh"],
-                                      self._params["cao"],
-                                      self._params["alpha"],
-                                      self._params["accuracy"])
-                p3m_set_eps(self._params["epsilon"])
-                python_p3m_set_mesh_offset(self._params["mesh_off"])
-                handle_errors("p3m gpu init")
+        def _activate_method(self):
+            self._params["p3m_actor"]._activate_method()
+            check_neutrality(self._params)
+            self._set_params_in_es_core()
 
 IF ELECTROSTATICS:
     cdef class MMM1D(ElectrostaticInteraction):
@@ -550,8 +595,12 @@ IF ELECTROSTATICS:
                 self._params["far_switch_radius"], self._params["maxPWerror"])
 
         def _tune(self):
-            cdef int resp
-            pyMMM1D_tune(self._params["verbose"])
+            resp = MMM1D_init()
+            if resp:
+                handle_errors("MMM1D: initialization failed")
+            resp = mmm1d_tune(self._params["verbose"])
+            if resp:
+                handle_errors("MMM1D: tuning failed")
             self._params.update(self._get_params_from_es_core())
 
         def _activate_method(self):
diff --git a/src/python/espressomd/globals.pyx b/src/python/espressomd/globals.pyx
index 20ca445af16..3321d27e9cb 100644
--- a/src/python/espressomd/globals.pyx
+++ b/src/python/espressomd/globals.pyx
@@ -24,8 +24,8 @@ from .globals cimport sim_time
 from .globals cimport timing_samples
 from .globals cimport forcecap_set
 from .globals cimport forcecap_get
-from .utils import array_locked
-from .utils cimport Vector3d, make_array_locked, handle_errors
+from .utils import array_locked, handle_errors
+from .utils cimport Vector3d, make_array_locked
 
 cdef class Globals:
     property box_l:
diff --git a/src/python/espressomd/integrate.pxd b/src/python/espressomd/integrate.pxd
index 3869f0dfcc5..4b290011588 100644
--- a/src/python/espressomd/integrate.pxd
+++ b/src/python/espressomd/integrate.pxd
@@ -42,18 +42,12 @@ IF NPT:
                                               cbool xdir_rescale, cbool ydir_rescale,
                                               cbool zdir_rescale, cbool cubic_box) except +
 
-cdef extern from "stokesian_dynamics/sd_interface.hpp":
-    IF STOKESIAN_DYNAMICS:
+IF STOKESIAN_DYNAMICS:
+    cdef extern from "stokesian_dynamics/sd_interface.hpp":
         void set_sd_viscosity(double eta) except +
-        double get_sd_viscosity()
-
         void set_sd_radius_dict(const unordered_map[int, double] & radius_dict) except +
-        unordered_map[int, double] get_sd_radius_dict()
-
         void set_sd_flags(int flg)
-        int get_sd_flags()
 
-IF STOKESIAN_DYNAMICS:
     cpdef enum flags:
         NONE = 0,
         SELF_MOBILITY = 1 << 0,
diff --git a/src/python/espressomd/integrate.pyx b/src/python/espressomd/integrate.pyx
index 7f25708d82e..e0abd70ab91 100644
--- a/src/python/espressomd/integrate.pyx
+++ b/src/python/espressomd/integrate.pyx
@@ -18,8 +18,8 @@
 #
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetInterrupt
 include "myconfig.pxi"
-from .utils cimport handle_errors, check_type_or_throw_except
-from .utils import to_char_pointer
+from .utils cimport check_type_or_throw_except
+from .utils import to_char_pointer, handle_errors
 from . cimport integrate
 
 cdef class IntegratorHandle:
diff --git a/src/python/espressomd/magnetostatic_extensions.pyx b/src/python/espressomd/magnetostatic_extensions.pyx
index 872504df0b9..e42eb695a05 100644
--- a/src/python/espressomd/magnetostatic_extensions.pyx
+++ b/src/python/espressomd/magnetostatic_extensions.pyx
@@ -20,7 +20,8 @@
 from . cimport utils
 include "myconfig.pxi"
 from .actors import Actor
-from .utils cimport handle_errors, check_range_or_except, check_type_or_throw_except
+from .utils import handle_errors
+from .utils cimport check_range_or_except, check_type_or_throw_except
 
 IF DIPOLES and DP3M:
     class MagnetostaticExtension(Actor):
@@ -41,8 +42,11 @@ IF DIPOLES and DP3M:
         Parameters
         ----------
         gap_size : :obj:`float`
-            Size of the empty gap. Note that DLC relies on the user to make
-            sure that this condition is fulfilled.
+            The gap size gives the height :math:`h` of the empty region between
+            the system box and the neighboring artificial images. |es| checks
+            that the gap is empty and will throw an error if it isn't. Therefore
+            you should really make sure that the gap region is empty (e.g.
+            with wall constraints).
         maxPWerror : :obj:`float`
             Maximal pairwise error of the potential and force.
         far_cut : :obj:`float`, optional
@@ -56,13 +60,18 @@ IF DIPOLES and DP3M:
             """
             default_params = self.default_params()
             check_type_or_throw_except(
-                self._params["maxPWerror"], 1, float, "")
+                self._params["maxPWerror"], 1, float,
+                "maxPWerror has to be a float")
             check_range_or_except(
                 self._params, "maxPWerror", 0, False, "inf", True)
-            check_type_or_throw_except(self._params["gap_size"], 1, float, "")
+            check_type_or_throw_except(
+                self._params["gap_size"], 1, float,
+                "gap_size has to be a float")
             check_range_or_except(
                 self._params, "gap_size", 0, False, "inf", True)
-            check_type_or_throw_except(self._params["far_cut"], 1, float, "")
+            check_type_or_throw_except(
+                self._params["far_cut"], 1, float,
+                "far_cut has to be a float")
 
         def valid_keys(self):
             return ["maxPWerror", "gap_size", "far_cut"]
diff --git a/src/python/espressomd/magnetostatics.pxd b/src/python/espressomd/magnetostatics.pxd
index 166f0cf3307..c59d9979bbb 100644
--- a/src/python/espressomd/magnetostatics.pxd
+++ b/src/python/espressomd/magnetostatics.pxd
@@ -16,7 +16,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 from libcpp cimport bool
-from .utils cimport handle_errors
 
 include "myconfig.pxi"
 
@@ -63,19 +62,14 @@ IF DP3M == 1:
     from p3m_common cimport P3MParameters
 
     cdef extern from "electrostatics_magnetostatics/p3m-dipolar.hpp":
-        int dp3m_set_params(double r_cut, int mesh, int cao, double alpha, double accuracy)
-        void dp3m_set_tune_params(double r_cut, int mesh, int cao, double alpha, double accuracy)
-        int dp3m_set_mesh_offset(double x, double y, double z)
-        int dp3m_set_eps(double eps)
+        void dp3m_set_params(double r_cut, int mesh, int cao, double alpha, double accuracy) except +
+        void dp3m_set_tune_params(double r_cut, int mesh, int cao, double accuracy)
+        void dp3m_set_mesh_offset(double x, double y, double z) except +
+        void dp3m_set_eps(double eps)
         int dp3m_adaptive_tune(bool verbose)
-        int dp3m_deactivate()
+        void dp3m_deactivate()
 
         ctypedef struct dp3m_data_struct:
             P3MParameters params
 
         cdef extern dp3m_data_struct dp3m
-
-    cdef inline python_dp3m_adaptive_tune(bool verbose):
-        cdef int response = dp3m_adaptive_tune(verbose)
-        if response:
-            handle_errors("python_dp3m_adaptive_tune")
diff --git a/src/python/espressomd/magnetostatics.pyx b/src/python/espressomd/magnetostatics.pyx
index 086748acc2f..5da552948aa 100644
--- a/src/python/espressomd/magnetostatics.pyx
+++ b/src/python/espressomd/magnetostatics.pyx
@@ -23,7 +23,7 @@ IF SCAFACOS == 1:
     from .scafacos import ScafacosConnector
     from . cimport scafacos
 
-from .utils cimport handle_errors
+from .utils import handle_errors
 from .utils import is_valid_type, check_type_or_throw_except, to_str
 
 IF DIPOLES == 1:
@@ -96,10 +96,6 @@ IF DP3M == 1:
             super().validate_params()
             default_params = self.default_params()
 
-            if not (self._params["r_cut"] >= 0
-                    or self._params["r_cut"] == default_params["r_cut"]):
-                raise ValueError("P3M r_cut has to be >=0")
-
             if is_valid_type(self._params["mesh"], int):
                 pass
             else:
@@ -110,13 +106,6 @@ IF DP3M == 1:
                     raise ValueError(
                         "DipolarP3M requires a cubic box")
 
-            if not (self._params["cao"] >= -1 and self._params["cao"] <= 7):
-                raise ValueError(
-                    "P3M cao has to be an integer between -1 and 7")
-
-            if not (self._params["accuracy"] > 0):
-                raise ValueError("P3M accuracy has to be positive")
-
             if self._params["epsilon"] == "metallic":
                 self._params["epsilon"] = 0.0
 
@@ -131,7 +120,7 @@ IF DP3M == 1:
         def valid_keys(self):
             return ["prefactor", "alpha_L", "r_cut_iL", "mesh", "mesh_off",
                     "cao", "accuracy", "epsilon", "cao_cut", "a", "ai",
-                    "alpha", "r_cut", "cao3", "additional_mesh", "tune", "verbose"]
+                    "alpha", "r_cut", "cao3", "tune", "verbose"]
 
         def required_keys(self):
             return ["accuracy", ]
@@ -154,20 +143,32 @@ IF DP3M == 1:
             return params
 
         def _set_params_in_es_core(self):
+            if hasattr(self._params["mesh"], "__getitem__"):
+                mesh = self._params["mesh"][0]
+            else:
+                mesh = self._params["mesh"]
+
             self.set_magnetostatics_prefactor()
             dp3m_set_eps(self._params["epsilon"])
-            self.python_dp3m_set_mesh_offset(self._params["mesh_off"])
-            self.python_dp3m_set_params(
-                self._params["r_cut"], self._params["mesh"],
-                self._params["cao"], self._params["alpha"], self._params["accuracy"])
+            dp3m_set_mesh_offset(self._params["mesh_off"][0],
+                                 self._params["mesh_off"][1],
+                                 self._params["mesh_off"][2])
+            dp3m_set_params(self._params["r_cut"], mesh, self._params["cao"],
+                            self._params["alpha"], self._params["accuracy"])
 
         def _tune(self):
+            if hasattr(self._params["mesh"], "__getitem__"):
+                mesh = self._params["mesh"][0]
+            else:
+                mesh = self._params["mesh"]
+
             self.set_magnetostatics_prefactor()
             dp3m_set_eps(self._params["epsilon"])
-            self.python_dp3m_set_tune_params(
-                self._params["r_cut"], self._params["mesh"],
-                self._params["cao"], -1., self._params["accuracy"])
-            python_dp3m_adaptive_tune(self._params["verbose"])
+            dp3m_set_tune_params(self._params["r_cut"], mesh,
+                                 self._params["cao"], self._params["accuracy"])
+            tuning_error = dp3m_adaptive_tune(self._params["verbose"])
+            if tuning_error:
+                handle_errors("DipolarP3M: tuning failed")
             self._params.update(self._get_params_from_es_core())
 
         def _activate_method(self):
@@ -181,48 +182,6 @@ IF DP3M == 1:
             dp3m_deactivate()
             super()._deactivate_method()
 
-        def python_dp3m_set_mesh_offset(self, mesh_off):
-            cdef double mesh_offset[3]
-            mesh_offset[0] = mesh_off[0]
-            mesh_offset[1] = mesh_off[1]
-            mesh_offset[2] = mesh_off[2]
-            return dp3m_set_mesh_offset(
-                mesh_offset[0], mesh_offset[1], mesh_offset[2])
-
-        def python_dp3m_set_params(self, p_r_cut, p_mesh, p_cao, p_alpha,
-                                   p_accuracy):
-            cdef int mesh
-            cdef double r_cut
-            cdef int cao
-            cdef double alpha
-            cdef double accuracy
-            r_cut = p_r_cut
-            cao = p_cao
-            alpha = p_alpha
-            accuracy = p_accuracy
-            if hasattr(p_mesh, "__getitem__"):
-                mesh = p_mesh[0]
-            else:
-                mesh = p_mesh
-            dp3m_set_params(r_cut, mesh, cao, alpha, accuracy)
-
-        def python_dp3m_set_tune_params(self, p_r_cut, p_mesh, p_cao, p_alpha,
-                                        p_accuracy):
-            cdef int mesh
-            cdef double r_cut
-            cdef int cao
-            cdef double alpha
-            cdef double accuracy
-            r_cut = p_r_cut
-            cao = p_cao
-            alpha = p_alpha
-            accuracy = p_accuracy
-            if hasattr(p_mesh, "__getitem__"):
-                mesh = p_mesh[0]
-            else:
-                mesh = p_mesh
-            dp3m_set_tune_params(r_cut, mesh, cao, alpha, accuracy)
-
 IF DIPOLES == 1:
     cdef class DipolarDirectSumCpu(MagnetostaticInteraction):
         """
diff --git a/src/python/espressomd/math.py b/src/python/espressomd/math.py
new file mode 100644
index 00000000000..5b7ff893eb4
--- /dev/null
+++ b/src/python/espressomd/math.py
@@ -0,0 +1,36 @@
+# Copyright (C) 2010-2019 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .script_interface import ScriptInterfaceHelper, script_interface_register
+
+
+@script_interface_register
+class CylindricalTransformationParameters(ScriptInterfaceHelper):
+    """
+    Class to hold and validate the parameters needed for a cylindrical transformation.
+    The three parameters are available as attributes but are read-only.
+
+    Parameters
+    ----------
+    center : (3,) array_like of :obj:`float`, default = [0, 0, 0]
+        Position of the origin of the cylindrical coordinate system.
+    axis : (3,) array_like of :obj:`float`, default = [0, 0, 1]
+        Orientation vector of the ``z``-axis of the cylindrical coordinate system.
+    orientation: (3,) array_like of :obj:`float`, default = [1, 0, 0]
+        The axis on which ``phi = 0``.
+    """
+    _so_name = "CylindricalTransformationParameters"
diff --git a/src/python/espressomd/observables.py b/src/python/espressomd/observables.py
index 2169bee15e9..430e68755c4 100644
--- a/src/python/espressomd/observables.py
+++ b/src/python/espressomd/observables.py
@@ -17,6 +17,7 @@
 import itertools
 import numpy as np
 from .script_interface import ScriptInterfaceHelper, script_interface_register
+from .math import CylindricalTransformationParameters
 
 
 @script_interface_register
@@ -69,6 +70,18 @@ def bin_centers(self):
         return np.array(list(itertools.product(*edges))).reshape(shape)
 
 
+class CylindricalProfileObservable(ProfileObservable):
+    """
+    Base class for observables that work with cylinder coordinates
+    """
+
+    def __init__(
+            self, transform_params=CylindricalTransformationParameters(), **kwargs):
+        # Provide default transformation parameters if not user-provided
+        kwargs['transform_params'] = transform_params
+        super().__init__(**kwargs)
+
+
 @script_interface_register
 class ComPosition(Observable):
 
@@ -636,7 +649,7 @@ class DPDStress(Observable):
 
 
 @script_interface_register
-class CylindricalDensityProfile(ProfileObservable):
+class CylindricalDensityProfile(CylindricalProfileObservable):
 
     """Calculates the particle density in cylindrical coordinates.
 
@@ -644,26 +657,24 @@ class CylindricalDensityProfile(ProfileObservable):
     ----------
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
-    center : (3,) array_like of :obj:`float`
-        Position of the center of the cylindrical coordinate system for the histogram.
-    axis : (3,) array_like of :obj:`float`
-        Orientation vector of the ``z``-axis of the cylindrical coordinate system for the histogram.
-    n_r_bins : :obj:`int`
+    transform_params : :class:`espressomd.math.CylindricalTransformationParameters`, optional
+        Parameters of the cylinder transformation. Defaults to the default of :class:`espressomd.math.CylindricalTransformationParameters`
+    n_r_bins : :obj:`int`, default = 1
         Number of bins in radial direction.
-    n_phi_bins : :obj:`int`
+    n_phi_bins : :obj:`int`, default = 1
         Number of bins for the azimuthal direction.
-    n_z_bins : :obj:`int`
+    n_z_bins : :obj:`int`, default = 1
         Number of bins in ``z`` direction.
-    min_r : :obj:`float`
+    min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`
-        Minimum ``phi`` to consider.
+    min_phi : :obj:`float`, default = -pi
+        Minimum ``phi`` to consider. Must be in [-pi,pi).
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`
-        Maximum ``phi`` to consider.
+    max_phi : :obj:`float`, default = pi
+        Maximum ``phi`` to consider. Must be in (-pi,pi].
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
@@ -676,7 +687,7 @@ class CylindricalDensityProfile(ProfileObservable):
 
 
 @script_interface_register
-class CylindricalFluxDensityProfile(ProfileObservable):
+class CylindricalFluxDensityProfile(CylindricalProfileObservable):
 
     """Calculates the particle flux density in cylindrical coordinates.
 
@@ -684,26 +695,24 @@ class CylindricalFluxDensityProfile(ProfileObservable):
     ----------
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
-    center : (3,) array_like of :obj:`float`
-        Position of the center of the cylindrical coordinate system for the histogram.
-    axis : (3,) array_like of :obj:`float`
-        Orientation vector of the ``z``-axis of the cylindrical coordinate system for the histogram.
-    n_r_bins : :obj:`int`
+    transform_params : :class:`espressomd.math.CylindricalTransformationParameters`, optional
+        Parameters of the cylinder transformation. Defaults to the default of :class:`espressomd.math.CylindricalTransformationParameters`
+    n_r_bins : :obj:`int`, default = 1
         Number of bins in radial direction.
-    n_phi_bins : :obj:`int`
+    n_phi_bins : :obj:`int`, default = 1
         Number of bins for the azimuthal direction.
-    n_z_bins : :obj:`int`
+    n_z_bins : :obj:`int`, default = 1
         Number of bins in ``z`` direction.
-    min_r : :obj:`float`
+    min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`
-        Minimum ``phi`` to consider.
+    min_phi : :obj:`float`, default = -pi
+        Minimum ``phi`` to consider. Must be in [-pi,pi).
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`
-        Maximum ``phi`` to consider.
+    max_phi : :obj:`float`, default = pi
+        Maximum ``phi`` to consider. Must be in (-pi,pi].
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
@@ -718,7 +727,8 @@ class CylindricalFluxDensityProfile(ProfileObservable):
 
 
 @script_interface_register
-class CylindricalLBFluxDensityProfileAtParticlePositions(ProfileObservable):
+class CylindricalLBFluxDensityProfileAtParticlePositions(
+        CylindricalProfileObservable):
 
     """Calculates the LB fluid flux density at the particle positions in
     cylindrical coordinates.
@@ -727,26 +737,24 @@ class CylindricalLBFluxDensityProfileAtParticlePositions(ProfileObservable):
     ----------
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
-    center : (3,) array_like of :obj:`float`
-        Position of the center of the cylindrical coordinate system for the histogram.
-    axis : (3,) array_like of :obj:`float`
-        Orientation vector of the ``z``-axis of the cylindrical coordinate system for the histogram.
-    n_r_bins : :obj:`int`
+    transform_params : :class:`espressomd.math.CylindricalTransformationParameters`, optional
+        Parameters of the cylinder transformation. Defaults to the default of :class:`espressomd.math.CylindricalTransformationParameters`
+    n_r_bins : :obj:`int`, default = 1
         Number of bins in radial direction.
-    n_phi_bins : :obj:`int`
+    n_phi_bins : :obj:`int`, default = 1
         Number of bins for the azimuthal direction.
-    n_z_bins : :obj:`int`
+    n_z_bins : :obj:`int`, default = 1
         Number of bins in ``z`` direction.
-    min_r : :obj:`float`
+    min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`
-        Minimum ``phi`` to consider.
+    min_phi : :obj:`float`, default = -pi
+        Minimum ``phi`` to consider. Must be in [-pi,pi).
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`
-        Maximum ``phi`` to consider.
+    max_phi : :obj:`float`, default = pi
+        Maximum ``phi`` to consider. Must be in (-pi,pi].
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
@@ -761,7 +769,8 @@ class CylindricalLBFluxDensityProfileAtParticlePositions(ProfileObservable):
 
 
 @script_interface_register
-class CylindricalLBVelocityProfileAtParticlePositions(ProfileObservable):
+class CylindricalLBVelocityProfileAtParticlePositions(
+        CylindricalProfileObservable):
 
     """Calculates the LB fluid velocity at the particle positions in
     cylindrical coordinates.
@@ -770,26 +779,24 @@ class CylindricalLBVelocityProfileAtParticlePositions(ProfileObservable):
     ----------
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
-    center : (3,) array_like of :obj:`float`
-        Position of the center of the cylindrical coordinate system for the histogram.
-    axis : (3,) array_like of :obj:`float`
-        Orientation vector of the ``z``-axis of the cylindrical coordinate system for the histogram.
-    n_r_bins : :obj:`int`
+    transform_params : :class:`espressomd.math.CylindricalTransformationParameters`, optional
+        Parameters of the cylinder transformation. Defaults to the default of :class:`espressomd.math.CylindricalTransformationParameters`
+    n_r_bins : :obj:`int`, default = 1
         Number of bins in radial direction.
-    n_phi_bins : :obj:`int`
+    n_phi_bins : :obj:`int`, default = 1
         Number of bins for the azimuthal direction.
-    n_z_bins : :obj:`int`
+    n_z_bins : :obj:`int`, default = 1
         Number of bins in ``z`` direction.
-    min_r : :obj:`float`
+    min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`
-        Minimum ``phi`` to consider.
+    min_phi : :obj:`float`, default = -pi
+        Minimum ``phi`` to consider. Must be in [-pi,pi).
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`
-        Maximum ``phi`` to consider.
+    max_phi : :obj:`float`, default = pi
+        Maximum ``phi`` to consider. Must be in (-pi,pi].
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
@@ -804,7 +811,7 @@ class CylindricalLBVelocityProfileAtParticlePositions(ProfileObservable):
 
 
 @script_interface_register
-class CylindricalVelocityProfile(ProfileObservable):
+class CylindricalVelocityProfile(CylindricalProfileObservable):
 
     """Calculates the particle velocity profile in cylindrical coordinates.
 
@@ -812,26 +819,24 @@ class CylindricalVelocityProfile(ProfileObservable):
     ----------
     ids : array_like of :obj:`int`
         The ids of (existing) particles to take into account.
-    center : (3,) array_like of :obj:`float`
-        Position of the center of the cylindrical coordinate system for the histogram.
-    axis : (3,) array_like of :obj:`float`
-        Orientation vector of the ``z``-axis of the cylindrical coordinate system for the histogram.
-    n_r_bins : :obj:`int`
+    transform_params : :class:`espressomd.math.CylindricalTransformationParameters`, optional
+        Parameters of the cylinder transformation. Defaults to the default of :class:`espressomd.math.CylindricalTransformationParameters`
+    n_r_bins : :obj:`int`, default = 1
         Number of bins in radial direction.
-    n_phi_bins : :obj:`int`
+    n_phi_bins : :obj:`int`, default = 1
         Number of bins for the azimuthal direction.
-    n_z_bins : :obj:`int`
+    n_z_bins : :obj:`int`, default = 1
         Number of bins in ``z`` direction.
-    min_r : :obj:`float`
+    min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`
-        Minimum ``phi`` to consider.
+    min_phi : :obj:`float`, default = -pi
+        Minimum ``phi`` to consider. Must be in [-pi,pi).
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`
-        Maximum ``phi`` to consider.
+    max_phi : :obj:`float`, default = pi
+        Maximum ``phi`` to consider. Must be in (-pi,pi].
     max_z : :obj:`float`
         Maximum ``z`` to consider.
 
@@ -846,7 +851,7 @@ class CylindricalVelocityProfile(ProfileObservable):
 
 
 @script_interface_register
-class CylindricalLBVelocityProfile(ProfileObservable):
+class CylindricalLBVelocityProfile(CylindricalProfileObservable):
 
     """Calculates the LB fluid velocity profile in cylindrical coordinates.
 
@@ -856,26 +861,24 @@ class CylindricalLBVelocityProfile(ProfileObservable):
 
     Parameters
     ----------
-    center : (3,) array_like of :obj:`float`
-        Position of the center of the cylindrical coordinate system for the histogram.
-    axis : (3,) array_like of :obj:`float`
-        Orientation vector of the ``z``-axis of the cylindrical coordinate system for the histogram.
-    n_r_bins : :obj:`int`
+    transform_params : :class:`espressomd.math.CylindricalTransformationParameters`, optional
+        Parameters of the cylinder transformation. Defaults to the default of :class:`espressomd.math.CylindricalTransformationParameters`
+    n_r_bins : :obj:`int`, default = 1
         Number of bins in radial direction.
-    n_phi_bins : :obj:`int`
+    n_phi_bins : :obj:`int`, default = 1
         Number of bins for the azimuthal direction.
-    n_z_bins : :obj:`int`
+    n_z_bins : :obj:`int`, default = 1
         Number of bins in ``z`` direction.
-    min_r : :obj:`float`
+    min_r : :obj:`float`, default = 0
         Minimum ``r`` to consider.
-    min_phi : :obj:`float`
-        Minimum ``phi`` to consider.
+    min_phi : :obj:`float`, default = -pi
+        Minimum ``phi`` to consider. Must be in [-pi,pi).
     min_z : :obj:`float`
         Minimum ``z`` to consider.
     max_r : :obj:`float`
         Maximum ``r`` to consider.
-    max_phi : :obj:`float`
-        Maximum ``phi`` to consider.
+    max_phi : :obj:`float`, default = pi
+        Maximum ``phi`` to consider. Must be in (-pi,pi].
     max_z : :obj:`float`
         Maximum ``z`` to consider.
     sampling_density : :obj:`float`
@@ -920,7 +923,7 @@ class RDF(Observable):
     _so_name = "Observables::RDF"
 
     def __init__(self, **kwargs):
-        if "oid" not in kwargs and "ids2" not in kwargs:
+        if "ids2" not in kwargs:
             kwargs["ids2"] = []
         super().__init__(**kwargs)
 
diff --git a/src/python/espressomd/p3m_common.pxd b/src/python/espressomd/p3m_common.pxd
index 3f7b685038c..aaf7f92b4d9 100644
--- a/src/python/espressomd/p3m_common.pxd
+++ b/src/python/espressomd/p3m_common.pxd
@@ -30,4 +30,3 @@ IF P3M == 1 or DP3M == 1:
             double a[3]
             double alpha
             double r_cut
-            double additional_mesh[3]
diff --git a/src/python/espressomd/particle_data.pxd b/src/python/espressomd/particle_data.pxd
index dc50d78194b..2b3d54e4c83 100644
--- a/src/python/espressomd/particle_data.pxd
+++ b/src/python/espressomd/particle_data.pxd
@@ -172,7 +172,6 @@ cdef extern from "particle_data.hpp":
 
     IF EXCLUSIONS:
         int change_exclusion(int part, int part2, int _delete)
-        void remove_all_exclusions()
 
     IF ENGINE:
         void set_particle_swimming(int part, particle_parameters_swimming swim)
diff --git a/src/python/espressomd/particle_data.pyx b/src/python/espressomd/particle_data.pyx
index 52a9e75ed62..f1b0612fcdb 100644
--- a/src/python/espressomd/particle_data.pyx
+++ b/src/python/espressomd/particle_data.pyx
@@ -1145,8 +1145,8 @@ cdef class ParticleHandle:
             constant terminal velocity in either of these methods is completely
             determined by the friction coefficient. You may only set one of the
             possibilities ``v_swim`` *or* ``f_swim`` as you cannot relax to constant force
-            *and* constant velocity at the same time. The setting both ``v_swim`` and
-            ``f_swim`` to 0.0 thus disables swimming. This option applies to all
+            *and* constant velocity at the same time. Setting both ``v_swim`` and
+            ``f_swim`` to 0.0 disables swimming. This option applies to all
             non-lattice-Boltzmann thermostats. Note that there is no real difference
             between ``v_swim`` and ``f_swim`` since the latter may always be chosen such that
             the same terminal velocity is achieved for a given friction coefficient.
@@ -1161,7 +1161,7 @@ cdef class ParticleHandle:
             v_swim : :obj:`float`
                 Achieve a constant velocity by imposing a constant terminal
                 velocity ``v_swim``. This excludes the option ``f_swim``.
-            mode : :obj:`str`, \{'pusher', 'puller'\}
+            mode : :obj:`str`, \{'pusher', 'puller', 'N/A'\}
                 The LB flow field can be generated by a pushing or a
                 pulling mechanism, leading to change in the sign of the
                 dipolar flow field with respect to the direction of motion.
@@ -1182,10 +1182,10 @@ cdef class ParticleHandle:
             >>> system = espressomd.System()
             >>>
             >>> # Usage with Langevin
-            >>> system.part.add(id=0, pos=[1,0,0],swimming={'f_swim':0.03})
+            >>> system.part.add(id=0, pos=[1, 0, 0], swimming={'f_swim': 0.03})
             >>>
             >>> # Usage with LB
-            >>> system.part.add(id=1, pos=[2,0,0], swimming={'f_swim': 0.01,
+            >>> system.part.add(id=1, pos=[2, 0, 0], swimming={'f_swim': 0.01,
             ...     'mode': 'pusher', 'dipole_length': 2.0})
 
             """
@@ -1228,7 +1228,7 @@ cdef class ParticleHandle:
                             swim.push_pull = 0
                         else:
                             raise Exception(
-                                "'mode' has to be either 'pusher' or 'puller'.")
+                                "'mode' has to be either 'pusher', 'puller' or 'N/A'.")
 
                     if 'dipole_length' in _params:
                         check_type_or_throw_except(
@@ -1578,7 +1578,7 @@ cdef class _ParticleSliceImpl:
         id_list = id_list[slice_]
 
         # Generate a mask which will remove ids of non-existing particles
-        mask = np.empty(len(id_list), dtype=np.bool)
+        mask = np.empty(len(id_list), dtype=type(True))
         mask[:] = True
         for i, id in enumerate(id_list):
             if not particle_exists(id):
@@ -1922,7 +1922,7 @@ Set quat and scalar dipole moment (dipm) instead.")
         if is_valid_type(idx, int):
             return particle_exists(idx)
         if isinstance(idx, (slice, tuple, list, np.ndarray)):
-            tf_array = np.zeros(len(idx), dtype=np.bool)
+            tf_array = np.zeros(len(idx), dtype=type(True))
             for i in range(len(idx)):
                 tf_array[i] = particle_exists(idx[i])
             return tf_array
diff --git a/src/python/espressomd/scafacos.pyx b/src/python/espressomd/scafacos.pyx
index 62726cf3fa2..93cf4263b7f 100644
--- a/src/python/espressomd/scafacos.pyx
+++ b/src/python/espressomd/scafacos.pyx
@@ -21,8 +21,7 @@ from .actors cimport Actor
 from libcpp.string cimport string  # import std::string
 from . cimport electrostatics
 from . cimport magnetostatics
-from .utils import to_char_pointer, to_str
-from .utils cimport handle_errors
+from .utils import to_char_pointer, to_str, handle_errors
 
 
 include "myconfig.pxi"
diff --git a/src/python/espressomd/script_interface.pyx b/src/python/espressomd/script_interface.pyx
index a2e6dfcfe23..5af72c5859a 100644
--- a/src/python/espressomd/script_interface.pyx
+++ b/src/python/espressomd/script_interface.pyx
@@ -15,8 +15,8 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import numpy as np
-from .utils import to_char_pointer, to_str
-from .utils cimport Vector3d, make_array_locked, handle_errors
+from .utils import to_char_pointer, to_str, handle_errors
+from .utils cimport Vector3d, make_array_locked
 
 from libcpp.memory cimport make_shared
 
@@ -44,13 +44,12 @@ cdef class PScriptInterface:
 
     Parameters
     ----------
+    sip : :class:`PObjectRef`
+        Object id of an existing core object (method 1).
     name : :obj:`str`
-        Name of the core class to instantiate (method 1).
+        Name of the core class to instantiate (method 2).
     \*\*kwargs
-        Parameters for the core class constructor (method 1).
-    sip : :class:`PObjectRef`
-        Object id of an existing core object (method 2).
-
+        Parameters for the core class constructor (method 2).
     policy : :obj:`str`, \{'GLOBAL', 'LOCAL'\}
         Creation policy.
 
@@ -59,8 +58,6 @@ cdef class PScriptInterface:
 
     sip: :class:`PObjectRef`
         Pointer to a ScriptInterface object in the core.
-    policy_: :obj:`str`
-        Creation policy.
 
     """
 
@@ -184,6 +181,7 @@ cdef Variant python_object_to_variant(value):
     """Convert Python objects to C++ Variant objects."""
 
     cdef vector[Variant] vec
+    cdef unordered_map[int, Variant] vmap
     cdef PObjectRef oref
 
     if value is None:
@@ -195,6 +193,13 @@ cdef Variant python_object_to_variant(value):
     if isinstance(value, PScriptInterface):
         oref = value.get_sip()
         return make_variant(oref.sip)
+    elif isinstance(value, dict):
+        for k, v in value.items():
+            if not isinstance(k, int):
+                raise TypeError(
+                    f"No conversion from type dict_item([({type(k).__name__}, {type(v).__name__})]) to Variant[std::unordered_map<int, Variant>]")
+            vmap[k] = python_object_to_variant(v)
+        return make_variant[unordered_map[int, Variant]](vmap)
     elif hasattr(value, '__iter__') and not(type(value) == str):
         for e in value:
             vec.push_back(python_object_to_variant(e))
@@ -208,12 +213,14 @@ cdef Variant python_object_to_variant(value):
     elif np.issubdtype(np.dtype(type(value)), np.floating):
         return make_variant[double](value)
     else:
-        raise TypeError("Unknown type for conversion to Variant")
+        raise TypeError(
+            f"No conversion from type {type(value).__name__} to Variant")
 
 cdef variant_to_python_object(const Variant & value) except +:
     """Convert C++ Variant objects to Python objects."""
 
     cdef vector[Variant] vec
+    cdef unordered_map[int, Variant] vmap
     cdef shared_ptr[ObjectHandle] ptr
     if is_none(value):
         return None
@@ -263,6 +270,14 @@ cdef variant_to_python_object(const Variant & value) except +:
             res.append(variant_to_python_object(i))
 
         return res
+    if is_type[unordered_map[int, Variant]](value):
+        vmap = get_value[unordered_map[int, Variant]](value)
+        res = {}
+
+        for kv in vmap:
+            res[kv.first] = variant_to_python_object(kv.second)
+
+        return res
 
     raise TypeError("Unknown type")
 
diff --git a/src/python/espressomd/system.pyx b/src/python/espressomd/system.pyx
index 96b0802f305..692722f1838 100644
--- a/src/python/espressomd/system.pyx
+++ b/src/python/espressomd/system.pyx
@@ -43,11 +43,11 @@ if LB_BOUNDARIES or LB_BOUNDARIES_GPU:
     from .ekboundaries import EKBoundaries
 from .comfixed import ComFixed
 from .globals import Globals
-from .globals cimport FIELD_SIMTIME, FIELD_MAX_OIF_OBJECTS
-from .globals cimport integ_switch, max_oif_objects, sim_time
+from .globals cimport FIELD_MAX_OIF_OBJECTS
+from .globals cimport integ_switch, max_oif_objects
 from .globals cimport maximal_cutoff_bonded, maximal_cutoff_nonbonded, mpi_bcast_parameter
-from .utils cimport handle_errors, check_type_or_throw_except
-from .utils import is_valid_type
+from .utils cimport check_type_or_throw_except
+from .utils import is_valid_type, handle_errors
 IF VIRTUAL_SITES:
     from .virtual_sites import ActiveVirtualSitesHandle, VirtualSitesOff
 
@@ -240,13 +240,10 @@ cdef class System:
         def __set__(self, double _time):
             if _time < 0:
                 raise ValueError("Simulation time must be >= 0")
-            global sim_time
-            sim_time = _time
-            mpi_bcast_parameter(FIELD_SIMTIME)
+            self.globals.time = _time
 
         def __get__(self):
-            global sim_time
-            return sim_time
+            return self.globals.time
 
     property time_step:
         """
diff --git a/src/python/espressomd/thermostat.pyx b/src/python/espressomd/thermostat.pyx
index 5db6938114b..3c7f4678e16 100644
--- a/src/python/espressomd/thermostat.pyx
+++ b/src/python/espressomd/thermostat.pyx
@@ -632,7 +632,7 @@ cdef class Thermostat:
             lb_lbcoupling_set_rng_state(0)
 
         global thermo_switch
-        thermo_switch = (thermo_switch or THERMO_LB)
+        thermo_switch = (thermo_switch | THERMO_LB)
         mpi_bcast_parameter(FIELD_THERMO_SWITCH)
 
         global thermo_virtual
diff --git a/src/python/espressomd/utils.pxd b/src/python/espressomd/utils.pxd
index c4ddde6bca9..a0b5ab06911 100644
--- a/src/python/espressomd/utils.pxd
+++ b/src/python/espressomd/utils.pxd
@@ -102,6 +102,7 @@ cdef extern from "utils/quaternion.hpp" namespace "Utils":
         T & operator[](int i)
 
 cdef make_array_locked(Vector3d)
+cdef make_array_locked_vector(vector[Vector3d] v)
 cdef Vector3d make_Vector3d(a)
 
 cdef extern from "utils/Factory.hpp" namespace "Utils":
diff --git a/src/python/espressomd/utils.pyx b/src/python/espressomd/utils.pyx
index 5246b29b1ee..83bc3c704eb 100644
--- a/src/python/espressomd/utils.pyx
+++ b/src/python/espressomd/utils.pyx
@@ -218,6 +218,13 @@ Use numpy.copy(<ESPResSo array property>) to get a writable copy."
 cdef make_array_locked(Vector3d v):
     return array_locked([v[0], v[1], v[2]])
 
+cdef make_array_locked_vector(vector[Vector3d] v):
+    ret = np.empty((v.size(), 3))
+    for i in range(v.size()):
+        for j in range(3):
+            ret[i][j] = v[i][j]
+    return array_locked(ret)
+
 
 cdef Vector3d make_Vector3d(a):
     cdef Vector3d v
@@ -274,7 +281,7 @@ def is_valid_type(value, t):
     if value is None:
         return False
     if t == int:
-        return isinstance(value, (int, np.integer, np.long))
+        return isinstance(value, (int, np.integer))
     elif t == float:
         if hasattr(np, 'float128'):
             return isinstance(
diff --git a/src/python/espressomd/visualization_opengl.py b/src/python/espressomd/visualization_opengl.py
index 7a1ec1658ef..412cd37b50f 100644
--- a/src/python/espressomd/visualization_opengl.py
+++ b/src/python/espressomd/visualization_opengl.py
@@ -1931,6 +1931,7 @@ def __init__(self, shape, particle_type, color, material,
         self.axis = np.array(self.shape.get_parameter('axis'))
         self.length = self.shape.get_parameter('length')
         self.radius = self.shape.get_parameter('radius')
+        self.open = self.shape.get_parameter('open')
         self.cap_center_1 = self.center - self.axis / \
             np.linalg.norm(self.axis) * 0.5 * self.length
         self.cap_center_2 = self.center + self.axis / \
@@ -1939,7 +1940,7 @@ def __init__(self, shape, particle_type, color, material,
     def draw(self):
         draw_cylinder(self.cap_center_1, self.cap_center_2,
                       self.radius, self.color, self.material,
-                      self.quality, draw_caps=True)
+                      self.quality, draw_caps=not self.open)
 
 
 class Ellipsoid(Shape):
diff --git a/src/python/object_in_fluid/oif_classes.py b/src/python/object_in_fluid/oif_classes.py
index 883639e7087..c6c1d8e54dc 100644
--- a/src/python/object_in_fluid/oif_classes.py
+++ b/src/python/object_in_fluid/oif_classes.py
@@ -1092,7 +1092,7 @@ def set_mesh_points(self, file_name=None):
             i = 0
             for line in nodes_coord:  # extracts coordinates from the string line
                 line = line.split()
-                new_position = np.array(line).astype(np.float) + center
+                new_position = np.array(line).astype(float) + center
                 self.mesh.points[i].set_pos(new_position)
                 i += 1
 
@@ -1376,7 +1376,6 @@ def elastic_forces(
                 self.append_point_data_to_vtk(
                     file_name=vtk_file, data_name="total_f_metric",
                     data=elastic_forces_norms_list, first_append=first)
-                first = False
 
         # output raw data
         if raw_data_file is not None:
diff --git a/src/python/pypresso.cmakein b/src/python/pypresso.cmakein
index 3ff41cce483..4b1ef554801 100755
--- a/src/python/pypresso.cmakein
+++ b/src/python/pypresso.cmakein
@@ -16,6 +16,7 @@ export PYTHONPATH
 
 if [ "@CMAKE_CXX_COMPILER_ID@" != "GNU" ] && [ "@WITH_ASAN@" = "ON" ]; then
   asan_lib=$("@CMAKE_CXX_COMPILER@" /dev/null -### -o /dev/null -fsanitize=address 2>&1 | grep -o '[" ][^" ]*libclang_rt.asan[^" ]*[^s][" ]' | sed 's/[" ]//g' | sed 's/\.a$/.so/g')
+  export DYLD_INSERT_LIBRARIES="$asan_lib"
   for lib in $asan_lib; do
       test -f $lib && LD_PRELOAD="$lib $LD_PRELOAD"
   done
diff --git a/src/script_interface/CylindricalTransformationParameters.hpp b/src/script_interface/CylindricalTransformationParameters.hpp
new file mode 100644
index 00000000000..89ed4b840d0
--- /dev/null
+++ b/src/script_interface/CylindricalTransformationParameters.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2010-2019 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef SCRIPT_INTERFACE_CYL_TRANSFORM_PARAMS_HPP
+#define SCRIPT_INTERFACE_CYL_TRANSFORM_PARAMS_HPP
+
+#include "script_interface/ScriptInterface.hpp"
+
+#include "utils/math/cylindrical_transformation_parameters.hpp"
+
+namespace ScriptInterface {
+
+class CylindricalTransformationParameters
+    : public AutoParameters<CylindricalTransformationParameters> {
+public:
+  CylindricalTransformationParameters() {
+    add_parameters({{"center", AutoParameter::read_only,
+                     [this]() { return m_transform_params->center(); }},
+                    {"axis", AutoParameter::read_only,
+                     [this]() { return m_transform_params->axis(); }},
+                    {"orientation", AutoParameter::read_only,
+                     [this]() { return m_transform_params->orientation(); }}});
+  }
+  std::shared_ptr<::Utils::CylindricalTransformationParameters>
+  cyl_transform_params() {
+    return m_transform_params;
+  }
+  void do_construct(VariantMap const &params) override {
+    m_transform_params =
+        std::make_shared<Utils::CylindricalTransformationParameters>(
+            get_value_or<Utils::Vector3d>(params, "center",
+                                          Utils::Vector3d{{0, 0, 0}}),
+            get_value_or<Utils::Vector3d>(params, "axis",
+                                          Utils::Vector3d{{0, 0, 1}}),
+            get_value_or<Utils::Vector3d>(params, "orientation",
+                                          Utils::Vector3d{{1, 0, 0}}));
+  }
+
+private:
+  std::shared_ptr<Utils::CylindricalTransformationParameters>
+      m_transform_params;
+};
+} // namespace ScriptInterface
+#endif
diff --git a/src/script_interface/Variant.hpp b/src/script_interface/Variant.hpp
index 07168303b7d..a1c34d3e3e4 100644
--- a/src/script_interface/Variant.hpp
+++ b/src/script_interface/Variant.hpp
@@ -25,9 +25,21 @@
 
 #include <boost/variant.hpp>
 
+/* This <boost/serialization/library_version_type.hpp> include guards against
+ * an issue in boost::serialization from boost 1.74.0 that leads to compiler
+ * error "'library_version_type' is not a member of 'boost::serialization'"
+ * when including <boost/serialization/unordered_map.hpp>. More details
+ * in ticket https://github.com/boostorg/serialization/issues/219
+ */
+#include <boost/serialization/version.hpp>
+#if BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 == 74
+#include <boost/serialization/library_version_type.hpp>
+#endif
+
 #include <boost/range/algorithm/transform.hpp>
 #include <boost/serialization/serialization.hpp>
 #include <boost/serialization/string.hpp>
+#include <boost/serialization/unordered_map.hpp>
 #include <boost/serialization/variant.hpp>
 #include <boost/serialization/vector.hpp>
 
@@ -48,11 +60,18 @@ constexpr const None none{};
 
 /**
  * @brief Possible types for parameters.
+ *
+ * The visitors and packing functions need to be adapted accordingly when
+ * extending this variant with new types. For the exact details, see commit
+ * <a href="https://github.com/espressomd/espresso/commit/b48ab62">b48ab62</a>.
+ * The number of types is limited by macro @c BOOST_MPL_LIMIT_LIST_SIZE
+ * (defaults to 20).
  */
 using Variant = boost::make_recursive_variant<
     None, bool, int, size_t, double, std::string, std::vector<int>,
     std::vector<double>, ObjectRef, std::vector<boost::recursive_variant_>,
-    Utils::Vector2d, Utils::Vector3d, Utils::Vector4d>::type;
+    Utils::Vector2d, Utils::Vector3d, Utils::Vector4d,
+    std::unordered_map<int, boost::recursive_variant_>>::type;
 
 using VariantMap = std::unordered_map<std::string, Variant>;
 
diff --git a/src/script_interface/constraints/couplings.hpp b/src/script_interface/constraints/couplings.hpp
index 640a5e934dc..31eeac0bcc3 100644
--- a/src/script_interface/constraints/couplings.hpp
+++ b/src/script_interface/constraints/couplings.hpp
@@ -19,6 +19,15 @@
 #ifndef SCRIPT_INTERFACE_CONSTRAINTS_DETAIL_COUPLINGS_HPP
 #define SCRIPT_INTERFACE_CONSTRAINTS_DETAIL_COUPLINGS_HPP
 
+/**
+ * @file
+ * @brief ScriptInterface implementations for the
+ *        various couplings provided.
+ *
+ * These are separated from the Constraints because
+ * they can be reused together with the couplings themselves.
+ */
+
 #include "core/field_coupling/couplings/Charge.hpp"
 #include "core/field_coupling/couplings/Direct.hpp"
 #include "core/field_coupling/couplings/Mass.hpp"
@@ -27,22 +36,13 @@
 
 #include "script_interface/ScriptInterface.hpp"
 
-#include <utils/serialization/pack.hpp>
-#include <utils/serialization/unordered_map.hpp>
+#include <unordered_map>
 
 namespace ScriptInterface {
 namespace Constraints {
 namespace detail {
 using namespace ::FieldCoupling::Coupling;
 
-/**
- * @brief ScriptInterface implementations for the
- *        various couplings provided.
- *
- * These are separated from the Constraints because
- * they can be reused together with the couplings themselves.
- */
-
 /**
  * Default version for parameterless couplings.
  */
@@ -59,7 +59,7 @@ template <> struct coupling_parameters_impl<Viscous> {
   static std::vector<AutoParameter> params(const This &this_) {
     return {{
         "gamma",
-        [this_](const Variant &v) { this_().gamma() = get_value<double>(v); },
+        AutoParameter::read_only,
         [this_]() { return this_().gamma(); },
     }};
   }
@@ -70,18 +70,11 @@ template <> struct coupling_parameters_impl<Scaled> {
   static std::vector<AutoParameter> params(const This &this_) {
     return {{
                 "default_scale",
-                [this_](const Variant &v) {
-                  this_().default_scale() = get_value<double>(v);
-                },
+                AutoParameter::read_only,
                 [this_]() { return this_().default_scale(); },
             },
-            {"particle_scales",
-             [this_](const Variant &v) {
-               this_().particle_scales() =
-                   Utils::unpack<std::unordered_map<int, double>>(
-                       boost::get<std::string>(v));
-             },
-             [this_]() { return Utils::pack(this_().particle_scales()); }}};
+            {"particle_scales", AutoParameter::read_only,
+             [this_]() { return make_map(this_().particle_scales()); }}};
   }
 };
 
@@ -96,12 +89,10 @@ template <> inline Viscous make_coupling<Viscous>(const VariantMap &params) {
 }
 
 template <> inline Scaled make_coupling<Scaled>(const VariantMap &params) {
-  auto scales = params.count("particle_scale")
-                    ? Utils::unpack<std::unordered_map<int, double>>(
-                          get_value<std::string>(params, "particle_scale"))
-                    : std::unordered_map<int, double>{};
-
-  return Scaled{scales, get_value<double>(params, "default_scale")};
+  auto const particle_scales = get_value_or<std::unordered_map<int, Variant>>(
+      params, "particle_scales", {});
+  return Scaled{get_map<int, double>(particle_scales),
+                get_value<double>(params, "default_scale")};
 }
 } // namespace detail
 } // namespace Constraints
diff --git a/src/script_interface/get_value.hpp b/src/script_interface/get_value.hpp
index 955bf06ef3f..d0da51aeb4d 100644
--- a/src/script_interface/get_value.hpp
+++ b/src/script_interface/get_value.hpp
@@ -93,7 +93,7 @@ struct vector_conversion_visitor : boost::static_visitor<Utils::Vector<T, N>> {
     return v;
   }
 
-  /* We try do unpack variant vectors and check if they
+  /* We try to unpack variant vectors and check if they
    * are convertible element by element. */
   auto operator()(std::vector<Variant> const &vv) const {
     if (N != vv.size()) {
@@ -160,6 +160,26 @@ template <> struct get_value_helper<std::vector<double>, void> {
   }
 };
 
+template <typename K, typename T>
+struct GetMapOrEmpty : boost::static_visitor<std::unordered_map<K, T>> {
+  /* Catch all case -> wrong type. */
+  template <typename U> std::unordered_map<K, T> operator()(U const &) const {
+    throw boost::bad_get{};
+  }
+
+  /* Standard case, correct type */
+  std::unordered_map<K, T> operator()(std::unordered_map<K, T> const &v) const {
+    return v;
+  }
+};
+
+/* std::unordered_map cases */
+template <> struct get_value_helper<std::unordered_map<int, Variant>, void> {
+  std::unordered_map<int, Variant> operator()(Variant const &v) const {
+    return boost::apply_visitor(GetMapOrEmpty<int, Variant>{}, v);
+  }
+};
+
 /* This allows direct retrieval of a shared_ptr to the object from
    an ObjectId variant. If the type is a derived type, the type is
    also checked.
@@ -213,6 +233,33 @@ template <typename T> T get_value(Variant const &v) {
   }
 }
 
+template <typename K, typename V>
+std::unordered_map<K, V> get_map(std::unordered_map<K, Variant> const &v) {
+  std::unordered_map<K, V> ret;
+  auto it = v.begin();
+  try {
+    for (; it != v.end(); ++it) {
+      ret.insert({it->first, detail::get_value_helper<V>{}(it->second)});
+    }
+  } catch (const boost::bad_get &) {
+    throw Exception("Provided map value of type " +
+                    detail::type_label(it->second) + " is not convertible to " +
+                    Utils::demangle<V>() +
+                    " (raised during the creation of a " +
+                    Utils::demangle<std::unordered_map<K, V>>() + ")");
+  }
+  return ret;
+}
+
+template <typename K, typename V>
+std::unordered_map<K, Variant> make_map(std::unordered_map<K, V> const &v) {
+  std::unordered_map<K, Variant> ret;
+  for (auto const &it : v) {
+    ret.insert({it.first, Variant(it.second)});
+  }
+  return ret;
+}
+
 /**
  * @brief Get a value from a VariantMap by name, or throw
  *        if it does not exist or is not convertible to
diff --git a/src/script_interface/initialize.cpp b/src/script_interface/initialize.cpp
index 88a7048bc4b..f971b29ae12 100644
--- a/src/script_interface/initialize.cpp
+++ b/src/script_interface/initialize.cpp
@@ -29,6 +29,7 @@
 #include "h5md/initialize.hpp"
 #endif
 #include "ComFixed.hpp"
+#include "CylindricalTransformationParameters.hpp"
 #include "accumulators/initialize.hpp"
 #include "collision_detection/initialize.hpp"
 #include "lbboundaries/initialize.hpp"
@@ -53,6 +54,8 @@ void initialize(Utils::Factory<ObjectHandle> *f) {
   CollisionDetection::initialize(f);
 
   f->register_new<ComFixed>("ComFixed");
+  f->register_new<CylindricalTransformationParameters>(
+      "CylindricalTransformationParameters");
 }
 
 } /* namespace ScriptInterface */
diff --git a/src/script_interface/observables/CylindricalLBProfileObservable.hpp b/src/script_interface/observables/CylindricalLBProfileObservable.hpp
index f0f12e3e639..b0eafa3942a 100644
--- a/src/script_interface/observables/CylindricalLBProfileObservable.hpp
+++ b/src/script_interface/observables/CylindricalLBProfileObservable.hpp
@@ -28,6 +28,8 @@
 #include "core/observables/CylindricalLBProfileObservable.hpp"
 #include "script_interface/get_value.hpp"
 
+#include "script_interface/CylindricalTransformationParameters.hpp"
+
 #include <boost/range/algorithm.hpp>
 
 #include <cstddef>
@@ -53,18 +55,7 @@ class CylindricalLBProfileObservable
   using Base::Base;
   CylindricalLBProfileObservable() {
     this->add_parameters({
-        {"center",
-         [this](const Variant &v) {
-           cylindrical_profile_observable()->center =
-               get_value<::Utils::Vector3d>(v);
-         },
-         [this]() { return cylindrical_profile_observable()->center; }},
-        {"axis",
-         [this](const Variant &v) {
-           cylindrical_profile_observable()->axis =
-               get_value<Utils::Vector3d>(v);
-         },
-         [this]() { return cylindrical_profile_observable()->axis; }},
+        {"transform_params", m_transform_params},
         {"n_r_bins",
          [this](const Variant &v) {
            cylindrical_profile_observable()->n_bins[0] =
@@ -149,13 +140,21 @@ class CylindricalLBProfileObservable
   }
 
   void do_construct(VariantMap const &params) override {
-    m_observable =
-        make_shared_from_args<CoreCylLBObs, Utils::Vector3d, Utils::Vector3d,
-                              int, int, int, double, double, double, double,
-                              double, double, double>(
-            params, "center", "axis", "n_r_bins", "n_phi_bins", "n_z_bins",
-            "min_r", "max_r", "min_phi", "max_phi", "min_z", "max_z",
-            "sampling_density");
+    set_from_args(m_transform_params, params, "transform_params");
+
+    if (m_transform_params)
+      m_observable = std::make_shared<CoreCylLBObs>(
+          m_transform_params->cyl_transform_params(),
+          get_value_or<int>(params, "n_r_bins", 1),
+          get_value_or<int>(params, "n_phi_bins", 1),
+          get_value_or<int>(params, "n_z_bins", 1),
+          get_value_or<double>(params, "min_r", 0.),
+          get_value<double>(params, "max_r"),
+          get_value_or<double>(params, "min_phi", -Utils::pi()),
+          get_value_or<double>(params, "max_phi", Utils::pi()),
+          get_value<double>(params, "min_z"),
+          get_value<double>(params, "max_z"),
+          get_value<double>(params, "sampling_density"));
   }
 
   Variant do_call_method(std::string const &method,
@@ -180,6 +179,7 @@ class CylindricalLBProfileObservable
 
 private:
   std::shared_ptr<CoreCylLBObs> m_observable;
+  std::shared_ptr<CylindricalTransformationParameters> m_transform_params;
 };
 
 } /* namespace Observables */
diff --git a/src/script_interface/observables/CylindricalPidProfileObservable.hpp b/src/script_interface/observables/CylindricalPidProfileObservable.hpp
index 9d22325af17..d541880b367 100644
--- a/src/script_interface/observables/CylindricalPidProfileObservable.hpp
+++ b/src/script_interface/observables/CylindricalPidProfileObservable.hpp
@@ -27,11 +27,14 @@
 #include "Observable.hpp"
 #include "core/observables/CylindricalPidProfileObservable.hpp"
 
-#include <boost/range/algorithm.hpp>
+#include <script_interface/CylindricalTransformationParameters.hpp>
+#include <utils/constants.hpp>
 
+#include <boost/range/algorithm.hpp>
 #include <cstddef>
 #include <iterator>
 #include <memory>
+
 #include <type_traits>
 #include <vector>
 
@@ -58,18 +61,7 @@ class CylindricalPidProfileObservable
                get_value<std::vector<int>>(v);
          },
          [this]() { return cylindrical_pid_profile_observable()->ids(); }},
-        {"center",
-         [this](const Variant &v) {
-           cylindrical_pid_profile_observable()->center =
-               get_value<::Utils::Vector3d>(v);
-         },
-         [this]() { return cylindrical_pid_profile_observable()->center; }},
-        {"axis",
-         [this](const Variant &v) {
-           cylindrical_pid_profile_observable()->axis =
-               get_value<Utils::Vector3d>(v);
-         },
-         [this]() { return cylindrical_pid_profile_observable()->axis; }},
+        {"transform_params", m_transform_params},
         {"n_r_bins",
          [this](const Variant &v) {
            cylindrical_pid_profile_observable()->n_bins[0] =
@@ -149,13 +141,21 @@ class CylindricalPidProfileObservable
   };
 
   void do_construct(VariantMap const &params) override {
-    m_observable =
-        make_shared_from_args<CoreObs, std::vector<int>, Utils::Vector3d,
-                              Utils::Vector3d, int, int, int, double, double,
-                              double, double, double, double>(
-            params, "ids", "center", "axis", "n_r_bins", "n_phi_bins",
-            "n_z_bins", "min_r", "max_r", "min_phi", "max_phi", "min_z",
-            "max_z");
+    set_from_args(m_transform_params, params, "transform_params");
+
+    if (m_transform_params)
+      m_observable = std::make_shared<CoreObs>(
+          get_value<std::vector<int>>(params, "ids"),
+          m_transform_params->cyl_transform_params(),
+          get_value_or<int>(params, "n_r_bins", 1),
+          get_value_or<int>(params, "n_phi_bins", 1),
+          get_value_or<int>(params, "n_z_bins", 1),
+          get_value_or<double>(params, "min_r", 0.),
+          get_value<double>(params, "max_r"),
+          get_value_or<double>(params, "min_phi", -Utils::pi()),
+          get_value_or<double>(params, "max_phi", Utils::pi()),
+          get_value<double>(params, "min_z"),
+          get_value<double>(params, "max_z"));
   }
 
   Variant do_call_method(std::string const &method,
@@ -180,6 +180,7 @@ class CylindricalPidProfileObservable
 
 private:
   std::shared_ptr<CoreObs> m_observable;
+  std::shared_ptr<CylindricalTransformationParameters> m_transform_params;
 };
 
 } /* namespace Observables */
diff --git a/src/script_interface/packed_variant.hpp b/src/script_interface/packed_variant.hpp
index 677e098c4c3..f07ee280a69 100644
--- a/src/script_interface/packed_variant.hpp
+++ b/src/script_interface/packed_variant.hpp
@@ -54,7 +54,8 @@ inline ObjectId object_id(const ObjectHandle *p) {
 using PackedVariant = boost::make_recursive_variant<
     None, bool, int, double, std::string, std::vector<int>, std::vector<double>,
     ObjectId, std::vector<boost::recursive_variant_>, Utils::Vector2d,
-    Utils::Vector3d, Utils::Vector4d>::type;
+    Utils::Vector3d, Utils::Vector4d,
+    std::unordered_map<int, boost::recursive_variant_>>::type;
 
 using PackedMap = std::vector<std::pair<std::string, PackedVariant>>;
 
@@ -84,6 +85,17 @@ struct PackVisitor : boost::static_visitor<PackedVariant> {
     return ret;
   }
 
+  /* For the map, we recurse into each element. */
+  auto operator()(const std::unordered_map<int, Variant> &map) const {
+    std::unordered_map<int, PackedVariant> ret{};
+
+    for (auto const &it : map) {
+      ret.insert({it.first, boost::apply_visitor(*this, it.second)});
+    }
+
+    return ret;
+  }
+
   /* For object references we store the object reference, and
    * replace it by just an id. */
   PackedVariant operator()(const ObjectRef &so_ptr) const {
@@ -121,6 +133,17 @@ struct UnpackVisitor : boost::static_visitor<Variant> {
     return ret;
   }
 
+  /* For the map, we recurse into each element. */
+  auto operator()(const std::unordered_map<int, PackedVariant> &map) const {
+    std::unordered_map<int, Variant> ret{};
+
+    for (auto const &it : map) {
+      ret.insert({it.first, boost::apply_visitor(*this, it.second)});
+    }
+
+    return ret;
+  }
+
   /* Regular value are just verbatim copied into the result. */
   template <class T> Variant operator()(T &&val) const {
     return std::forward<T>(val);
diff --git a/src/script_interface/shapes/Union.hpp b/src/script_interface/shapes/Union.hpp
index eaa1aff3949..8d689379849 100644
--- a/src/script_interface/shapes/Union.hpp
+++ b/src/script_interface/shapes/Union.hpp
@@ -57,8 +57,8 @@ class Union : public Shape {
     } else if (name == "clear") {
       for (auto &s : m_shapes) {
         m_core_shape->remove(s->shape());
-        m_shapes.clear();
       }
+      m_shapes.clear();
     } else if (name == "size") {
       return static_cast<int>(m_shapes.size());
     } else if (name == "empty") {
diff --git a/src/shapes/include/shapes/HollowConicalFrustum.hpp b/src/shapes/include/shapes/HollowConicalFrustum.hpp
index ac846e0e545..1deb0d98f2e 100644
--- a/src/shapes/include/shapes/HollowConicalFrustum.hpp
+++ b/src/shapes/include/shapes/HollowConicalFrustum.hpp
@@ -22,6 +22,9 @@
 
 #include "Shape.hpp"
 #include <utils/Vector.hpp>
+#include <utils/math/orthonormal_vec.hpp>
+
+#include <list>
 
 namespace Shapes {
 
@@ -48,15 +51,20 @@ class HollowConicalFrustum : public Shape {
   HollowConicalFrustum()
       : m_r1(0.0), m_r2(0.0), m_length(0.0), m_thickness(0.0),
         m_direction(1), m_center{Utils::Vector3d{}}, m_axis{Utils::Vector3d{
-                                                         0, 0, 1}} {}
-
-  void set_r1(double radius) { m_r1 = radius; }
-  void set_r2(double radius) { m_r2 = radius; }
-  void set_length(double length) { m_length = length; }
-  void set_thickness(double thickness) { m_thickness = thickness; }
-  void set_direction(int dir) { m_direction = dir; }
-  void set_axis(Utils::Vector3d const &axis) { m_axis = axis; }
+                                                         0., 0., 1.}},
+        m_orientation{Utils::Vector3d{1., 0., 0.}} {}
 
+  void set_r1(double const radius) { m_r1 = radius; }
+  void set_r2(double const radius) { m_r2 = radius; }
+  void set_length(double const length) { m_length = length; }
+  void set_thickness(double const thickness) { m_thickness = thickness; }
+  void set_direction(int const dir) { m_direction = dir; }
+  void set_axis(Utils::Vector3d const &axis) {
+    m_axis = axis;
+    // Even though the HCF is cylinder-symmetric, it needs a well defined phi=0
+    // orientation for the coordinate transformation.
+    m_orientation = Utils::calc_orthonormal_vector(axis);
+  }
   void set_center(Utils::Vector3d const &center) { m_center = center; }
 
   /// Get radius 1 perpendicular to axis.
@@ -92,6 +100,7 @@ class HollowConicalFrustum : public Shape {
   int m_direction;
   Utils::Vector3d m_center;
   Utils::Vector3d m_axis;
+  Utils::Vector3d m_orientation;
 };
 } // namespace Shapes
 
diff --git a/src/shapes/src/HollowConicalFrustum.cpp b/src/shapes/src/HollowConicalFrustum.cpp
index 3db5fb59fc0..ea384252cc4 100644
--- a/src/shapes/src/HollowConicalFrustum.cpp
+++ b/src/shapes/src/HollowConicalFrustum.cpp
@@ -33,8 +33,9 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
                                           Utils::Vector3d &vec) const {
   // transform given position to cylindrical coordinates in the reference frame
   // of the cone
-  auto const pos_cyl =
-      Utils::transform_coordinate_cartesian_to_cylinder(pos - m_center, m_axis);
+  auto const v = pos - m_center;
+  auto const pos_cyl = Utils::transform_coordinate_cartesian_to_cylinder(
+      v, m_axis, m_orientation);
   // clang-format off
   /*
    * the following implementation is based on:
@@ -61,7 +62,7 @@ void HollowConicalFrustum::calculate_dist(const Utils::Vector3d &pos,
   // Transform back to cartesian coordinates.
   auto const pos_intersection =
       Utils::transform_coordinate_cylinder_to_cartesian(
-          {r_intersection, pos_cyl[1], z_intersection}, m_axis) +
+          {r_intersection, pos_cyl[1], z_intersection}, m_axis, m_orientation) +
       m_center;
 
   auto const u = (pos - pos_intersection).normalize();
diff --git a/src/shapes/unit_tests/CMakeLists.txt b/src/shapes/unit_tests/CMakeLists.txt
index 5a92272c020..abe3d2805fd 100644
--- a/src/shapes/unit_tests/CMakeLists.txt
+++ b/src/shapes/unit_tests/CMakeLists.txt
@@ -6,3 +6,5 @@ unit_test(NAME Union_test SRC Union_test.cpp DEPENDS EspressoShapes
           EspressoUtils)
 unit_test(NAME Ellipsoid_test SRC Ellipsoid_test.cpp DEPENDS EspressoShapes
           EspressoUtils)
+unit_test(NAME NoWhere_test SRC NoWhere_test.cpp DEPENDS EspressoShapes
+          EspressoUtils)
diff --git a/src/shapes/unit_tests/NoWhere_test.cpp b/src/shapes/unit_tests/NoWhere_test.cpp
new file mode 100644
index 00000000000..057b0a1f2b6
--- /dev/null
+++ b/src/shapes/unit_tests/NoWhere_test.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2010-2021 The ESPResSo project
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010
+ *   Max-Planck-Institute for Polymer Research, Theory Group
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE NoWhere test
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include <shapes/NoWhere.hpp>
+#include <shapes/Shape.hpp>
+
+#include <utils/Vector.hpp>
+
+#include <limits>
+
+bool dist_is_always_inf(const Shapes::Shape &s) {
+  constexpr auto infinity = std::numeric_limits<double>::infinity();
+
+  Utils::Vector3d const positions[2] = {
+      {0.0, 1.0, 2.0},
+      {-10.0, 0.1, 5.0},
+  };
+
+  for (auto const &pos : positions) {
+    Utils::Vector3d dist{};
+    double d;
+
+    s.calculate_dist(pos, d, dist);
+    if (d != infinity) {
+      return false;
+    }
+
+    for (auto xyz : dist) {
+      if (xyz != infinity) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+BOOST_AUTO_TEST_CASE(dist_function) {
+  Shapes::NoWhere nw;
+
+  BOOST_CHECK(dist_is_always_inf(nw));
+}
diff --git a/src/utils/include/utils/math/coordinate_transformation.hpp b/src/utils/include/utils/math/coordinate_transformation.hpp
index db31193b062..07f84265f8b 100644
--- a/src/utils/include/utils/math/coordinate_transformation.hpp
+++ b/src/utils/include/utils/math/coordinate_transformation.hpp
@@ -19,64 +19,141 @@
 #ifndef UTILS_COORDINATE_TRANSFORMATION_HPP
 #define UTILS_COORDINATE_TRANSFORMATION_HPP
 
+/**
+ * @file
+ * Convert coordinates from the Cartesian system to the cylindrical system.
+ * The transformation functions are provided with three overloads:
+ * - one function for the trivial Cartesian <-> cylindrical transformation
+ * - one function to transform from/to a cylindrical system with custom axis
+ *   (extra @p axis argument, keep in mind the angle phi is under-defined)
+ * - one function to transform from/to an oriented cylindrical system with
+ *   custom axis (extra @p orientation argument, the angle phi is well-defined)
+ */
+
 #include "utils/Vector.hpp"
 #include "utils/constants.hpp"
 #include "utils/math/vec_rotate.hpp"
+#include "utils/matrix.hpp"
 #include "utils/quaternion.hpp"
 
+#include <cassert>
+#include <cmath>
+
 namespace Utils {
 
-/** \brief Transform the given 3D position to cylinder coordinates with
- * longitudinal axis aligned with axis parameter.
+/**
+ * @brief Basis change.
  */
-inline Vector3d
-transform_coordinate_cartesian_to_cylinder(const Vector3d &pos,
-                                           const Vector3d &axis) {
-  static auto const z_axis = Vector3d{{0, 0, 1}};
-  double theta;
-  Vector3d rotation_axis;
-  auto r = [](auto const &pos) {
-    return std::sqrt(pos[0] * pos[0] + pos[1] * pos[1]);
-  };
-  auto phi = [](auto const &pos) { return std::atan2(pos[1], pos[0]); };
-  if (axis != z_axis) {
-    std::tie(theta, rotation_axis) = rotation_params(axis, z_axis);
-    auto const rotated_pos = vec_rotate(rotation_axis, theta, pos);
-    return {r(rotated_pos), phi(rotated_pos), rotated_pos[2]};
+inline Vector3d basis_change(Vector3d const &b1, Vector3d const &b2,
+                             Vector3d const &b3, Vector3d const &v,
+                             bool reverse = false) {
+  auto const e_x = b1.normalized();
+  auto const e_y = b2.normalized();
+  auto const e_z = b3.normalized();
+  auto const M = Matrix<double, 3, 3>{
+      {e_x[0], e_x[1], e_x[2]},
+      {e_y[0], e_y[1], e_y[2]},
+      {e_z[0], e_z[1],
+       e_z[2]}}.transposed();
+  if (reverse) {
+    return M * v;
   }
-  return {r(pos), phi(pos), pos[2]};
+  return M.inversed() * v;
 }
 
 /**
- * @brief Coordinate transformation from cylinder to cartesian coordinates.
+ * @brief Coordinate transformation from Cartesian to cylindrical coordinates.
+ * The origins and z-axis of the coordinate systems co-incide.
+ * The @f$ \phi = 0 @f$ direction corresponds to the x-axis in the
+ * original coordinate system.
+ * @param pos    %Vector to transform
  */
 inline Vector3d
-transform_coordinate_cylinder_to_cartesian(Vector3d const &pos,
-                                           Vector3d const &axis) {
-  Vector3d const transformed{
-      {pos[0] * std::cos(pos[1]), pos[0] * std::sin(pos[1]), pos[2]}};
-  static auto const z_axis = Vector3d{{0, 0, 1}};
-  if (axis == z_axis)
-    return transformed;
-  double theta;
-  Vector3d rotation_axis;
-  std::tie(theta, rotation_axis) = rotation_params(z_axis, axis);
-  auto const rotated_pos = vec_rotate(rotation_axis, theta, transformed);
-  return rotated_pos;
+transform_coordinate_cartesian_to_cylinder(Vector3d const &pos) {
+  auto const r = std::sqrt(pos[0] * pos[0] + pos[1] * pos[1]);
+  auto const phi = std::atan2(pos[1], pos[0]);
+  return {r, phi, pos[2]};
+}
+
+/**
+ * @brief Coordinate transformation from Cartesian to cylindrical coordinates
+ * with change of basis. The origins of the coordinate systems co-incide.
+ *
+ * If the parameter @p axis is not equal to <tt>[0, 0, 1]</tt>, the value
+ * of the angle @f$ \phi @f$ in cylindrical coordinates is under-defined.
+ * To fully define it, it is necessary to provide an orientation vector
+ * in Cartesian coordinates that will be used as the reference point
+ * (i.e. such that @f$ \phi = 0 @f$), by default it is the x-axis.
+ *
+ * @param pos    %Vector to transform
+ * @param axis   Longitudinal axis of the cylindrical coordinates
+ * @param orientation   Reference point (in untransformed coordinates) for
+ *                      which @f$ \phi = 0 @f$
+ */
+inline Vector3d transform_coordinate_cartesian_to_cylinder(
+    Vector3d const &pos, Vector3d const &axis, Vector3d const &orientation) {
+  // check that axis and orientation are orthogonal
+  assert(std::abs(axis * orientation) <
+         5 * std::numeric_limits<double>::epsilon());
+  auto const rotation_axis = vector_product(axis, orientation);
+  auto const pos_t = basis_change(orientation, rotation_axis, axis, pos);
+  return transform_coordinate_cartesian_to_cylinder(pos_t);
 }
 
-/** \brief Transform the given 3D vector to cylinder coordinates with
- * symmetry axis aligned with axis parameter.
+/**
+ * @brief Coordinate transformation from cylindrical to Cartesian coordinates.
+ * The origins and z-axis of the coordinate systems co-incide.
+ * The @f$ \phi = 0 @f$ direction corresponds to the x-axis in the
+ * transformed coordinate system.
+ * @param pos    %Vector to transform
+ */
+inline Vector3d
+transform_coordinate_cylinder_to_cartesian(Vector3d const &pos) {
+  auto const &rho = pos[0];
+  auto const &phi = pos[1];
+  auto const &z = pos[2];
+  return {rho * std::cos(phi), rho * std::sin(phi), z};
+}
+
+/**
+ * @brief Coordinate transformation from cylindrical to Cartesian coordinates
+ * with change of basis. The origins of the coordinate systems co-incide.
+ *
+ * If the parameter @p axis is not equal to <tt>[0, 0, 1]</tt>, the value
+ * of the angle @f$ \phi @f$ in cylindrical coordinates is under-defined.
+ * To fully define it, it is necessary to provide an orientation vector
+ * in Cartesian coordinates that will be used as the reference point
+ * (i.e. such that @f$ \phi = 0 @f$).
+ *
+ * @param pos    %Vector to transform
+ * @param axis   Longitudinal axis of the cylindrical coordinates
+ * @param orientation   Reference point (in Cartesian coordinates) for
+ *                      which @f$ \phi = 0 @f$
+ */
+inline Vector3d transform_coordinate_cylinder_to_cartesian(
+    Vector3d const &pos, Vector3d const &axis, Vector3d const &orientation) {
+  // check that axis and orientation are orthogonal
+  assert(std::abs(axis * orientation) <
+         5 * std::numeric_limits<double>::epsilon());
+  auto const rotation_axis = vector_product(axis, orientation);
+  auto const pos_t = transform_coordinate_cylinder_to_cartesian(pos);
+  return basis_change(orientation, rotation_axis, axis, pos_t, true);
+}
+
+/**
+ * @brief Vector transformation from Cartesian to cylindrical coordinates.
+ * @param vec    %Vector to transform
+ * @param axis   Longitudinal axis of the cylindrical coordinates
+ * @param pos    Origin of the vector
  */
 inline Vector3d transform_vector_cartesian_to_cylinder(Vector3d const &vec,
                                                        Vector3d const &axis,
                                                        Vector3d const &pos) {
   static auto const z_axis = Vector3d{{0, 0, 1}};
-  double theta;
-  Vector3d rotation_axis;
-  std::tie(theta, rotation_axis) = rotation_params(axis, z_axis);
-  auto const rotated_pos = vec_rotate(rotation_axis, theta, pos);
-  auto const rotated_vec = vec_rotate(rotation_axis, theta, vec);
+  auto const angle = angle_between(axis, z_axis);
+  auto const rotation_axis = Utils::vector_product(axis, z_axis).normalize();
+  auto const rotated_pos = vec_rotate(rotation_axis, angle, pos);
+  auto const rotated_vec = vec_rotate(rotation_axis, angle, vec);
   auto const r = std::sqrt(rotated_pos[0] * rotated_pos[0] +
                            rotated_pos[1] * rotated_pos[1]);
   // v_r = (x * v_x + y * v_y) / sqrt(x^2 + y^2)
diff --git a/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
new file mode 100644
index 00000000000..20f18d78a77
--- /dev/null
+++ b/src/utils/include/utils/math/cylindrical_transformation_parameters.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2010-2019 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
+#define ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
+
+#include <stdexcept>
+#include <string>
+
+#include <utils/math/abs.hpp>
+
+namespace Utils {
+
+/**
+ * @brief A class to hold and validate parameters for a cylindrical coordinate
+ * transformations.
+ *
+ * @param center The origin of the cylindrical coordinates.
+ * @param axis The "z"-axis. Must be normalized.
+ * @param orientation The axis along which phi = 0. Must be normalized and
+ * orthogonal to axis.
+ */
+class CylindricalTransformationParameters {
+public:
+  CylindricalTransformationParameters() = default;
+  CylindricalTransformationParameters(Utils::Vector3d const &center,
+                                      Utils::Vector3d const &axis,
+                                      Utils::Vector3d const &orientation)
+      : m_center(center), m_axis(axis), m_orientation(orientation) {
+    validate();
+  }
+
+  Utils::Vector3d center() const { return m_center; }
+  Utils::Vector3d axis() const { return m_axis; }
+  Utils::Vector3d orientation() const { return m_orientation; }
+
+private:
+  void validate() const {
+    auto constexpr eps = 10 * std::numeric_limits<double>::epsilon();
+    if (Utils::abs(m_orientation * m_axis) > eps) {
+      throw std::runtime_error(
+          "CylindricalTransformationParameters: Axis and orientation must be "
+          "orthogonal. Scalar product is " +
+          std::to_string(m_orientation * m_axis));
+    }
+    if (Utils::abs(m_axis.norm() - 1) > eps) {
+      throw std::runtime_error("CylindricalTransformationParameters: Axis must "
+                               "be normalized. Norm is " +
+                               std::to_string(m_axis.norm()));
+    }
+    if (Utils::abs(m_orientation.norm() - 1) > eps) {
+      throw std::runtime_error("CylindricalTransformationParameters: "
+                               "orientation must be normalized. Norm is " +
+                               std::to_string(m_orientation.norm()));
+    }
+  }
+
+  const Utils::Vector3d m_center{};
+  const Utils::Vector3d m_axis{0, 0, 1};
+  const Utils::Vector3d m_orientation{1, 0, 0};
+};
+
+} // namespace Utils
+
+#endif // ESPRESSO_CYLINDER_TRANSFORMATION_PARAMETERS_HPP
diff --git a/src/utils/include/utils/math/orthonormal_vec.hpp b/src/utils/include/utils/math/orthonormal_vec.hpp
new file mode 100644
index 00000000000..57f2637fab4
--- /dev/null
+++ b/src/utils/include/utils/math/orthonormal_vec.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2010-2019 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef ESPRESSO_ORTHONORMAL_VEC_HPP
+#define ESPRESSO_ORTHONORMAL_VEC_HPP
+
+#include "utils/Vector.hpp"
+#include "utils/constants.hpp"
+
+namespace Utils {
+/**
+ * @brief Return a vector that is orthonormal to vec
+ */
+template <typename T, std::size_t N>
+Vector<T, N> calc_orthonormal_vector(Vector<T, N> const &vec) {
+  /* Calculate orthonormal vector using Gram-Schmidt orthogonalization of a
+   trial vector. Only works if the trial vector is not parallel, so we have to
+   try a second one in that case
+  */
+  Vector<Vector<T, N>, 2> try_vectors = {Vector<T, N>::broadcast(0),
+                                         Vector<T, N>::broadcast(0)};
+  try_vectors[0][0] = 1;
+  try_vectors[1][1] = 1;
+
+  Vector<T, N> ret;
+  for (auto v : try_vectors) {
+    auto orth_component = v - (v * vec) / vec.norm2() * vec;
+    auto norm = orth_component.norm();
+    if (norm >= 1. / Utils::sqrt_2()) {
+      ret = orth_component / norm;
+      break;
+    }
+  }
+  return ret;
+}
+
+} // namespace Utils
+
+#endif // ESPRESSO_ORTHONORMAL_VEC_HPP
\ No newline at end of file
diff --git a/src/utils/include/utils/math/vec_rotate.hpp b/src/utils/include/utils/math/vec_rotate.hpp
index 32c1cc23d8d..029852ddd56 100644
--- a/src/utils/include/utils/math/vec_rotate.hpp
+++ b/src/utils/include/utils/math/vec_rotate.hpp
@@ -47,21 +47,10 @@ inline Vector3d vec_rotate(const Vector3d &axis, double angle,
 }
 
 /**
- * @brief Determine rotation angle and axis for rotating vec onto target_vec.
- * @param vec Vector to be rotated
- * @param target_vec Target vector
- * @return rotation angle and rotation axis
+ * @brief Determine the angle between two vectors.
  */
-inline std::tuple<double, Vector3d>
-rotation_params(Vector3d const &vec, Vector3d const &target_vec) {
-  if (vec.normalized() != target_vec.normalized()) {
-    auto const theta =
-        std::acos(vec * target_vec / (vec.norm() * target_vec.norm()));
-    auto const rotation_axis =
-        Utils::vector_product(vec, target_vec).normalize();
-    return std::make_tuple(theta, rotation_axis);
-  }
-  return std::make_tuple(0.0, Vector3d{});
+inline double angle_between(Vector3d const &v1, Vector3d const &v2) {
+  return std::acos(v1 * v2 / std::sqrt(v1.norm2() * v2.norm2()));
 }
 
 } // namespace Utils
diff --git a/src/utils/include/utils/memory.hpp b/src/utils/include/utils/memory.hpp
deleted file mode 100644
index 200c456a18b..00000000000
--- a/src/utils/include/utils/memory.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (C) 2010-2019 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef CORE_UTILS_MEMORY_HPP
-#define CORE_UTILS_MEMORY_HPP
-
-#include <cstddef>
-#include <cstdlib>
-#include <new>
-#include <stdexcept>
-
-namespace Utils {
-
-/*************************************************************/
-/** \name Dynamic memory allocation.                         */
-/*************************************************************/
-/**@{*/
-
-/* to enable us to make sure that freed pointers are invalidated, we normally
-   try to use realloc.
-   Unfortunately allocating zero bytes (which should be avoided) actually
-   allocates 16 bytes, and
-   reallocating to 0 also. To avoid this, we use our own malloc and realloc
-   procedures. */
-
-/** used instead of realloc.
-    Makes sure that resizing to zero FREEs pointer */
-template <typename T> inline T *realloc(T *old, size_t size) {
-  if (size == 0) {
-    ::free(static_cast<void *>(old));
-    return nullptr;
-  }
-
-  auto *p = static_cast<T *>(::realloc(static_cast<void *>(old), size));
-
-  if (p == nullptr) {
-    throw std::bad_alloc{};
-  }
-  return p;
-}
-
-/** used instead of malloc.
-    Makes sure that a zero size allocation returns a nullptr pointer */
-inline void *malloc(size_t size) {
-  if (size == 0) {
-    return nullptr;
-  }
-
-  void *p = ::malloc(size);
-
-  if (p == nullptr) {
-    throw std::bad_alloc{};
-  }
-  return p;
-}
-
-/**@}*/
-} // namespace Utils
-
-#endif
diff --git a/src/utils/include/utils/mpi/all_gatherv.hpp b/src/utils/include/utils/mpi/all_gatherv.hpp
deleted file mode 100644
index 2d746fc779d..00000000000
--- a/src/utils/include/utils/mpi/all_gatherv.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (C) 2010-2019 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef UTILS_MPI_ALL_GATHERV_HPP
-#define UTILS_MPI_ALL_GATHERV_HPP
-
-#include <boost/mpi/communicator.hpp>
-#include <boost/mpi/datatype.hpp>
-#include <boost/mpi/exception.hpp>
-#include <boost/mpi/nonblocking.hpp>
-#include <vector>
-
-namespace Utils {
-namespace Mpi {
-
-namespace detail {
-template <typename T>
-void all_gatherv_impl(const boost::mpi::communicator &comm, const T *in_values,
-                      int in_size, T *out_values, const int *sizes,
-                      const int *displs, boost::mpl::true_) {
-  MPI_Datatype type = boost::mpi::get_mpi_datatype<T>();
-
-  /* in-place ? */
-  if (in_values == out_values) {
-    BOOST_MPI_CHECK_RESULT(MPI_Allgatherv,
-                           (MPI_IN_PLACE, 0, type, out_values,
-                            const_cast<int *>(sizes), const_cast<int *>(displs),
-                            type, comm));
-  } else {
-    BOOST_MPI_CHECK_RESULT(MPI_Allgatherv,
-                           (const_cast<T *>(in_values), in_size, type,
-                            out_values, const_cast<int *>(sizes),
-                            const_cast<int *>(displs), type, comm));
-  }
-}
-
-template <typename T>
-void all_gatherv_impl(const boost::mpi::communicator &comm, const T *in_values,
-                      int in_size, T *out_values, const int *sizes,
-                      const int *displs, boost::mpl::false_) {
-  auto const n_nodes = comm.size();
-  auto const rank = comm.rank();
-
-  /* not in-place */
-  if (in_values != out_values) {
-    std::copy_n(in_values, in_size, out_values + displs[rank]);
-  }
-
-  std::vector<boost::mpi::request> req;
-  for (int i = 0; i < n_nodes; i++) {
-    if (i != rank) {
-      req.emplace_back(comm.isend(i, 42, out_values + displs[rank], in_size));
-      req.emplace_back(comm.irecv(i, 42, out_values + displs[i], sizes[i]));
-    }
-  }
-
-  boost::mpi::wait_all(req.begin(), req.end());
-}
-} // namespace detail
-
-template <typename T>
-void all_gatherv(const boost::mpi::communicator &comm, const T *in_values,
-                 int in_size, T *out_values, const int *sizes,
-                 const int *displs) {
-  detail::all_gatherv_impl(comm, in_values, in_size, out_values, sizes, displs,
-                           boost::mpi::is_mpi_datatype<T>());
-}
-
-template <typename T>
-void all_gatherv(const boost::mpi::communicator &comm, const T *in_values,
-                 int in_size, T *out_values, const int *sizes) {
-  std::vector<int> displ(comm.size());
-
-  int offset = 0;
-  for (unsigned i = 0; i < displ.size(); i++) {
-    displ[i] = offset;
-    offset += sizes[i];
-  }
-
-  detail::all_gatherv_impl(comm, in_values, in_size, out_values, sizes,
-                           displ.data(), boost::mpi::is_mpi_datatype<T>());
-}
-} // namespace Mpi
-} // namespace Utils
-#endif
diff --git a/src/utils/include/utils/mpi/gather_buffer.hpp b/src/utils/include/utils/mpi/gather_buffer.hpp
index ab827769dc2..61e501856f7 100644
--- a/src/utils/include/utils/mpi/gather_buffer.hpp
+++ b/src/utils/include/utils/mpi/gather_buffer.hpp
@@ -33,65 +33,6 @@
 
 namespace Utils {
 namespace Mpi {
-namespace detail {
-template <typename T>
-void relocate_data(T *buffer, std::vector<int> const &sizes,
-                   std::vector<int> const &displ, int root) {
-  if (sizes[root] && displ[root]) {
-    for (int i = sizes[root] - 1; i >= 0; --i) {
-      buffer[i + displ[root]] = buffer[i];
-    }
-  }
-}
-} // namespace detail
-
-/**
- * @brief Gather buffer with different size on each node.
- *
- * Gathers buffers with different lengths from all nodes to root.
- * The buffer is assumed to be large enough to hold the data from
- * all the nodes and is owned by the caller. On the @p root node,
- * the first @p n_elem elements of @p buffer are moved, if need
- * be. On the other nodes, @p buffer is not touched.
- *
- * This encapsulates a common combination of <tt>MPI_Gather()</tt>
- * and <tt>MPI_{Send,Recv}()</tt>.
- *
- * @param buffer On the master the target buffer that has to be
- *        large enough to hold all elements and has the local
- *        part in the beginning. On the slaves the local buffer.
- * @param n_elem The number of elements in the local buffer.
- * @param comm The MPI communicator.
- * @param root The rank where the data should be gathered.
- * @return On rank root, the total number of elements in the buffer,
- *         on the other ranks 0.
- */
-template <typename T>
-int gather_buffer(T *buffer, int n_elem, boost::mpi::communicator comm,
-                  int root = 0) {
-  if (comm.rank() == root) {
-    static std::vector<int> sizes;
-    static std::vector<int> displ;
-
-    auto const total_size =
-        detail::size_and_offset<T>(sizes, displ, n_elem, comm, root);
-
-    /* Move the original data to its new location */
-    detail::relocate_data(buffer, sizes, displ, root);
-
-    /* Gather data */
-    gatherv(comm, buffer, 0, buffer, sizes.data(), displ.data(), root);
-
-    return total_size;
-  }
-  /* Send local size */
-  detail::size_and_offset(n_elem, comm, root);
-  /* Send data */
-  gatherv(comm, buffer, n_elem, static_cast<T *>(nullptr), nullptr, nullptr,
-          root);
-
-  return 0;
-}
 
 /**
  * @brief Gather buffer with different size on each node.
@@ -125,7 +66,11 @@ void gather_buffer(std::vector<T, Allocator> &buffer,
     buffer.resize(tot_size);
 
     /* Move the original data to its new location */
-    detail::relocate_data(buffer.data(), sizes, displ, root);
+    if (sizes[root] && displ[root]) {
+      for (int i = sizes[root] - 1; i >= 0; --i) {
+        buffer[i + displ[root]] = buffer[i];
+      }
+    }
 
     /* Gather data */
     gatherv(comm, buffer.data(), buffer.size(), buffer.data(), sizes.data(),
diff --git a/src/utils/tests/CMakeLists.txt b/src/utils/tests/CMakeLists.txt
index 3b7c549d0f3..a230d4b84ba 100644
--- a/src/utils/tests/CMakeLists.txt
+++ b/src/utils/tests/CMakeLists.txt
@@ -74,9 +74,9 @@ unit_test(NAME all_compare_test SRC all_compare_test.cpp DEPENDS EspressoUtils
           Boost::mpi MPI::MPI_CXX NUM_PROC 3)
 unit_test(NAME gatherv_test SRC gatherv_test.cpp DEPENDS EspressoUtils
           Boost::mpi MPI::MPI_CXX NUM_PROC 3)
-unit_test(NAME all_gatherv_test SRC all_gatherv_test.cpp DEPENDS EspressoUtils
-          Boost::mpi MPI::MPI_CXX)
 unit_test(NAME sendrecv_test SRC sendrecv_test.cpp DEPENDS EspressoUtils
           Boost::mpi MPI::MPI_CXX EspressoUtils NUM_PROC 3)
 unit_test(NAME matrix_test SRC matrix_test.cpp DEPENDS EspressoUtils
           Boost::serialization NUM_PROC 1)
+unit_test(NAME orthonormal_vec_test SRC orthonormal_vec_test.cpp DEPENDS
+          EspressoUtils Boost::serialization NUM_PROC 1)
diff --git a/src/utils/tests/Factory_test.cpp b/src/utils/tests/Factory_test.cpp
index e14355ecd5b..ad461867dae 100644
--- a/src/utils/tests/Factory_test.cpp
+++ b/src/utils/tests/Factory_test.cpp
@@ -45,7 +45,7 @@ struct OtherDerivedTestClass : public TestClass {
 };
 
 /* Check registration of construction functions */
-BOOST_AUTO_TEST_CASE(regiser_class) {
+BOOST_AUTO_TEST_CASE(register_class) {
   Utils::Factory<TestClass> factory;
 
   factory.register_new<OtherDerivedTestClass>("other_derived_class");
@@ -66,7 +66,7 @@ BOOST_AUTO_TEST_CASE(make) {
   BOOST_CHECK(dynamic_cast<DerivedTestClass *>(o.get()) != nullptr);
 }
 
-BOOST_AUTO_TEST_CASE(stable_name_) {
+BOOST_AUTO_TEST_CASE(type_name) {
   const std::string derived_class_name = "derived_test_class";
 
   Utils::Factory<TestClass> factory;
diff --git a/src/utils/tests/all_gatherv_test.cpp b/src/utils/tests/all_gatherv_test.cpp
deleted file mode 100644
index 211f8b76c08..00000000000
--- a/src/utils/tests/all_gatherv_test.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (C) 2017-2019 The ESPResSo project
- *
- * This file is part of ESPResSo.
- *
- * ESPResSo is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * ESPResSo is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#define BOOST_TEST_NO_MAIN
-#define BOOST_TEST_MODULE all_gather test
-#define BOOST_TEST_DYN_LINK
-#include <boost/mpi.hpp>
-#include <boost/test/unit_test.hpp>
-
-#include "utils/mpi/all_gatherv.hpp"
-using Utils::Mpi::all_gatherv;
-
-#include <string>
-#include <vector>
-
-namespace mpi = boost::mpi;
-
-BOOST_AUTO_TEST_CASE(mpi_type) {
-  mpi::communicator world;
-  auto const rank = world.rank();
-  auto const size = world.size();
-
-  /* out-of-place */
-  {
-    std::vector<int> out(size, -1);
-    std::vector<int> sizes(size, 1);
-
-    all_gatherv(world, &rank, 1, out.data(), sizes.data());
-
-    for (int i = 0; i < size; i++) {
-      BOOST_CHECK_EQUAL(i, out.at(i));
-    }
-  }
-
-  /* in-place */
-  {
-    std::vector<int> out(size, -1);
-    out[rank] = rank;
-    std::vector<int> sizes(size, 1);
-
-    all_gatherv(world, out.data(), 1, out.data(), sizes.data());
-
-    for (int i = 0; i < size; i++) {
-      BOOST_CHECK_EQUAL(i, out.at(i));
-    }
-  }
-}
-
-BOOST_AUTO_TEST_CASE(non_mpi_type) {
-  mpi::communicator world;
-  auto const rank = world.rank();
-  auto const size = world.size();
-  auto const in = std::to_string(rank);
-
-  /* out-of-place */
-  {
-    std::vector<std::string> out(size);
-    std::vector<int> sizes(size, 1);
-
-    all_gatherv(world, &in, 1, out.data(), sizes.data());
-
-    for (int i = 0; i < size; i++) {
-      BOOST_CHECK_EQUAL(std::to_string(i), out.at(i));
-    }
-  }
-
-  /* in-place */
-  {
-    std::vector<std::string> out(size);
-    out[rank] = in;
-    std::vector<int> sizes(size, 1);
-
-    all_gatherv(world, out.data(), 1, out.data(), sizes.data());
-
-    for (int i = 0; i < size; i++) {
-      BOOST_CHECK_EQUAL(std::to_string(i), out.at(i));
-    }
-  }
-}
-
-int main(int argc, char **argv) {
-  mpi::environment mpi_env(argc, argv);
-
-  return boost::unit_test::unit_test_main(init_unit_test, argc, argv);
-}
diff --git a/src/utils/tests/coordinate_transformation.cpp b/src/utils/tests/coordinate_transformation.cpp
index b5c5d19e2ee..31371967657 100644
--- a/src/utils/tests/coordinate_transformation.cpp
+++ b/src/utils/tests/coordinate_transformation.cpp
@@ -25,81 +25,215 @@
 #include <utils/math/vec_rotate.hpp>
 
 #include <cmath>
+#include <random>
 
 using Utils::Vector3d;
 
 BOOST_AUTO_TEST_CASE(cartesian_to_cylinder_test) {
-  Vector3d const cart_coord{{1.0, 3.3, 2.0}};
-  auto const transformed_x = transform_coordinate_cartesian_to_cylinder(
-      cart_coord, Vector3d{{1, 0, 0}});
-  auto const transformed_y = transform_coordinate_cartesian_to_cylinder(
-      cart_coord, Vector3d{{0, 1, 0}});
-  auto const transformed_z = transform_coordinate_cartesian_to_cylinder(
-      cart_coord, Vector3d{{0, 0, 1}});
-  // For x as the symmetry axis we rotate the cartesian coordinates around the
-  // y-axis by -pi/2.
-  auto const expected_x = transform_coordinate_cartesian_to_cylinder(
-      vec_rotate(Vector3d{{0.0, 1.0, 0.0}}, -Utils::pi() / 2.0, cart_coord),
-      Vector3d{{0, 0, 1}});
-  // For y as the symmetry axis we rotate the cartesian coordinates around the
-  // x-axis by pi/2.
-  auto const expected_y = transform_coordinate_cartesian_to_cylinder(
-      vec_rotate(Vector3d{{1.0, 0.0, 0.0}}, Utils::pi() / 2.0, cart_coord),
-      Vector3d{{0, 0, 1}});
-  auto const expected_z = Vector3d{
-      {std::sqrt(cart_coord[0] * cart_coord[0] + cart_coord[1] * cart_coord[1]),
-       std::atan2(cart_coord[1], cart_coord[0]), cart_coord[2]}};
+  constexpr auto eps = 1e-14;
+  auto const pos = Vector3d{{1.0, 3.3, 2.0}};
+  auto const cyl = transform_coordinate_cartesian_to_cylinder(pos);
+  BOOST_CHECK_SMALL(cyl[0] - std::sqrt(pos[0] * pos[0] + pos[1] * pos[1]), eps);
+  BOOST_CHECK_SMALL(cyl[1] - std::atan2(pos[1], pos[0]), eps);
+  BOOST_CHECK_SMALL(cyl[2] - pos[2], eps);
+}
+
+BOOST_AUTO_TEST_CASE(basis_transform_test) {
+  constexpr auto eps = 1e-14;
+  Vector3d const b_x{{1, 0, 0}};
+  Vector3d const b_y{{0, 1, 0}};
+  Vector3d const b_z{{0, 0, 1}};
+  // identity transform
+  Vector3d const v{{1, 2, 3}};
+  Vector3d const v_identity_transform = Utils::basis_change(b_x, b_y, b_z, v);
+  // identity transform (swap both the vector and coordinate system)
+  Vector3d const v_swap_coord_transform =
+      Utils::basis_change(b_z, b_y, b_x, {{v[2], v[1], v[0]}});
+  // non-trivial transform
+  Vector3d const v1 = Vector3d{{2, 2, 2}}.normalized();
+  Vector3d const v2 = Vector3d{{3, 3, -6}}.normalized();
+  Vector3d const v3 = Utils::vector_product(v1, v2).normalized();
+  Vector3d const v4 = basis_change(v1, v2, v3, 0.1 * v1 + 0.2 * v2 - 0.3 * v3);
+  Vector3d const v4_expected = Vector3d{{0.1, 0.2, -0.3}};
+  for (int i = 0; i < 3; ++i) {
+    BOOST_CHECK_SMALL(v_identity_transform[i] - v[i], eps);
+    BOOST_CHECK_SMALL(v_swap_coord_transform[i] - v[i], eps);
+    BOOST_CHECK_SMALL(v4[i] - v4_expected[i], eps);
+  }
+}
 
+BOOST_AUTO_TEST_CASE(
+    transform_coordinate_cartesian_to_cylinder_base_change_test) {
+  constexpr auto eps = 1e-14;
+  Vector3d const v1{{1, 3, 4}};
+  Vector3d const v2{{-3.0, 7, 2}};
+  Vector3d const axis = Utils::vector_product(v1, v2).normalized();
+  Vector3d const v3 =
+      Utils::transform_coordinate_cartesian_to_cylinder(v1, axis, v1);
+  Vector3d const v3_ref{{v1.norm(), 0, 0}};
+  auto const angle_v1_v2 = Utils::angle_between(v1, v2);
+  auto const v4 = Utils::transform_coordinate_cartesian_to_cylinder(
+      v2 + 2 * axis, axis, v1);
+  Vector3d v4_ref{{v2.norm(), angle_v1_v2, 2}};
+  auto const v5 = Utils::transform_coordinate_cartesian_to_cylinder(
+      v1 + 2 * axis, axis, v2);
+  Vector3d v5_ref{{v1.norm(), -angle_v1_v2, 2}};
   for (int i = 0; i < 3; ++i) {
-    BOOST_CHECK(transformed_x[i] == expected_x[i]);
-    BOOST_CHECK(transformed_y[i] == expected_y[i]);
-    BOOST_CHECK(transformed_z[i] == expected_z[i]);
+    BOOST_CHECK_SMALL(v3[i] - v3_ref[i], eps);
+    BOOST_CHECK_SMALL(v4[i] - v4_ref[i], eps);
+    BOOST_CHECK_SMALL(v5[i] - v5_ref[i], eps);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(cartesian_to_cylinder_with_axis_and_orientation_test) {
+  constexpr auto eps = 1e-14;
+  // tilted orthogonal basis
+  auto const y = (Vector3d{{0, 1, -1}}).normalize();
+  auto const z = (Vector3d{{1, 1, 1}}).normalize();
+  auto const x = Utils::vector_product(y, z);
+
+  // check transformation with orientation (phi is random for r=0)
+  {
+    auto const x_cyl = transform_coordinate_cartesian_to_cylinder(x, z, y);
+    auto const y_cyl = transform_coordinate_cartesian_to_cylinder(y, z, y);
+    auto const z_cyl = transform_coordinate_cartesian_to_cylinder(z, z, y);
+    auto const x_ref = Vector3d{{1.0, -Utils::pi() / 2.0, 0.0}};
+    auto const y_ref = Vector3d{{1.0, 0.0, 0.0}};
+    auto const z_ref = Vector3d{{0.0, z_cyl[1], 1.0}};
+    for (int i = 0; i < 3; ++i) {
+      BOOST_CHECK_SMALL(x_cyl[i] - x_ref[i], eps);
+      BOOST_CHECK_SMALL(y_cyl[i] - y_ref[i], eps);
+      BOOST_CHECK_SMALL(z_cyl[i] - z_ref[i], eps);
+    }
+  }
+  // check transformation with orientation for another angle
+  {
+    auto const u = vec_rotate(z, Utils::pi() / 3.0, x);
+    auto const v = vec_rotate(z, Utils::pi() / 3.0, y);
+    auto const u_cyl = transform_coordinate_cartesian_to_cylinder(u, z, y);
+    auto const v_cyl = transform_coordinate_cartesian_to_cylinder(v, z, y);
+    auto const u_ref = Vector3d{{1.0, Utils::pi() * (1. / 3. - 1. / 2.), 0.0}};
+    auto const v_ref = Vector3d{{1.0, Utils::pi() / 3.0, 0.0}};
+    for (int i = 0; i < 3; ++i) {
+      BOOST_CHECK_SMALL(u_cyl[i] - u_ref[i], eps);
+      BOOST_CHECK_SMALL(v_cyl[i] - v_ref[i], eps);
+    }
+  }
+  // check transformation of random vectors
+  {
+    std::subtract_with_carry_engine<unsigned, 24, 10, 24> rng(2);
+    auto const r_uniform = [&rng]() {
+      return static_cast<double>(rng() - rng.min()) / (rng.max() - rng.min());
+    };
+    for (int trial = 0; trial < 100; ++trial) {
+      Vector3d const v1{r_uniform(), r_uniform(), r_uniform()};
+      Vector3d const v2{r_uniform(), r_uniform(), r_uniform()};
+      auto const a = Utils::vector_product(v1, v2) / v1.norm() / v2.norm();
+      auto const v1_v1 = transform_coordinate_cartesian_to_cylinder(v1, a, v1);
+      auto const v2_v1 = transform_coordinate_cartesian_to_cylinder(v2, a, v1);
+      auto const v1_v2 = transform_coordinate_cartesian_to_cylinder(v1, a, v2);
+      Vector3d const v1_v1_ref{v1.norm(), 0.0, 0.0};
+      Vector3d const v2_v1_ref{v2.norm(), Utils::angle_between(v1, v2), 0.0};
+      Vector3d const v1_v2_ref{v1.norm(), -Utils::angle_between(v1, v2), 0.0};
+      for (int i = 0; i < 3; ++i) {
+        BOOST_CHECK_SMALL(v1_v1[i] - v1_v1_ref[i], eps);
+        BOOST_CHECK_SMALL(v2_v1[i] - v2_v1_ref[i], eps);
+        BOOST_CHECK_SMALL(v1_v2[i] - v1_v2_ref[i], eps);
+      }
+    }
   }
 }
 
 BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_test) {
+  constexpr auto eps = 1e-14;
+  auto const cyl = Vector3d{{1.0, Utils::pi() / 4, 2.0}};
+  auto const pos = transform_coordinate_cylinder_to_cartesian(cyl);
+  BOOST_CHECK_SMALL(pos[0] - std::sqrt(2) / 2, eps);
+  BOOST_CHECK_SMALL(pos[1] - std::sqrt(2) / 2, eps);
+  BOOST_CHECK_SMALL(pos[2] - cyl[2], eps);
+}
+
+BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_with_axis_and_orientation_test) {
+  constexpr auto eps = 2e-14;
   Vector3d const cylinder_coord{{1.2, 3.123, 42.0}};
-  auto const transformed_x = transform_coordinate_cylinder_to_cartesian(
-      cylinder_coord, Vector3d{{1, 0, 0}});
-  auto const transformed_y = transform_coordinate_cylinder_to_cartesian(
-      cylinder_coord, Vector3d{{0, 1, 0}});
-  auto const transformed_z = transform_coordinate_cylinder_to_cartesian(
-      cylinder_coord, Vector3d{{0, 0, 1}});
+  auto const e_x = Vector3d{{1., 0., 0.}};
+  auto const e_y = Vector3d{{0., 1., 0.}};
+  auto const e_z = Vector3d{{0., 0., 1.}};
+
+  auto const transformed_x =
+      transform_coordinate_cylinder_to_cartesian(cylinder_coord, e_x, -e_z);
+  auto const transformed_y =
+      transform_coordinate_cylinder_to_cartesian(cylinder_coord, e_y, e_x);
+  auto const transformed_z =
+      transform_coordinate_cylinder_to_cartesian(cylinder_coord, e_z, e_x);
   // We transform from cylinder zu cartesian and have to rotate back. See test
   // cartesian_to_cylinder_test.
-  auto const expected_x =
-      vec_rotate(Vector3d{{0.0, 1.0, 0.0}}, Utils::pi() / 2.0,
-                 transform_coordinate_cylinder_to_cartesian(
-                     cylinder_coord, Vector3d{{0, 0, 1}}));
-  auto const expected_y =
-      vec_rotate(Vector3d{{1.0, 0.0, 0.0}}, -Utils::pi() / 2.0,
-                 transform_coordinate_cylinder_to_cartesian(
-                     cylinder_coord, Vector3d{{0, 0, 1}}));
+  auto const expected_x = vec_rotate(
+      e_y, Utils::pi() / 2.0,
+      transform_coordinate_cylinder_to_cartesian(cylinder_coord, e_z, e_x));
+  auto const expected_y = vec_rotate(
+      e_x, -Utils::pi() / 2.0,
+      transform_coordinate_cylinder_to_cartesian(cylinder_coord, e_z, e_x));
   // x = r * cos(phi); y = r * sin(phi); z = z
   auto const expected_z = Vector3d{
       {cylinder_coord[0] * std::cos(cylinder_coord[1]),
        cylinder_coord[0] * std::sin(cylinder_coord[1]), cylinder_coord[2]}};
   for (int i = 0; i < 3; ++i) {
-    BOOST_CHECK(transformed_x[i] == expected_x[i]);
-    BOOST_CHECK(transformed_y[i] == expected_y[i]);
-    BOOST_CHECK(transformed_z[i] == expected_z[i]);
+    BOOST_CHECK_SMALL(transformed_x[i] - expected_x[i], eps);
+    BOOST_CHECK_SMALL(transformed_y[i] - expected_y[i], eps);
+    BOOST_CHECK_SMALL(transformed_z[i] - expected_z[i], eps);
   }
 }
 
-BOOST_AUTO_TEST_CASE(vector_cart_to_cyl_test) {
-  constexpr auto eps = 1e-13;
-  Vector3d const pos{{1.1, 2.2, 3.3}};
-  auto const axis = (Vector3d{{4.4, 5.5, 6.6}}).normalized();
-  Vector3d const vec{{7.7, 8.8, 9.9}};
+BOOST_AUTO_TEST_CASE(cylinder_to_cartesian_with_axis_with_phi_2_test) {
+  constexpr auto eps = 1e-14;
+  // tilted orthogonal basis
+  auto const y = (Vector3d{{0, 1, -1}}).normalize();
+  auto const z = (Vector3d{{1, 1, 1}}).normalize();
+  auto const x = Utils::vector_product(y, z);
 
-  auto const vec_cyl = transform_vector_cartesian_to_cylinder(vec, axis, pos);
-
-  // cylindrical basis vectors at pos
-  auto const e_z = axis;
-  auto const e_r = (pos - (pos * axis) * axis).normalized();
-  auto const e_phi = Utils::vector_product(e_z, e_r);
-
-  BOOST_CHECK_SMALL(vec_cyl[0] - vec * e_r, eps);
-  BOOST_CHECK_SMALL(vec_cyl[1] - vec * e_phi, eps);
-  BOOST_CHECK_SMALL(vec_cyl[2] - vec * e_z, eps);
+  // check transformation with orientation
+  {
+    auto const x_cyl = transform_coordinate_cartesian_to_cylinder(x, z, y);
+    auto const y_cyl = transform_coordinate_cartesian_to_cylinder(y, z, y);
+    auto const z_cyl = transform_coordinate_cartesian_to_cylinder(z, z, y);
+    auto const x_cart = transform_coordinate_cylinder_to_cartesian(x_cyl, z, y);
+    auto const y_cart = transform_coordinate_cylinder_to_cartesian(y_cyl, z, y);
+    auto const z_cart = transform_coordinate_cylinder_to_cartesian(z_cyl, z, y);
+    for (int i = 0; i < 3; ++i) {
+      BOOST_CHECK_SMALL(x_cart[i] - x[i], eps);
+      BOOST_CHECK_SMALL(y_cart[i] - y[i], eps);
+      BOOST_CHECK_SMALL(z_cart[i] - z[i], eps);
+    }
+  }
+  // check transformation with orientation for another angle
+  {
+    auto const u = vec_rotate(z, Utils::pi() / 3.0, x);
+    auto const v = vec_rotate(z, Utils::pi() / 3.0, y);
+    auto const u_cyl = transform_coordinate_cartesian_to_cylinder(u, z, y);
+    auto const v_cyl = transform_coordinate_cartesian_to_cylinder(v, z, y);
+    auto const u_cart = transform_coordinate_cylinder_to_cartesian(u_cyl, z, y);
+    auto const v_cart = transform_coordinate_cylinder_to_cartesian(v_cyl, z, y);
+    for (int i = 0; i < 3; ++i) {
+      BOOST_CHECK_SMALL(u_cart[i] - u[i], eps);
+      BOOST_CHECK_SMALL(v_cart[i] - v[i], eps);
+    }
+  }
+  // check transformation of random vectors
+  {
+    std::subtract_with_carry_engine<unsigned, 24, 10, 24> rng(2);
+    auto const r_uniform = [&rng]() {
+      return static_cast<double>(rng() - rng.min()) / (rng.max() - rng.min());
+    };
+    for (int trial = 0; trial < 100; ++trial) {
+      Vector3d const v1{r_uniform(), r_uniform(), r_uniform()};
+      Vector3d const v2{r_uniform(), r_uniform(), r_uniform()};
+      auto const a = Utils::vector_product(v1, v2) / v1.norm() / v2.norm();
+      auto const v3 = transform_coordinate_cartesian_to_cylinder(v2, a, v1);
+      auto const v4 = transform_coordinate_cylinder_to_cartesian(v3, a, v1);
+      for (int i = 0; i < 3; ++i) {
+        BOOST_CHECK_SMALL(v4[i] - v2[i], eps);
+      }
+    }
+  }
 }
diff --git a/src/utils/tests/gather_buffer_test.cpp b/src/utils/tests/gather_buffer_test.cpp
index 11036c3d6e8..cfca73b204e 100644
--- a/src/utils/tests/gather_buffer_test.cpp
+++ b/src/utils/tests/gather_buffer_test.cpp
@@ -37,39 +37,6 @@
 using Utils::Mpi::gather_buffer;
 namespace mpi = boost::mpi;
 
-void check_pointer(const mpi::communicator &comm, int root) {
-  if (comm.rank() == root) {
-    auto const n = comm.size();
-    const int total_size = n * (n + 1) / 2;
-
-    std::vector<int> buf(total_size, comm.rank() + 1);
-    auto const ret_size =
-        gather_buffer(buf.data(), comm.rank() + 1, comm, root);
-
-    BOOST_CHECK(ret_size == total_size);
-
-    /* Check order in result */
-    BOOST_CHECK(std::is_sorted(buf.begin(), buf.end()));
-
-    /* Check values */
-    for (int i = 1; i <= n; i++) {
-      std::vector<int>::iterator lower, upper;
-      std::tie(lower, upper) = std::equal_range(buf.begin(), buf.end(), i);
-
-      BOOST_CHECK(i == std::distance(lower, upper));
-    }
-  } else {
-    std::vector<int> buf(comm.rank() + 1, comm.rank() + 1);
-    gather_buffer(buf.data(), buf.size(), comm, root);
-
-    /* Check that buffer is unchanged */
-    BOOST_CHECK(buf.size() == comm.rank() + 1);
-    for (auto const &i : buf) {
-      BOOST_CHECK(i == comm.rank() + 1);
-    }
-  }
-}
-
 void check_vector(const mpi::communicator &comm, int root) {
   std::vector<int> buf(comm.rank() + 1, comm.rank() + 1);
 
@@ -123,28 +90,6 @@ void check_vector_out_of_bounds(const mpi::communicator &comm) {
   }
 }
 
-void check_pointer_out_of_bounds(const mpi::communicator &comm) {
-  /* Check that moving data in the buffer on the root doesn't lead
-   * to an access out of bounds (using a sentinel value) */
-  const auto root = 1;
-  if (comm.rank() == 1) {
-    std::vector<int> buf = {2, 2, 0, -1};
-    gather_buffer(buf.data(), 2, comm, root);
-    BOOST_CHECK(buf.size() == 4);
-    BOOST_CHECK(buf[0] == 1);
-    BOOST_CHECK(buf[1] == 2);
-    BOOST_CHECK(buf[2] == 2);
-    BOOST_CHECK(buf[3] == -1);
-  } else if (comm.rank() == 0) {
-    std::vector<int> buf = {1};
-    gather_buffer(buf.data(), 1, comm, root);
-    BOOST_CHECK(buf[0] == 1);
-  } else {
-    std::vector<int> buf = {};
-    gather_buffer(buf.data(), 0, comm, root);
-  }
-}
-
 void check_vector_empty(const mpi::communicator &comm, int empty) {
   std::vector<int> buf((comm.rank() == empty) ? 0 : 11, comm.rank());
   gather_buffer(buf, comm);
@@ -165,54 +110,6 @@ void check_vector_empty(const mpi::communicator &comm, int empty) {
   }
 }
 
-void check_pointer_empty(const mpi::communicator &comm, int empty) {
-  auto const n_elem = (comm.rank() == empty) ? 0 : 11;
-  std::vector<int> buf(n_elem, comm.rank());
-
-  if (comm.rank() == 0) {
-    buf.resize((comm.size() - 1) * 11);
-  }
-
-  gather_buffer(buf.data(), n_elem, comm);
-
-  if (comm.rank() == 0) {
-    for (int i = 0; i < comm.size(); i++) {
-      std::vector<int>::iterator lower, upper;
-      std::tie(lower, upper) = std::equal_range(buf.begin(), buf.end(), i);
-
-      if (i == empty) {
-        BOOST_CHECK(0 == std::distance(lower, upper));
-      } else {
-        BOOST_CHECK(11 == std::distance(lower, upper));
-      }
-    }
-  }
-}
-
-BOOST_AUTO_TEST_CASE(pointer) {
-  mpi::communicator world;
-  check_pointer(world, 0);
-}
-
-BOOST_AUTO_TEST_CASE(pointer_overlap) {
-  mpi::communicator world;
-  if (world.size() >= 2)
-    check_pointer(world, 1);
-}
-
-BOOST_AUTO_TEST_CASE(pointer_out_of_bounds) {
-  mpi::communicator world;
-  if (world.size() >= 2)
-    check_pointer_out_of_bounds(world);
-}
-
-BOOST_AUTO_TEST_CASE(pointer_root) {
-  mpi::communicator world;
-
-  auto root = (world.size() >= 3) ? world.size() - 2 : world.size() - 1;
-  check_pointer(world, root);
-}
-
 BOOST_AUTO_TEST_CASE(vector) {
   mpi::communicator world;
   check_vector(world, 0);
@@ -250,19 +147,6 @@ BOOST_AUTO_TEST_CASE(vector_empty_root) {
   check_vector_empty(world, root);
 }
 
-BOOST_AUTO_TEST_CASE(pointer_empty) {
-  mpi::communicator world;
-
-  check_pointer_empty(world, 0);
-}
-
-BOOST_AUTO_TEST_CASE(pointer_empty_root) {
-  mpi::communicator world;
-  auto root = (world.size() >= 3) ? world.size() - 2 : world.size() - 1;
-
-  check_pointer_empty(world, root);
-}
-
 BOOST_AUTO_TEST_CASE(non_trivial_type) {
   mpi::communicator world;
 
diff --git a/src/utils/tests/matrix_test.cpp b/src/utils/tests/matrix_test.cpp
index 8030490d2ba..d3a9533cd2f 100644
--- a/src/utils/tests/matrix_test.cpp
+++ b/src/utils/tests/matrix_test.cpp
@@ -1,6 +1,8 @@
 /*
  * Copyright (C) 2018-2019 The ESPResSo project
  *
+ * This file is part of ESPResSo.
+ *
  * ESPResSo is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
diff --git a/src/utils/tests/orthonormal_vec_test.cpp b/src/utils/tests/orthonormal_vec_test.cpp
new file mode 100644
index 00000000000..51f9679dfe2
--- /dev/null
+++ b/src/utils/tests/orthonormal_vec_test.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2019 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE Utils::orthonormal_vec test
+#define BOOST_TEST_DYN_LINK
+#include <boost/test/unit_test.hpp>
+
+#include <utils/Vector.hpp>
+#include <utils/math/orthonormal_vec.hpp>
+
+BOOST_AUTO_TEST_CASE(orthonormal_vec_test) {
+  constexpr auto eps = 1e-14;
+
+  auto const v0 = Utils::Vector3d{{1.1, -2.2, 3.3}};
+  auto v0_orth = Utils::calc_orthonormal_vector(v0);
+  BOOST_CHECK_SMALL(v0 * v0_orth, eps);
+  BOOST_CHECK_SMALL(1 - v0_orth.norm(), eps);
+
+  auto const v1 = Utils::VectorXd<2>{{1., 0.}};
+  auto v1_orth = Utils::calc_orthonormal_vector(v1);
+  BOOST_CHECK_SMALL(v1 * v1_orth, eps);
+  BOOST_CHECK_SMALL(1 - v1_orth.norm(), eps);
+}
\ No newline at end of file
diff --git a/src/utils/tests/vec_rotate_test.cpp b/src/utils/tests/vec_rotate_test.cpp
index 18b9b14ef45..f1d5727a2cd 100644
--- a/src/utils/tests/vec_rotate_test.cpp
+++ b/src/utils/tests/vec_rotate_test.cpp
@@ -22,11 +22,9 @@
 #include <utils/Vector.hpp>
 #include <utils/constants.hpp>
 #include <utils/math/vec_rotate.hpp>
-using Utils::vec_rotate;
 
 #include <cmath>
 #include <limits>
-#include <tuple>
 
 BOOST_AUTO_TEST_CASE(rotation) {
   using std::cos;
@@ -43,20 +41,16 @@ BOOST_AUTO_TEST_CASE(rotation) {
   auto const expected =
       cos(t) * v + sin(t) * vector_product(k, v) + (1. - cos(t)) * (k * v) * k;
 
-  auto const is = vec_rotate(k, t, v);
+  auto const is = Utils::vec_rotate(k, t, v);
   auto const rel_diff = (expected - is).norm() / expected.norm();
 
   BOOST_CHECK(rel_diff < std::numeric_limits<double>::epsilon());
 }
 
-BOOST_AUTO_TEST_CASE(rotation_params) {
-  Utils::Vector3d v1 = {1.0, 0.0, 0.0};
-  Utils::Vector3d v2 = {1.0, 1.0, 0.0};
+BOOST_AUTO_TEST_CASE(angle_between) {
+  Utils::Vector3d const v1 = {1.0, 0.0, 0.0};
+  Utils::Vector3d const v2 = {1.0, 1.0, 0.0};
 
-  double angle;
-  Utils::Vector3d rotation_axis;
-  std::tie(angle, rotation_axis) = Utils::rotation_params(v1, v2);
+  auto const angle = Utils::angle_between(v1, v2);
   BOOST_CHECK_CLOSE(angle, Utils::pi() / 4.0, 1e-7);
-  BOOST_CHECK_SMALL((rotation_axis * v1), 1e-7);
-  BOOST_CHECK_SMALL((rotation_axis * v2), 1e-7);
 }
diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt
index ec8ff4c9dd6..f13ae95562d 100644
--- a/testsuite/python/CMakeLists.txt
+++ b/testsuite/python/CMakeLists.txt
@@ -57,7 +57,7 @@ endfunction(PYTHON_TEST)
 # Separate features with hyphens, use a period to add an optional flag.
 foreach(
   TEST_COMBINATION
-  lb.cpu-p3m.cpu-lj-therm.lb;lb.gpu-p3m.cpu-lj-therm.lb;ek.gpu;lb.off-therm.npt-int.npt;lb.off-int.sd;lb.off-therm.langevin-int.nvt;lb.off-therm.dpd-int.nvt;lb.off-therm.bd-int.bd;lb.off-therm.sdm-int.sdm
+  lb.cpu-p3m.cpu-lj-therm.lb;lb.gpu-p3m.elc-lj-therm.lb;ek.gpu;lb.off-therm.npt-int.npt;lb.off-int.sd;lb.off-dp3m.cpu-therm.langevin-int.nvt;lb.off-therm.dpd-int.nvt;lb.off-scafacos-therm.bd-int.bd;lb.off-therm.sdm-int.sdm
 )
   if(${TEST_COMBINATION} MATCHES "\\.gpu")
     set(TEST_LABELS "gpu")
@@ -125,9 +125,9 @@ python_test(FILE lb_stokes_sphere.py MAX_NUM_PROC 4 LABELS gpu long)
 python_test(FILE lb_pressure_tensor.py MAX_NUM_PROC 1 LABELS gpu long)
 python_test(FILE ek_fluctuations.py MAX_NUM_PROC 1 LABELS gpu)
 python_test(FILE ek_charged_plate.py MAX_NUM_PROC 1 LABELS gpu)
-python_test(FILE ek_eof_one_species_x.py MAX_NUM_PROC 1 LABELS gpu)
-python_test(FILE ek_eof_one_species_y.py MAX_NUM_PROC 1 LABELS gpu)
-python_test(FILE ek_eof_one_species_z.py MAX_NUM_PROC 1 LABELS gpu)
+python_test(FILE ek_eof_one_species.py MAX_NUM_PROC 1 LABELS gpu SUFFIX x)
+python_test(FILE ek_eof_one_species.py MAX_NUM_PROC 1 LABELS gpu SUFFIX y)
+python_test(FILE ek_eof_one_species.py MAX_NUM_PROC 1 LABELS gpu SUFFIX z)
 python_test(FILE exclusions.py MAX_NUM_PROC 2)
 python_test(FILE langevin_thermostat.py MAX_NUM_PROC 1)
 python_test(FILE langevin_thermostat_stats.py MAX_NUM_PROC 1 LABELS long)
@@ -143,7 +143,7 @@ python_test(FILE integrator_npt_stats.py MAX_NUM_PROC 4 LABELS long)
 python_test(FILE integrator_steepest_descent.py MAX_NUM_PROC 4)
 python_test(FILE dipolar_mdlc_p3m_scafacos_p2nfft.py MAX_NUM_PROC 1)
 python_test(FILE dipolar_direct_summation.py MAX_NUM_PROC 1 LABELS gpu)
-python_test(FILE dipolar_p3m.py MAX_NUM_PROC 1)
+python_test(FILE dipolar_p3m.py MAX_NUM_PROC 2)
 python_test(FILE dipolar_interface.py MAX_NUM_PROC 1 LABELS gpu)
 python_test(FILE lb.py MAX_NUM_PROC 2 LABELS gpu)
 python_test(FILE lb_stats.py MAX_NUM_PROC 2 LABELS gpu long)
@@ -207,7 +207,7 @@ python_test(FILE sigint.py DEPENDENCIES sigint_child.py MAX_NUM_PROC 1)
 python_test(FILE lb_density.py MAX_NUM_PROC 1)
 python_test(FILE observable_chain.py MAX_NUM_PROC 4)
 python_test(FILE mpiio.py MAX_NUM_PROC 4)
-python_test(FILE gpu_availability.py MAX_NUM_PROC 1 LABELS gpu)
+python_test(FILE gpu_availability.py MAX_NUM_PROC 2 LABELS gpu)
 python_test(FILE features.py MAX_NUM_PROC 1)
 python_test(FILE galilei.py MAX_NUM_PROC 32)
 python_test(FILE linear_momentum.py MAX_NUM_PROC 4)
@@ -222,6 +222,10 @@ python_test(FILE rotation.py MAX_NUM_PROC 1)
 python_test(FILE shapes.py MAX_NUM_PROC 1)
 python_test(FILE h5md.py MAX_NUM_PROC 2)
 python_test(FILE mdanalysis.py MAX_NUM_PROC 2)
+python_test(FILE p3m_fft.py MAX_NUM_PROC 6)
+if(${TEST_NP} GREATER_EQUAL 8)
+  python_test(FILE p3m_fft.py MAX_NUM_PROC 8 SUFFIX 8_cores)
+endif()
 python_test(FILE p3m_tuning_exceptions.py MAX_NUM_PROC 1 LABELS gpu)
 python_test(FILE integrator_exceptions.py MAX_NUM_PROC 1)
 python_test(FILE utils.py MAX_NUM_PROC 1)
@@ -244,10 +248,6 @@ add_custom_target(
           ${CMAKE_CURRENT_BINARY_DIR}
   COMMAND
     ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/thermostats_common.py
-    ${CMAKE_CURRENT_BINARY_DIR}
-  COMMAND
-    ${CMAKE_COMMAND} -E copy
-    ${CMAKE_CURRENT_SOURCE_DIR}/ek_eof_one_species_base.py
     ${CMAKE_CURRENT_BINARY_DIR})
 
 add_custom_target(
diff --git a/testsuite/python/cellsystem.py b/testsuite/python/cellsystem.py
index 3725d3c7434..a5a2b5358f9 100644
--- a/testsuite/python/cellsystem.py
+++ b/testsuite/python/cellsystem.py
@@ -24,6 +24,7 @@
 class CellSystem(ut.TestCase):
     system = espressomd.System(box_l=[5.0, 5.0, 5.0])
     system.cell_system.skin = 0.0
+    n_nodes = system.cell_system.get_state()['n_nodes']
 
     def test_cell_system(self):
         self.system.cell_system.set_n_square(use_verlet_lists=False)
@@ -34,15 +35,15 @@ def test_cell_system(self):
         self.assertEqual(
             [s['use_verlet_list'], s['type']], [1, "domain_decomposition"])
 
+    @ut.skipIf(n_nodes == 1, "Skipping test: only runs for n_nodes >= 2")
     def test_node_grid(self):
         self.system.cell_system.set_domain_decomposition()
-        n_nodes = self.system.cell_system.get_state()['n_nodes']
-        if n_nodes == 1:
-            return
-        self.system.cell_system.node_grid = [n_nodes, 1, 1]
-        s = self.system.cell_system.get_state()
-        np.testing.assert_array_equal(
-            s['node_grid'], [n_nodes, 1, 1])
+        for i in range(3):
+            node_grid_ref = [1, 1, 1]
+            node_grid_ref[i] = self.n_nodes
+            self.system.cell_system.node_grid = node_grid_ref
+            node_grid = self.system.cell_system.get_state()['node_grid']
+            np.testing.assert_array_equal(node_grid, node_grid_ref)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/collision_detection.py b/testsuite/python/collision_detection.py
index 42edbf0ac28..9ef1851b252 100644
--- a/testsuite/python/collision_detection.py
+++ b/testsuite/python/collision_detection.py
@@ -63,9 +63,9 @@ def test_00_interface_and_defaults(self):
             self.s.collision_detection.mode = "bind_centers"
 
         # Verify exception throwing for unknown collision modes
-        with self.assertRaises(Exception):
-            self.s.collision_detection.set_params(mode=0)
-            self.s.collision_detection.set_params(mode="blahblah")
+        for unknown_mode in (0, "unknown"):
+            with self.assertRaisesRegex(Exception, "Mode not handled"):
+                self.s.collision_detection.set_params(mode=unknown_mode)
 
         # That should work
         self.s.collision_detection.set_params(mode="off")
@@ -496,7 +496,12 @@ def test_glue_to_surface_random(self):
 
         # Collision detection
         self.s.collision_detection.set_params(
-            mode="glue_to_surface", distance=0.11, distance_glued_particle_to_vs=0.02, bond_centers=self.H, bond_vs=self.H2, part_type_vs=self.part_type_vs, part_type_to_attach_vs_to=self.part_type_to_attach_vs_to, part_type_to_be_glued=self.part_type_to_be_glued, part_type_after_glueing=self.part_type_after_glueing)
+            mode="glue_to_surface", distance=0.11,
+            distance_glued_particle_to_vs=0.02, bond_centers=self.H,
+            bond_vs=self.H2, part_type_vs=self.part_type_vs,
+            part_type_to_attach_vs_to=self.part_type_to_attach_vs_to,
+            part_type_to_be_glued=self.part_type_to_be_glued,
+            part_type_after_glueing=self.part_type_after_glueing)
         self.get_state_set_state_consistency()
 
         # Integrate lj liquid
@@ -703,7 +708,7 @@ def verify_triangle_binding(self, distance, first_bond, angle_res):
         expected_angle_bonds = sorted(expected_angle_bonds)
         self.assertEqual(expected_pairs, found_pairs)
 
-        if not expected_angle_bonds == found_angle_bonds:
+        if expected_angle_bonds != found_angle_bonds:
             # Verbose info
             print("expected:", expected_angle_bonds)
             missing = []
diff --git a/testsuite/python/constraint_shape_based.py b/testsuite/python/constraint_shape_based.py
index a58b9fbf610..f3ea78b9456 100644
--- a/testsuite/python/constraint_shape_based.py
+++ b/testsuite/python/constraint_shape_based.py
@@ -107,6 +107,50 @@ def z(y, r1, r2, l): return l / (r1 - r2) * \
         self.assertLess(shape.calc_distance(
             position=[0.0, R1 - (0.5 + sys.float_info.epsilon) * D, 0.25 * LENGTH])[0], 0.0)
 
+    def test_simplepore(self):
+        """
+        Test implementation of simplepore shape.
+
+        """
+        RADIUS = 12.5
+        LENGTH = 15.0
+        CENTER = 3 * [self.box_l / 2]
+        AXIS = [1, 0, 0]
+        SRADIUS = 2
+
+        shape = espressomd.shapes.SimplePore(
+            center=CENTER, axis=AXIS, length=LENGTH, radius=RADIUS,
+            smoothing_radius=SRADIUS)
+
+        # check distances inside cylinder
+        for x in np.linspace(self.box_l / 2 - LENGTH / 2 + SRADIUS,
+                             self.box_l / 2 + LENGTH / 2 - SRADIUS, 10):
+            for y in np.linspace(0, RADIUS, 5):
+                dist = shape.calc_distance(
+                    position=[x, self.box_l / 2 + y, self.box_l / 2])
+                self.assertAlmostEqual(dist[0], RADIUS - y)
+
+        # check distances near the walls
+        for y in np.linspace(0, self.box_l / 2 - RADIUS - SRADIUS, 6):
+            for z in np.linspace(0, self.box_l / 2 - RADIUS - SRADIUS, 6):
+                for x in np.linspace(0, self.box_l / 2 - LENGTH / 2, 6):
+                    dist_to_x = (self.box_l / 2 - LENGTH / 2 - x)
+                    dist = shape.calc_distance(
+                        position=[x, y, self.box_l - z])
+                    np.testing.assert_almost_equal(
+                        np.copy(dist[1]), [-dist_to_x, 0, 0])
+                    dist = shape.calc_distance(
+                        position=[self.box_l - x, self.box_l - y, z])
+                    np.testing.assert_almost_equal(
+                        np.copy(dist[1]), [dist_to_x, 0, 0])
+
+        # check getters
+        self.assertAlmostEqual(shape.radius, RADIUS)
+        self.assertAlmostEqual(shape.length, LENGTH)
+        self.assertAlmostEqual(shape.smoothing_radius, SRADIUS)
+        np.testing.assert_almost_equal(np.copy(shape.axis), AXIS)
+        np.testing.assert_almost_equal(np.copy(shape.center), CENTER)
+
     def test_sphere(self):
         """Checks geometry of an inverted sphere
 
@@ -205,6 +249,8 @@ def test_ellipsoid(self):
         # change ellipsoid parameters instead of creating a new constraint
         e.a = 1.
         e.b = 1.
+        self.assertAlmostEqual(e.a, 1.)
+        self.assertAlmostEqual(e.b, 1.)
 
         radii = np.linspace(1., 6.5, 7)
 
@@ -331,6 +377,18 @@ def test_cylinder(self):
                             dist = -distance
 
                     self.assertAlmostEqual(shape_dist, dist)
+
+        # check getters
+        self.assertAlmostEqual(cylinder_shape_finite.radius, rad)
+        self.assertAlmostEqual(cylinder_shape_finite.length, length)
+        np.testing.assert_almost_equal(
+            np.copy(cylinder_shape_finite.axis), [0, 0, 1])
+        np.testing.assert_almost_equal(
+            np.copy(cylinder_shape_finite.center), 3 * [rad])
+        self.assertFalse(cylinder_shape_finite.open)
+        cylinder_shape_finite.open = True
+        self.assertTrue(cylinder_shape_finite.open)
+
         # Reset
         system.non_bonded_inter[0, 1].lennard_jones.set_params(
             epsilon=0.0, sigma=0.0, cutoff=0.0, shift=0)
@@ -432,6 +490,14 @@ def test_spherocylinder(self):
                     energy = system.analysis.energy()
                     self.assertAlmostEqual(energy["total"], 10. - r)
 
+        # check getters
+        self.assertAlmostEqual(spherocylinder_shape.radius, 10.)
+        self.assertAlmostEqual(spherocylinder_shape.length, 6.0)
+        np.testing.assert_almost_equal(
+            np.copy(spherocylinder_shape.axis), [0, 1, 0])
+        np.testing.assert_almost_equal(
+            np.copy(spherocylinder_shape.center), 3 * [self.box_l / 2.0])
+
         # Reset
         system.non_bonded_inter[0, 1].generic_lennard_jones.set_params(
             epsilon=0., sigma=0., cutoff=0., shift=0., offset=0., e1=0, e2=0, b1=0., b2=0.)
@@ -866,6 +932,13 @@ def test_torus(self):
                         position=phi_rot_point.tolist())
                     self.assertAlmostEqual(shape_dist, distance)
 
+        # check getters
+        self.assertAlmostEqual(torus_shape.radius, radius)
+        self.assertAlmostEqual(torus_shape.tube_radius, tube_radius)
+        np.testing.assert_almost_equal(np.copy(torus_shape.normal), [0, 0, 1])
+        np.testing.assert_almost_equal(
+            np.copy(torus_shape.center), 3 * [self.box_l / 2.0])
+
         # Reset
         system.non_bonded_inter[0, 1].lennard_jones.set_params(
             epsilon=0.0, sigma=0.0, cutoff=0.0, shift=0)
diff --git a/testsuite/python/coulomb_cloud_wall.py b/testsuite/python/coulomb_cloud_wall.py
index 494d66418bb..2fd2ce38ed5 100644
--- a/testsuite/python/coulomb_cloud_wall.py
+++ b/testsuite/python/coulomb_cloud_wall.py
@@ -115,41 +115,6 @@ def test_p3m_gpu(self):
         self.S.integrator.run(0)
         self.compare("p3m_gpu", energy=False, prefactor=2.2)
 
-    @ut.skipIf(not espressomd.has_features(["SCAFACOS"])
-               or 'p3m' not in scafacos.available_methods(),
-               'Skipping test: missing feature SCAFACOS or p3m method')
-    def test_scafacos_p3m(self):
-        self.S.actors.add(
-            espressomd.electrostatics.Scafacos(
-                prefactor=0.5,
-                method_name="p3m",
-                method_params={
-                    "p3m_r_cut": 1.001,
-                    "p3m_grid": 64,
-                    "p3m_cao": 7,
-                    "p3m_alpha": 2.70746}))
-        self.S.integrator.run(0)
-        self.compare("scafacos_p3m", energy=True, prefactor=0.5)
-
-    @ut.skipIf(not espressomd.has_features(["SCAFACOS"])
-               or 'p3m' not in scafacos.available_methods(),
-               'Skipping test: missing feature SCAFACOS or p3m method')
-    def test_scafacos_p3m_tuning(self):
-        # check that the tuning function can be called without throwing
-        # an exception or causing an MPI deadlock
-        self.S.actors.add(
-            espressomd.electrostatics.Scafacos(
-                prefactor=0.5,
-                method_name="p3m",
-                method_params={
-                    "p3m_r_cut": -1.5,
-                    "p3m_grid": 64,
-                    "p3m_cao": 7,
-                    "p3m_alpha": 2.70746}))
-        self.S.integrator.run(0)
-        # check the scafacos script interface
-        self.assertEqual(self.S.actors[-1].get_params()['prefactor'], 0.5)
-
     @ut.skipIf(not espressomd.has_features("SCAFACOS")
                or 'p2nfft' not in scafacos.available_methods(),
                'Skipping test: missing feature SCAFACOS or p2nfft method')
diff --git a/testsuite/python/coulomb_mixed_periodicity.py b/testsuite/python/coulomb_mixed_periodicity.py
index c275d3a9f7e..30f84005926 100644
--- a/testsuite/python/coulomb_mixed_periodicity.py
+++ b/testsuite/python/coulomb_mixed_periodicity.py
@@ -20,7 +20,7 @@
 import unittest_decorators as utx
 import numpy as np
 import espressomd
-from espressomd import electrostatics, electrostatic_extensions, scafacos
+from espressomd import electrostatics, scafacos
 import tests_common
 
 
@@ -44,7 +44,6 @@ def setUp(self):
         self.S.box_l = (10, 10, 10)
         self.S.time_step = 0.01
         self.S.cell_system.skin = 0.
-        self.S.actors.clear()
 
         data = np.genfromtxt(tests_common.abspath(
             "data/coulomb_mixed_periodicity_system.data"))
@@ -61,6 +60,7 @@ def setUp(self):
 
     def tearDown(self):
         self.S.part.clear()
+        self.S.actors.clear()
 
     def compare(self, method_name, energy=True):
         # Compare forces and energy now in the system to stored ones
@@ -84,7 +84,7 @@ def compare(self, method_name, energy=True):
     # Tests for individual methods
 
     @utx.skipIfMissingFeatures(["P3M"])
-    def test_zz_p3mElc(self):
+    def test_elc(self):
         # Make sure, the data satisfies the gap
         for p in self.S.part:
             if p.pos[2] < 0 or p.pos[2] > 9.:
@@ -97,13 +97,11 @@ def test_zz_p3mElc(self):
         self.S.box_l = (10, 10, 10)
 
         p3m = electrostatics.P3M(prefactor=1, accuracy=1e-6, mesh=(64, 64, 64))
+        elc = electrostatics.ELC(p3m_actor=p3m, maxPWerror=1E-6, gap_size=1)
 
-        self.S.actors.add(p3m)
-        elc = electrostatic_extensions.ELC(maxPWerror=1E-6, gap_size=1)
         self.S.actors.add(elc)
         self.S.integrator.run(0)
         self.compare("elc", energy=True)
-        self.S.actors.remove(p3m)
 
     @ut.skipIf(not espressomd.has_features("SCAFACOS")
                or 'p2nfft' not in scafacos.available_methods(),
@@ -125,7 +123,6 @@ def test_scafacos_p2nfft(self):
         self.S.actors.add(scafacos)
         self.S.integrator.run(0)
         self.compare("scafacos_p2nfft", energy=True)
-        self.S.actors.remove(scafacos)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/dawaanr-and-bh-gpu.py b/testsuite/python/dawaanr-and-bh-gpu.py
index 9be6a09c992..bc7a6b6f5dc 100644
--- a/testsuite/python/dawaanr-and-bh-gpu.py
+++ b/testsuite/python/dawaanr-and-bh-gpu.py
@@ -17,16 +17,13 @@
 import unittest as ut
 import unittest_decorators as utx
 import numpy as np
+import tests_common
 
 import espressomd
 import espressomd.magnetostatics
 import espressomd.analyze
 import espressomd.cuda_init
-
-
-def stopAll(system):
-    system.part[:].v = np.zeros(3)
-    system.part[:].omega_body = np.zeros(3)
+import espressomd.galilei
 
 
 @utx.skipIfMissingGPU()
@@ -48,34 +45,22 @@ def test(self):
         pf_bh_gpu = 2.34
         pf_dawaanr = 3.524
         ratio_dawaanr_bh_gpu = pf_dawaanr / pf_bh_gpu
-        l = 15
-        self.system.box_l = [l, l, l]
+        self.system.box_l = 3 * [15]
         self.system.periodicity = [0, 0, 0]
         self.system.time_step = 1E-4
         self.system.cell_system.skin = 0.1
 
-        part_dip = np.zeros((3))
-
         for n in [128, 541]:
             dipole_modulus = 1.3
-            # scale the box for a large number of particles:
-            if n > 1000:
-                l *= (n / 541) ** (1 / 3.0)
-            for i in range(n):
-                part_pos = np.array(np.random.random(3)) * l
-                costheta = 2 * np.random.random() - 1
-                sintheta = np.sin(np.arcsin(costheta))
-                phi = 2 * np.pi * np.random.random()
-                part_dip[0] = sintheta * np.cos(phi) * dipole_modulus
-                part_dip[1] = sintheta * np.sin(phi) * dipole_modulus
-                part_dip[2] = costheta * dipole_modulus
-                self.system.part.add(id=i, type=0, pos=part_pos, dip=part_dip,
-                                     v=np.array([0, 0, 0]), omega_body=np.array([0, 0, 0]))
+            part_dip = dipole_modulus * tests_common.random_dipoles(n)
+            part_pos = np.random.random((n, 3)) * self.system.box_l[0]
+            self.system.part.add(pos=part_pos, dip=part_dip)
 
             self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
                 epsilon=10.0, sigma=0.5, cutoff=0.55, shift="auto")
             self.system.thermostat.set_langevin(kT=0.0, gamma=10.0, seed=42)
-            stopAll(self.system)
+            g = espressomd.galilei.GalileiTransform()
+            g.kill_particle_motion(rotation=True)
             self.system.integrator.set_vv()
 
             self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
diff --git a/testsuite/python/dawaanr-and-dds-gpu.py b/testsuite/python/dawaanr-and-dds-gpu.py
index d613dff3bf5..3e67db7e514 100644
--- a/testsuite/python/dawaanr-and-dds-gpu.py
+++ b/testsuite/python/dawaanr-and-dds-gpu.py
@@ -16,13 +16,14 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import unittest as ut
 import unittest_decorators as utx
-from numpy.random import random
+import tests_common
 import numpy as np
 
 import espressomd
 import espressomd.interactions
 import espressomd.magnetostatics
 import espressomd.analyze
+import espressomd.galilei
 
 
 @utx.skipIfMissingGPU()
@@ -31,37 +32,22 @@ class DDSGPUTest(ut.TestCase):
     # Handle for espresso system
     es = espressomd.System(box_l=[1.0, 1.0, 1.0])
 
-    def stopAll(self):
-        for i in range(len(self.es.part)):
-            self.es.part[i].v = np.array([0.0, 0.0, 0.0])
-            self.es.part[i].omega_body = np.array([0.0, 0.0, 0.0])
-
     @ut.skipIf(es.cell_system.get_state()["n_nodes"] > 1,
                "Skipping test: only runs for n_nodes == 1")
     def test(self):
         pf_dds_gpu = 2.34
         pf_dawaanr = 3.524
         ratio_dawaanr_dds_gpu = pf_dawaanr / pf_dds_gpu
-        l = 15
-        self.es.box_l = [l, l, l]
+        self.es.box_l = 3 * [15]
         self.es.periodicity = [0, 0, 0]
         self.es.time_step = 1E-4
         self.es.cell_system.skin = 0.1
 
-        part_dip = np.zeros((3))
-
         for n in [128, 541]:
             dipole_modulus = 1.3
-            for i in range(n):
-                part_pos = np.array(random(3)) * l
-                costheta = 2 * random() - 1
-                sintheta = np.sin(np.arcsin(costheta))
-                phi = 2 * np.pi * random()
-                part_dip[0] = sintheta * np.cos(phi) * dipole_modulus
-                part_dip[1] = sintheta * np.sin(phi) * dipole_modulus
-                part_dip[2] = costheta * dipole_modulus
-                self.es.part.add(id=i, type=0, pos=part_pos, dip=part_dip,
-                                 v=np.array([0, 0, 0]), omega_body=np.array([0, 0, 0]))
+            part_dip = dipole_modulus * tests_common.random_dipoles(n)
+            part_pos = np.random.random((n, 3)) * self.es.box_l[0]
+            self.es.part.add(pos=part_pos, dip=part_dip)
 
             self.es.non_bonded_inter[0, 0].lennard_jones.set_params(
                 epsilon=10.0, sigma=0.5, cutoff=0.55, shift="auto")
@@ -70,7 +56,8 @@ def test(self):
             self.es.integrator.set_steepest_descent(
                 f_max=0.0, gamma=0.1, max_displacement=0.1)
             self.es.integrator.run(500)
-            self.stopAll()
+            g = espressomd.galilei.GalileiTransform()
+            g.kill_particle_motion(rotation=True)
             self.es.integrator.set_vv()
 
             self.es.non_bonded_inter[0, 0].lennard_jones.set_params(
diff --git a/testsuite/python/dds-and-bh-gpu.py b/testsuite/python/dds-and-bh-gpu.py
index e473e35b818..8aa23acee1b 100644
--- a/testsuite/python/dds-and-bh-gpu.py
+++ b/testsuite/python/dds-and-bh-gpu.py
@@ -23,11 +23,7 @@
 import espressomd.magnetostatics
 import espressomd.analyze
 import espressomd.cuda_init
-
-
-def stopAll(system):
-    system.part[:].v = np.zeros(3)
-    system.part[:].omega_body = np.zeros(3)
+import espressomd.galilei
 
 
 @utx.skipIfMissingGPU()
@@ -70,7 +66,8 @@ def test(self):
             self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
                 epsilon=10.0, sigma=0.5, cutoff=0.55, shift="auto")
             self.system.thermostat.set_langevin(kT=0.0, gamma=10.0, seed=42)
-            stopAll(self.system)
+            g = espressomd.galilei.GalileiTransform()
+            g.kill_particle_motion(rotation=True)
             self.system.integrator.set_vv()
 
             self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
diff --git a/testsuite/python/dipolar_direct_summation.py b/testsuite/python/dipolar_direct_summation.py
index dd41b43b4bb..52de4e0f5c4 100644
--- a/testsuite/python/dipolar_direct_summation.py
+++ b/testsuite/python/dipolar_direct_summation.py
@@ -19,10 +19,11 @@
 import espressomd
 import espressomd.magnetostatics
 import espressomd.magnetostatic_extensions
+import os
 import numpy as np
 import unittest as ut
-from tests_common import abspath
 import unittest_decorators as utx
+from tests_common import abspath, random_dipoles
 OPEN_BOUNDARIES_REF_ENERGY = abspath("data/dipolar_open_boundaries_energy.npy")
 OPEN_BOUNDARIES_REF_ARRAYS = abspath("data/dipolar_open_boundaries_arrays.npy")
 
@@ -34,7 +35,7 @@ class dds(ut.TestCase):
 
     system.time_step = 0.01
     system.cell_system.skin = 0.1
-    system.periodicity = 0, 0, 0
+    system.periodicity = [False, False, False]
 
     def tearDown(self):
         self.system.part.clear()
@@ -107,19 +108,27 @@ def fcs_data(self):
 
     @ut.skipIf(system.cell_system.get_state()["n_nodes"] > 1,
                "Skipping test: only runs for n_nodes == 1")
-    def gen_reference_data(self):
+    def test_gen_reference_data(self):
+        filepaths = ('dipolar_direct_summation_energy.npy',
+                     'dipolar_direct_summation_arrays.npy')
+        for filepath in filepaths:
+            if os.path.isfile(filepath):
+                os.remove(filepath)
+
+        self.gen_reference_data(filepaths[0], filepaths[1])
+        for filepath in filepaths:
+            self.assertTrue(os.path.isfile(filepath))
+
+    def gen_reference_data(self, filepath_energy=OPEN_BOUNDARIES_REF_ENERGY,
+                           filepath_arrays=OPEN_BOUNDARIES_REF_ARRAYS):
         system = self.system
+        np.random.seed(42)
 
         # add particles
         N = 20
         dipole_modulus = 1.3
         part_pos = np.random.random((N, 3)) * system.box_l
-        costheta = 2 * np.random.random(N) - 1
-        sintheta = np.sin(np.arcsin(costheta))
-        phi = 2 * np.pi * np.random.random(N)
-        part_dip = np.array([sintheta * np.cos(phi) * dipole_modulus,
-                             sintheta * np.sin(phi) * dipole_modulus,
-                             costheta * dipole_modulus]).T
+        part_dip = dipole_modulus * random_dipoles(N)
         particles = system.part.add(pos=part_pos, dip=part_dip,
                                     rotation=N * [(1, 1, 1)])
 
@@ -137,12 +146,11 @@ def gen_reference_data(self):
         # compute forces and energies for dawaanr
         ref_e, ref_f, ref_t = self.dds_data()
         np.save(
-            OPEN_BOUNDARIES_REF_ENERGY,
-            np.array(
-                [ref_e]),
+            filepath_energy,
+            np.array([ref_e]),
             allow_pickle=False)
         np.save(
-            OPEN_BOUNDARIES_REF_ARRAYS,
+            filepath_arrays,
             np.hstack(
                 (particles.pos_folded,
                  particles.dip,
@@ -190,7 +198,9 @@ def test_dds_gpu(self):
             force_tol=1E-4,
             torque_tol=1E-4)
 
-    @utx.skipIfMissingFeatures("SCAFACOS_DIPOLES")
+    @ut.skipIf(not espressomd.has_features("SCAFACOS_DIPOLES") or
+               "direct" not in espressomd.scafacos.available_methods(),
+               "Skipping test: missing SCAFACOS_DIPOLES or 'direct' method")
     def test_dds_scafacos(self):
         self.check_open_bc(
             self.fcs_data,
diff --git a/testsuite/python/dipolar_interface.py b/testsuite/python/dipolar_interface.py
index 5cf6d5baf2f..33c06aaf822 100644
--- a/testsuite/python/dipolar_interface.py
+++ b/testsuite/python/dipolar_interface.py
@@ -30,8 +30,8 @@ class MagnetostaticsInterface(ut.TestCase):
 
     def setUp(self):
         self.system.box_l = [10., 10., 10.]
-        self.system.part.add(id=0, pos=(0.1, 0.1, 0.1), dip=(1.3, 2.1, -6))
-        self.system.part.add(id=1, pos=(0, 0, 0), dip=(7.3, 6.1, -4))
+        self.system.part.add(pos=(0.0, 0.0, 0.0), dip=(1.3, 2.1, -6))
+        self.system.part.add(pos=(0.1, 0.1, 0.1), dip=(7.3, 6.1, -4))
 
     def tearDown(self):
         self.system.part.clear()
diff --git a/testsuite/python/dipolar_p3m.py b/testsuite/python/dipolar_p3m.py
index 1001f9e46e5..373885a236a 100644
--- a/testsuite/python/dipolar_p3m.py
+++ b/testsuite/python/dipolar_p3m.py
@@ -22,6 +22,7 @@
 import numpy as np
 
 import espressomd.magnetostatics
+import espressomd.magnetostatic_extensions
 
 
 @utx.skipIfMissingFeatures(["DP3M"])
@@ -30,33 +31,13 @@ class MagnetostaticsP3M(ut.TestCase):
 
     def setUp(self):
         self.system.box_l = [10., 10., 10.]
-        self.system.part.add(id=0, pos=(0.1, 0.1, 0.1), dip=(1.3, 2.1, -6))
-        self.system.part.add(id=1, pos=(0, 0, 0), dip=(7.3, 6.1, -4))
+        self.system.part.add(id=0, pos=[4.0, 2.0, 2.0], dip=(1.3, 2.1, -6))
+        self.system.part.add(id=1, pos=[6.0, 2.0, 2.0], dip=(7.3, 6.1, -4))
 
     def tearDown(self):
         self.system.part.clear()
         self.system.actors.clear()
 
-    def ref_values(self, epsilon=np.inf):
-        x = 1. / (1 + 2 * epsilon)
-        dp3m_energy = 1.66706 * x + 1.673333
-        dp3m_torque1 = np.array([-0.5706503 * x + 2.561371,
-                                 -0.1812375 * x + 10.394144,
-                                 -0.2976916 * x + 9.965342])
-        dp3m_torque2 = np.array([+0.3362938 * x + 1.854679,
-                                 -0.2269749 * x - 3.638175,
-                                 +0.5315054 * x + 8.487292])
-        dp3m_force = np.array([-3.54175042, -4.6761059, 9.96632774])
-        alpha, r_cut, mesh, cao = (9.056147262573242, 4.739799499511719, 49, 7)
-        dp3m_params = {'prefactor': 1.1, 'accuracy': 9.995178689932661e-07,
-                       'mesh': mesh, 'mesh_off': [0.5, 0.5, 0.5],
-                       'cao': cao, 'additional_mesh': [0.0, 0.0, 0.0],
-                       'alpha': alpha / 10, 'alpha_L': alpha, 'r_cut': r_cut,
-                       'r_cut_iL': r_cut / self.system.box_l[0],
-                       'cao_cut': 3 * [self.system.box_l[0] / mesh / 2 * cao],
-                       'a': 3 * [self.system.box_l[0] / mesh]}
-        return dp3m_params, dp3m_energy, dp3m_force, dp3m_torque1, dp3m_torque2
-
     if espressomd.has_features("DP3M"):
         test_DP3M = tests_common.generate_test_for_class(
             system, espressomd.magnetostatics.DipolarP3M,
@@ -65,48 +46,95 @@ def ref_values(self, epsilon=np.inf):
 
     def test_dp3m(self):
         self.system.time_step = 0.01
-        self.system.part[0].pos = [1.0, 2.0, 2.0]
-        self.system.part[1].pos = [3.0, 2.0, 2.0]
-        dp3m_params, dp3m_energy, dp3m_force, dp3m_torque1, dp3m_torque2 = self.ref_values()
-        dp3m = espressomd.magnetostatics.DipolarP3M(tune=False, **dp3m_params)
+        prefactor = 1.1
+        box_vol = self.system.volume()
+        p1, p2 = self.system.part[:]
+        dip = np.copy(p1.dip + p2.dip)
+        dp3m_params = {'accuracy': 1e-6,
+                       'mesh': [49, 49, 49],
+                       'cao': 7,
+                       'r_cut': 4.739799499511719,
+                       'alpha': 0.9056147262573242}
+        mdlc_params = {'maxPWerror': 1e-5, 'gap_size': 5.}
+
+        # reference values for energy and force calculated for prefactor = 1.1
+        ref_dp3m_energy = 1.673333
+        ref_dp3m_force = np.array([-3.54175042, -4.6761059, 9.96632774])
+        ref_dp3m_torque1 = np.array([-3.29316117, -13.21245739, -5.33787892])
+        ref_dp3m_torque2 = np.array([3.98103932, -7.47123148, -4.12823244])
+
+        # check metallic case
+        dp3m = espressomd.magnetostatics.DipolarP3M(
+            prefactor=prefactor, epsilon='metallic', tune=False, **dp3m_params)
         self.system.actors.add(dp3m)
-        self.assertAlmostEqual(self.system.analysis.energy()['dipolar'],
-                               dp3m_energy, places=5)
-        # update forces and torques
-        self.system.integrator.run(0)
-        np.testing.assert_allclose(np.copy(self.system.part[0].f),
-                                   dp3m_force, atol=1E-5)
-        np.testing.assert_allclose(np.copy(self.system.part[1].f),
-                                   -dp3m_force, atol=1E-5)
-        np.testing.assert_allclose(np.copy(self.system.part[0].torque_lab),
-                                   dp3m_torque1, atol=1E-5)
-        np.testing.assert_allclose(np.copy(self.system.part[1].torque_lab),
-                                   dp3m_torque2, atol=1E-5)
-
-    def test_dp3m_non_metallic(self):
-        self.system.time_step = 0.01
-        self.system.part[0].pos = [1.0, 2.0, 2.0]
-        self.system.part[1].pos = [3.0, 2.0, 2.0]
-        for epsilon_power in range(-4, 5):
-            epsilon = 10**epsilon_power
-            dp3m_params, dp3m_energy, dp3m_force, dp3m_torque1, dp3m_torque2 = self.ref_values(
-                epsilon)
+        self.system.integrator.run(0, recalc_forces=True)
+        energy = self.system.analysis.energy()['dipolar']
+        tol = 1e-5
+        np.testing.assert_allclose(energy, ref_dp3m_energy, atol=tol)
+        np.testing.assert_allclose(np.copy(p1.f), ref_dp3m_force, atol=tol)
+        np.testing.assert_allclose(np.copy(p2.f), -ref_dp3m_force, atol=tol)
+        np.testing.assert_allclose(
+            np.copy(p1.convert_vector_space_to_body(p1.torque_lab)),
+            ref_dp3m_torque1, atol=tol)
+        np.testing.assert_allclose(
+            np.copy(p2.convert_vector_space_to_body(p2.torque_lab)),
+            ref_dp3m_torque2, atol=tol)
+
+        # keep current values as reference to check for DP3M dipole correction
+        ref_dp3m_energy_metallic = self.system.analysis.energy()['dipolar']
+        ref_dp3m_forces_metallic = np.copy(self.system.part[:].f)
+        ref_dp3m_torque_metallic = np.array([
+            p1.convert_vector_space_to_body(p1.torque_lab),
+            p2.convert_vector_space_to_body(p2.torque_lab)])
+
+        # MDLC cancels out dipole correction
+        mdlc = espressomd.magnetostatic_extensions.DLC(**mdlc_params)
+        self.system.actors.add(mdlc)
+
+        # keep current values as reference to check for MDLC dipole correction
+        self.system.integrator.run(0, recalc_forces=True)
+        ref_mdlc_energy_metallic = self.system.analysis.energy()['dipolar']
+        ref_mdlc_forces_metallic = np.copy(self.system.part[:].f)
+        ref_mdlc_torque_metallic = np.copy(self.system.part[:].torque_lab)
+        self.system.actors.clear()
+
+        # check non-metallic case
+        tol = 1e-10
+        for epsilon in np.power(10., np.arange(-4, 5)):
+            dipole_correction = 4 * np.pi / box_vol / (1 + 2 * epsilon)
+            e_correction = dipole_correction / 2 * np.linalg.norm(dip)**2
+            t_correction = np.cross([p1.dip, p2.dip], dipole_correction * dip)
+            ref_dp3m_energy = ref_dp3m_energy_metallic + prefactor * e_correction
+            ref_dp3m_forces = ref_dp3m_forces_metallic
+            ref_dp3m_torque = ref_dp3m_torque_metallic - prefactor * t_correction
             dp3m = espressomd.magnetostatics.DipolarP3M(
-                tune=False, epsilon=epsilon, **dp3m_params)
+                prefactor=prefactor, epsilon=epsilon, tune=False, **dp3m_params)
             self.system.actors.add(dp3m)
-            self.assertAlmostEqual(self.system.analysis.energy()['dipolar'],
-                                   dp3m_energy, places=5)
-            # update forces and torques
-            self.system.integrator.run(0)
-            np.testing.assert_allclose(np.copy(self.system.part[0].f),
-                                       dp3m_force, atol=1E-5)
-            np.testing.assert_allclose(np.copy(self.system.part[1].f),
-                                       -dp3m_force, atol=1E-5)
-            np.testing.assert_allclose(np.copy(self.system.part[0].torque_lab),
-                                       dp3m_torque1, atol=1E-5)
-            np.testing.assert_allclose(np.copy(self.system.part[1].torque_lab),
-                                       dp3m_torque2, atol=1E-5)
-            self.system.actors.remove(dp3m)
+            self.system.integrator.run(0, recalc_forces=True)
+            dp3m_forces = np.copy(self.system.part[:].f)
+            dp3m_torque = np.array([
+                p1.convert_vector_space_to_body(p1.torque_lab),
+                p2.convert_vector_space_to_body(p2.torque_lab)])
+            dp3m_energy = self.system.analysis.energy()['dipolar']
+            np.testing.assert_allclose(dp3m_forces, ref_dp3m_forces, atol=tol)
+            np.testing.assert_allclose(dp3m_torque, ref_dp3m_torque, atol=tol)
+            np.testing.assert_allclose(dp3m_energy, ref_dp3m_energy, atol=tol)
+
+            # MDLC cancels out dipole correction
+            ref_mdlc_energy = ref_mdlc_energy_metallic
+            ref_mdlc_forces = ref_mdlc_forces_metallic
+            ref_mdlc_torque = ref_mdlc_torque_metallic
+            mdlc = espressomd.magnetostatic_extensions.DLC(**mdlc_params)
+            self.system.actors.add(mdlc)
+            self.system.integrator.run(0, recalc_forces=True)
+            mdlc_forces = np.copy(self.system.part[:].f)
+            mdlc_torque = np.copy(self.system.part[:].torque_lab)
+            mdlc_energy = self.system.analysis.energy()['dipolar']
+            np.testing.assert_allclose(mdlc_forces, ref_mdlc_forces, atol=tol)
+            np.testing.assert_allclose(mdlc_torque, ref_mdlc_torque, atol=tol)
+            np.testing.assert_allclose(mdlc_energy, ref_mdlc_energy, atol=tol)
+
+            self.system.actors.clear()
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/ek_common.py b/testsuite/python/ek_common.py
index d13765b46f3..b6db63aed52 100644
--- a/testsuite/python/ek_common.py
+++ b/testsuite/python/ek_common.py
@@ -55,7 +55,7 @@ def pressure_tensor_offdiagonal(x, xi, bjerrum_length, force):
 
 # function to calculate the hydrostatic pressure
 
-# Technically, the LB simulates a compressible fluid, whiches pressure
+# Technically, the LB simulates a compressible fluid, whose pressure
 # tensor contains an additional term on the diagonal, proportional to
 # the divergence of the velocity. We neglect this contribution, which
 # creates a small error in the direction normal to the wall, which
@@ -72,22 +72,3 @@ def hydrostatic_pressure(
     offset = ek[int(box_x / (2 * agrid)), int(box_y / (2 * agrid)),
                 int(box_z / (2 * agrid))].pressure[tensor_entry]
     return 0.0 + offset
-
-
-# variant from the nonlinear tests
-def hydrostatic_pressure_non_lin(
-        ek,
-        x,
-        xi,
-        bjerrum_length,
-        tensor_entry,
-        box_x,
-        box_y,
-        box_z,
-        agrid,
-        temperature):
-    offset = ek[int(box_x / (2 * agrid)), int(box_y / (2 * agrid)),
-                int(box_z / (2 * agrid))].pressure[tensor_entry]
-    return temperature * xi * xi * \
-        math.tan(xi * x) * math.tan(xi * x) / \
-        (2.0 * math.pi * bjerrum_length) + offset
diff --git a/testsuite/python/ek_eof_one_species_base.py b/testsuite/python/ek_eof_one_species.py
similarity index 63%
rename from testsuite/python/ek_eof_one_species_base.py
rename to testsuite/python/ek_eof_one_species.py
index a419a8596c2..e7c19634c5e 100644
--- a/testsuite/python/ek_eof_one_species_base.py
+++ b/testsuite/python/ek_eof_one_species.py
@@ -17,29 +17,40 @@
 
 import unittest as ut
 import unittest_decorators as utx
+import pathlib
+
 import sys
 import math
 import numpy as np
+try:
+    import vtk
+    from vtk.util import numpy_support as VN
+    skipIfMissingPythonPackage = utx.no_skip
+except ImportError:
+    skipIfMissingPythonPackage = ut.skip(
+        "Python module vtk not available, skipping test!")
+
 
 import espressomd
 import espressomd.electrokinetics
 import espressomd.shapes
 import ek_common
-from tests_common import DynamicDict
 
 ##########################################################################
 #                          Set up the System                             #
 ##########################################################################
-# Set the slit pore geometry the width is the non-periodic part of the geometry
-# the padding is used to ensure that there is no field inside outside the slit
+# Set the slit pore geometry. The width is the non-periodic part of the
+# geometry. The padding is used to ensure that there is no field outside
+# the slit.
 
-params_base = DynamicDict([
+params_base = dict([
     ('dt', 1.0 / 7),
     ('integration_length', 2300),
     ('agrid', 1. / 3),
     ('density_water', 26.15),
     ('friction', 1.9),
     ('width', 20.0),
+    ('thickness', 3.0),
     ('sigma', -0.04),
     ('padding', 6.0),
     ('force', 0.07),
@@ -47,8 +58,53 @@
     ('viscosity_kinematic', 1.7),
     ('bjerrum_length', 0.8),
     ('sigma', -0.04),
-    ('density_counterions', '-2.0 * sigma / width'),
-    ('valency', 1.0)])
+    ('valency', 1.0),
+])
+params_base['density_counterions'] = -2.0 * \
+    params_base['sigma'] / params_base['width']
+
+axis = "@TEST_SUFFIX@"
+params = {
+    "x": dict([
+        ('box_x', params_base['thickness']),
+        ('box_y', params_base['thickness']),
+        ('box_z', params_base['width'] + 2 * params_base['padding']),
+        ('ext_force_density', [params_base['force'], 0.0, 0.0]),
+        ('wall_normal_1', [0, 0, 1]),
+        ('wall_normal_2', [0, 0, -1]),
+        ('periodic_dirs', (0, 1)),
+        ('non_periodic_dir', 2),
+        ('n_roll_index', 0),
+        ('calculated_pressure_xy', 0.0),
+        ('calculated_pressure_yz', 0.0)
+    ]),
+    "y": dict([
+        ('box_x', params_base['width'] + 2 * params_base['padding']),
+        ('box_y', params_base['thickness']),
+        ('box_z', params_base['thickness']),
+        ('ext_force_density', [0.0, params_base['force'], 0.0]),
+        ('wall_normal_1', [1, 0, 0]),
+        ('wall_normal_2', [-1, 0, 0]),
+        ('periodic_dirs', (1, 2)),
+        ('non_periodic_dir', 0),
+        ('n_roll_index', 1),
+        ('calculated_pressure_xz', 0.0),
+        ('calculated_pressure_yz', 0.0)
+    ]),
+    "z": dict([
+        ('box_x', params_base['thickness']),
+        ('box_y', params_base['width'] + 2 * params_base['padding']),
+        ('box_z', params_base['thickness']),
+        ('ext_force_density', [0.0, 0.0, params_base['force']]),
+        ('wall_normal_1', [0, 1, 0]),
+        ('wall_normal_2', [0, -1, 0]),
+        ('periodic_dirs', (0, 2)),
+        ('non_periodic_dir', 1),
+        ('n_roll_index', 2),
+        ('calculated_pressure_xy', 0.0),
+        ('calculated_pressure_xz', 0.0)
+    ])
+}[axis]
 
 
 def bisection():
@@ -110,16 +166,28 @@ class ek_eof_one_species(ut.TestCase):
     system = espressomd.System(box_l=[1.0, 1.0, 1.0])
     xi = bisection()
 
-    def run_test(self, params):
-        system = self.system
+    def parse_vtk(self, filepath, name, shape):
+        reader = vtk.vtkStructuredPointsReader()
+        reader.SetFileName(filepath)
+        reader.ReadAllVectorsOn()
+        reader.ReadAllScalarsOn()
+        reader.Update()
+
+        data = reader.GetOutput()
+        points = data.GetPointData()
+
+        return VN.vtk_to_numpy(points.GetArray(name)).reshape(shape, order='F')
+
+    @classmethod
+    def setUpClass(cls):
+        system = cls.system
         system.box_l = [params['box_x'], params['box_y'], params['box_z']]
         system.time_step = params_base['dt']
-        system.thermostat.turn_off()
         system.cell_system.skin = 0.1
         system.thermostat.turn_off()
 
         # Set up the (LB) electrokinetics fluid
-        ek = espressomd.electrokinetics.Electrokinetics(
+        ek = cls.ek = espressomd.electrokinetics.Electrokinetics(
             agrid=params_base['agrid'],
             lb_density=params_base['density_water'],
             viscosity=params_base['viscosity_kinematic'],
@@ -129,7 +197,7 @@ def run_test(self, params):
             params_base['temperature'],
             stencil="linkcentered")
 
-        counterions = espressomd.electrokinetics.Species(
+        counterions = cls.counterions = espressomd.electrokinetics.Species(
             density=params_base['density_counterions'],
             D=0.3,
             valency=params_base['valency'],
@@ -156,6 +224,7 @@ def run_test(self, params):
         # Integrate the system
         system.integrator.run(params_base['integration_length'])
 
+    def test(self):
         # compare the various quantities to the analytic results
         total_velocity_difference = 0.0
         total_density_difference = 0.0
@@ -166,6 +235,9 @@ def run_test(self, params):
         total_pressure_difference_yz = 0.0
         total_pressure_difference_xz = 0.0
 
+        system = self.system
+        ek = self.ek
+        counterions = self.counterions
         for i in range(
                 int(system.box_l[params['non_periodic_dir']] / params_base['agrid'])):
             if (i *
@@ -301,3 +373,90 @@ def run_test(self, params):
                         "Pressure accuracy yz component not achieved")
         self.assertLess(total_pressure_difference_xz, 1.0e-04,
                         "Pressure accuracy xz component not achieved")
+
+    @skipIfMissingPythonPackage
+    def test_vtk(self):
+        ek = self.ek
+        counterions = self.counterions
+        grid_dims = list(
+            map(int, np.round(self.system.box_l / params_base['agrid'])))
+
+        # write VTK files
+        vtk_root = f"vtk_out/ek_eof_{axis}"
+        pathlib.Path(vtk_root).mkdir(parents=True, exist_ok=True)
+        path_vtk_boundary = f"{vtk_root}/boundary.vtk"
+        path_vtk_velocity = f"{vtk_root}/velocity.vtk"
+        path_vtk_potential = f"{vtk_root}/potential.vtk"
+        path_vtk_lbdensity = f"{vtk_root}/density.vtk"
+        path_vtk_lbforce = f"{vtk_root}/lbforce.vtk"
+        path_vtk_density = f"{vtk_root}/lbdensity.vtk"
+        path_vtk_flux = f"{vtk_root}/flux.vtk"
+        path_vtk_flux_link = f"{vtk_root}/flux_link.vtk"
+        if espressomd.has_features('EK_DEBUG'):
+            path_vtk_flux_fluc = f"{vtk_root}/flux_fluc.vtk"
+        ek.write_vtk_boundary(path_vtk_boundary)
+        ek.write_vtk_velocity(path_vtk_velocity)
+        ek.write_vtk_potential(path_vtk_potential)
+        ek.write_vtk_density(path_vtk_lbdensity)
+        ek.write_vtk_lbforce(path_vtk_lbforce)
+        counterions.write_vtk_density(path_vtk_density)
+        counterions.write_vtk_flux(path_vtk_flux)
+        if espressomd.has_features('EK_DEBUG'):
+            counterions.write_vtk_flux_fluc(path_vtk_flux_fluc)
+        counterions.write_vtk_flux_link(path_vtk_flux_link)
+
+        # load VTK files to check they are correctly formatted
+        get_vtk = self.parse_vtk
+        vtk_boundary = get_vtk(path_vtk_boundary, "boundary", grid_dims)
+        vtk_velocity = get_vtk(path_vtk_velocity, "velocity", grid_dims + [3])
+        vtk_potential = get_vtk(path_vtk_potential, "potential", grid_dims)
+        vtk_lbdensity = get_vtk(path_vtk_lbdensity, "density_lb", grid_dims)
+        get_vtk(path_vtk_lbforce, "lbforce", grid_dims + [3])
+        vtk_density = get_vtk(path_vtk_density, "density_1", grid_dims)
+        vtk_flux = get_vtk(path_vtk_flux, "flux_1", grid_dims + [3])
+        if espressomd.has_features('EK_DEBUG'):
+            get_vtk(path_vtk_flux_fluc, "flux_fluc_1", grid_dims + [4])
+        get_vtk(path_vtk_flux_link, "flux_link_1", grid_dims + [13])
+
+        # check VTK files against the EK grid
+        species_density = np.zeros(grid_dims)
+        species_flux = np.zeros(grid_dims + [3])
+        ek_potential = np.zeros(grid_dims)
+        ek_velocity = np.zeros(grid_dims + [3])
+        for i in range(grid_dims[0]):
+            for j in range(grid_dims[1]):
+                for k in range(grid_dims[2]):
+                    index = np.array([i, j, k])
+                    species_density[i, j, k] = counterions[index].density
+                    species_flux[i, j, k] = counterions[index].flux
+                    ek_potential[i, j, k] = ek[index].potential
+                    ek_velocity[i, j, k] = ek[index].velocity
+
+        np.testing.assert_allclose(vtk_velocity, ek_velocity, atol=1e-6)
+        np.testing.assert_allclose(vtk_potential, ek_potential, atol=1e-6)
+        np.testing.assert_allclose(vtk_density, species_density, atol=1e-6)
+        np.testing.assert_allclose(vtk_flux, species_flux, atol=1e-6)
+
+        # check VTK files against the EK parameters
+        dens = params_base['density_water']
+        left_dist = int(params_base['padding'] / params_base['agrid'])
+        right_dist = int(-params_base['padding'] / params_base['agrid'])
+        thickness = int(params_base['thickness'] / params_base['agrid'])
+        i = np.roll([0, 0, right_dist], params['n_roll_index'])
+        j = np.roll([thickness, thickness, left_dist], params['n_roll_index'])
+        mask_left = np.zeros(grid_dims, dtype=bool)
+        mask_left[:j[0], :j[1], :j[2]] = True
+        mask_right = np.zeros(grid_dims, dtype=bool)
+        mask_right[i[0]:, i[1]:, i[2]:] = True
+        mask_outside = np.logical_or(mask_left, mask_right)
+        mask_inside = np.logical_not(mask_outside)
+        np.testing.assert_allclose(vtk_lbdensity[mask_inside], dens, atol=1e-4)
+        np.testing.assert_allclose(vtk_lbdensity[mask_outside], 0, atol=1e-6)
+        np.testing.assert_allclose(vtk_boundary[mask_left], 1, atol=1e-6)
+        np.testing.assert_allclose(vtk_boundary[mask_left], 1, atol=1e-6)
+        np.testing.assert_allclose(vtk_boundary[mask_right], 2, atol=1e-6)
+        np.testing.assert_allclose(vtk_boundary[mask_inside], 0, atol=1e-6)
+
+
+if __name__ == "__main__":
+    ut.main()
diff --git a/testsuite/python/ek_eof_one_species_x.py b/testsuite/python/ek_eof_one_species_x.py
deleted file mode 100644
index 4d06ecaeab4..00000000000
--- a/testsuite/python/ek_eof_one_species_x.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (C) 2011-2019 The ESPResSo project
-#
-# This file is part of ESPResSo.
-#
-# ESPResSo is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# ESPResSo is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import unittest as ut
-
-from ek_eof_one_species_base import ek_eof_one_species
-from ek_eof_one_species_base import params_base
-
-params_x = dict([
-    ('box_x', 3.0),
-    ('box_y', 3.0),
-    ('box_z', params_base['width'] + 2 * params_base['padding']),
-    ('ext_force_density', [params_base['force'], 0.0, 0.0]),
-    ('wall_normal_1', [0, 0, 1]),
-    ('wall_normal_2', [0, 0, -1]),
-    ('periodic_dirs', (0, 1)),
-    ('non_periodic_dir', 2),
-    ('n_roll_index', 0),
-    ('calculated_pressure_xy', 0.0),
-    ('calculated_pressure_yz', 0.0)
-])
-
-
-class eof_x(ek_eof_one_species):
-
-    def test(self):
-        self.run_test(params_x)
-
-
-if __name__ == "__main__":
-    ut.main()
diff --git a/testsuite/python/ek_eof_one_species_y.py b/testsuite/python/ek_eof_one_species_y.py
deleted file mode 100644
index 7e72950a1f3..00000000000
--- a/testsuite/python/ek_eof_one_species_y.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (C) 2011-2019 The ESPResSo project
-#
-# This file is part of ESPResSo.
-#
-# ESPResSo is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# ESPResSo is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import unittest as ut
-
-from ek_eof_one_species_base import ek_eof_one_species
-from ek_eof_one_species_base import params_base
-
-params_y = dict([
-    ('box_x', params_base['width'] + 2 * params_base['padding']),
-    ('box_y', 3.0),
-    ('box_z', 3.0),
-    ('ext_force_density', [0.0, params_base['force'], 0.0]),
-    ('wall_normal_1', [1, 0, 0]),
-    ('wall_normal_2', [-1, 0, 0]),
-    ('periodic_dirs', (1, 2)),
-    ('non_periodic_dir', 0),
-    ('n_roll_index', 1),
-    ('calculated_pressure_xz', 0.0),
-    ('calculated_pressure_yz', 0.0)
-])
-
-
-class eof_y(ek_eof_one_species):
-
-    def test(self):
-        self.run_test(params_y)
-
-
-if __name__ == "__main__":
-    ut.main()
diff --git a/testsuite/python/ek_eof_one_species_z.py b/testsuite/python/ek_eof_one_species_z.py
deleted file mode 100644
index 2bade76def7..00000000000
--- a/testsuite/python/ek_eof_one_species_z.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (C) 2011-2019 The ESPResSo project
-#
-# This file is part of ESPResSo.
-#
-# ESPResSo is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# ESPResSo is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-import unittest as ut
-
-from ek_eof_one_species_base import ek_eof_one_species
-from ek_eof_one_species_base import params_base
-
-params_z = dict([
-    ('box_x', 3.0),
-    ('box_y', params_base['width'] + 2 * params_base['padding']),
-    ('box_z', 3.0),
-    ('ext_force_density', [0.0, 0.0, params_base['force']]),
-    ('wall_normal_1', [0, 1, 0]),
-    ('wall_normal_2', [0, -1, 0]),
-    ('periodic_dirs', (0, 2)),
-    ('non_periodic_dir', 1),
-    ('n_roll_index', 2),
-    ('calculated_pressure_xy', 0.0),
-    ('calculated_pressure_xz', 0.0)
-])
-
-
-class eof_z(ek_eof_one_species):
-
-    def test(self):
-        self.run_test(params_z)
-
-
-if __name__ == "__main__":
-    ut.main()
diff --git a/testsuite/python/elc.py b/testsuite/python/elc.py
index c1d9a9085ab..32fe7754935 100644
--- a/testsuite/python/elc.py
+++ b/testsuite/python/elc.py
@@ -16,8 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import unittest as ut
 import unittest_decorators as utx
-import espressomd
-from espressomd import electrostatics, electrostatic_extensions
+import espressomd.electrostatics
 
 import numpy as np
 
@@ -33,32 +32,31 @@ class ElcTest(ut.TestCase):
     system.cell_system.skin = 0.0
 
     def test_finite_potential_drop(self):
-        s = self.system
+        system = self.system
 
-        p1 = s.part.add(pos=[0, 0, 1], q=+1)
-        p2 = s.part.add(pos=[0, 0, 9], q=-1)
+        p1 = system.part.add(pos=[0, 0, 1], q=+1)
+        p2 = system.part.add(pos=[0, 0, 9], q=-1)
 
-        s.actors.add(
-            electrostatics.P3M(
-                # zero is not allowed
-                prefactor=1e-100,
-                mesh=32,
-                cao=5,
-                accuracy=1e-3,
-            ))
-
-        s.actors.add(
-            electrostatic_extensions.ELC(
-                gap_size=GAP[2],
-                maxPWerror=1e-3,
-                delta_mid_top=-1,
-                delta_mid_bot=-1,
-                const_pot=1,
-                pot_diff=POTENTIAL_DIFFERENCE,
-            ))
+        p3m = espressomd.electrostatics.P3M(
+            # zero is not allowed
+            prefactor=1e-100,
+            mesh=32,
+            cao=5,
+            accuracy=1e-3,
+        )
+        elc = espressomd.electrostatics.ELC(
+            p3m_actor=p3m,
+            gap_size=GAP[2],
+            maxPWerror=1e-3,
+            delta_mid_top=-1,
+            delta_mid_bot=-1,
+            const_pot=1,
+            pot_diff=POTENTIAL_DIFFERENCE,
+        )
+        system.actors.add(elc)
 
         # Calculated energy
-        U_elc = s.analysis.energy()['coulomb']
+        U_elc = system.analysis.energy()['coulomb']
 
         # Expected E-Field is voltage drop over the box
         E_expected = POTENTIAL_DIFFERENCE / (BOX_L[2] - GAP[2])
@@ -67,7 +65,7 @@ def test_finite_potential_drop(self):
 
         self.assertAlmostEqual(U_elc, U_expected)
 
-        s.integrator.run(0)
+        system.integrator.run(0)
         self.assertAlmostEqual(E_expected, p1.f[2] / p1.q)
         self.assertAlmostEqual(E_expected, p2.f[2] / p2.q)
 
@@ -76,14 +74,14 @@ def test_finite_potential_drop(self):
         p1.pos = [BOX_L[0] / 2, BOX_L[1] / 2, BOX_L[2] - GAP[2] / 2]
         with self.assertRaises(Exception):
             self.system.analysis.energy()
-        with self.assertRaises(Exception):
-            self.integrator.run(2)
+        with self.assertRaisesRegex(Exception, 'entered ELC gap region'):
+            self.system.integrator.run(2)
         # negative direction
         p1.pos = [BOX_L[0] / 2, BOX_L[1] / 2, -GAP[2] / 2]
         with self.assertRaises(Exception):
             self.system.analysis.energy()
-        with self.assertRaises(Exception):
-            self.integrator.run(2)
+        with self.assertRaisesRegex(Exception, 'entered ELC gap region'):
+            self.system.integrator.run(2)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/elc_vs_analytic.py b/testsuite/python/elc_vs_analytic.py
index 8f186608061..d5bfbb28764 100644
--- a/testsuite/python/elc_vs_analytic.py
+++ b/testsuite/python/elc_vs_analytic.py
@@ -19,7 +19,6 @@
 import espressomd
 import numpy as np
 import espressomd.electrostatics
-from espressomd import electrostatic_extensions
 
 
 @utx.skipIfMissingFeatures(["P3M"])
@@ -62,12 +61,11 @@ def test_elc(self):
                                             accuracy=self.accuracy,
                                             mesh=[58, 58, 70],
                                             cao=4)
-        self.system.actors.add(p3m)
-
-        elc = electrostatic_extensions.ELC(gap_size=self.elc_gap,
-                                           maxPWerror=self.accuracy,
-                                           delta_mid_bot=self.delta_mid_bot,
-                                           delta_mid_top=self.delta_mid_top)
+        elc = espressomd.electrostatics.ELC(p3m_actor=p3m,
+                                            gap_size=self.elc_gap,
+                                            maxPWerror=self.accuracy,
+                                            delta_mid_bot=self.delta_mid_bot,
+                                            delta_mid_top=self.delta_mid_top)
         self.system.actors.add(elc)
 
         elc_results = self.scan()
diff --git a/testsuite/python/electrostaticInteractions.py b/testsuite/python/electrostaticInteractions.py
index ee676971d4d..a59b0da1992 100644
--- a/testsuite/python/electrostaticInteractions.py
+++ b/testsuite/python/electrostaticInteractions.py
@@ -27,14 +27,13 @@
 @utx.skipIfMissingFeatures(["ELECTROSTATICS"])
 class ElectrostaticInteractionsTests(ut.TestCase):
     # Handle to espresso system
-    system = espressomd.System(box_l=[1.0, 1.0, 1.0])
+    system = espressomd.System(box_l=[20., 20., 20.])
 
     def setUp(self):
-        self.system.box_l = [20, 20, 20]
         self.system.time_step = 0.01
 
-        self.system.part.add(id=0, pos=(1.0, 2.0, 2.0), q=1)
-        self.system.part.add(id=1, pos=(3.0, 2.0, 2.0), q=-1)
+        self.system.part.add(id=0, pos=(9.0, 2.0, 2.0), q=1)
+        self.system.part.add(id=1, pos=(11.0, 2.0, 2.0), q=-1)
 
     def tearDown(self):
         self.system.part.clear()
@@ -78,51 +77,53 @@ def calc_rf_potential(self, r, rf_params):
 
     @utx.skipIfMissingFeatures(["P3M"])
     def test_p3m(self):
-        prefactor = 1.1
-        self.system.part[0].pos = [1.0, 2.0, 2.0]
-        self.system.part[1].pos = [3.0, 2.0, 2.0]
-        # results, reference values for energy and force only calculated for
-        # prefactor = 1
-        p3m_energy = -0.501062398379 * prefactor
-        p3m_force = 2.48921612e-01 * prefactor
-        p3m = espressomd.electrostatics.P3M(prefactor=prefactor,
-                                            accuracy=9.910945054074526e-08,
-                                            mesh=[22, 22, 22],
-                                            cao=7,
-                                            r_cut=8.906249999999998,
-                                            alpha=0.387611049779351,
-                                            tune=False)
-        self.system.actors.add(p3m)
-        self.assertAlmostEqual(self.system.analysis.energy()['coulomb'],
-                               p3m_energy, places=5)
-        # need to update forces
-        self.system.integrator.run(0)
-        np.testing.assert_allclose(np.copy(self.system.part[0].f),
-                                   [p3m_force, 0, 0], atol=1E-4)
-        np.testing.assert_allclose(np.copy(self.system.part[1].f),
-                                   [-p3m_force, 0, 0], atol=1E-5)
-
-    @utx.skipIfMissingFeatures(["P3M"])
-    def test_p3m_non_metallic(self):
         prefactor = 1.1
         box_vol = self.system.volume()
-        self.system.part[0].pos = [1.0, 2.0, 2.0]
-        self.system.part[1].pos = [3.0, 2.0, 2.0]
-        for epsilon_power in range(-4, 5):
-            epsilon = 10**epsilon_power
-            p3m_energy = np.pi / box_vol * 16 / (1 + 2 * epsilon) - 0.501
-            p3m_energy *= prefactor
-            p3m = espressomd.electrostatics.P3M(prefactor=prefactor,
-                                                accuracy=9.910945054074526e-08,
-                                                mesh=[22, 22, 22],
-                                                cao=7,
-                                                epsilon=epsilon,
-                                                r_cut=8.906249999999998,
-                                                alpha=0.387611049779351,
-                                                tune=False)
+        p1, p2 = self.system.part[:]
+        dip = np.copy(p1.q * p1.pos + p2.q * p2.pos)
+        p3m_params = {'accuracy': 1e-7,
+                      'mesh': [22, 22, 22],
+                      'cao': 7,
+                      'r_cut': 8.906249999999998,
+                      'alpha': 0.387611049779351}
+
+        # reference values for energy and force calculated for prefactor = 1
+        ref_energy = -0.501062398379 * prefactor
+        ref_force1 = [0.248921612 * prefactor, 0, 0]
+        ref_force2 = [-ref_force1[0], 0, 0]
+
+        # check metallic case
+        p3m = espressomd.electrostatics.P3M(
+            prefactor=prefactor, epsilon='metallic', tune=False, **p3m_params)
+        self.system.actors.add(p3m)
+        self.system.integrator.run(0, recalc_forces=True)
+        p3m_energy = self.system.analysis.energy()['coulomb']
+        tol = 1e-5
+        np.testing.assert_allclose(p3m_energy, ref_energy, atol=tol)
+        np.testing.assert_allclose(np.copy(p1.f), ref_force1, atol=tol)
+        np.testing.assert_allclose(np.copy(p2.f), ref_force2, atol=tol)
+
+        # keep current values as reference to check for P3M dipole correction
+        ref_energy_metallic = self.system.analysis.energy()['coulomb']
+        ref_forces_metallic = np.copy(self.system.part[:].f)
+        self.system.actors.remove(p3m)
+
+        # check non-metallic case
+        tol = 1e-10
+        for epsilon in np.power(10., np.arange(-4, 5)):
+            dipole_correction = 4 * np.pi / box_vol / (1 + 2 * epsilon)
+            energy_correction = dipole_correction * np.linalg.norm(dip)**2
+            forces_correction = np.outer([p1.q, p2.q], dipole_correction * dip)
+            ref_energy = ref_energy_metallic + prefactor * energy_correction
+            ref_forces = ref_forces_metallic - prefactor * forces_correction
+            p3m = espressomd.electrostatics.P3M(
+                prefactor=prefactor, epsilon=epsilon, tune=False, **p3m_params)
             self.system.actors.add(p3m)
-            self.assertAlmostEqual(self.system.analysis.energy()['coulomb'],
-                                   p3m_energy, places=3)
+            self.system.integrator.run(0, recalc_forces=True)
+            p3m_forces = np.array([p1.f, p2.f])
+            p3m_energy = self.system.analysis.energy()['coulomb']
+            np.testing.assert_allclose(p3m_energy, ref_energy, atol=tol)
+            np.testing.assert_allclose(p3m_forces, ref_forces, atol=tol)
             self.system.actors.remove(p3m)
 
     def test_dh(self):
diff --git a/testsuite/python/engine_langevin.py b/testsuite/python/engine_langevin.py
index 5b3f9cf9eb5..a6969bb1b8a 100644
--- a/testsuite/python/engine_langevin.py
+++ b/testsuite/python/engine_langevin.py
@@ -52,7 +52,7 @@ def z_v(t, z0):
 
         S.part.add(id=0, pos=pos_0, swimming={"v_swim": v_swim})
         S.part.add(id=1, pos=pos_1, swimming={"f_swim": f_swim})
-        S.part[:].rotation = 1, 1, 1
+        S.part[:].rotation = (1, 1, 1)
 
         S.thermostat.set_langevin(kT=temp, gamma=gamma, seed=42)
 
diff --git a/testsuite/python/field_test.py b/testsuite/python/field_test.py
index 3cf32e30658..18c56e33868 100644
--- a/testsuite/python/field_test.py
+++ b/testsuite/python/field_test.py
@@ -89,6 +89,14 @@ def test_linear_electric_potential(self):
         self.assertAlmostEqual(self.system.analysis.energy()['total'],
                                self.system.analysis.energy()['external_fields'])
 
+        np.testing.assert_allclose(
+            electric_field.call_method("_eval_field", x=[0, 0, 0]), phi0)
+        np.testing.assert_allclose(
+            electric_field.call_method("_eval_field", x=[3, 2, 1]),
+            np.dot(-E, [3, 2, 1]) + phi0)
+        np.testing.assert_allclose(
+            electric_field.call_method("_eval_jacobian", x=[3, 2, 1]), -E)
+
     @utx.skipIfMissingFeatures("ELECTROSTATICS")
     def test_electric_plane_wave(self):
         E0 = np.array([1., -2., 3.])
@@ -146,10 +154,18 @@ def test_potential_field(self):
             box, h, self.potential)
 
         F = constraints.PotentialField(field=field_data, grid_spacing=h,
+                                       particle_scales={1: 0.0},
                                        default_scale=scaling)
 
         p = self.system.part.add(pos=[0, 0, 0])
+        self.system.part.add(pos=[1, 0, 0])
         self.system.constraints.add(F)
+        self.assertAlmostEqual(F.default_scale, scaling, delta=1e-9)
+        self.assertEqual(F.particle_scales, {1: 0.0})
+        with self.assertRaisesRegex(RuntimeError, 'Parameter default_scale is read-only'):
+            F.default_scale = 2.0
+        with self.assertRaisesRegex(RuntimeError, 'Parameter particle_scales is read-only'):
+            F.particle_scales = {0: 0.0}
 
         for i in product(*map(range, 3 * [10])):
             x = (h * i)
@@ -198,10 +214,17 @@ def test_force_field(self):
         field_data = constraints.ForceField.field_from_fn(box, h, self.force)
 
         F = constraints.ForceField(field=field_data, grid_spacing=h,
+                                   particle_scales={1: 0.0},
                                    default_scale=scaling)
 
         p = self.system.part.add(pos=[0, 0, 0])
         self.system.constraints.add(F)
+        self.assertAlmostEqual(F.default_scale, scaling, delta=1e-9)
+        self.assertEqual(F.particle_scales, {1: 0.0})
+        with self.assertRaisesRegex(RuntimeError, 'Parameter default_scale is read-only'):
+            F.default_scale = 2.0
+        with self.assertRaisesRegex(RuntimeError, 'Parameter particle_scales is read-only'):
+            F.particle_scales = {0: 0.0}
 
         for i in product(*map(range, 3 * [10])):
             x = (h * i)
@@ -226,6 +249,8 @@ def test_flow_field(self):
 
         p = self.system.part.add(pos=[0, 0, 0], v=[1, 2, 3])
         self.system.constraints.add(F)
+        with self.assertRaisesRegex(RuntimeError, 'Parameter gamma is read-only'):
+            F.gamma = 2.0
 
         for i in product(*map(range, 3 * [10])):
             x = (h * i)
diff --git a/testsuite/python/gpu_availability.py b/testsuite/python/gpu_availability.py
index 03d30cb44d1..e8638b768d8 100644
--- a/testsuite/python/gpu_availability.py
+++ b/testsuite/python/gpu_availability.py
@@ -17,21 +17,62 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 import unittest as ut
+import unittest_decorators as utx
 import espressomd
 
 
 class GPUAvailability(ut.TestCase):
 
     """Tests consistency of GPU availability reporting."""
+    system = espressomd.System(box_l=[1, 1, 1])
 
     def test(self):
         if espressomd.has_features("CUDA"):
-            system = espressomd.System(box_l=[1, 1, 1])
-            self.assertEqual(system.cuda_init_handle.device_list != {},
+            self.assertEqual(self.system.cuda_init_handle.list_devices() != {},
                              espressomd.gpu_available())
+            self.assertEqual(
+                self.system.cuda_init_handle.list_devices_properties() != {},
+                espressomd.gpu_available())
         else:
             self.assertFalse(espressomd.gpu_available())
 
+    @utx.skipIfMissingFeatures("CUDA")
+    def test_exceptions(self):
+        error_msg = 'CUDA error: '
+        if espressomd.gpu_available():
+            n_gpus = len(self.system.cuda_init_handle.list_devices())
+            with self.assertRaisesRegex(RuntimeError, error_msg):
+                self.system.cuda_init_handle.device = n_gpus + 1
+        else:
+            with self.assertRaisesRegex(RuntimeError, error_msg):
+                self.system.cuda_init_handle.device
+            with self.assertRaisesRegex(RuntimeError, error_msg):
+                self.system.cuda_init_handle.device = 0
+
+    @utx.skipIfMissingGPU()
+    def test_list_devices(self):
+        # check if GPU properties can be queried
+        device_list = self.system.cuda_init_handle.list_devices()
+        device_list_p = self.system.cuda_init_handle.list_devices_properties()
+        self.assertEqual(len(device_list_p), 1)
+        device_list_p_head = list(device_list_p.values())[0]
+        dev_keys = {'name', 'compute_capability', 'cores', 'total_memory'}
+        # check both dicts agree
+        self.assertEqual(device_list.keys(), device_list_p_head.keys())
+        for dev_id in device_list:
+            self.assertEqual(device_list_p_head[dev_id].keys(), dev_keys)
+            self.assertEqual(
+                device_list_p_head[dev_id]['name'],
+                device_list[dev_id])
+        # check the currently active GPU
+        dev_id = self.system.cuda_init_handle.device
+        self.assertIn(dev_id, device_list_p_head)
+        device = device_list_p_head[dev_id]
+        self.assertGreater(device['cores'], 0)
+        self.assertGreater(device['total_memory'], 0)
+        self.assertGreaterEqual(device['compute_capability'][0], 3)
+        self.assertGreaterEqual(device['compute_capability'][1], 0)
+
 
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/python/icc.py b/testsuite/python/icc.py
index 7e98d658cfa..f97d5d88cfa 100644
--- a/testsuite/python/icc.py
+++ b/testsuite/python/icc.py
@@ -17,98 +17,162 @@
 import unittest as ut
 import unittest_decorators as utx
 import espressomd
+import numpy as np
 
 
-@utx.skipIfMissingFeatures(["P3M", "EXTERNAL_FORCES"])
+@utx.skipIfMissingFeatures(["ELECTROSTATICS", "EXTERNAL_FORCES"])
 class test_icc(ut.TestCase):
+    system = espressomd.System(box_l=[10, 10, 10])
+
+    def tearDown(self):
+        self.system.actors.clear()
+        self.system.part.clear()
+
+    def add_icc_particles(self, side_num_particles,
+                          initial_charge, z_position):
+        number = side_num_particles**2
+        areas = self.system.box_l[0] * \
+            self.system.box_l[1] / number * np.ones(number)
+        normals = np.zeros((number, 3))
+        normals[:, 2] = 1
+
+        x_position = np.linspace(
+            0,
+            self.system.box_l[0],
+            side_num_particles,
+            endpoint=False)
+        y_position = np.linspace(
+            0,
+            self.system.box_l[1],
+            side_num_particles,
+            endpoint=False)
+        x_pos, y_pos = np.meshgrid(x_position, y_position)
+
+        positions = np.stack((x_pos, y_pos, np.full_like(
+            x_pos, z_position)), axis=-1).reshape(-1, 3)
+
+        charges = np.full(number, initial_charge)
+        fix = [(True, True, True)] * number
+
+        return self.system.part.add(
+            pos=positions, q=charges, fix=fix), normals, areas
+
+    def common_setup(self, kwargs, error):
+        from espressomd.electrostatic_extensions import ICC
+
+        self.tearDown()
+        part_slice, normals, areas = self.add_icc_particles(2, 0.01, 0)
+
+        params = {"n_icc": len(part_slice),
+                  "normals": normals,
+                  "areas": areas,
+                  "epsilons": np.ones_like(areas),
+                  "first_id": part_slice.id[0],
+                  "check_neutrality": False}
+
+        params.update(kwargs)
+
+        icc = ICC(**params)
+        with self.assertRaisesRegex(Exception, error):
+            self.system.actors.add(icc)
+
+    def test_params(self):
+        params = [({"n_icc": -1}, 'ICC: invalid number of particles'),
+                  ({"first_id": -1}, 'ICC: invalid first_id'),
+                  ({"max_iterations": -1}, 'ICC: invalid max_iterations'),
+                  ({"convergence": -1}, 'ICC: invalid convergence value'),
+                  ({"relaxation": -1}, 'ICC: invalid relaxation value'),
+                  ({"relaxation": 2.1}, 'ICC: invalid relaxation value'),
+                  ({"eps_out": -1}, 'ICC: invalid eps_out'),
+                  ({"ext_field": 0}, 'A single value was given but 3 were expected'), ]
+
+        for kwargs, error in params:
+            self.common_setup(kwargs, error)
+
+    def test_core_params(self):
+        from espressomd.electrostatic_extensions import ICC
+
+        self.tearDown()
+        part_slice, normals, areas = self.add_icc_particles(5, 0.01, 0)
 
-    def runTest(self):
+        params = {"n_icc": len(part_slice),
+                  "normals": normals,
+                  "areas": areas,
+                  "epsilons": np.ones_like(areas),
+                  "first_id": part_slice.id[0],
+                  "check_neutrality": False}
+
+        icc = ICC(**params)
+        self.system.actors.add(icc)
+
+        icc_params = icc.get_params()
+        for key, value in params.items():
+            np.testing.assert_allclose(value, np.copy(icc_params[key]))
+
+    @utx.skipIfMissingFeatures(["P3M"])
+    def test_dipole_system(self):
         from espressomd.electrostatics import P3M
         from espressomd.electrostatic_extensions import ICC
 
-        S = espressomd.System(box_l=[1.0, 1.0, 1.0])
-        # Parameters
-        box_l = 20.0
-        nicc = 10
-        q_test = 10.0
-        q_dist = 5.0
-
-        # System
-        S.box_l = [box_l, box_l, box_l + 5.0]
-        S.cell_system.skin = 0.4
-        S.time_step = 0.01
-
-        # ICC particles
-        nicc_per_electrode = nicc * nicc
-        nicc_tot = 2 * nicc_per_electrode
-        iccArea = box_l * box_l / nicc_per_electrode
-
-        iccNormals = []
-        iccAreas = []
-        iccSigmas = []
-        iccEpsilons = []
-
-        l = box_l / nicc
-        for xi in range(nicc):
-            for yi in range(nicc):
-                S.part.add(pos=[l * xi, l * yi, 0], q=-0.0001, fix=[1, 1, 1])
-                iccNormals.append([0, 0, 1])
-
-        for xi in range(nicc):
-            for yi in range(nicc):
-                S.part.add(pos=[l * xi, l * yi, box_l],
-                           q=0.0001, fix=[1, 1, 1])
-                iccNormals.append([0, 0, -1])
-
-        iccAreas.extend([iccArea] * nicc_tot)
-        iccSigmas.extend([0] * nicc_tot)
-        iccEpsilons.extend([10000000] * nicc_tot)
-
-        # Test Dipole
-        b2 = box_l * 0.5
-        S.part.add(pos=[b2, b2, b2 - q_dist / 2], q=q_test, fix=[1, 1, 1])
-        S.part.add(pos=[b2, b2, b2 + q_dist / 2], q=-q_test, fix=[1, 1, 1])
-
-        # Actors
+        BOX_L = 20.
+        BOX_SPACE = 5.
+
+        self.tearDown()
+        self.system.box_l = [BOX_L, BOX_L, BOX_L + BOX_SPACE]
+        self.system.cell_system.skin = 0.4
+        self.system.time_step = 0.01
+
+        N_ICC_SIDE_LENGTH = 10
+        DIPOLE_DISTANCE = 5.0
+        DIPOLE_CHARGE = 10.0
+
+        part_slice_lower, normals_lower, areas_lower = self.add_icc_particles(
+            N_ICC_SIDE_LENGTH, -0.0001, 0.)
+        part_slice_upper, normals_upper, areas_upper = self.add_icc_particles(
+            N_ICC_SIDE_LENGTH, 0.0001, BOX_L)
+
+        assert (part_slice_upper.id[-1] - part_slice_lower.id[0] +
+                1) == 2 * N_ICC_SIDE_LENGTH**2, "ICC particles not continuous"
+
+        normals = np.vstack((normals_lower, -normals_upper))
+        areas = np.hstack((areas_lower, areas_upper))
+        epsilons = np.full_like(areas, 1e8)
+        sigmas = np.zeros_like(areas)
+
+        icc = ICC(n_icc=2 * N_ICC_SIDE_LENGTH**2,
+                  normals=normals,
+                  areas=areas,
+                  epsilons=epsilons,
+                  sigmas=sigmas,
+                  convergence=1e-6,
+                  max_iterations=100,
+                  first_id=part_slice_lower.id[0],
+                  eps_out=1.,
+                  relaxation=0.75,
+                  ext_field=[0, 0, 0])
+
+        # Dipole in the center of the simulation box
+        BOX_L_HALF = BOX_L / 2
+
+        self.system.part.add(pos=[BOX_L_HALF, BOX_L_HALF, BOX_L_HALF - DIPOLE_DISTANCE / 2],
+                             q=DIPOLE_CHARGE, fix=[True, True, True])
+        self.system.part.add(pos=[BOX_L_HALF, BOX_L_HALF, BOX_L_HALF + DIPOLE_DISTANCE / 2],
+                             q=-DIPOLE_CHARGE, fix=[True, True, True])
+
         p3m = P3M(prefactor=1, mesh=32, cao=7, accuracy=1e-5)
-        icc = ICC(
-            n_icc=nicc_tot,
-            convergence=1e-6,
-            relaxation=0.75,
-            ext_field=[0, 0, 0],
-            max_iterations=100,
-            first_id=0,
-            eps_out=1,
-            normals=iccNormals,
-            areas=iccAreas,
-            sigmas=iccSigmas,
-            epsilons=iccEpsilons)
-
-        S.actors.add(p3m)
-        S.actors.add(icc)
-
-        # Run
-        S.integrator.run(0)
-
-        # Analyze
-        QL = sum(S.part[:nicc_per_electrode].q)
-        QR = sum(S.part[nicc_per_electrode:nicc_tot].q)
-
-        testcharge_dipole = q_test * q_dist
-        induced_dipole = 0.5 * (abs(QL) + abs(QR)) * box_l
-
-        # Result
-        self.assertAlmostEqual(1, induced_dipole / testcharge_dipole, places=4)
 
-        # Test applying changes
-        enegry_pre_change = S.analysis.energy()['total']
-        pressure_pre_change = S.analysis.pressure()['total']
-        icc.set_params(sigmas=[2.0] * nicc_tot)
-        icc.set_params(epsilons=[20.0] * nicc_tot)
-        enegry_post_change = S.analysis.energy()['total']
-        pressure_post_change = S.analysis.pressure()['total']
-        self.assertNotAlmostEqual(enegry_pre_change, enegry_post_change)
-        self.assertNotAlmostEqual(pressure_pre_change, pressure_post_change)
+        self.system.actors.add(p3m)
+        self.system.actors.add(icc)
+
+        self.system.integrator.run(0)
+
+        charge_lower = sum(part_slice_lower.q)
+        charge_upper = sum(part_slice_upper.q)
+
+        testcharge_dipole = DIPOLE_CHARGE * DIPOLE_DISTANCE
+        induced_dipole = 0.5 * (abs(charge_lower) + abs(charge_upper)) * BOX_L
+
+        self.assertAlmostEqual(1, induced_dipole / testcharge_dipole, places=4)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/lb_electrohydrodynamics.py b/testsuite/python/lb_electrohydrodynamics.py
index a716e5a8d5c..126f8f351dc 100644
--- a/testsuite/python/lb_electrohydrodynamics.py
+++ b/testsuite/python/lb_electrohydrodynamics.py
@@ -40,9 +40,6 @@ def setUp(self):
         self.s.time_step = self.params['time_step']
         self.s.cell_system.skin = self.params['skin']
 
-        for i in self.s.actors:
-            self.s.actors.remove(i)
-
         self.lbf = self.lb.LBFluid(
             visc=self.params['viscosity'],
             dens=self.params['dens'],
@@ -56,6 +53,9 @@ def setUp(self):
             LB_fluid=self.lbf,
             gamma=self.params['friction'])
 
+    def tearDown(self):
+        self.s.actors.clear()
+
     def test(self):
         s = self.s
 
diff --git a/testsuite/python/lb_poiseuille_cylinder.py b/testsuite/python/lb_poiseuille_cylinder.py
index 981c0c23e28..0607a60641f 100644
--- a/testsuite/python/lb_poiseuille_cylinder.py
+++ b/testsuite/python/lb_poiseuille_cylinder.py
@@ -18,6 +18,7 @@
 import unittest_decorators as utx
 import numpy as np
 
+import espressomd.math
 import espressomd.lb
 import espressomd.lbboundaries
 import espressomd.observables
@@ -81,7 +82,8 @@ class LBPoiseuilleCommon:
     system = espressomd.System(box_l=[BOX_L] * 3)
     system.time_step = TIME_STEP
     system.cell_system.skin = 0.4 * AGRID
-    params = {'axis': [0, 0, 1]}
+    params = {'axis': [0, 0, 1],
+              'orientation': [1, 0, 0]}
 
     def prepare(self):
         """
@@ -150,8 +152,10 @@ def prepare_obs(self):
         else:
             obs_center = [BOX_L / 2.0, BOX_L / 2.0, 0.0]
         local_obs_params = OBS_PARAMS.copy()
-        local_obs_params['center'] = obs_center
-        local_obs_params['axis'] = self.params['axis']
+        ctp = espressomd.math.CylindricalTransformationParameters(center=obs_center,
+                                                                  axis=self.params['axis'],
+                                                                  orientation=self.params['orientation'])
+        local_obs_params['transform_params'] = ctp
         obs = espressomd.observables.CylindricalLBVelocityProfile(
             **local_obs_params)
         self.accumulator = espressomd.accumulators.MeanVarianceCalculator(
@@ -178,16 +182,19 @@ def check_observable(self):
 
     def test_x(self):
         self.params['axis'] = [1, 0, 0]
+        self.params['orientation'] = [0, 0, -1]
         self.compare_to_analytical()
         self.check_observable()
 
     def test_y(self):
         self.params['axis'] = [0, 1, 0]
+        self.params['orientation'] = [1, 0, 0]
         self.compare_to_analytical()
         self.check_observable()
 
     def test_z(self):
         self.params['axis'] = [0, 0, 1]
+        self.params['orientation'] = [1, 0, 0]
         self.compare_to_analytical()
         self.check_observable()
 
diff --git a/testsuite/python/observable_cylindrical.py b/testsuite/python/observable_cylindrical.py
index 2e6e7a79ec2..84b33d810f2 100644
--- a/testsuite/python/observable_cylindrical.py
+++ b/testsuite/python/observable_cylindrical.py
@@ -18,6 +18,7 @@
 import unittest as ut
 import espressomd
 import espressomd.observables
+import espressomd.math
 import tests_common
 
 
@@ -31,13 +32,15 @@ class TestCylindricalObservable(ut.TestCase):
     system.time_step = 0.01
     system.cell_system.skin = 0.4
 
+    cyl_transform_params = espressomd.math.CylindricalTransformationParameters(
+        center=3 * [7.5], axis=[1 / np.sqrt(2), 1 / np.sqrt(2), 0], orientation=[0, 0, 1])
+
     params = {
-        'ids': list(range(100)),
-        'center': [7.5, 7.5, 7.5],  # center of the histogram
-        'axis': 'y',
-        'n_r_bins': 4,  # number of bins in r
-        'n_phi_bins': 4,  # -*- in phi
-        'n_z_bins': 4,  # -*- in z
+        'ids': None,
+        'transform_params': cyl_transform_params,
+        'n_r_bins': 4,
+        'n_phi_bins': 3,
+        'n_z_bins': 4,
         'min_r': 0.0,
         'min_phi': -np.pi,
         'min_z': -5.0,
@@ -46,180 +49,162 @@ class TestCylindricalObservable(ut.TestCase):
         'max_z': 5.0,
     }
 
+    v_r = 0.6
+    v_phi = 0.7
+    v_z = 0.8
+
     def tearDown(self):
         self.system.part.clear()
 
-    def swap_axis(self, arr, axis):
-        if axis == 'x':
-            arr = np.dot(
-                tests_common.rotation_matrix([0, 1, 0], np.pi / 2.0), arr)
-        elif axis == 'y':
-            arr = np.dot(
-                tests_common.rotation_matrix([1, 0, 0], -np.pi / 2.0), arr)
-        return arr
-
-    def swap_axis_inverse(self, arr, axis):
-        if axis == 'x':
-            arr = np.dot(
-                tests_common.rotation_matrix([0, 1, 0], -np.pi / 2.0), arr)
-        elif axis == 'y':
-            arr = np.dot(
-                tests_common.rotation_matrix([1, 0, 0], np.pi / 2.0), arr)
-        return arr
-
-    def pol_coords(self):
-        positions = np.zeros((len(self.params['ids']), 3))
-        for i, p in enumerate(self.system.part):
-            tmp = p.pos - np.array(self.params['center'])
-            tmp = self.swap_axis_inverse(tmp, self.params['axis'])
-            positions[
-                i, :] = tests_common.transform_pos_from_cartesian_to_polar_coordinates(tmp)
-        return positions
-
-    def set_particles(self):
-        self.system.part.clear()
-        # Parameters for an ellipse.
-        a = 1.0  # semi minor-axis length
-        b = 2.0  # semi major-axis length
-        # Choose the cartesian velocities such that each particle gets the same
-        # v_r, v_phi and v_z, respectively.
-        self.v_r = .75
-        self.v_phi = 2.5
-        self.v_z = 1.5
-        for i in self.params['ids']:
+    def calc_ellipsis_pos_vel(
+            self, n_part, z_min, z_max, semi_x=1., semi_y=1.):
+        """
+        Calculate positions on an elliptical corkscrew line.
+        Calculate cartesian velocities that lead to a
+        constant velocity in cylindrical coordinates
+        """
+
+        zs = np.linspace(z_min, z_max, num=n_part)
+        angles = np.linspace(-0.99 * np.pi, 0.999 * np.pi, num=n_part)
+
+        positions = []
+        velocities = []
+
+        for angle, z in zip(angles, zs):
             position = np.array(
-                [a * np.cos(i * 2.0 * np.pi / (len(self.params['ids']) + 1)),
-                 b * np.sin(i * 2.0 * np.pi / (len(self.params['ids']) + 1)),
-                 i * (self.params['max_z'] - self.params['min_z']) /
-                 (len(self.params['ids']) + 1) - self.params['center'][2]])
-
-            e_z = np.array([0, 0, 1])
-            e_r = position - (position * e_z) * e_z
-            e_r /= np.linalg.norm(e_r)
-            e_phi = np.cross(e_z, e_r)
-            velocity = e_r * self.v_r + e_phi * self.v_phi + e_z * self.v_z 
-
-            velocity = self.swap_axis(velocity, self.params['axis'])
-            position = self.swap_axis(position, self.params['axis'])
-            position += np.array(self.params['center'])
-            self.system.part.add(id=i, pos=position, v=velocity)
-
-    def calculate_numpy_histogram(self):
-        pol_positions = self.pol_coords()
+                [semi_x * np.cos(angle),
+                 semi_y * np.sin(angle),
+                 z])
+
+            e_r, e_phi, e_z = tests_common.get_cylindrical_basis_vectors(
+                position)
+            velocity = self.v_r * e_r + self.v_phi * e_phi + self.v_z * e_z
+
+            positions.append(position)
+            velocities.append(velocity)
+
+        return np.array(positions), np.array(velocities)
+
+    def align_with_observable_frame(self, vec):
+        """
+        Rotate vectors from the original box frame to the frame of the observables.
+        """
+
+        # align original z to observable z
+        vec = tests_common.rodrigues_rot(vec, [1, -1, 0], -np.pi / 2.)
+        # original x now points along [sqrt(3),-sqrt(3),-sqrt(3)]
+
+        # align original x to observable orientation
+        vec = tests_common.rodrigues_rot(vec, [1, 1, 0], -3. / 4. * np.pi)
+        return vec
+
+    def setup_system_get_np_hist(self):
+        """
+        Pick positions and velocities in the original box frame
+        and calculate the np histogram.
+        Then rotate and move the positions and velocities
+        to the frame of the observables.
+        After calculating the core observables, the result should be
+        the same as the np histogram obtained from the original box frame.
+        """
+
+        positions, velocities = self.calc_ellipsis_pos_vel(100, 0.99 *
+                                                           self.params['min_z'], 0.9 *
+                                                           self.params['max_z'], semi_x=0.9 *
+                                                           self.params['max_r'], semi_y=0.2 *
+                                                           self.params['max_r'])
+
+        # first, get the numpy histogram of the cylinder coordinates
+        pos_cyl = []
+        for pos in positions:
+            pos_cyl.append(
+                tests_common.transform_pos_from_cartesian_to_polar_coordinates(pos))
         np_hist, np_edges = tests_common.get_histogram(
-            pol_positions, self.params, 'cylindrical')
-        return np_hist, np_edges
-
-    def normalize_with_bin_volume(self, histogram):
-        bin_volume = tests_common.get_cylindrical_bin_volume(
-            self.params['n_r_bins'],
-            self.params['n_phi_bins'],
-            self.params['n_z_bins'],
-            self.params['min_r'],
-            self.params['max_r'],
-            self.params['min_phi'],
-            self.params['max_phi'],
-            self.params['min_z'],
-            self.params['max_z'])
-        for i in range(self.params['n_r_bins']):
-            histogram[i, :, :] /= bin_volume[i]
-        return histogram
-
-    def density_profile_test(self):
-        self.set_particles()
-        # Set up the Observable.
-        local_params = self.params.copy()
-        if self.params['axis'] == 'x':
-            local_params['axis'] = [1.0, 0.0, 0.0]
-        elif self.params['axis'] == 'y':
-            local_params['axis'] = [0.0, 1.0, 0.0]
-        else:
-            local_params['axis'] = [0.0, 0.0, 1.0]
-        obs = espressomd.observables.CylindricalDensityProfile(**local_params)
-        core_hist = obs.calculate()
-        core_edges = obs.call_method("edges")
-        np_hist, np_edges = self.calculate_numpy_histogram()
-        np_hist = self.normalize_with_bin_volume(np_hist)
-        np.testing.assert_array_almost_equal(np_hist, core_hist)
-        for i in range(3):
-            np.testing.assert_array_almost_equal(np_edges[i], core_edges[i])
-        self.assertEqual(np.prod(obs.shape()), len(np_hist.flatten()))
-
-    def velocity_profile_test(self):
-        self.set_particles()
-        # Set up the Observable.
-        local_params = self.params.copy()
-        if self.params['axis'] == 'x':
-            local_params['axis'] = [1.0, 0.0, 0.0]
-        elif self.params['axis'] == 'y':
-            local_params['axis'] = [0.0, 1.0, 0.0]
-        else:
-            local_params['axis'] = [0.0, 0.0, 1.0]
-        obs = espressomd.observables.CylindricalVelocityProfile(**local_params)
-        core_hist = obs.calculate()
+            np.array(pos_cyl), self.params, 'cylindrical')
+        np_dens = tests_common.normalize_cylindrical_hist(
+            np_hist.copy(), self.params)
+
+        # now align the positions and velocities with the frame of reference
+        # used in the observables
+        pos_aligned = []
+        vel_aligned = []
+        for pos, vel in zip(positions, velocities):
+            pos_aligned.append(
+                self.align_with_observable_frame(pos) +
+                self.cyl_transform_params.center)
+            vel_aligned.append(self.align_with_observable_frame(vel))
+        self.system.part.add(pos=pos_aligned, v=vel_aligned)
+        self.params['ids'] = self.system.part[:].id
+
+        return np_dens, np_edges
+
+    def check_edges(self, observable, np_edges):
+        core_edges = observable.call_method("edges")
+        for core_edge, np_edge in zip(core_edges, np_edges):
+            np.testing.assert_array_almost_equal(core_edge, np_edge)
+
+    def test_density_profile(self):
+        """
+        Check that the result from the observable (in its own frame)
+        matches the np result from the box frame
+        """
+        np_dens, np_edges = self.setup_system_get_np_hist()
+
+        cyl_dens_prof = espressomd.observables.CylindricalDensityProfile(
+            **self.params)
+        core_hist = cyl_dens_prof.calculate()
+        np.testing.assert_array_almost_equal(np_dens, core_hist)
+        self.check_edges(cyl_dens_prof, np_edges)
+
+    def test_vel_profile(self):
+        """
+        Check that the result from the observable (in its own frame)
+        matches the np result from the box frame
+        """
+        np_dens, np_edges = self.setup_system_get_np_hist()
+        cyl_vel_prof = espressomd.observables.CylindricalVelocityProfile(
+            **self.params)
+        core_hist = cyl_vel_prof.calculate()
         core_hist_v_r = core_hist[:, :, :, 0]
         core_hist_v_phi = core_hist[:, :, :, 1]
         core_hist_v_z = core_hist[:, :, :, 2]
-        np_hist, _ = self.calculate_numpy_histogram()
-        for x in np.nditer(np_hist, op_flags=['readwrite']):
-            if x[...] > 0.0:
-                x[...] /= x[...]
-        np.testing.assert_array_almost_equal(np_hist * self.v_r, core_hist_v_r)
+        np_hist_binary = np_dens
+        np_hist_binary[np.nonzero(np_hist_binary)] = 1
+        np.testing.assert_array_almost_equal(
+            np_hist_binary * self.v_r, core_hist_v_r)
         np.testing.assert_array_almost_equal(
-            np_hist * self.v_phi, core_hist_v_phi)
-        np.testing.assert_array_almost_equal(np_hist * self.v_z, core_hist_v_z)
-        self.assertEqual(np.prod(obs.shape()), len(np_hist.flatten()) * 3)
-
-    def flux_density_profile_test(self):
-        self.set_particles()
-        # Set up the Observable.
-        local_params = self.params.copy()
-        if self.params['axis'] == 'x':
-            local_params['axis'] = [1.0, 0.0, 0.0]
-        elif self.params['axis'] == 'y':
-            local_params['axis'] = [0.0, 1.0, 0.0]
-        else:
-            local_params['axis'] = [0.0, 0.0, 1.0]
-        obs = espressomd.observables.CylindricalFluxDensityProfile(
-            **local_params)
-        core_hist = obs.calculate()
+            np_hist_binary * self.v_phi, core_hist_v_phi)
+        np.testing.assert_array_almost_equal(
+            np_hist_binary * self.v_z, core_hist_v_z)
+        self.check_edges(cyl_vel_prof, np_edges)
+
+    def test_flux_density_profile(self):
+        """
+        Check that the result from the observable (in its own frame)
+        matches the np result from the box frame
+        """
+        np_dens, np_edges = self.setup_system_get_np_hist()
+        cyl_flux_dens = espressomd.observables.CylindricalFluxDensityProfile(
+            **self.params)
+        core_hist = cyl_flux_dens.calculate()
         core_hist_v_r = core_hist[:, :, :, 0]
         core_hist_v_phi = core_hist[:, :, :, 1]
         core_hist_v_z = core_hist[:, :, :, 2]
-        np_hist, _ = self.calculate_numpy_histogram()
-        np_hist = self.normalize_with_bin_volume(np_hist)
-        np.testing.assert_array_almost_equal(np_hist * self.v_r, core_hist_v_r)
+        np.testing.assert_array_almost_equal(np_dens * self.v_r, core_hist_v_r)
         np.testing.assert_array_almost_equal(
-            np_hist * self.v_phi, core_hist_v_phi)
-        np.testing.assert_array_almost_equal(np_hist * self.v_z, core_hist_v_z)
-        self.assertEqual(np.prod(obs.shape()), len(np_hist.flatten()) * 3)
-
-    def test_hist_x(self):
-        self.params['axis'] = 'x'
-        self.velocity_profile_test()
-        self.flux_density_profile_test()
-        self.density_profile_test()
-
-    def test_hist_y(self):
-        self.params['axis'] = 'y'
-        self.velocity_profile_test()
-        self.flux_density_profile_test()
-        self.density_profile_test()
-
-    def test_hist_z(self):
-        self.params['axis'] = 'z'
-        self.velocity_profile_test()
-        self.flux_density_profile_test()
-        self.density_profile_test()
+            np_dens * self.v_phi, core_hist_v_phi)
+        np.testing.assert_array_almost_equal(np_dens * self.v_z, core_hist_v_z)
+        self.check_edges(cyl_flux_dens, np_edges)
 
     def test_cylindrical_pid_profile_interface(self):
-        # test setters and getters
+        """
+        Test setters and getters of the script interface
+        """
         params = self.params.copy()
         params['n_r_bins'] = 4
         params['n_phi_bins'] = 6
         params['n_z_bins'] = 8
         params['ids'] = [0, 1]
-        params['axis'] = [0.0, 1.0, 0.0]
         self.system.part.add(id=0, pos=[0, 0, 0], type=0)
         self.system.part.add(id=1, pos=[0, 0, 0], type=1)
         observable = espressomd.observables.CylindricalDensityProfile(**params)
@@ -266,15 +251,14 @@ def test_cylindrical_pid_profile_interface(self):
         self.assertEqual(observable.max_z, 9)
         obs_bin_edges = observable.bin_edges()
         np.testing.assert_array_equal(obs_bin_edges[-1, -1, -1], [7, 8, 9])
-        # check center
-        np.testing.assert_array_equal(
-            np.copy(observable.center), params['center'])
-        observable.center = [3, 2, 1]
-        np.testing.assert_array_equal(np.copy(observable.center), [3, 2, 1])
-        # check axis
-        np.testing.assert_array_equal(np.copy(observable.axis), params['axis'])
-        observable.axis = [6, 5, 4]
-        np.testing.assert_array_equal(np.copy(observable.axis), [6, 5, 4])
+        # check center, axis, orientation
+        ctp = espressomd.math.CylindricalTransformationParameters(
+            center=[1, 2, 3], axis=[0, 1, 0], orientation=[0, 0, 1])
+        observable.transform_params = ctp
+
+        for attr_name in ['center', 'axis', 'orientation']:
+            np.testing.assert_array_almost_equal(np.copy(ctp.__getattr__(attr_name)),
+                                                 np.copy(observable.transform_params.__getattr__(attr_name)))
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/observable_cylindricalLB.py b/testsuite/python/observable_cylindricalLB.py
index e2ba5a665c6..0d1f72edfc8 100644
--- a/testsuite/python/observable_cylindricalLB.py
+++ b/testsuite/python/observable_cylindricalLB.py
@@ -18,210 +18,158 @@
 import unittest as ut
 import unittest_decorators as utx
 import espressomd
+import espressomd.math
 import espressomd.observables
 import espressomd.lb
 import tests_common
 
 
-AGRID = 1.0
-VISC = 2.7
-DENS = 1.7
-TIME_STEP = 0.1
-LB_PARAMS = {'agrid': AGRID,
-             'dens': DENS,
-             'visc': VISC,
-             'tau': TIME_STEP,
-             }
-
-
 class CylindricalLBObservableCommon:
 
     """
-    Testcase for the CylindricalLBObservable.
+    Testcase for the CylindricalLBObservables.
 
     """
     lbf = None
-    system = espressomd.System(box_l=(10, 10, 10))
+    system = espressomd.System(box_l=3 * [15])
     system.time_step = 0.01
     system.cell_system.skin = 0.4
     positions = []
 
+    lb_params = {'agrid': 1.,
+                 'dens': 1.2,
+                 'visc': 2.7,
+                 'tau': 0.1,
+                 }
+    cyl_transform_params = espressomd.math.CylindricalTransformationParameters(
+        center=3 * [7], axis=[1, 0, 0], orientation=[0, 0, 1])
+
     params = {
-        'ids': list(range(10)),
-        'center': [5.0, 5.0, 5.0],  # center of the histogram
-        'axis': 'y',
-        'n_r_bins': 10,  # number of bins in r
-        'n_phi_bins': 2,  # -*- in phi
-        'n_z_bins': 2,  # -*- in z
+        'ids': None,
+        'transform_params': cyl_transform_params,
+        'n_r_bins': 4,
+        'n_phi_bins': 3,
+        'n_z_bins': 5,
         'min_r': 0.0,
         'min_phi': -np.pi,
-        'min_z': -5.0,
-        'max_r': 5.0,
+        'min_z': -6.0,
+        'max_r': 6.0,
         'max_phi': np.pi,
-        'max_z': 5.0,
+        'max_z': 6.0,
     }
 
-    def tearDown(self):
-        self.system.part.clear()
-
-    def swap_axis(self, arr, axis):
-        if axis == 'x':
-            arr = np.dot(tests_common.rotation_matrix(
-                [0, 1, 0], np.pi / 2.0), arr)
-        elif axis == 'y':
-            arr = np.dot(tests_common.rotation_matrix(
-                [1, 0, 0], -np.pi / 2.0), arr)
-        return arr
-
-    def swap_axis_inverse(self, arr, axis):
-        if axis == 'x':
-            arr = np.dot(tests_common.rotation_matrix(
-                [0, 1, 0], -np.pi / 2.0), arr)
-        elif axis == 'y':
-            arr = np.dot(tests_common.rotation_matrix(
-                [1, 0, 0], np.pi / 2.0), arr)
-        return arr
-
-    def pol_coords(self):
-        positions = np.zeros((len(self.positions), 3))
-        for i, p in enumerate(self.positions):
-            tmp = p - np.array(self.params['center'])
-            tmp = self.swap_axis_inverse(tmp, self.params['axis'])
-            positions[i, :] = tests_common.transform_pos_from_cartesian_to_polar_coordinates(
-                tmp)
-        return positions
-
-    def set_particles(self):
-        self.system.part.clear()
-        self.system.part.add(pos=self.positions)
-
-    def set_fluid_velocity(self):
-        del self.positions[:]
-        # Choose the cartesian velocities such that each particle gets the same
-        # v_r, v_phi and v_z, respectively.
-        self.v_r = .75
-        self.v_phi = 2.5
-        self.v_z = 1.5
-        node_positions = np.arange(-4.5, 5.0, 1.0)
-        for i, _ in enumerate(node_positions):
-            position = np.array(
-                [node_positions[i], node_positions[i], node_positions[i]])
-
-            e_z = np.array([0, 0, 1])
-            e_r = position - (position * e_z) * e_z
-            e_r /= np.linalg.norm(e_r)
-            e_phi = np.cross(e_z, e_r)
-
-            velocity = e_r * self.v_r + e_phi * self.v_phi + e_z * self.v_z
-
-            velocity = self.swap_axis(velocity, self.params['axis'])
-            position = self.swap_axis(position, self.params['axis'])
-            position += np.array(self.params['center'])
-            self.positions.append(position)
-            self.lbf[np.array(position, dtype=int)].velocity = velocity
-
-    def normalize_with_bin_volume(self, histogram):
-        bin_volume = tests_common.get_cylindrical_bin_volume(
-            self.params['n_r_bins'],
-            self.params['n_phi_bins'],
-            self.params['n_z_bins'],
-            self.params['min_r'],
-            self.params['max_r'],
-            self.params['min_phi'],
-            self.params['max_phi'],
-            self.params['min_z'],
-            self.params['max_z'])
-        # Normalization
-        for i in range(self.params['n_r_bins']):
-            histogram[i, :, :] /= bin_volume[i]
-        return histogram
-
-    def LB_fluxdensity_profile_test(self):
-        self.set_fluid_velocity()
-        self.set_particles()
-        # Set up the Observable.
-        local_params = self.params.copy()
-        if self.params['axis'] == 'x':
-            local_params['axis'] = [1.0, 0.0, 0.0]
-        elif self.params['axis'] == 'y':
-            local_params['axis'] = [0.0, 1.0, 0.0]
-        else:
-            local_params['axis'] = [0.0, 0.0, 1.0]
-        p = espressomd.observables.CylindricalLBFluxDensityProfileAtParticlePositions(
-            **local_params)
-        core_hist = p.calculate()
-        core_hist_v_r = core_hist[:, :, :, 0]
-        core_hist_v_phi = core_hist[:, :, :, 1]
-        core_hist_v_z = core_hist[:, :, :, 2]
-        core_edges = p.call_method("edges")
-        self.pol_positions = self.pol_coords()
+    v_r = 0.02
+    v_phi = 0.04
+    v_z = 0.03
+
+    def calc_vel_at_pos(self, positions):
+        """
+        In cylindrical coordinates, all velocities are the same.
+        In cartesian they depend on the position.
+        The cartesian velocities are calculated here.
+        """
+
+        vels = []
+        for pos in positions:
+            e_r, e_phi, e_z = tests_common.get_cylindrical_basis_vectors(pos)
+            velocity = self.v_r * e_r + self.v_phi * e_phi + self.v_z * e_z
+            vels.append(velocity)
+        return vels
+
+    def align_with_observable_frame(self, vec):
+        """
+        Rotate vectors from the original box frame to
+        the frame of the observables.
+        """
+
+        # align original z to observable z
+        vec = tests_common.rodrigues_rot(vec, [0, 1, 0], np.pi / 2.)
+        # original x now points along [0,0,-1]
+
+        # align original x to observable orientation
+        vec = tests_common.rodrigues_rot(vec, [1, 0, 0], np.pi)
+        return vec
+
+    def setup_system_get_np_hist(self):
+        """
+        Pick positions and velocities in the original box frame and
+        calculate the np histogram. Then rotate and move the positions
+        and velocities to the frame of the observables.
+        After calculating the core observables, the result should be
+        the same as the np histogram obtained from the original box frame.
+        """
+
+        nodes = np.array(np.meshgrid([1, 2], [1, 2], [
+                         1, 1, 1, 1, 2])).T.reshape(-1, 3)
+        positions = nodes + 3 * [0.5]
+        velocities = self.calc_vel_at_pos(positions)
+
+        # get the histogram from numpy
+        pos_cyl = []
+        for pos in positions:
+            pos_cyl.append(
+                tests_common.transform_pos_from_cartesian_to_polar_coordinates(pos))
         np_hist, np_edges = tests_common.get_histogram(
-            self.pol_positions, self.params, 'cylindrical')
-        np_hist = self.normalize_with_bin_volume(np_hist)
-        np.testing.assert_array_almost_equal(np_hist * self.v_r, core_hist_v_r)
+            np.array(pos_cyl), self.params, 'cylindrical')
+
+        # the particles only determine the evaluation points, not the values of
+        # the observables
+        np_hist[np.nonzero(np_hist)] = 1
+
+        # now align the positions and velocities with the frame of reference
+        # used in the observables
+        pos_aligned = []
+        vel_aligned = []
+        for pos, vel in zip(positions, velocities):
+            pos_aligned.append(
+                self.align_with_observable_frame(pos) +
+                self.cyl_transform_params.center)
+            vel_aligned.append(self.align_with_observable_frame(vel))
+        node_aligned = np.array(
+            np.rint(
+                np.array(pos_aligned) -
+                3 *
+                [0.5]),
+            dtype=int)
+        self.system.part.add(pos=pos_aligned, v=vel_aligned)
+        self.params['ids'] = self.system.part[:].id
+
+        for node, vel in zip(node_aligned, vel_aligned):
+            self.lbf[node].velocity = vel
+
+        return np_hist, np_edges
+
+    def check_edges(self, observable, np_edges):
+        core_edges = observable.call_method("edges")
+        for core_edge, np_edge in zip(core_edges, np_edges):
+            np.testing.assert_array_almost_equal(core_edge, np_edge)
+
+    def test_cylindrical_lb_vel_profile_obs(self):
+        """
+        Check that the result from the observable (in its own frame)
+        matches the np result from the box frame
+        """
+
+        np_hist_binary, np_edges = self.setup_system_get_np_hist()
+        vel_obs = espressomd.observables.CylindricalLBVelocityProfileAtParticlePositions(
+            **self.params)
+        core_hist_v = vel_obs.calculate()
+        core_hist_v_r = core_hist_v[:, :, :, 0]
+        core_hist_v_phi = core_hist_v[:, :, :, 1]
+        core_hist_v_z = core_hist_v[:, :, :, 2]
         np.testing.assert_array_almost_equal(
-            np_hist * self.v_phi, core_hist_v_phi)
-        np.testing.assert_array_almost_equal(np_hist * self.v_z, core_hist_v_z)
-        for i in range(3):
-            np.testing.assert_array_almost_equal(np_edges[i], core_edges[i])
-        self.assertEqual(np.prod(p.shape()), len(np_hist.flatten()) * 3)
-
-    def LB_velocity_profile_at_particle_positions_test(self):
-        self.set_fluid_velocity()
-        self.set_particles()
-        # Set up the Observable.
-        local_params = self.params.copy()
-        if self.params['axis'] == 'x':
-            local_params['axis'] = [1.0, 0.0, 0.0]
-        elif self.params['axis'] == 'y':
-            local_params['axis'] = [0.0, 1.0, 0.0]
-        else:
-            local_params['axis'] = [0.0, 0.0, 1.0]
-        p = espressomd.observables.CylindricalLBVelocityProfileAtParticlePositions(
-            **local_params)
-        core_hist = p.calculate()
-        core_hist_v_r = core_hist[:, :, :, 0]
-        core_hist_v_phi = core_hist[:, :, :, 1]
-        core_hist_v_z = core_hist[:, :, :, 2]
-        self.pol_positions = self.pol_coords()
-        np_hist, _ = np.histogramdd(
-            self.pol_positions,
-            bins=(self.params['n_r_bins'],
-                  self.params['n_phi_bins'],
-                  self.params['n_z_bins']),
-            range=[(self.params['min_r'],
-                    self.params['max_r']),
-                   (self.params['min_phi'],
-                    self.params['max_phi']),
-                   (self.params['min_z'],
-                    self.params['max_z'])])
-        for x in np.nditer(np_hist, op_flags=['readwrite']):
-            if x[...] > 0.0:
-                x[...] /= x[...]
-        np.testing.assert_array_almost_equal(np_hist * self.v_r, core_hist_v_r)
+            np_hist_binary * self.v_r, core_hist_v_r)
         np.testing.assert_array_almost_equal(
-            np_hist * self.v_phi, core_hist_v_phi)
-        np.testing.assert_array_almost_equal(np_hist * self.v_z, core_hist_v_z)
-        self.assertEqual(np.prod(p.shape()), len(np_hist.flatten()) * 3)
-
-    def perform_tests(self):
-        self.LB_fluxdensity_profile_test()
-        self.LB_velocity_profile_at_particle_positions_test()
-
-    def test_x_axis(self):
-        self.params['axis'] = 'x'
-        self.perform_tests()
-
-    def test_y_axis(self):
-        self.params['axis'] = 'y'
-        self.perform_tests()
-
-    def test_z_axis(self):
-        self.params['axis'] = 'z'
-        self.perform_tests()
+            np_hist_binary * self.v_phi, core_hist_v_phi)
+        np.testing.assert_array_almost_equal(
+            np_hist_binary * self.v_z, core_hist_v_z)
+        self.check_edges(vel_obs, np_edges)
 
     def test_cylindrical_lb_profile_interface(self):
-        # test setters and getters
+        """
+        Test setters and getters of the script interface
+        """
+
         params = self.params.copy()
         params['n_r_bins'] = 4
         params['n_phi_bins'] = 6
@@ -269,25 +217,20 @@ def test_cylindrical_lb_profile_interface(self):
         self.assertEqual(observable.max_z, 9)
         obs_bin_edges = observable.bin_edges()
         np.testing.assert_array_equal(obs_bin_edges[-1, -1, -1], [7, 8, 9])
-        # check center
-        np.testing.assert_array_equal(
-            np.copy(observable.center), params['center'])
-        observable.center = [3, 2, 1]
-        np.testing.assert_array_equal(np.copy(observable.center), [3, 2, 1])
-        # check axis
-        np.testing.assert_array_equal(np.copy(observable.axis), params['axis'])
-        observable.axis = [6, 5, 4]
-        np.testing.assert_array_equal(np.copy(observable.axis), [6, 5, 4])
-        # check sampling_density
-        self.assertEqual(observable.sampling_density, 2)
-        observable.sampling_density = 3
-        self.assertEqual(observable.sampling_density, 3)
+        # check center, axis, orientation
+        ctp = espressomd.math.CylindricalTransformationParameters(
+            center=[1, 2, 3], axis=[0, 1, 0], orientation=[0, 0, 1])
+        observable.transform_params = ctp
+
+        for attr_name in ['center', 'axis', 'orientation']:
+            np.testing.assert_array_almost_equal(np.copy(ctp.__getattr__(attr_name)),
+                                                 np.copy(observable.transform_params.__getattr__(attr_name)))
 
 
 class CylindricalLBObservableCPU(ut.TestCase, CylindricalLBObservableCommon):
 
     def setUp(self):
-        self.lbf = espressomd.lb.LBFluid(**LB_PARAMS)
+        self.lbf = espressomd.lb.LBFluid(**self.lb_params)
         self.system.actors.add(self.lbf)
 
     def tearDown(self):
@@ -295,16 +238,47 @@ def tearDown(self):
         self.system.actors.remove(self.lbf)
         self.system.part.clear()
 
+    def test_cylindrical_lb_flux_density_obs(self):
+        """
+        Check that the result from the observable (in its own frame)
+        matches the np result from the box frame.
+        Only for CPU because density interpolation is not implemented for GPU LB.
+        """
+        np_hist_binary, np_edges = self.setup_system_get_np_hist()
+
+        flux_obs = espressomd.observables.CylindricalLBFluxDensityProfileAtParticlePositions(
+            **self.params)
+        core_hist_fl = flux_obs.calculate()
+        core_hist_fl_r = core_hist_fl[:, :, :, 0]
+        core_hist_fl_phi = core_hist_fl[:, :, :, 1]
+        core_hist_fl_z = core_hist_fl[:, :, :, 2]
+
+        np.testing.assert_array_almost_equal(
+            np_hist_binary *
+            self.lb_params['dens'] *
+            self.v_r,
+            core_hist_fl_r)
+        np.testing.assert_array_almost_equal(
+            np_hist_binary *
+            self.lb_params['dens'] *
+            self.v_phi,
+            core_hist_fl_phi)
+        np.testing.assert_array_almost_equal(
+            np_hist_binary *
+            self.lb_params['dens'] *
+            self.v_z,
+            core_hist_fl_z)
+        self.check_edges(flux_obs, np_edges)
+
 
 @utx.skipIfMissingGPU()
 class CylindricalLBObservableGPU(ut.TestCase, CylindricalLBObservableCommon):
 
     def setUp(self):
-        self.lbf = espressomd.lb.LBFluidGPU(**LB_PARAMS)
+        self.lbf = espressomd.lb.LBFluidGPU(**self.lb_params)
         self.system.actors.add(self.lbf)
 
     def tearDown(self):
-        del self.positions[:]
         self.system.actors.remove(self.lbf)
         self.system.part.clear()
 
diff --git a/testsuite/python/p3m_fft.py b/testsuite/python/p3m_fft.py
new file mode 100644
index 00000000000..872a4ec59b8
--- /dev/null
+++ b/testsuite/python/p3m_fft.py
@@ -0,0 +1,129 @@
+#
+# Copyright (C) 2020-2021 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import espressomd
+import numpy as np
+import unittest as ut
+import unittest_decorators as utx
+import tests_common
+
+P3M_PARAMS = [
+    {'cao': 7, 'r_cut': 3.103065490722656, 'alpha': 1.228153768561588, 'mesh': 48},
+    {'cao': 7, 'r_cut': 4.477272033691406, 'alpha': 0.845808585620971, 'mesh': 32},
+    {'cao': 7, 'r_cut': 2.393871545791626, 'alpha': 1.599093835130641, 'mesh': 64},
+]
+
+FFT_PLANS = {
+    1: [([1, 1, 1], P3M_PARAMS[1])],
+    2: [([2, 1, 1], P3M_PARAMS[1])],
+    3: [([3, 1, 1], P3M_PARAMS[0])],
+    4: [([2, 2, 1], P3M_PARAMS[1]),
+        ([4, 1, 1], P3M_PARAMS[2])],
+    6: [([3, 2, 1], P3M_PARAMS[0])],
+    8: [([2, 2, 2], P3M_PARAMS[1]),
+        ([4, 2, 1], P3M_PARAMS[2])],
+}
+
+
+@utx.skipIfMissingFeatures(["LENNARD_JONES", "P3M"])
+class FFT_test(ut.TestCase):
+
+    system = espressomd.System(box_l=[10., 10., 10.])
+    original_node_grid = tuple(system.cell_system.node_grid)
+    n_nodes = system.cell_system.get_state()["n_nodes"]
+
+    def setUp(self):
+        self.system.box_l = [10., 10., 10.]
+        self.system.cell_system.node_grid = self.original_node_grid
+        self.system.time_step = 0.01
+
+    def tearDown(self):
+        self.system.actors.clear()
+        self.system.part.clear()
+
+    def minimize(self):
+        self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
+            epsilon=1.0, sigma=1.0, cutoff=2**(1.0 / 6.0), shift="auto")
+        self.system.integrator.set_steepest_descent(
+            f_max=1, gamma=0.01, max_displacement=0.01)
+        self.system.integrator.run(100)
+        self.system.integrator.set_vv()
+        self.system.non_bonded_inter[0, 0].lennard_jones.set_params(
+            epsilon=0.0, sigma=1.0, cutoff=2)
+
+    def add_charged_particles(self):
+        np.random.seed(seed=42)
+        num_pairs = 200
+        positions = np.random.random((2 * num_pairs, 3))
+        self.system.part.add(pos=positions * self.system.box_l,
+                             q=num_pairs * [-1, 1])
+        self.minimize()
+
+    def add_magnetic_particles(self):
+        np.random.seed(seed=42)
+        num_part = 200
+        positions = np.random.random((num_part, 3))
+        dipoles = tests_common.random_dipoles(num_part)
+        self.system.part.add(pos=positions * self.system.box_l,
+                             dip=dipoles, rotation=num_part * [(1, 1, 1)])
+        self.minimize()
+
+    @ut.skipIf(n_nodes not in FFT_PLANS, f"no FFT plan for {n_nodes} threads")
+    def test_fft_plans(self):
+        import espressomd.electrostatics
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+        for node_grid, p3m_params in FFT_PLANS[self.n_nodes]:
+            self.system.cell_system.node_grid = node_grid
+            solver = espressomd.electrostatics.P3M(
+                prefactor=2, accuracy=1e-6, tune=False, **p3m_params)
+            self.system.actors.add(solver)
+            ref_energy = -75.871906
+            p3m_energy = self.system.analysis.energy()['coulomb']
+            self.system.actors.clear()
+            np.testing.assert_allclose(p3m_energy, ref_energy, rtol=1e-4)
+
+    @utx.skipIfMissingFeatures("P3M")
+    @ut.skipIf(n_nodes < 2 or n_nodes >= 8, "only runs for 2 <= n_nodes <= 7")
+    def test_unsorted_node_grid_exception_p3m(self):
+        import espressomd.electrostatics
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+        unsorted_node_grid = self.system.cell_system.node_grid[::-1]
+        self.system.cell_system.node_grid = unsorted_node_grid
+        solver = espressomd.electrostatics.P3M(prefactor=2, accuracy=1e-2)
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: P3M_init: node grid must be sorted, largest first'):
+            self.system.actors.add(solver)
+
+    @utx.skipIfMissingFeatures("DP3M")
+    @ut.skipIf(n_nodes < 2 or n_nodes >= 8, "only runs for 2 <= n_nodes <= 7")
+    def test_unsorted_node_grid_exception_dp3m(self):
+        import espressomd.magnetostatics
+        self.system.time_step = 0.01
+        self.add_magnetic_particles()
+        unsorted_node_grid = self.system.cell_system.node_grid[::-1]
+        self.system.cell_system.node_grid = unsorted_node_grid
+        solver = espressomd.magnetostatics.DipolarP3M(
+            prefactor=2, accuracy=1e-2)
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: dipolar P3M_init: node grid must be sorted, largest first'):
+            self.system.actors.add(solver)
+
+
+if __name__ == "__main__":
+    ut.main()
diff --git a/testsuite/python/p3m_tuning_exceptions.py b/testsuite/python/p3m_tuning_exceptions.py
index c3f27d3aec1..e3fbc4c5d65 100644
--- a/testsuite/python/p3m_tuning_exceptions.py
+++ b/testsuite/python/p3m_tuning_exceptions.py
@@ -34,7 +34,7 @@ def add_charged_particles(self):
         self.system.part.add(pos=[[0, 0, 0], [.5, .5, .5]], q=[-1, 1])
 
     def add_magnetic_particles(self):
-        self.system.part.add(pos=[[0, 0, 0], [.5, .5, .5]],
+        self.system.part.add(pos=[[0.01, 0.01, 0.01], [.5, .5, .5]],
                              rotation=2 * [(1, 1, 1)], dip=2 * [(1, 0, 0)])
 
     ##################################################
@@ -49,7 +49,7 @@ def test_01_time_not_set_p3m_gpu(self):
         self.add_charged_particles()
 
         solver = espressomd.electrostatics.P3MGPU(prefactor=2, accuracy=1e-2)
-        with self.assertRaisesRegex(Exception, 'python_p3m_adaptive_tune: ERROR: time_step not set'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: time_step not set'):
             self.system.actors.add(solver)
 
     @utx.skipIfMissingFeatures("P3M")
@@ -59,7 +59,7 @@ def test_01_time_not_set_p3m_cpu(self):
         self.add_charged_particles()
 
         solver = espressomd.electrostatics.P3M(prefactor=2, accuracy=1e-2)
-        with self.assertRaisesRegex(Exception, 'python_p3m_adaptive_tune: ERROR: time_step not set'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: time_step not set'):
             self.system.actors.add(solver)
 
     @utx.skipIfMissingFeatures("DP3M")
@@ -70,7 +70,7 @@ def test_01_time_not_set_dp3m_cpu(self):
 
         solver = espressomd.magnetostatics.DipolarP3M(
             prefactor=2, accuracy=1e-2)
-        with self.assertRaisesRegex(Exception, 'python_dp3m_adaptive_tune: ERROR: time_step not set'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: time_step not set'):
             self.system.actors.add(solver)
 
     ##############################################
@@ -85,7 +85,7 @@ def test_02_no_particles_p3m_gpu(self):
         self.system.time_step = 0.01
 
         solver = espressomd.electrostatics.P3MGPU(prefactor=2, accuracy=1e-2)
-        with self.assertRaisesRegex(Exception, 'python_p3m_adaptive_tune: ERROR: no charged particles in the system'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: no charged particles in the system'):
             self.system.actors.add(solver)
 
     @utx.skipIfMissingFeatures("P3M")
@@ -95,7 +95,7 @@ def test_02_no_particles_p3m_cpu(self):
         self.system.time_step = 0.01
 
         solver = espressomd.electrostatics.P3M(prefactor=2, accuracy=1e-2)
-        with self.assertRaisesRegex(Exception, 'python_p3m_adaptive_tune: ERROR: no charged particles in the system'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: no charged particles in the system'):
             self.system.actors.add(solver)
 
     @utx.skipIfMissingFeatures("DP3M")
@@ -106,7 +106,7 @@ def test_02_no_particles_dp3m_cpu(self):
 
         solver = espressomd.magnetostatics.DipolarP3M(
             prefactor=2, accuracy=1e-2)
-        with self.assertRaisesRegex(Exception, 'python_dp3m_adaptive_tune: ERROR: no dipolar particles in the system'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: no dipolar particles in the system'):
             self.system.actors.add(solver)
 
     #######################################
@@ -124,7 +124,7 @@ def test_03_non_cubic_box_p3m_gpu(self):
 
         solver = espressomd.electrostatics.P3MGPU(
             prefactor=2, accuracy=1e-2, epsilon=1)
-        with self.assertRaisesRegex(Exception, 'python_p3m_adaptive_tune: ERROR: non-metallic epsilon requires cubic box'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: non-metallic epsilon requires cubic box'):
             self.system.actors.add(solver)
 
     @utx.skipIfMissingFeatures("P3M")
@@ -137,7 +137,7 @@ def test_03_non_cubic_box_p3m_cpu(self):
 
         solver = espressomd.electrostatics.P3M(
             prefactor=2, accuracy=1e-2, epsilon=1)
-        with self.assertRaisesRegex(Exception, 'python_p3m_adaptive_tune: ERROR: non-metallic epsilon requires cubic box'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: non-metallic epsilon requires cubic box'):
             self.system.actors.add(solver)
 
     @utx.skipIfMissingFeatures("DP3M")
@@ -150,9 +150,108 @@ def test_03_non_cubic_box_dp3m_cpu(self):
 
         solver = espressomd.magnetostatics.DipolarP3M(
             prefactor=2, accuracy=1e-2)
-        with self.assertRaisesRegex(Exception, 'python_dp3m_adaptive_tune: ERROR: dipolar P3M requires a cubic box'):
+        with self.assertRaisesRegex(Exception, 'P3M: tuning failed: ERROR: dipolar P3M requires a cubic box'):
             self.system.actors.add(solver)
 
+    ##########################################
+    # block of tests with invalid parameters #
+    ##########################################
+
+    def check_invalid_params(self, solver_class, **custom_params):
+        valid_params = {
+            'prefactor': 2, 'accuracy': .01, 'tune': False, 'cao': 1,
+            'r_cut': 0.373, 'alpha': 3.81, 'mesh': (8, 8, 8),
+            'mesh_off': [-1, -1, -1]}
+        valid_params.update(custom_params)
+
+        invalid_params = [
+            ('cao', 0, 'P3M: invalid cao'),
+            ('cao', 8, 'P3M: invalid cao'),
+            ('r_cut', -2.0, 'P3M: invalid r_cut'),
+            ('alpha', -2.0, 'P3M: invalid alpha'),
+            ('accuracy', -2.0, 'P3M: invalid accuracy'),
+            ('mesh', (-1, -1, -1), 'P3M: invalid mesh size'),
+            ('mesh', (0, 0, 0), 'P3M: cao larger than mesh size'),
+            ('mesh_off', (-2, 1, 1), 'P3M: invalid mesh offset'),
+        ]
+
+        for key, invalid_value, err_msg in invalid_params:
+            params = valid_params.copy()
+            params[key] = invalid_value
+            solver = solver_class(**params)
+            with self.assertRaisesRegex(RuntimeError, err_msg):
+                self.system.actors.add(solver)
+            self.system.actors.clear()
+
+    @utx.skipIfMissingFeatures("P3M")
+    def test_04_invalid_params_p3m_cpu(self):
+        import espressomd.electrostatics
+
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+
+        self.check_invalid_params(espressomd.electrostatics.P3M)
+
+    @utx.skipIfMissingGPU()
+    @utx.skipIfMissingFeatures("P3M")
+    def test_04_invalid_params_p3m_gpu(self):
+        import espressomd.electrostatics
+
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+
+        self.check_invalid_params(espressomd.electrostatics.P3MGPU,
+                                  mesh=3 * [28], alpha=0.3548, r_cut=4.4434)
+
+    @utx.skipIfMissingFeatures("DP3M")
+    def test_04_invalid_params_dp3m_cpu(self):
+        import espressomd.magnetostatics
+
+        self.system.time_step = 0.01
+        self.add_magnetic_particles()
+
+        self.check_invalid_params(espressomd.magnetostatics.DipolarP3M)
+
+    @utx.skipIfMissingFeatures("P3M")
+    def test_04_invalid_params_p3m_elc_cpu(self):
+        import espressomd.electrostatics
+
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+
+        solver_p3m = espressomd.electrostatics.P3M(
+            prefactor=2, accuracy=0.01, tune=False, cao=1,
+            r_cut=0.373, alpha=3.81, mesh=(8, 8, 8))
+        solver_elc = espressomd.electrostatics.ELC(
+            p3m_actor=solver_p3m, gap_size=1.2 * self.system.box_l[2],
+            maxPWerror=0.01)
+        with self.assertRaisesRegex(Exception, "gap size too large"):
+            self.system.actors.add(solver_elc)
+
+        self.system.actors.clear()
+        solver_dh = espressomd.electrostatics.DH(
+            prefactor=1.2, kappa=0.8, r_cut=2.0)
+        solver_elc = espressomd.electrostatics.ELC(
+            p3m_actor=solver_dh, gap_size=1, maxPWerror=0.01)
+        with self.assertRaisesRegex(ValueError, "p3m_actor has to be a P3M solver"):
+            self.system.actors.add(solver_elc)
+
+    @utx.skipIfMissingGPU()
+    @utx.skipIfMissingFeatures("P3M")
+    def test_04_invalid_params_p3m_elc_gpu(self):
+        import espressomd.electrostatics
+
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+
+        solver_p3m = espressomd.electrostatics.P3MGPU(
+            prefactor=2, accuracy=0.01, tune=False, cao=1,
+            r_cut=4.4434, alpha=0.3548, mesh=(28, 28, 28))
+        solver_elc = espressomd.electrostatics.ELC(
+            p3m_actor=solver_p3m, gap_size=1, maxPWerror=0.01)
+        with self.assertRaisesRegex(ValueError, "ELC is not set up to work with the GPU P3M"):
+            self.system.actors.add(solver_elc)
+
     ###########################################################
     # block of tests where tuning should not throw exceptions #
     ###########################################################
@@ -179,12 +278,20 @@ def test_09_no_errors_p3m_cpu(self):
         self.system.time_step = 0.01
         self.add_charged_particles()
 
-        solver = espressomd.electrostatics.P3M(prefactor=2, accuracy=1e-2,
-                                               epsilon='metallic')
-        try:
-            self.system.actors.add(solver)
-        except Exception as err:
-            self.fail('tuning raised Exception("' + str(err) + '")')
+        solver = espressomd.electrostatics.P3M(prefactor=2, accuracy=0.1)
+        valid_params = {
+            'mesh_off': solver.default_params()['mesh_off'],  # sentinel
+            'cao': 2, 'r_cut': 3.18, 'mesh': 8}
+
+        # tuning with cao or r_cut or mesh constrained, or without constraints
+        for key, value in valid_params.items():
+            solver = espressomd.electrostatics.P3M(
+                prefactor=2, accuracy=1e-2, epsilon=0.0, **{key: value})
+            try:
+                self.system.actors.add(solver)
+            except Exception as err:
+                self.fail('tuning raised Exception("' + str(err) + '")')
+            self.system.actors.clear()
 
     @utx.skipIfMissingFeatures("DP3M")
     def test_09_no_errors_dp3m_cpu(self):
@@ -194,11 +301,61 @@ def test_09_no_errors_dp3m_cpu(self):
         self.add_magnetic_particles()
 
         solver = espressomd.magnetostatics.DipolarP3M(
-            prefactor=2, accuracy=1e-2)
+            prefactor=2, accuracy=0.1)
+        valid_params = {
+            'mesh_off': solver.default_params()['mesh_off'],  # sentinel
+            'cao': 1, 'r_cut': 3.28125, 'mesh': 5}
+
+        # tuning with cao or r_cut or mesh constrained, or without constraints
+        for key, value in valid_params.items():
+            solver = espressomd.magnetostatics.DipolarP3M(
+                prefactor=2, accuracy=1e-2, **{key: value})
+            try:
+                self.system.actors.add(solver)
+            except Exception as err:
+                self.fail('tuning raised Exception("' + str(err) + '")')
+            self.system.actors.clear()
+
+    @utx.skipIfMissingFeatures("P3M")
+    def test_09_no_errors_p3m_cpu_rescale_mesh(self):
+        import espressomd.electrostatics
+
+        self.system.box_l = [10., 15., 20.]
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+
+        solver = espressomd.electrostatics.P3M(prefactor=2, accuracy=1e-2,
+                                               epsilon='metallic',
+                                               mesh=[8, -1, -1])
+        try:
+            self.system.actors.add(solver)
+        except Exception as err:
+            self.fail('tuning raised Exception("' + str(err) + '")')
+        tuned_mesh = solver.get_params()['mesh']
+        self.assertEqual(tuned_mesh[0], 8)
+        self.assertEqual(tuned_mesh[1], 12)
+        self.assertEqual(tuned_mesh[2], 16)
+
+    @utx.skipIfMissingGPU()
+    @utx.skipIfMissingFeatures("P3M")
+    def test_09_no_errors_p3m_gpu_rescale_mesh(self):
+        import espressomd.electrostatics
+
+        self.system.box_l = [10., 15., 20.]
+        self.system.time_step = 0.01
+        self.add_charged_particles()
+
+        solver = espressomd.electrostatics.P3MGPU(prefactor=2, accuracy=1e-1,
+                                                  epsilon='metallic',
+                                                  mesh=[20, -1, -1])
         try:
             self.system.actors.add(solver)
         except Exception as err:
             self.fail('tuning raised Exception("' + str(err) + '")')
+        tuned_mesh = solver.get_params()['mesh']
+        self.assertEqual(tuned_mesh[0], 20)
+        self.assertEqual(tuned_mesh[1], 30)
+        self.assertEqual(tuned_mesh[2], 40)
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/save_checkpoint.py b/testsuite/python/save_checkpoint.py
index a9b4698dee3..ec7b9996d66 100644
--- a/testsuite/python/save_checkpoint.py
+++ b/testsuite/python/save_checkpoint.py
@@ -21,6 +21,7 @@
 import espressomd
 import espressomd.checkpointing
 import espressomd.electrostatics
+import espressomd.magnetostatics
 import espressomd.interactions
 import espressomd.virtual_sites
 import espressomd.accumulators
@@ -36,8 +37,10 @@
 modes = {x for mode in set("@TEST_COMBINATION@".upper().split('-'))
          for x in [mode, mode.split('.')[0]]}
 
-# use a box with 3 different dimensions
+# use a box with 3 different dimensions, unless DipolarP3M is used
 system = espressomd.System(box_l=[12.0, 14.0, 16.0])
+if 'DP3M' in modes:
+    system.box_l = 3 * [np.max(system.box_l)]
 system.cell_system.skin = 0.1
 system.time_step = 0.01
 system.min_global_cut = 2.0
@@ -89,15 +92,21 @@
     ek.add_species(ek_species)
     system.actors.add(ek)
 
-system.part.add(pos=[1.0] * 3)
-system.part.add(pos=[1.0, 1.0, 2.0])
+p1 = system.part.add(pos=[1.0] * 3)
+p2 = system.part.add(pos=[1.0, 1.0, 2.0])
+
+if espressomd.has_features('ELECTROSTATICS'):
+    p1.q = 1
+    p2.q = -1
+
+if espressomd.has_features('DIPOLES'):
+    p1.dip = (1.3, 2.1, -6)
+    p2.dip = (7.3, 6.1, -4)
 
 if espressomd.has_features('EXCLUSIONS'):
     system.part.add(pos=[2.0] * 3, exclusions=[0, 1])
 
-if espressomd.has_features('P3M') and 'P3M.CPU' in modes:
-    system.part[0].q = 1
-    system.part[1].q = -1
+if espressomd.has_features('P3M') and 'P3M' in modes:
     p3m = espressomd.electrostatics.P3M(
         prefactor=1.0,
         accuracy=0.1,
@@ -106,7 +115,16 @@
         alpha=1.0,
         r_cut=1.0,
         tune=False)
-    system.actors.add(p3m)
+    if 'P3M.CPU' in modes:
+        system.actors.add(p3m)
+    elif 'P3M.ELC' in modes:
+        elc = espressomd.electrostatics.ELC(
+            p3m_actor=p3m,
+            gap_size=6.0,
+            maxPWerror=0.1,
+            delta_mid_top=0.9,
+            delta_mid_bot=0.1)
+        system.actors.add(elc)
 
 obs = espressomd.observables.ParticlePositions(ids=[0, 1])
 acc_mean_variance = espressomd.accumulators.MeanVarianceCalculator(obs=obs)
@@ -117,7 +135,7 @@
 acc_mean_variance.update()
 acc_time_series.update()
 acc_correlator.update()
-system.part[0].pos = [1.0, 2.0, 3.0]
+p1.pos = [1.0, 2.0, 3.0]
 acc_mean_variance.update()
 acc_time_series.update()
 acc_correlator.update()
@@ -138,7 +156,8 @@
     system.box_l, np.ones(3), lambda x: np.linalg.norm(10 * np.ones(3) - x))
 checkpoint.register("pot_field_data")
 system.constraints.add(constraints.PotentialField(
-    field=pot_field_data, grid_spacing=np.ones(3), default_scale=1.6))
+    field=pot_field_data, grid_spacing=np.ones(3), default_scale=1.6,
+    particle_scales={5: 6.0}))
 vec_field_data = constraints.ForceField.field_from_fn(
     system.box_l, np.ones(3), lambda x: 10 * np.ones(3) - x)
 checkpoint.register("vec_field_data")
@@ -181,7 +200,7 @@
 if espressomd.has_features(['VIRTUAL_SITES', 'VIRTUAL_SITES_RELATIVE']):
     system.virtual_sites = espressomd.virtual_sites.VirtualSitesRelative(
         have_quaternion=True)
-    system.part[1].vs_auto_relate_to(0)
+    p2.vs_auto_relate_to(p1)
 
 if espressomd.has_features(['LENNARD_JONES']) and 'LJ' in modes:
     system.non_bonded_inter[0, 0].lennard_jones.set_params(
@@ -191,27 +210,67 @@
 
 harmonic_bond = espressomd.interactions.HarmonicBond(r_0=0.0, k=1.0)
 system.bonded_inter.add(harmonic_bond)
-system.part[1].add_bond((harmonic_bond, 0))
+p2.add_bond((harmonic_bond, p1))
 if 'THERM.LB' not in modes:
     thermalized_bond = espressomd.interactions.ThermalizedBond(
         temp_com=0.0, gamma_com=0.0, temp_distance=0.2, gamma_distance=0.5,
         r_cut=2, seed=51)
     system.bonded_inter.add(thermalized_bond)
-    system.part[1].add_bond((thermalized_bond, 0))
+    p2.add_bond((thermalized_bond, p1))
 checkpoint.register("system")
 checkpoint.register("acc_mean_variance")
 checkpoint.register("acc_time_series")
 checkpoint.register("acc_correlator")
 # calculate forces
 system.integrator.run(0)
-particle_force0 = np.copy(system.part[0].f)
-particle_force1 = np.copy(system.part[1].f)
+particle_force0 = np.copy(p1.f)
+particle_force1 = np.copy(p2.f)
 checkpoint.register("particle_force0")
 checkpoint.register("particle_force1")
 if espressomd.has_features("COLLISION_DETECTION"):
     system.collision_detection.set_params(
         mode="bind_centers", distance=0.11, bond_centers=harmonic_bond)
 
+if espressomd.has_features('DP3M') and 'DP3M' in modes:
+    dp3m = espressomd.magnetostatics.DipolarP3M(
+        prefactor=1.,
+        epsilon=2.,
+        mesh_off=[0.5, 0.5, 0.5],
+        r_cut=2.4,
+        cao=1,
+        mesh=[8, 8, 8],
+        alpha=12,
+        accuracy=0.01,
+        tune=False)
+    system.actors.add(dp3m)
+
+if espressomd.has_features('SCAFACOS') and 'SCAFACOS' in modes \
+        and 'p3m' in espressomd.scafacos.available_methods():
+    system.actors.add(espressomd.electrostatics.Scafacos(
+        prefactor=0.5,
+        method_name="p3m",
+        method_params={
+            "p3m_r_cut": 1.0,
+            "p3m_grid": 64,
+            "p3m_cao": 7,
+            "p3m_alpha": 2.084652}))
+
+if espressomd.has_features('SCAFACOS_DIPOLES') and 'SCAFACOS' in modes \
+        and 'p2nfft' in espressomd.scafacos.available_methods():
+    system.actors.add(espressomd.magnetostatics.Scafacos(
+        prefactor=1.2,
+        method_name='p2nfft',
+        method_params={
+            "p2nfft_verbose_tuning": "0",
+            "pnfft_N": "32,32,32",
+            "pnfft_n": "32,32,32",
+            "pnfft_window_name": "bspline",
+            "pnfft_m": "4",
+            "p2nfft_ignore_tolerance": "1",
+            "pnfft_diff_ik": "0",
+            "p2nfft_r_cut": "11",
+            "p2nfft_alpha": "0.37"}))
+
 if LB_implementation:
     m = np.pi / 12
     nx = int(np.round(system.box_l[0] / lbf.get_params()["agrid"]))
diff --git a/testsuite/python/scafacos_interface.py b/testsuite/python/scafacos_interface.py
index 07d76194bb6..d71790149d6 100644
--- a/testsuite/python/scafacos_interface.py
+++ b/testsuite/python/scafacos_interface.py
@@ -50,6 +50,9 @@ def test_available_methods(self):
         for method in available_methods:
             self.assertIn(method, scafacos_methods)
 
+    @ut.skipIf(not espressomd.has_features('SCAFACOS') or
+               'p3m' not in espressomd.scafacos.available_methods(),
+               'Skipping test: missing ScaFaCoS p3m method')
     def test_actor_exceptions(self):
         system = self.system
 
@@ -71,6 +74,9 @@ def test_actor_exceptions(self):
                 prefactor=1, method_name="p3m", method_params={}))
         system.actors.clear()
 
+    @ut.skipIf(not espressomd.has_features('SCAFACOS') or
+               'p3m' not in espressomd.scafacos.available_methods(),
+               'Skipping test: missing ScaFaCoS p3m method')
     def test_actor_coulomb(self):
         system = self.system
 
@@ -79,6 +85,7 @@ def test_actor_coulomb(self):
             method_name="p3m",
             method_params={
                 "p3m_r_cut": 1.0,
+                "p3m_alpha": 2.799269,
                 "p3m_grid": 32,
                 "p3m_cao": 7}))
         actor = system.actors[0]
@@ -86,9 +93,12 @@ def test_actor_coulomb(self):
         self.assertEqual(params["prefactor"], 0.5)
         self.assertEqual(params["method_name"], "p3m")
         self.assertEqual(params["method_params"],
-                         {'p3m_cao': '7', 'p3m_r_cut': '1.0', 'p3m_grid': '32'})
+                         {'p3m_cao': '7', 'p3m_r_cut': '1.0',
+                          'p3m_grid': '32', 'p3m_alpha': '2.799269'})
 
-    @utx.skipIfMissingFeatures(["SCAFACOS_DIPOLES"])
+    @ut.skipIf(not espressomd.has_features('SCAFACOS_DIPOLES') or
+               'p2nfft' not in espressomd.scafacos.available_methods(),
+               'Skipping test: missing ScaFaCoS p2nfft method')
     def test_actor_dipoles(self):
         system = self.system
 
@@ -128,6 +138,7 @@ def p3m_data(self):
             accuracy=1e-5,
             cao=7,
             mesh=48,
+            r_cut=1.88672,
             epsilon="metallic")
         system.actors.add(dp3m)
 
@@ -146,11 +157,18 @@ def fcs_data(self):
 
         scafacos_coulomb = espressomd.electrostatics.Scafacos(
             prefactor=0.5,
-            method_name="p3m",
+            method_name="p2nfft",
             method_params={
-                "p3m_r_cut": 1.0,
-                "p3m_grid": 32,
-                "p3m_cao": 7})
+                "p2nfft_verbose_tuning": 0,
+                "pnfft_N": "32,32,32",
+                "pnfft_n": "32,32,32",
+                "tolerance_field": "5e-4",
+                "pnfft_window_name": "bspline",
+                "pnfft_m": "4",
+                "p2nfft_ignore_tolerance": "1",
+                "pnfft_diff_ik": "0",
+                "p2nfft_r_cut": "1.0",
+                "p2nfft_alpha": "2.92"})
         system.actors.add(scafacos_coulomb)
 
         scafacos_dipoles = espressomd.magnetostatics.Scafacos(
@@ -178,7 +196,10 @@ def fcs_data(self):
 
         return (ref_E_coulomb, ref_E_dipoles, ref_forces, ref_torques)
 
-    @utx.skipIfMissingFeatures(["SCAFACOS_DIPOLES", "LENNARD_JONES"])
+    @utx.skipIfMissingFeatures("LENNARD_JONES")
+    @ut.skipIf(not espressomd.has_features('SCAFACOS_DIPOLES') or
+               'p2nfft' not in espressomd.scafacos.available_methods(),
+               'Skipping test: missing SCAFACOS_DIPOLES or p2nfft method')
     def test_electrostatics_plus_magnetostatics(self):
         # check that two instances of ScaFaCoS can be used
         system = self.system
@@ -206,7 +227,7 @@ def test_electrostatics_plus_magnetostatics(self):
 
         self.assertAlmostEqual(fcs_E_coulomb, p3m_E_coulomb, delta=1e-4)
         self.assertAlmostEqual(fcs_E_dipoles, p3m_E_dipoles, delta=1e-4)
-        np.testing.assert_allclose(fcs_forces, p3m_forces, rtol=1e-3)
+        np.testing.assert_allclose(fcs_forces, p3m_forces, rtol=1e-2)
         np.testing.assert_allclose(fcs_torques, p3m_torques, rtol=1e-3)
 
 
diff --git a/testsuite/python/shapes.py b/testsuite/python/shapes.py
index 51e3f034319..063ed0618fe 100644
--- a/testsuite/python/shapes.py
+++ b/testsuite/python/shapes.py
@@ -27,9 +27,22 @@ def test_Union(self):
         union = espressomd.shapes.Union()
         wall1 = espressomd.shapes.Wall(normal=[0, 0, 1], dist=0)
         wall2 = espressomd.shapes.Wall(normal=[0, 0, -1], dist=-10)
+        self.assertTrue(union.call_method('empty'))
         union.add([wall1, wall2])
+        self.assertFalse(union.call_method('empty'))
         self.assertEqual(union.size(), 2)
 
+        # check object retrieval
+        pwall1, pwall2 = union.call_method('get_elements')
+        self.assertIsInstance(pwall1, espressomd.shapes.Wall)
+        self.assertIsInstance(pwall2, espressomd.shapes.Wall)
+        np.testing.assert_almost_equal(
+            np.copy(pwall1.normal), np.copy(wall1.normal))
+        np.testing.assert_almost_equal(
+            np.copy(pwall2.normal), np.copy(wall2.normal))
+        np.testing.assert_almost_equal(pwall1.dist, wall1.dist)
+        np.testing.assert_almost_equal(pwall2.dist, wall2.dist)
+
         self.assertAlmostEqual(union.calc_distance(
             position=[1, 2, 4.5])[0], 4.5)
         self.assertAlmostEqual(union.calc_distance(
@@ -41,6 +54,7 @@ def test_Union(self):
         with self.assertRaises(ValueError):
             union.calc_distance(position=[1, 2, 11.5])
         union.clear()
+        self.assertTrue(union.call_method('empty'))
         self.assertEqual(union.size(), 0)
         self.assertEqual(union.calc_distance(position=[1, 2, 6.5])[0], np.inf)
 
diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py
index 9ba36f2e049..34eedc6954f 100644
--- a/testsuite/python/test_checkpoint.py
+++ b/testsuite/python/test_checkpoint.py
@@ -22,6 +22,9 @@
 
 import espressomd
 import espressomd.checkpointing
+import espressomd.electrostatics
+import espressomd.magnetostatics
+import espressomd.scafacos
 import espressomd.virtual_sites
 import espressomd.integrate
 from espressomd.shapes import Sphere, Wall
@@ -44,6 +47,16 @@ def setUpClass(cls):
                 '.', '__'),
             checkpoint_path="@CMAKE_CURRENT_BINARY_DIR@")
         cls.checkpoint.load(0)
+        cls.ref_box_l = np.array([12.0, 14.0, 16.0])
+        if 'DP3M' in modes:
+            cls.ref_box_l = np.array([16.0, 16.0, 16.0])
+
+    def get_active_actor_of_type(self, actor_type):
+        for actor in system.actors.active_actors:
+            if isinstance(actor, actor_type):
+                return actor
+        self.fail(
+            f"system doesn't have an actor of type {actor_type.__name__}")
 
     @ut.skipIf(not LB, "Skipping test due to missing mode.")
     def test_LB(self):
@@ -107,30 +120,20 @@ def test_EK(self):
             self.assertIn(key, state)
             self.assertAlmostEqual(reference[key], state[key], delta=1E-5)
         state_species = ek_species.get_params()
-        reference_species = {'density': 0.4, 'D': 0.02, 'valency': 0.3}
+        reference_species = {'density': 0.4, 'D': 0.02, 'valency': 0.3,
+                             'ext_force_density': [0.01, -0.08, 0.06]}
         for key in reference_species:
             self.assertIn(key, state_species)
-            self.assertAlmostEqual(
+            np.testing.assert_allclose(
                 reference_species[key],
                 state_species[key],
-                delta=1E-5)
-        self.assertAlmostEqual(
-            state_species['ext_force_density'][0],
-            0.01,
-            delta=1E-5)
-        self.assertAlmostEqual(
-            state_species['ext_force_density'][1],
-            -0.08,
-            delta=1E-5)
-        self.assertAlmostEqual(
-            state_species['ext_force_density'][2],
-            0.06,
-            delta=1E-5)
+                atol=1E-5)
 
     def test_variables(self):
         self.assertEqual(system.cell_system.skin, 0.1)
         self.assertEqual(system.time_step, 0.01)
         self.assertEqual(system.min_global_cut, 2.0)
+        np.testing.assert_allclose(np.copy(system.box_l), self.ref_box_l)
 
     def test_part(self):
         np.testing.assert_allclose(
@@ -317,12 +320,93 @@ def test_correlator(self):
             system.auto_update_accumulators[2].result(),
             expected)
 
+    @utx.skipIfMissingFeatures('DP3M')
+    @ut.skipIf('DP3M.CPU' not in modes,
+               "Skipping test due to missing combination.")
+    def test_dp3m(self):
+        actor = self.get_active_actor_of_type(
+            espressomd.magnetostatics.DipolarP3M)
+        state = actor.get_params()
+        reference = {'prefactor': 1.0, 'accuracy': 0.01, 'mesh': 3 * [8],
+                     'cao': 1, 'alpha': 12.0, 'r_cut': 2.4, 'tune': False,
+                     'mesh_off': [0.5, 0.5, 0.5], 'epsilon': 2.0}
+        for key in reference:
+            self.assertIn(key, state)
+            np.testing.assert_almost_equal(state[key], reference[key],
+                                           err_msg=f'for parameter {key}')
+
     @utx.skipIfMissingFeatures('P3M')
     @ut.skipIf('P3M.CPU' not in modes,
                "Skipping test due to missing combination.")
     def test_p3m(self):
-        self.assertTrue(any(isinstance(actor, espressomd.electrostatics.P3M)
-                            for actor in system.actors.active_actors))
+        actor = self.get_active_actor_of_type(espressomd.electrostatics.P3M)
+        state = actor.get_params()
+        reference = {'prefactor': 1.0, 'accuracy': 0.1, 'mesh': 3 * [10],
+                     'cao': 1, 'alpha': 1.0, 'r_cut': 1.0, 'tune': False}
+        for key in reference:
+            self.assertIn(key, state)
+            np.testing.assert_almost_equal(state[key], reference[key],
+                                           err_msg=f'for parameter {key}')
+
+    @utx.skipIfMissingFeatures('P3M')
+    @ut.skipIf('P3M.ELC' not in modes,
+               "Skipping test due to missing combination.")
+    def test_elc(self):
+        actor = self.get_active_actor_of_type(espressomd.electrostatics.ELC)
+        elc_state = actor.get_params()
+        p3m_state = elc_state['p3m_actor'].get_params()
+        p3m_reference = {'prefactor': 1.0, 'accuracy': 0.1, 'mesh': 3 * [10],
+                         'cao': 1, 'alpha': 1.0, 'r_cut': 1.0, 'tune': False}
+        elc_reference = {'gap_size': 6.0, 'maxPWerror': 0.1,
+                         'delta_mid_top': 0.9, 'delta_mid_bot': 0.1}
+        for key in elc_reference:
+            self.assertIn(key, elc_state)
+            np.testing.assert_almost_equal(elc_state[key], elc_reference[key],
+                                           err_msg=f'for parameter {key}')
+        for key in p3m_reference:
+            self.assertIn(key, p3m_state)
+            np.testing.assert_almost_equal(p3m_state[key], p3m_reference[key],
+                                           err_msg=f'for parameter {key}')
+
+    @ut.skipIf(not espressomd.has_features('SCAFACOS') or
+               'SCAFACOS' not in modes or
+               'p3m' not in espressomd.scafacos.available_methods(),
+               "Skipping test due to missing combination or p3m method.")
+    def test_scafacos(self):
+        actor = self.get_active_actor_of_type(
+            espressomd.electrostatics.Scafacos)
+        state = actor.get_params()
+        reference = {'prefactor': 0.5, 'method_name': 'p3m',
+                     'method_params': {
+                         'p3m_cao': '7',
+                         'p3m_r_cut': '1.0',
+                         'p3m_grid': '64',
+                         'p3m_alpha': '2.084652'}}
+        for key in reference:
+            self.assertEqual(state[key], reference[key], msg=f'for {key}')
+
+    @ut.skipIf(not espressomd.has_features('SCAFACOS_DIPOLES') or
+               'SCAFACOS' not in modes or
+               'p2nfft' not in espressomd.scafacos.available_methods(),
+               "Skipping test due to missing combination or p2nfft method.")
+    def test_scafacos_dipoles(self):
+        actor = self.get_active_actor_of_type(
+            espressomd.magnetostatics.Scafacos)
+        state = actor.get_params()
+        reference = {'prefactor': 1.2, 'method_name': 'p2nfft',
+                     'method_params': {
+                         "p2nfft_verbose_tuning": "0",
+                         "pnfft_N": "32,32,32",
+                         "pnfft_n": "32,32,32",
+                         "pnfft_window_name": "bspline",
+                         "pnfft_m": "4",
+                         "p2nfft_ignore_tolerance": "1",
+                         "pnfft_diff_ik": "0",
+                         "p2nfft_r_cut": "11",
+                         "p2nfft_alpha": "0.37"}}
+        for key in reference:
+            self.assertIn(key, state)
+            self.assertEqual(state[key], reference[key], msg=f'for {key}')
 
     @utx.skipIfMissingFeatures('COLLISION_DETECTION')
     def test_collision_detection(self):
@@ -350,6 +434,7 @@ def test_constraints(self):
         self.assertEqual(len(system.constraints),
                          8 - int(not espressomd.has_features("ELECTROSTATICS")))
         c = system.constraints
+        ref_shape = self.ref_box_l.astype(int) + 2
 
         self.assertIsInstance(c[0].shape, Sphere)
         self.assertAlmostEqual(c[0].shape.radius, 0.1, delta=1E-10)
@@ -370,8 +455,9 @@ def test_constraints(self):
         self.assertAlmostEqual(c[4].gamma, 2.3, delta=1E-10)
 
         self.assertIsInstance(c[5], constraints.PotentialField)
-        self.assertEqual(c[5].field.shape, (14, 16, 18, 1))
+        self.assertEqual(c[5].field.shape, tuple(list(ref_shape) + [1]))
         self.assertAlmostEqual(c[5].default_scale, 1.6, delta=1E-10)
+        self.assertAlmostEqual(c[5].particle_scales[5], 6.0, delta=1E-10)
         np.testing.assert_allclose(np.copy(c[5].origin), [-0.5, -0.5, -0.5])
         np.testing.assert_allclose(np.copy(c[5].grid_spacing), np.ones(3))
         ref_pot = constraints.PotentialField(
@@ -380,7 +466,7 @@ def test_constraints(self):
                                    atol=1e-10)
 
         self.assertIsInstance(c[6], constraints.ForceField)
-        self.assertEqual(c[6].field.shape, (14, 16, 18, 3))
+        self.assertEqual(c[6].field.shape, tuple(list(ref_shape) + [3]))
         self.assertAlmostEqual(c[6].default_scale, 1.4, delta=1E-10)
         np.testing.assert_allclose(np.copy(c[6].origin), [-0.5, -0.5, -0.5])
         np.testing.assert_allclose(np.copy(c[6].grid_spacing), np.ones(3))
diff --git a/testsuite/python/tests_common.py b/testsuite/python/tests_common.py
index 82b0b386bf5..72bce32f6fc 100644
--- a/testsuite/python/tests_common.py
+++ b/testsuite/python/tests_common.py
@@ -134,7 +134,7 @@ def abspath(path):
 
 
 def transform_pos_from_cartesian_to_polar_coordinates(pos):
-    """Transform the given cartesian coordinates to polar coordinates.
+    """Transform the given cartesian coordinates to cylindrical coordinates.
 
     Parameters
     ----------
@@ -167,33 +167,26 @@ def transform_vel_from_cartesian_to_polar_coordinates(pos, vel):
         (pos[0] * vel[1] - pos[1] * vel[0]) / np.sqrt(pos[0]**2 + pos[1]**2), vel[2]])
 
 
+def get_cylindrical_basis_vectors(pos):
+    phi = transform_pos_from_cartesian_to_polar_coordinates(pos)[1]
+    e_r = np.array([np.cos(phi), np.sin(phi), 0.])
+    e_phi = np.array([-np.sin(phi), np.cos(phi), 0.])
+    e_z = np.array([0., 0., 1.])
+    return e_r, e_phi, e_z
+
+
 def convert_vec_body_to_space(system, part, vec):
     A = rotation_matrix_quat(system, part)
     return np.dot(A.transpose(), vec)
 
 
-def rotation_matrix(axis, theta):
+def rodrigues_rot(vec, axis, angle):
     """
-    Return the rotation matrix associated with counterclockwise rotation about
-    the given axis by theta radians.
-
-    Parameters
-    ----------
-    axis : array_like :obj:`float`
-        Axis to rotate around.
-    theta : :obj:`float`
-        Rotation angle.
-
+    https://en.wikipedia.org/wiki/Rodrigues%27_rotation_formula#Statement
     """
-    axis = np.asarray(axis)
-    axis = axis / np.sqrt(np.dot(axis, axis))
-    a = np.cos(theta / 2.0)
-    b, c, d = -axis * np.sin(theta / 2.0)
-    aa, bb, cc, dd = a * a, b * b, c * c, d * d
-    bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
-    return np.array([[aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
-                     [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
-                     [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])
+    axis /= np.linalg.norm(axis)
+    return np.cos(angle) * vec + np.sin(angle) * np.cross(axis, vec) + \
+        (1 - np.cos(angle)) * np.dot(axis, vec) * axis
 
 
 def rotation_matrix_quat(system, part):
@@ -225,55 +218,41 @@ def rotation_matrix_quat(system, part):
     return A
 
 
-def get_cylindrical_bin_volume(
-        n_r_bins,
-        n_phi_bins,
-        n_z_bins,
-        min_r,
-        max_r,
-        min_phi,
-        max_phi,
-        min_z,
-        max_z):
+def normalize_cylindrical_hist(histogram, cyl_obs_params):
     """
-    Return the bin volumes for a cylindrical histogram.
+    normalize a histogram in cylindrical coordinates. Helper to test the output
+    of cylindrical histogram observables
 
     Parameters
     ----------
-    n_r_bins : :obj:`float`
-        Number of bins in ``r`` direction.
-    n_phi_bins : :obj:`float`
-        Number of bins in ``phi`` direction.
-    n_z_bins : :obj:`float`
-        Number of bins in ``z`` direction.
-    min_r : :obj:`float`
-        Minimum considered value in ``r`` direction.
-    max_r : :obj:`float`
-        Maximum considered value in ``r`` direction.
-    min_phi : :obj:`float`
-        Minimum considered value in ``phi`` direction.
-    max_phi : :obj:`float`
-        Maximum considered value in ``phi`` direction.
-    min_z : :obj:`float`
-        Minimum considered value in ``z`` direction.
-    max_z : :obj:`float`
-        Maximum considered value in ``z`` direction.
+    histogram : (N,3) array_like of :obj:`float`
+        The histogram that needs to be normalized
+    cyl_obs_params : :obj:`dict`
+        A dictionary containing the common parameters of the cylindrical histogram observables.
+        Needs to contain the information about number and range of bins.
+    """
 
-    Returns
-    -------
-    array_like
-        Bin volumes.
+    n_r_bins = cyl_obs_params['n_r_bins']
+    n_phi_bins = cyl_obs_params['n_phi_bins']
+    n_z_bins = cyl_obs_params['n_z_bins']
+    min_r = cyl_obs_params['min_r']
+    max_r = cyl_obs_params['max_r']
+    min_phi = cyl_obs_params['min_phi']
+    max_phi = cyl_obs_params['max_phi']
+    min_z = cyl_obs_params['min_z']
+    max_z = cyl_obs_params['max_z']
 
-    """
     bin_volume = np.zeros(n_r_bins)
     r_bin_size = (max_r - min_r) / n_r_bins
     phi_bin_size = (max_phi - min_phi) / n_phi_bins
     z_bin_size = (max_z - min_z) / n_z_bins
     for i in range(n_r_bins):
-        bin_volume[i] = np.pi * ((min_r + r_bin_size * (i + 1))**2.0 -
-                                 (min_r + r_bin_size * i)**2.0) * \
+        bin_volume = np.pi * ((min_r + r_bin_size * (i + 1))**2.0 -
+                              (min_r + r_bin_size * i)**2.0) * \
             phi_bin_size / (2.0 * np.pi) * z_bin_size
-    return bin_volume
+        histogram[i, :, :] /= bin_volume
+
+    return histogram
 
 
 def get_histogram(pos, obs_params, coord_system, **kwargs):
@@ -638,13 +617,6 @@ def gay_berne_potential(r_ij, u_i, u_j, epsilon_0, sigma_0, mu, nu, k_1, k_2):
     return 4. * epsilon * (rr**-12 - rr**-6)
 
 
-class DynamicDict(dict):
-
-    def __getitem__(self, key):
-        value = super().__getitem__(key)
-        return eval(value, self) if isinstance(value, str) else value
-
-
 def count_fluid_nodes(lbf):
     """Counts the non-boundary nodes in the passed lb fluid instance."""
 
@@ -654,3 +626,14 @@ def count_fluid_nodes(lbf):
             fluid_nodes += 1
 
     return fluid_nodes
+
+
+def random_dipoles(n_particles):
+    """Generate random dipoles by sampling Euler angles uniformly at random."""
+    cos_theta = 2 * np.random.random(n_particles) - 1
+    sin_theta = np.sin(np.arcsin(cos_theta))
+    phi = 2 * np.pi * np.random.random(n_particles)
+    dip = np.array([sin_theta * np.cos(phi),
+                    sin_theta * np.sin(phi),
+                    cos_theta]).T
+    return dip
diff --git a/testsuite/python/utils.py b/testsuite/python/utils.py
index dbd6e88d75a..0a19d4b5a94 100644
--- a/testsuite/python/utils.py
+++ b/testsuite/python/utils.py
@@ -56,7 +56,7 @@ def test_is_valid_type(self):
         self.assertTrue(utils.is_valid_type(
             np.array([12], dtype=int)[0], int))
         self.assertTrue(utils.is_valid_type(
-            np.array([12], dtype=np.long)[0], int))
+            np.array([12], dtype=int)[0], int))
         self.assertTrue(utils.is_valid_type(
             np.array([1.], dtype=float)[0], float))
         self.assertTrue(utils.is_valid_type(
diff --git a/testsuite/python/virtual_sites_tracers_common.py b/testsuite/python/virtual_sites_tracers_common.py
index 45d7a791348..e36d98a56f0 100644
--- a/testsuite/python/virtual_sites_tracers_common.py
+++ b/testsuite/python/virtual_sites_tracers_common.py
@@ -108,9 +108,7 @@ def compute_angle(self):
         n1 = n1 / norm1
         n2 = n2 / norm2
 
-        cos_alpha = np.dot(n1, n2)
-        if cos_alpha > 1:
-            cos_alpha = 1
+        cos_alpha = min(1, np.dot(n1, n2))
         alpha = np.arccos(cos_alpha)
         return alpha
 
diff --git a/testsuite/scripts/samples/test_load_checkpoint.py b/testsuite/scripts/samples/test_load_checkpoint.py
index f3904590b4c..a4176768981 100644
--- a/testsuite/scripts/samples/test_load_checkpoint.py
+++ b/testsuite/scripts/samples/test_load_checkpoint.py
@@ -19,21 +19,19 @@
 import importlib_wrapper
 
 
-def shorten_loop(code):
-    breakpoint = "while True:"
-    assert breakpoint in code
-    code = code.replace(breakpoint, "for _ in range(6):", 1)
-    return code
-
-
 sample, skipIfMissingFeatures = importlib_wrapper.configure_and_import(
-    "@SAMPLES_DIR@/load_checkpoint.py", substitutions=shorten_loop)
+    "@SAMPLES_DIR@/load_checkpoint.py")
 
 
 @skipIfMissingFeatures
 class Sample(ut.TestCase):
     system = sample.system
 
+    def test_file_generation(self):
+        self.assertEqual(set(sample.checkpoint.get_registered_objects()),
+                         {'myvar', 'system', 'p3m'})
+        self.assertEqual(sample.myvar, "some script variable (updated value)")
+
 
 if __name__ == "__main__":
     ut.main()
diff --git a/testsuite/scripts/tutorials/test_active_matter__rectification_simulation.py b/testsuite/scripts/tutorials/test_active_matter__rectification_simulation.py
index 5a6ace54591..fe2b21c2dd5 100644
--- a/testsuite/scripts/tutorials/test_active_matter__rectification_simulation.py
+++ b/testsuite/scripts/tutorials/test_active_matter__rectification_simulation.py
@@ -20,9 +20,11 @@
 import os
 import numpy as np
 
+np.random.seed(40)
+
 tutorial, skipIfMissingFeatures = importlib_wrapper.configure_and_import(
     "@TUTORIALS_DIR@/active_matter/solutions/rectification_simulation.py",
-    cmd_arguments=[6.0], PROD_STEPS=100, PROD_LENGTH=150)
+    cmd_arguments=[6.0], PROD_STEPS=100, PROD_LENGTH=100)
 
 
 @skipIfMissingFeatures
diff --git a/testsuite/scripts/tutorials/test_raspberry_electrophoresis.py b/testsuite/scripts/tutorials/test_raspberry_electrophoresis.py
index 85176a36151..d1c17fe33fa 100644
--- a/testsuite/scripts/tutorials/test_raspberry_electrophoresis.py
+++ b/testsuite/scripts/tutorials/test_raspberry_electrophoresis.py
@@ -18,29 +18,29 @@
 import unittest as ut
 import importlib_wrapper
 import numpy as np
+np.random.seed(42)
 
 tutorial, skipIfMissingFeatures = importlib_wrapper.configure_and_import(
     "@TUTORIALS_DIR@/raspberry_electrophoresis/raspberry_electrophoresis.py",
-    gpu=True, box_l=20., E=0.25, num_iterations=200, num_steps_per_iteration=250)
+    gpu=True, box_l=20., num_iterations=20, num_steps_per_iteration=20)
 
 
 @skipIfMissingFeatures
 class Tutorial(ut.TestCase):
+    '''Check the raspberry travels a longer distance on the x-axis'''
     system = tutorial.system
 
     def test_trajectory_sample(self):
         trajectory = np.loadtxt('posVsTime_sample.dat')[:, 1:4]
-        # the raspberry should have traveled mostly on the x-axis
-        dist = np.abs(trajectory[-1, :] - trajectory[0, :])
-        self.assertGreater(dist[0], dist[1])
-        self.assertGreater(dist[0], dist[2])
+        x, y, z = np.abs(trajectory[-1, :] - trajectory[0, :])
+        self.assertGreater(x, y)
+        self.assertGreater(x, z)
 
     def test_trajectory_simulated(self):
         trajectory = np.loadtxt('posVsTime.dat')[:, 1:4]
-        # the raspberry should have traveled mostly on the x-axis,
-        # but due to insufficient sampling, it's not always the case
-        dist = np.abs(trajectory[-1, :] - trajectory[0, :])
-        self.assertGreater(dist[0], np.min(dist[1:]))
+        x, y, z = np.abs(trajectory[-1, :] - trajectory[0, :])
+        self.assertGreater(x, y)
+        self.assertGreater(x, z)
 
 
 if __name__ == "__main__":