From 0a763c5c145fec4c898589dd948e5d7a689be8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Wed, 13 Nov 2024 19:19:07 +0100
Subject: [PATCH 1/3] Improve LB GPU field accessors performance

---
 .../templates/FieldAccessors.tmpl.cu          | 56 ++++++++----
 .../templates/FieldAccessors.tmpl.cuh         |  2 +
 .../templates/FieldAccessors.tmpl.h           | 21 +++++
 .../src/lattice_boltzmann/LBWalberlaImpl.hpp  |  4 +-
 .../FieldAccessorsDoublePrecision.h           | 51 +++++++++++
 .../FieldAccessorsDoublePrecisionCUDA.cu      | 87 +++++++++++++------
 .../FieldAccessorsDoublePrecisionCUDA.cuh     |  1 +
 .../FieldAccessorsSinglePrecision.h           | 51 +++++++++++
 .../FieldAccessorsSinglePrecisionCUDA.cu      | 87 +++++++++++++------
 .../FieldAccessorsSinglePrecisionCUDA.cuh     |  1 +
 testsuite/python/lb_pressure_tensor.py        | 25 +++---
 11 files changed, 301 insertions(+), 85 deletions(-)

diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
index c9f8ae4dc9..df0be6acad 100644
--- a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cu
@@ -899,7 +899,7 @@ namespace Force {
 namespace MomentumDensity
 {
 // LCOV_EXCL_START
-    __global__ void kernel_sum(
+    __global__ void kernel_get(
         gpu::FieldAccessor< {{dtype}} > pdf,
         gpu::FieldAccessor< {{dtype}} > force,
         {{dtype}} * RESTRICT out )
@@ -914,7 +914,7 @@ namespace MomentumDensity
             {% endfor -%}
             {{momentum_density_getter | substitute_force_getter_cu | indent(8) }}
             {% for i in range(D) -%}
-                out[{{i}}u] += md_{{i}};
+                out[{{i}}u] = md_{{i}};
             {% endfor %}
         }
     }
@@ -924,19 +924,22 @@ namespace MomentumDensity
         gpu::GPUField< {{dtype}} > const * pdf_field,
         gpu::GPUField< {{dtype}} > const * force_field )
     {
-        thrust::device_vector< {{dtype}} > dev_data({{D}}u, {{dtype}} {0});
+        auto const ci = pdf_field->xyzSize();
+        thrust::device_vector< {{dtype}} > dev_data({{D}}u * ci.numCells());
         auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-        WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
-            Cell cell(x, y, z);
-            CellInterval ci ( cell, cell );
-            auto kernel = gpu::make_kernel( kernel_sum );
-            kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
-            kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
-            kernel.addParam( dev_data_ptr );
-            kernel();
-        });
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci) );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *force_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(dev_data.size());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
         Vector{{D}}< {{dtype}} > mom({{dtype}} {0});
-        thrust::copy(dev_data.begin(), dev_data.begin() + {{D}}u, mom.data());
+        for (auto i = uint_t{ 0u }; i < {{D}}u * ci.numCells(); i += {{D}}u) {
+            {% for j in range(D) -%}
+                mom[{{j}}u] += out[i + {{j}}u];
+            {% endfor -%}
+        }
         return mom;
     }
 } // namespace MomentumDensity
@@ -977,7 +980,7 @@ namespace PressureTensor
         Matrix{{D}}< {{dtype}} > out;
         thrust::copy(dev_data.begin(), dev_data.end(), out.data());
         return out;
-   }
+    }
 
     std::vector< {{dtype}} > get(
         gpu::GPUField< {{dtype}} > const * pdf_field,
@@ -992,7 +995,30 @@ namespace PressureTensor
         std::vector< {{dtype}} > out(dev_data.size());
         thrust::copy(dev_data.begin(), dev_data.end(), out.data());
         return out;
-   }
+    }
+
+    Matrix{{D}}< {{dtype}} > reduce(
+        gpu::GPUField< {{dtype}} > const * pdf_field)
+    {
+        auto const ci = pdf_field->xyzSize();
+        thrust::device_vector< {{dtype}} > dev_data({{D**2}}u * ci.numCells());
+        auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+        auto kernel = gpu::make_kernel( kernel_get );
+        kernel.addFieldIndexingParam( gpu::FieldIndexing< {{dtype}} >::interval( *pdf_field, ci ) );
+        kernel.addParam( dev_data_ptr );
+        kernel();
+        std::vector< {{dtype}} > out(dev_data.size());
+        thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+        Matrix{{D}}< {{dtype}} > pressureTensor({{dtype}} {0});
+        for (auto i = uint_t{ 0u }; i < {{D**2}}u * ci.numCells(); i += {{D**2}}u) {
+            {% for i in range(D) -%}
+                {% for j in range(D) -%}
+                    pressureTensor[{{i*D+j}}u] += out[i + {{i*D+j}}u];
+                {% endfor %}
+            {% endfor %}
+        }
+        return pressureTensor;
+    }
 } // namespace PressureTensor
 
 
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
index 65f776abee..ef29b2b508 100644
--- a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.cuh
@@ -213,6 +213,8 @@ namespace PressureTensor {
     std::vector< {{dtype}} >
     get( gpu::GPUField< {{dtype}} > const * pdf_field,
          CellInterval const & ci );
+    Matrix{{D}}< {{dtype}} >
+    reduce( gpu::GPUField< {{dtype}} > const * pdf_field );
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
index d443243bba..bcdbb4fd22 100644
--- a/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
+++ b/maintainer/walberla_kernels/templates/FieldAccessors.tmpl.h
@@ -641,6 +641,27 @@ namespace PressureTensor
         }
         return out;
     }
+
+    inline auto
+    reduce( GhostLayerField< {{dtype}}, uint_t{ {{Q}}u } > const * pdf_field)
+    {
+        Matrix{{D}}< {{dtype}} > pressureTensor({{dtype}} {0});
+        WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
+            const {{dtype}} & xyz0 = pdf_field->get(x, y, z, uint_t{ 0u });
+            {% for i in range(Q) -%}
+                const {{dtype}} f_{{i}} = pdf_field->getF( &xyz0, uint_t{ {{i}}u });
+            {% endfor -%}
+
+            {{second_momentum_getter | indent(8) }}
+
+            {% for i in range(D) -%}
+                {% for j in range(D) -%}
+                    pressureTensor[{{i*D+j}}u] += p_{{i*D+j}};
+                {% endfor %}
+            {% endfor %}
+        });
+        return pressureTensor;
+    }
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
index 8986fed7b4..414433006f 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
@@ -1508,9 +1508,7 @@ class LBWalberlaImpl : public LBWalberlaBase {
     Matrix3<FloatType> tensor(FloatType{0});
     for (auto block = blocks->begin(); block != blocks->end(); ++block) {
       auto pdf_field = block->template getData<PdfField>(m_pdf_field_id);
-      WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
-        tensor += lbm::accessor::PressureTensor::get(pdf_field, Cell{x, y, z});
-      });
+      tensor += lbm::accessor::PressureTensor::reduce(pdf_field);
     }
     auto const grid_size = get_lattice().get_grid_dimensions();
     auto const number_of_nodes = Utils::product(grid_size);
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
index c73cb58c14..bff4efa0fc 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecision.h
@@ -1242,6 +1242,57 @@ inline auto get(GhostLayerField<double, uint_t{19u}> const *pdf_field,
   }
   return out;
 }
+
+inline auto reduce(GhostLayerField<double, uint_t{19u}> const *pdf_field) {
+  Matrix3<double> pressureTensor(double{0});
+  WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
+    const double &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+    const double f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+    const double f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+    const double f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+    const double f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+    const double f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+    const double f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+    const double f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+    const double f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+    const double f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+    const double f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+    const double f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+    const double f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+    const double f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+    const double f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+    const double f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+    const double f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+    const double f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+    const double f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+    const double f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+    const double p_0 =
+        f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+    const double p_1 = -f_10 - f_7 + f_8 + f_9;
+    const double p_2 = -f_13 + f_14 + f_17 - f_18;
+    const double p_3 = -f_10 - f_7 + f_8 + f_9;
+    const double p_4 =
+        f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+    const double p_5 = f_11 - f_12 - f_15 + f_16;
+    const double p_6 = -f_13 + f_14 + f_17 - f_18;
+    const double p_7 = f_11 - f_12 - f_15 + f_16;
+    const double p_8 =
+        f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+
+    pressureTensor[0u] += p_0;
+    pressureTensor[1u] += p_1;
+    pressureTensor[2u] += p_2;
+
+    pressureTensor[3u] += p_3;
+    pressureTensor[4u] += p_4;
+    pressureTensor[5u] += p_5;
+
+    pressureTensor[6u] += p_6;
+    pressureTensor[7u] += p_7;
+    pressureTensor[8u] += p_8;
+  });
+  return pressureTensor;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
index 62dda87100..ee087416d9 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cu
@@ -756,19 +756,6 @@ double get(
   return rho;
 }
 
-void set(
-    gpu::GPUField<double> *pdf_field,
-    const double rho,
-    Cell const &cell) {
-  CellInterval ci(cell, cell);
-  thrust::device_vector<double> dev_data(1u, rho);
-  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set);
-  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
-  kernel.addParam(const_cast<const double *>(dev_data_ptr));
-  kernel();
-}
-
 std::vector<double> get(
     gpu::GPUField<double> const *pdf_field,
     CellInterval const &ci) {
@@ -783,6 +770,19 @@ std::vector<double> get(
   return out;
 }
 
+void set(
+    gpu::GPUField<double> *pdf_field,
+    const double rho,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<double> dev_data(1u, rho);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const double *>(dev_data_ptr));
+  kernel();
+}
+
 void set(
     gpu::GPUField<double> *pdf_field,
     std::vector<double> const &values,
@@ -1048,7 +1048,7 @@ void set(gpu::GPUField<double> const *pdf_field,
 
 namespace MomentumDensity {
 // LCOV_EXCL_START
-__global__ void kernel_sum(
+__global__ void kernel_get(
     gpu::FieldAccessor<double> pdf,
     gpu::FieldAccessor<double> force,
     double *RESTRICT out) {
@@ -1086,9 +1086,9 @@ __global__ void kernel_sum(
     const double md_0 = force.get(0) * 0.50000000000000000 + momdensity_0;
     const double md_1 = force.get(1) * 0.50000000000000000 + momdensity_1;
     const double md_2 = force.get(2) * 0.50000000000000000 + momdensity_2;
-    out[0u] += md_0;
-    out[1u] += md_1;
-    out[2u] += md_2;
+    out[0u] = md_0;
+    out[1u] = md_1;
+    out[2u] = md_2;
   }
 }
 // LCOV_EXCL_STOP
@@ -1096,19 +1096,22 @@ __global__ void kernel_sum(
 Vector3<double> reduce(
     gpu::GPUField<double> const *pdf_field,
     gpu::GPUField<double> const *force_field) {
-  thrust::device_vector<double> dev_data(3u, double{0});
+  auto const ci = pdf_field->xyzSize();
+  thrust::device_vector<double> dev_data(3u * ci.numCells());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
-    Cell cell(x, y, z);
-    CellInterval ci(cell, cell);
-    auto kernel = gpu::make_kernel(kernel_sum);
-    kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
-    kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
-    kernel.addParam(dev_data_ptr);
-    kernel();
-  });
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
   Vector3<double> mom(double{0});
-  thrust::copy(dev_data.begin(), dev_data.begin() + 3u, mom.data());
+  for (auto i = uint_t{0u}; i < 3u * ci.numCells(); i += 3u) {
+    mom[0u] += out[i + 0u];
+    mom[1u] += out[i + 1u];
+    mom[2u] += out[i + 2u];
+  }
   return mom;
 }
 } // namespace MomentumDensity
@@ -1191,6 +1194,34 @@ std::vector<double> get(
   thrust::copy(dev_data.begin(), dev_data.end(), out.data());
   return out;
 }
+
+Matrix3<double> reduce(
+    gpu::GPUField<double> const *pdf_field) {
+  auto const ci = pdf_field->xyzSize();
+  thrust::device_vector<double> dev_data(9u * ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<double>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<double> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  Matrix3<double> pressureTensor(double{0});
+  for (auto i = uint_t{0u}; i < 9u * ci.numCells(); i += 9u) {
+    pressureTensor[0u] += out[i + 0u];
+    pressureTensor[1u] += out[i + 1u];
+    pressureTensor[2u] += out[i + 2u];
+
+    pressureTensor[3u] += out[i + 3u];
+    pressureTensor[4u] += out[i + 4u];
+    pressureTensor[5u] += out[i + 5u];
+
+    pressureTensor[6u] += out[i + 6u];
+    pressureTensor[7u] += out[i + 7u];
+    pressureTensor[8u] += out[i + 8u];
+  }
+  return pressureTensor;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
index 6ac754f263..7435c52b71 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsDoublePrecisionCUDA.cuh
@@ -163,6 +163,7 @@ namespace PressureTensor {
 Matrix3<double> get(gpu::GPUField<double> const *pdf_field, Cell const &cell);
 std::vector<double> get(gpu::GPUField<double> const *pdf_field,
                         CellInterval const &ci);
+Matrix3<double> reduce(gpu::GPUField<double> const *pdf_field);
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
index a7711c4f78..11043cefb8 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecision.h
@@ -1244,6 +1244,57 @@ inline auto get(GhostLayerField<float, uint_t{19u}> const *pdf_field,
   }
   return out;
 }
+
+inline auto reduce(GhostLayerField<float, uint_t{19u}> const *pdf_field) {
+  Matrix3<float> pressureTensor(float{0});
+  WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
+    const float &xyz0 = pdf_field->get(x, y, z, uint_t{0u});
+    const float f_0 = pdf_field->getF(&xyz0, uint_t{0u});
+    const float f_1 = pdf_field->getF(&xyz0, uint_t{1u});
+    const float f_2 = pdf_field->getF(&xyz0, uint_t{2u});
+    const float f_3 = pdf_field->getF(&xyz0, uint_t{3u});
+    const float f_4 = pdf_field->getF(&xyz0, uint_t{4u});
+    const float f_5 = pdf_field->getF(&xyz0, uint_t{5u});
+    const float f_6 = pdf_field->getF(&xyz0, uint_t{6u});
+    const float f_7 = pdf_field->getF(&xyz0, uint_t{7u});
+    const float f_8 = pdf_field->getF(&xyz0, uint_t{8u});
+    const float f_9 = pdf_field->getF(&xyz0, uint_t{9u});
+    const float f_10 = pdf_field->getF(&xyz0, uint_t{10u});
+    const float f_11 = pdf_field->getF(&xyz0, uint_t{11u});
+    const float f_12 = pdf_field->getF(&xyz0, uint_t{12u});
+    const float f_13 = pdf_field->getF(&xyz0, uint_t{13u});
+    const float f_14 = pdf_field->getF(&xyz0, uint_t{14u});
+    const float f_15 = pdf_field->getF(&xyz0, uint_t{15u});
+    const float f_16 = pdf_field->getF(&xyz0, uint_t{16u});
+    const float f_17 = pdf_field->getF(&xyz0, uint_t{17u});
+    const float f_18 = pdf_field->getF(&xyz0, uint_t{18u});
+    const float p_0 =
+        f_10 + f_13 + f_14 + f_17 + f_18 + f_3 + f_4 + f_7 + f_8 + f_9;
+    const float p_1 = -f_10 - f_7 + f_8 + f_9;
+    const float p_2 = -f_13 + f_14 + f_17 - f_18;
+    const float p_3 = -f_10 - f_7 + f_8 + f_9;
+    const float p_4 =
+        f_1 + f_10 + f_11 + f_12 + f_15 + f_16 + f_2 + f_7 + f_8 + f_9;
+    const float p_5 = f_11 - f_12 - f_15 + f_16;
+    const float p_6 = -f_13 + f_14 + f_17 - f_18;
+    const float p_7 = f_11 - f_12 - f_15 + f_16;
+    const float p_8 =
+        f_11 + f_12 + f_13 + f_14 + f_15 + f_16 + f_17 + f_18 + f_5 + f_6;
+
+    pressureTensor[0u] += p_0;
+    pressureTensor[1u] += p_1;
+    pressureTensor[2u] += p_2;
+
+    pressureTensor[3u] += p_3;
+    pressureTensor[4u] += p_4;
+    pressureTensor[5u] += p_5;
+
+    pressureTensor[6u] += p_6;
+    pressureTensor[7u] += p_7;
+    pressureTensor[8u] += p_8;
+  });
+  return pressureTensor;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
index e0c8dd8102..9631c3e844 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cu
@@ -756,19 +756,6 @@ float get(
   return rho;
 }
 
-void set(
-    gpu::GPUField<float> *pdf_field,
-    const float rho,
-    Cell const &cell) {
-  CellInterval ci(cell, cell);
-  thrust::device_vector<float> dev_data(1u, rho);
-  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  auto kernel = gpu::make_kernel(kernel_set);
-  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
-  kernel.addParam(const_cast<const float *>(dev_data_ptr));
-  kernel();
-}
-
 std::vector<float> get(
     gpu::GPUField<float> const *pdf_field,
     CellInterval const &ci) {
@@ -783,6 +770,19 @@ std::vector<float> get(
   return out;
 }
 
+void set(
+    gpu::GPUField<float> *pdf_field,
+    const float rho,
+    Cell const &cell) {
+  CellInterval ci(cell, cell);
+  thrust::device_vector<float> dev_data(1u, rho);
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_set);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(const_cast<const float *>(dev_data_ptr));
+  kernel();
+}
+
 void set(
     gpu::GPUField<float> *pdf_field,
     std::vector<float> const &values,
@@ -1048,7 +1048,7 @@ void set(gpu::GPUField<float> const *pdf_field,
 
 namespace MomentumDensity {
 // LCOV_EXCL_START
-__global__ void kernel_sum(
+__global__ void kernel_get(
     gpu::FieldAccessor<float> pdf,
     gpu::FieldAccessor<float> force,
     float *RESTRICT out) {
@@ -1086,9 +1086,9 @@ __global__ void kernel_sum(
     const float md_0 = force.get(0) * 0.50000000000000000f + momdensity_0;
     const float md_1 = force.get(1) * 0.50000000000000000f + momdensity_1;
     const float md_2 = force.get(2) * 0.50000000000000000f + momdensity_2;
-    out[0u] += md_0;
-    out[1u] += md_1;
-    out[2u] += md_2;
+    out[0u] = md_0;
+    out[1u] = md_1;
+    out[2u] = md_2;
   }
 }
 // LCOV_EXCL_STOP
@@ -1096,19 +1096,22 @@ __global__ void kernel_sum(
 Vector3<float> reduce(
     gpu::GPUField<float> const *pdf_field,
     gpu::GPUField<float> const *force_field) {
-  thrust::device_vector<float> dev_data(3u, float{0});
+  auto const ci = pdf_field->xyzSize();
+  thrust::device_vector<float> dev_data(3u * ci.numCells());
   auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
-  WALBERLA_FOR_ALL_CELLS_XYZ(pdf_field, {
-    Cell cell(x, y, z);
-    CellInterval ci(cell, cell);
-    auto kernel = gpu::make_kernel(kernel_sum);
-    kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
-    kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
-    kernel.addParam(dev_data_ptr);
-    kernel();
-  });
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*force_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
   Vector3<float> mom(float{0});
-  thrust::copy(dev_data.begin(), dev_data.begin() + 3u, mom.data());
+  for (auto i = uint_t{0u}; i < 3u * ci.numCells(); i += 3u) {
+    mom[0u] += out[i + 0u];
+    mom[1u] += out[i + 1u];
+    mom[2u] += out[i + 2u];
+  }
   return mom;
 }
 } // namespace MomentumDensity
@@ -1191,6 +1194,34 @@ std::vector<float> get(
   thrust::copy(dev_data.begin(), dev_data.end(), out.data());
   return out;
 }
+
+Matrix3<float> reduce(
+    gpu::GPUField<float> const *pdf_field) {
+  auto const ci = pdf_field->xyzSize();
+  thrust::device_vector<float> dev_data(9u * ci.numCells());
+  auto const dev_data_ptr = thrust::raw_pointer_cast(dev_data.data());
+  auto kernel = gpu::make_kernel(kernel_get);
+  kernel.addFieldIndexingParam(gpu::FieldIndexing<float>::interval(*pdf_field, ci));
+  kernel.addParam(dev_data_ptr);
+  kernel();
+  std::vector<float> out(dev_data.size());
+  thrust::copy(dev_data.begin(), dev_data.end(), out.data());
+  Matrix3<float> pressureTensor(float{0});
+  for (auto i = uint_t{0u}; i < 9u * ci.numCells(); i += 9u) {
+    pressureTensor[0u] += out[i + 0u];
+    pressureTensor[1u] += out[i + 1u];
+    pressureTensor[2u] += out[i + 2u];
+
+    pressureTensor[3u] += out[i + 3u];
+    pressureTensor[4u] += out[i + 4u];
+    pressureTensor[5u] += out[i + 5u];
+
+    pressureTensor[6u] += out[i + 6u];
+    pressureTensor[7u] += out[i + 7u];
+    pressureTensor[8u] += out[i + 8u];
+  }
+  return pressureTensor;
+}
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
index 9d25a37a85..fd12e2e019 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
+++ b/src/walberla_bridge/src/lattice_boltzmann/generated_kernels/FieldAccessorsSinglePrecisionCUDA.cuh
@@ -159,6 +159,7 @@ namespace PressureTensor {
 Matrix3<float> get(gpu::GPUField<float> const *pdf_field, Cell const &cell);
 std::vector<float> get(gpu::GPUField<float> const *pdf_field,
                        CellInterval const &ci);
+Matrix3<float> reduce(gpu::GPUField<float> const *pdf_field);
 } // namespace PressureTensor
 
 } // namespace accessor
diff --git a/testsuite/python/lb_pressure_tensor.py b/testsuite/python/lb_pressure_tensor.py
index 59ff0f2b5d..d4fb48b231 100644
--- a/testsuite/python/lb_pressure_tensor.py
+++ b/testsuite/python/lb_pressure_tensor.py
@@ -43,19 +43,22 @@ class TestLBPressureTensor:
     system.time_step = params["tau"]
     system.cell_system.skin = 0
 
-    def tearDown(self):
-        self.system.lb = None
-        self.system.thermostat.turn_off()
+    @classmethod
+    def tearDownClass(cls):
+        cls.system.lb = None
+        cls.system.thermostat.turn_off()
 
-    def setUp(self):
+    @classmethod
+    def setUpClass(cls):
         # Setup
-        self.lbf = self.lb_class(**self.params, **self.lb_params)
-        self.system.lb = self.lbf
-        self.system.thermostat.set_lb(LB_fluid=self.lbf, seed=42)
+        cls.lbf = cls.lb_class(**cls.params, **cls.lb_params)
+        cls.system.lb = cls.lbf
 
         # Warmup
-        self.system.integrator.run(500)
+        cls.system.integrator.run(500)
+        cls.sample_pressure(cls)
 
+    def sample_pressure(self):
         # Sampling
         self.p_global = np.zeros((self.steps, 3, 3))
         self.p_node0 = np.zeros((self.steps, 3, 3))
@@ -188,10 +191,10 @@ def test_gk_viscosity(self):
                 numeric_integral = np.trapz(acf[:len(ts)], dx=2 * self.params["tau"])
 
                 # fit tail
-                def f(x, a, b): return a * np.exp(-b * x)
+                def fit(x, a, b): return a * np.exp(-b * x)
 
-                (a, b), _ = scipy.optimize.curve_fit(f, acf[:len(ts)], ts)
-                tail = f(ts[-1], a, b) / b
+                (a, b), _ = scipy.optimize.curve_fit(fit, acf[:len(ts)], ts)
+                tail = fit(ts[-1], a, b) / b
 
                 integral = numeric_integral + tail
 

From dceb1d00a2c4793f1a229a2e8d636eac1d02fd0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Thu, 14 Nov 2024 23:24:28 +0100
Subject: [PATCH 2/3] Implement LB GPU VTK kernels

---
 .../src/lattice_boltzmann/LBWalberlaImpl.hpp  | 105 +++++++++++++-----
 testsuite/python/CMakeLists.txt               |   4 +-
 testsuite/python/lattice_vtk.py               |  48 +++++---
 testsuite/python/save_checkpoint.py           |  29 +++--
 testsuite/python/test_checkpoint.py           |   1 -
 5 files changed, 127 insertions(+), 60 deletions(-)

diff --git a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
index 414433006f..3c1b11219f 100644
--- a/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
+++ b/src/walberla_bridge/src/lattice_boltzmann/LBWalberlaImpl.hpp
@@ -40,6 +40,7 @@
 #include <stencil/D3Q27.h>
 #if defined(__CUDACC__)
 #include <gpu/AddGPUFieldToStorage.h>
+#include <gpu/HostFieldAllocator.h>
 #include <gpu/communication/MemcpyPackInfo.h>
 #include <gpu/communication/UniformGPUScheme.h>
 #endif
@@ -173,6 +174,10 @@ class LBWalberlaImpl : public LBWalberlaBase {
   using FlagField = typename BoundaryModel::FlagField;
 #if defined(__CUDACC__)
   using GPUField = gpu::GPUField<FloatType>;
+  using PdfFieldCpu =
+      typename FieldTrait<FloatType, lbmpy::Arch::CPU>::PdfField;
+  using VectorFieldCpu =
+      typename FieldTrait<FloatType, lbmpy::Arch::CPU>::VectorField;
 #endif
 
   struct GhostComm {
@@ -302,7 +307,12 @@ class LBWalberlaImpl : public LBWalberlaBase {
   BlockDataID m_force_to_be_applied_id;
 
   BlockDataID m_velocity_field_id;
-  BlockDataID m_vec_tmp_field_id;
+  BlockDataID m_vel_tmp_field_id;
+
+#if defined(__CUDACC__)
+  std::optional<BlockDataID> m_pdf_cpu_field_id;
+  std::optional<BlockDataID> m_vel_cpu_field_id;
+#endif
 
   /** Flag for boundary cells. */
   FlagUID const Boundary_flag{"boundary"};
@@ -364,6 +374,10 @@ class LBWalberlaImpl : public LBWalberlaBase {
   // lattice
   std::shared_ptr<LatticeWalberla> m_lattice;
 
+#if defined(__CUDACC__)
+  std::shared_ptr<gpu::HostFieldAllocator<FloatType>> m_host_field_allocator;
+#endif
+
   [[nodiscard]] std::optional<CellInterval>
   get_interval(Utils::Vector3i const &lower_corner,
                Utils::Vector3i const &upper_corner) const {
@@ -474,7 +488,11 @@ class LBWalberlaImpl : public LBWalberlaBase {
     m_last_applied_force_field_id = add_to_storage<_VectorField>("force last");
     m_force_to_be_applied_id = add_to_storage<_VectorField>("force next");
     m_velocity_field_id = add_to_storage<_VectorField>("velocity");
-    m_vec_tmp_field_id = add_to_storage<_VectorField>("velocity_tmp");
+    m_vel_tmp_field_id = add_to_storage<_VectorField>("velocity_tmp");
+#if defined(__CUDACC__)
+    m_host_field_allocator =
+        std::make_shared<gpu::HostFieldAllocator<FloatType>>();
+#endif
 
     // Initialize and register pdf field
     auto pdf_setter =
@@ -768,13 +786,13 @@ class LBWalberlaImpl : public LBWalberlaBase {
             m_lees_edwards_callbacks->get_pos_offset);
     m_lees_edwards_vel_interpol_sweep = std::make_shared<
         InterpolateAndShiftAtBoundary<_VectorField, FloatType>>(
-        blocks, m_velocity_field_id, m_vec_tmp_field_id, n_ghost_layers,
+        blocks, m_velocity_field_id, m_vel_tmp_field_id, n_ghost_layers,
         shear_direction, shear_plane_normal,
         m_lees_edwards_callbacks->get_pos_offset,
         m_lees_edwards_callbacks->get_shear_velocity);
     m_lees_edwards_last_applied_force_interpol_sweep = std::make_shared<
         InterpolateAndShiftAtBoundary<_VectorField, FloatType>>(
-        blocks, m_last_applied_force_field_id, m_vec_tmp_field_id,
+        blocks, m_last_applied_force_field_id, m_vel_tmp_field_id,
         n_ghost_layers, shear_direction, shear_plane_normal,
         m_lees_edwards_callbacks->get_pos_offset);
     setup_streaming_communicator();
@@ -1577,13 +1595,9 @@ class LBWalberlaImpl : public LBWalberlaBase {
   }
 
   void register_vtk_field_filters(walberla::vtk::VTKOutput &vtk_obj) override {
-    if constexpr (Architecture == lbmpy::Arch::GPU) {
-      throw std::runtime_error("VTK output not supported for GPU");
-    } else {
-      field::FlagFieldCellFilter<FlagField> fluid_filter(m_flag_field_id);
-      fluid_filter.addFlag(Boundary_flag);
-      vtk_obj.addCellExclusionFilter(fluid_filter);
-    }
+    field::FlagFieldCellFilter<FlagField> fluid_filter(m_flag_field_id);
+    fluid_filter.addFlag(Boundary_flag);
+    vtk_obj.addCellExclusionFilter(fluid_filter);
   }
 
 protected:
@@ -1672,26 +1686,61 @@ class LBWalberlaImpl : public LBWalberlaBase {
   void register_vtk_field_writers(walberla::vtk::VTKOutput &vtk_obj,
                                   LatticeModel::units_map const &units,
                                   int flag_observables) override {
-    if constexpr (Architecture == lbmpy::Arch::GPU) {
-      throw std::runtime_error("VTK output not supported for GPU");
-    } else {
-      if (flag_observables & static_cast<int>(OutputVTK::density)) {
-        auto const unit_conversion = FloatType_c(units.at("density"));
-        vtk_obj.addCellDataWriter(std::make_shared<DensityVTKWriter<float>>(
-            m_pdf_field_id, "density", unit_conversion));
+#if defined(__CUDACC__)
+    auto const allocate_cpu_field_if_empty =
+        [&]<typename Field>(auto const &blocks, std::string name,
+                            std::optional<BlockDataID> &cpu_field) {
+          if (not cpu_field) {
+            cpu_field = field::addToStorage<Field>(
+                blocks, name, FloatType{0}, field::fzyx,
+                m_lattice->get_ghost_layers(), m_host_field_allocator);
+          }
+        };
+#endif
+    if (flag_observables & static_cast<int>(OutputVTK::density)) {
+      auto const unit_conversion = FloatType_c(units.at("density"));
+#if defined(__CUDACC__)
+      if constexpr (Architecture == lbmpy::Arch::GPU) {
+        auto const &blocks = m_lattice->get_blocks();
+        allocate_cpu_field_if_empty.template operator()<PdfFieldCpu>(
+            blocks, "pdfs_cpu", m_pdf_cpu_field_id);
+        vtk_obj.addBeforeFunction(gpu::fieldCpyFunctor<PdfFieldCpu, PdfField>(
+            blocks, *m_pdf_cpu_field_id, m_pdf_field_id));
       }
-      if (flag_observables & static_cast<int>(OutputVTK::velocity_vector)) {
-        auto const unit_conversion = FloatType_c(units.at("velocity"));
-        vtk_obj.addCellDataWriter(std::make_shared<VelocityVTKWriter<float>>(
-            m_velocity_field_id, "velocity_vector", unit_conversion));
+#endif
+      vtk_obj.addCellDataWriter(std::make_shared<DensityVTKWriter<float>>(
+          m_pdf_field_id, "density", unit_conversion));
+    }
+    if (flag_observables & static_cast<int>(OutputVTK::velocity_vector)) {
+      auto const unit_conversion = FloatType_c(units.at("velocity"));
+#if defined(__CUDACC__)
+      if constexpr (Architecture == lbmpy::Arch::GPU) {
+        auto const &blocks = m_lattice->get_blocks();
+        allocate_cpu_field_if_empty.template operator()<VectorFieldCpu>(
+            blocks, "vel_cpu", m_vel_cpu_field_id);
+        vtk_obj.addBeforeFunction(
+            gpu::fieldCpyFunctor<VectorFieldCpu, VectorField>(
+                blocks, *m_vel_cpu_field_id, m_velocity_field_id));
       }
-      if (flag_observables & static_cast<int>(OutputVTK::pressure_tensor)) {
-        auto const unit_conversion = FloatType_c(units.at("pressure"));
-        vtk_obj.addCellDataWriter(
-            std::make_shared<PressureTensorVTKWriter<float>>(
-                m_pdf_field_id, "pressure_tensor", unit_conversion,
-                pressure_tensor_correction_factor()));
+#endif
+      vtk_obj.addCellDataWriter(std::make_shared<VelocityVTKWriter<float>>(
+          m_velocity_field_id, "velocity_vector", unit_conversion));
+    }
+    if (flag_observables & static_cast<int>(OutputVTK::pressure_tensor)) {
+      auto const unit_conversion = FloatType_c(units.at("pressure"));
+#if defined(__CUDACC__)
+      if constexpr (Architecture == lbmpy::Arch::GPU) {
+        auto const &blocks = m_lattice->get_blocks();
+        allocate_cpu_field_if_empty.template operator()<PdfFieldCpu>(
+            blocks, "pdfs_cpu", m_pdf_cpu_field_id);
+        vtk_obj.addBeforeFunction(gpu::fieldCpyFunctor<PdfFieldCpu, PdfField>(
+            blocks, *m_pdf_cpu_field_id, m_pdf_field_id));
       }
+#endif
+      vtk_obj.addCellDataWriter(
+          std::make_shared<PressureTensorVTKWriter<float>>(
+              m_pdf_field_id, "pressure_tensor", unit_conversion,
+              pressure_tensor_correction_factor()));
     }
   }
 
diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt
index ee90256b07..12b21f8914 100644
--- a/testsuite/python/CMakeLists.txt
+++ b/testsuite/python/CMakeLists.txt
@@ -388,9 +388,9 @@ python_test(FILE utils.py MAX_NUM_PROC 1)
 python_test(FILE npt_thermostat.py MAX_NUM_PROC 4)
 python_test(FILE box_geometry.py MAX_NUM_PROC 1)
 python_test(FILE lattice.py MAX_NUM_PROC 4)
-python_test(FILE lattice_vtk.py MAX_NUM_PROC 4)
+python_test(FILE lattice_vtk.py MAX_NUM_PROC 4 GPU_SLOTS 1)
 if(${ESPRESSO_TEST_NP} GREATER_EQUAL 6)
-  python_test(FILE lattice_vtk.py MAX_NUM_PROC 6 SUFFIX 6_cores)
+  python_test(FILE lattice_vtk.py MAX_NUM_PROC 6 SUFFIX 6_cores GPU_SLOTS 1)
 endif()
 python_test(FILE ek_interface.py MAX_NUM_PROC 2)
 python_test(FILE ek_diffusion.py MAX_NUM_PROC 1)
diff --git a/testsuite/python/lattice_vtk.py b/testsuite/python/lattice_vtk.py
index 9b4e1981a1..d4420d8cbf 100644
--- a/testsuite/python/lattice_vtk.py
+++ b/testsuite/python/lattice_vtk.py
@@ -36,7 +36,7 @@
 
 
 class TestVTK:
-    system = espressomd.System(box_l=[6, 7, 5])
+    system = espressomd.System(box_l=[6, 7, 3])
     system.time_step = 0.1
     system.cell_system.skin = 0.4
 
@@ -135,7 +135,7 @@ def test_vtk(self):
         actor.add_boundary_from_shape(
             espressomd.shapes.Wall(normal=[-1, 0, 0], dist=-(self.system.box_l[0] - dist)))
 
-        n_steps = 10
+        n_steps = 4 if self.lb_class is espressomd.lb.LBFluidWalberlaGPU else 10
         shape = tuple(actor.shape)
         shape = (shape[0] - 4, *shape[1:])
         vtk_reader = espressomd.io.vtk.VTKReader()
@@ -329,42 +329,62 @@ def test_vtk(self):
         self.assertEqual(len(actor.vtk_writers), 0)
 
 
-@utx.skipIfMissingFeatures("WALBERLA")
-class LBWalberlaWrite(TestLBVTK, ut.TestCase):
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBWalberlaVTKDoublePrecisionCPU(TestLBVTK, ut.TestCase):
     vtk_class = espressomd.lb.VTKOutput
     lattice_class = espressomd.lb.LatticeWalberla
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": False}
-    vtk_id = "lb_double_precision"
+    vtk_id = "lb_double_precision_cpu"
 
 
-@utx.skipIfMissingFeatures("WALBERLA")
-class LBWalberlaWriteSinglePrecision(TestLBVTK, ut.TestCase):
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBWalberlaVTKDoublePrecisionGPU(TestLBVTK, ut.TestCase):
+    vtk_class = espressomd.lb.VTKOutput
+    lattice_class = espressomd.lb.LatticeWalberla
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": False}
+    vtk_id = "lb_double_precision_gpu"
+
+
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class LBWalberlaVTKSinglePrecisionCPU(TestLBVTK, ut.TestCase):
     vtk_class = espressomd.lb.VTKOutput
     lattice_class = espressomd.lb.LatticeWalberla
     lb_class = espressomd.lb.LBFluidWalberla
     lb_params = {"single_precision": True}
-    vtk_id = "lb_single_precision"
+    vtk_id = "lb_single_precision_cpu"
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["WALBERLA", "CUDA"])
+class LBWalberlaVTKSinglePrecisionGPU(TestLBVTK, ut.TestCase):
+    vtk_class = espressomd.lb.VTKOutput
+    lattice_class = espressomd.lb.LatticeWalberla
+    lb_class = espressomd.lb.LBFluidWalberlaGPU
+    lb_params = {"single_precision": True}
+    vtk_id = "lb_single_precision_gpu"
 
 
-@utx.skipIfMissingFeatures("WALBERLA")
-class EKWalberlaWrite(TestEKVTK, ut.TestCase):
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class EKWalberlaVTKDoublePrecisionCPU(TestEKVTK, ut.TestCase):
     vtk_class = espressomd.electrokinetics.VTKOutput
     lattice_class = espressomd.electrokinetics.LatticeWalberla
     ek_class = espressomd.electrokinetics.EKSpecies
     ek_solver = espressomd.electrokinetics.EKNone
     ek_params = {"single_precision": False}
-    vtk_id = "ek_double_precision"
+    vtk_id = "ek_double_precision_cpu"
 
 
-@utx.skipIfMissingFeatures("WALBERLA")
-class EKWalberlaWriteSinglePrecision(TestEKVTK, ut.TestCase):
+@utx.skipIfMissingFeatures(["WALBERLA"])
+class EKWalberlaVTKSinglePrecisionCPU(TestEKVTK, ut.TestCase):
     vtk_class = espressomd.electrokinetics.VTKOutput
     lattice_class = espressomd.electrokinetics.LatticeWalberla
     ek_class = espressomd.electrokinetics.EKSpecies
     ek_solver = espressomd.electrokinetics.EKNone
     ek_params = {"single_precision": True}
-    vtk_id = "ek_single_precision"
+    vtk_id = "ek_single_precision_cpu"
 
 
 if __name__ == "__main__":
diff --git a/testsuite/python/save_checkpoint.py b/testsuite/python/save_checkpoint.py
index 29e8d387de..28c5950a3c 100644
--- a/testsuite/python/save_checkpoint.py
+++ b/testsuite/python/save_checkpoint.py
@@ -405,21 +405,20 @@
     vtk_suffix = config.test_name
     vtk_root = pathlib.Path("vtk_out")
     # create LB VTK callbacks
-    if 'LB.GPU' not in modes:  # TODO WALBERLA
-        lb_vtk_auto_id = f"auto_lb_{vtk_suffix}"
-        lb_vtk_manual_id = f"manual_lb_{vtk_suffix}"
-        config.recursive_unlink(vtk_root / lb_vtk_auto_id)
-        config.recursive_unlink(vtk_root / lb_vtk_manual_id)
-        lb_vtk_auto = espressomd.lb.VTKOutput(
-            identifier=lb_vtk_auto_id, delta_N=1,
-            observables=('density', 'velocity_vector'), base_folder=str(vtk_root))
-        lbf.add_vtk_writer(vtk=lb_vtk_auto)
-        lb_vtk_auto.disable()
-        lb_vtk_manual = espressomd.lb.VTKOutput(
-            identifier=lb_vtk_manual_id, delta_N=0,
-            observables=('density',), base_folder=str(vtk_root))
-        lbf.add_vtk_writer(vtk=lb_vtk_manual)
-        lb_vtk_manual.write()
+    lb_vtk_auto_id = f"auto_lb_{vtk_suffix}"
+    lb_vtk_manual_id = f"manual_lb_{vtk_suffix}"
+    config.recursive_unlink(vtk_root / lb_vtk_auto_id)
+    config.recursive_unlink(vtk_root / lb_vtk_manual_id)
+    lb_vtk_auto = espressomd.lb.VTKOutput(
+        identifier=lb_vtk_auto_id, delta_N=1,
+        observables=('density', 'velocity_vector'), base_folder=str(vtk_root))
+    lbf.add_vtk_writer(vtk=lb_vtk_auto)
+    lb_vtk_auto.disable()
+    lb_vtk_manual = espressomd.lb.VTKOutput(
+        identifier=lb_vtk_manual_id, delta_N=0,
+        observables=('density',), base_folder=str(vtk_root))
+    lbf.add_vtk_writer(vtk=lb_vtk_manual)
+    lb_vtk_manual.write()
     # create EK VTK callbacks
     ek_vtk_auto_id = f"auto_ek_{vtk_suffix}"
     ek_vtk_manual_id = f"manual_ek_{vtk_suffix}"
diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py
index 5f4b5f63f0..ed31ebc8d6 100644
--- a/testsuite/python/test_checkpoint.py
+++ b/testsuite/python/test_checkpoint.py
@@ -269,7 +269,6 @@ def generator(value, shape):
             np.copy(ek_species[:, :, :].is_boundary), False)
 
     @utx.skipIfMissingFeatures(["WALBERLA"])
-    @ut.skipIf('LB.GPU' in modes, 'VTK not implemented for LB GPU')
     @ut.skipIf(not has_lb_mode, "Skipping test due to missing LB mode.")
     def test_lb_vtk(self):
         lbf = system.lb

From 0a6d1bdec7bde5bfee0e775535883a667fd4ae62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 15 Nov 2024 18:53:37 +0100
Subject: [PATCH 3/3] Fix thermostats

Properly handle null values for kT and friction coefficients.
Make friction coefficients required parameters.
---
 .../electrokinetics/electrokinetics.ipynb     |  2 +-
 samples/lb_profile.py                         |  1 -
 src/core/lb/particle_coupling.cpp             | 27 ++++---
 src/core/thermostat.hpp                       |  3 +
 .../thermostat/thermostat.hpp                 | 70 +++++++++++++++----
 testsuite/python/lb.py                        |  5 +-
 testsuite/python/lb_interpolation.py          | 39 ++++++-----
 testsuite/python/lb_pressure_tensor.py        |  1 -
 testsuite/python/lb_thermostat.py             |  2 +
 9 files changed, 104 insertions(+), 46 deletions(-)

diff --git a/doc/tutorials/electrokinetics/electrokinetics.ipynb b/doc/tutorials/electrokinetics/electrokinetics.ipynb
index 32e5f952b8..3fe674686e 100644
--- a/doc/tutorials/electrokinetics/electrokinetics.ipynb
+++ b/doc/tutorials/electrokinetics/electrokinetics.ipynb
@@ -1042,7 +1042,7 @@
     "                lattice=lattice, density=DENSITY_FLUID, kinematic_viscosity=VISCOSITY_KINEMATIC,\n",
     "                tau=TAU, ext_force_density=EXT_FORCE_DENSITY, kT=KT, seed=42)\n",
     "system.lb = lbf\n",
-    "system.thermostat.set_lb(LB_fluid=lbf, seed=42)\n",
+    "system.thermostat.set_lb(LB_fluid=lbf, seed=42, gamma=0.)\n",
     "eksolver = espressomd.electrokinetics.EKNone(lattice=lattice)\n",
     "system.ekcontainer = espressomd.electrokinetics.EKContainer(tau=TAU, solver=eksolver)"
    ]
diff --git a/samples/lb_profile.py b/samples/lb_profile.py
index 763bd7f95a..6b47778f29 100644
--- a/samples/lb_profile.py
+++ b/samples/lb_profile.py
@@ -45,7 +45,6 @@
     agrid=1.0, density=1.0, kinematic_viscosity=1.0, tau=0.01,
     ext_force_density=[0, 0, 0.15], kT=0.0)
 system.lb = lb_fluid
-system.thermostat.set_lb(LB_fluid=lb_fluid, seed=23)
 ctp = espressomd.math.CylindricalTransformationParameters(
     center=[5.0, 5.0, 0.0],
     axis=[0, 0, 1],
diff --git a/src/core/lb/particle_coupling.cpp b/src/core/lb/particle_coupling.cpp
index 648384f1f6..fad6bb64b8 100644
--- a/src/core/lb/particle_coupling.cpp
+++ b/src/core/lb/particle_coupling.cpp
@@ -291,18 +291,23 @@ void ParticleCoupling::kernel(std::vector<Particle *> const &particles) {
 #endif
     Utils::Vector3d force_on_particle = {};
     if (coupling_mode == particle_force) {
-      auto &v_fluid = *it_interpolated_velocities;
-      if (m_box_geo.type() == BoxType::LEES_EDWARDS) {
-        // Account for the case where the interpolated velocity has been read
-        // from a ghost of the particle across the LE boundary (or vice verssa)
-        // Then the particle velocity is shifted by +,- the LE shear velocity
-        auto const vel_correction = lees_edwards_vel_shift(
-            *it_positions_velocity_coupling, p.pos(), m_box_geo);
-        v_fluid += vel_correction;
+#ifndef THERMOSTAT_PER_PARTICLE
+      if (m_thermostat.gamma > 0.)
+#endif
+      {
+        auto &v_fluid = *it_interpolated_velocities;
+        if (m_box_geo.type() == BoxType::LEES_EDWARDS) {
+          // Account for the case where the interpolated velocity has been read
+          // from a ghost of the particle across the LE boundary (or vice versa)
+          // Then the particle velocity is shifted by +,- the LE shear velocity
+          auto const vel_correction = lees_edwards_vel_shift(
+              *it_positions_velocity_coupling, p.pos(), m_box_geo);
+          v_fluid += vel_correction;
+        }
+        auto const drag_force = lb_drag_force(p, m_thermostat.gamma, v_fluid);
+        auto const random_force = get_noise_term(p);
+        force_on_particle = drag_force + random_force;
       }
-      auto const drag_force = lb_drag_force(p, m_thermostat.gamma, v_fluid);
-      auto const random_force = get_noise_term(p);
-      force_on_particle = drag_force + random_force;
       ++it_interpolated_velocities;
       ++it_positions_velocity_coupling;
     }
diff --git a/src/core/thermostat.hpp b/src/core/thermostat.hpp
index 121c6a4ebd..8d5781257c 100644
--- a/src/core/thermostat.hpp
+++ b/src/core/thermostat.hpp
@@ -97,6 +97,9 @@ inline bool are_kT_equal(double old_kT, double new_kT) {
   }
   auto const large_kT = (old_kT > new_kT) ? old_kT : new_kT;
   auto const small_kT = (old_kT > new_kT) ? new_kT : old_kT;
+  if (small_kT == 0.) {
+    return false;
+  }
   return (large_kT / small_kT - 1. < relative_tolerance);
 }
 } // namespace Thermostat
diff --git a/src/script_interface/thermostat/thermostat.hpp b/src/script_interface/thermostat/thermostat.hpp
index 57318778f1..bb63ad38a3 100644
--- a/src/script_interface/thermostat/thermostat.hpp
+++ b/src/script_interface/thermostat/thermostat.hpp
@@ -33,6 +33,7 @@
 #include <cassert>
 #include <limits>
 #include <memory>
+#include <span>
 #include <stdexcept>
 #include <string>
 
@@ -92,7 +93,7 @@ class Interface : public AutoParameters<Interface<CoreClass>, System::Leaf> {
   }
 
   virtual bool invalid_rng_state(VariantMap const &params) const {
-    return (not params.count("seed") or is_none(params.at("seed"))) and
+    return (not params.contains("seed") or is_none(params.at("seed"))) and
            is_seed_required();
   }
 
@@ -101,14 +102,15 @@ class Interface : public AutoParameters<Interface<CoreClass>, System::Leaf> {
   get_member_handle(::Thermostat::Thermostat &thermostat) = 0;
 
   void set_new_parameters(VariantMap const &params) {
-    if (params.count("__check_rng_state") and invalid_rng_state(params)) {
-      context()->parallel_try_catch([]() {
+    context()->parallel_try_catch([&]() {
+      if (params.contains("__check_rng_state") and invalid_rng_state(params)) {
         throw std::invalid_argument("Parameter 'seed' is needed on first "
                                     "activation of the thermostat");
-      });
-    }
+      }
+      check_required_parameters(params);
+    });
     for (auto const &key : get_parameter_insertion_order()) {
-      if (params.count(key)) {
+      if (params.contains(key)) {
         auto const &v = params.at(key);
         if (key == "is_active") {
           is_active = get_value<bool>(v);
@@ -119,6 +121,17 @@ class Interface : public AutoParameters<Interface<CoreClass>, System::Leaf> {
     }
   }
 
+  virtual std::span<std::string_view const> get_required_parameters() const = 0;
+
+  void check_required_parameters(VariantMap const &params) const {
+    for (auto const &required : get_required_parameters()) {
+      auto name = std::string(required);
+      if (not params.contains(name)) {
+        throw std::runtime_error("Parameter '" + name + "' is missing");
+      }
+    }
+  }
+
 protected:
   template <typename T>
   auto make_autoparameter(T CoreThermostat::*member, char const *name) {
@@ -169,7 +182,7 @@ class Interface : public AutoParameters<Interface<CoreClass>, System::Leaf> {
     auto params = parameters;
     if (not is_seed_required()) {
       for (auto key : {std::string("seed"), std::string("philox_counter")}) {
-        if (params.count(key) == 0ul) {
+        if (not params.contains(key)) {
           params[key] = get_parameter(key);
         }
       }
@@ -216,7 +229,7 @@ class Interface : public AutoParameters<Interface<CoreClass>, System::Leaf> {
   }
 
   virtual std::optional<double> extract_kT(VariantMap const &params) const {
-    if (params.count("kT")) {
+    if (params.contains("kT")) {
       auto const value = get_value<double>(params, "kT");
       sanity_checks_positive(value, "kT");
       return value;
@@ -301,6 +314,11 @@ class Langevin : public Interface<::LangevinThermostat> {
     return thermostat.langevin;
   }
 
+  std::span<std::string_view const> get_required_parameters() const override {
+    static constexpr std::array names{std::string_view("gamma")};
+    return names;
+  }
+
 public:
   Langevin() {
     add_parameters({
@@ -319,7 +337,7 @@ class Langevin : public Interface<::LangevinThermostat> {
         Interface<::LangevinThermostat>::extend_parameters(parameters);
 #ifdef ROTATION
     // If gamma_rotation is not set explicitly, use the translational one.
-    if (params.count("gamma_rotation") == 0ul and params.count("gamma")) {
+    if (not params.contains("gamma_rotation") and params.contains("gamma")) {
       params["gamma_rotation"] = params.at("gamma");
     }
 #endif // ROTATION
@@ -333,6 +351,11 @@ class Brownian : public Interface<::BrownianThermostat> {
     return thermostat.brownian;
   }
 
+  std::span<std::string_view const> get_required_parameters() const override {
+    static constexpr std::array names{std::string_view("gamma")};
+    return names;
+  }
+
 public:
   Brownian() {
     add_parameters({
@@ -351,7 +374,7 @@ class Brownian : public Interface<::BrownianThermostat> {
         Interface<::BrownianThermostat>::extend_parameters(parameters);
 #ifdef ROTATION
     // If gamma_rotation is not set explicitly, use the translational one.
-    if (params.count("gamma_rotation") == 0ul and params.count("gamma")) {
+    if (not params.contains("gamma_rotation") and params.contains("gamma")) {
       params["gamma_rotation"] = params.at("gamma");
     }
 #endif // ROTATION
@@ -366,6 +389,12 @@ class IsotropicNpt : public Interface<::IsotropicNptThermostat> {
     return thermostat.npt_iso;
   }
 
+  std::span<std::string_view const> get_required_parameters() const override {
+    static constexpr std::array names{std::string_view("gamma0"),
+                                      std::string_view("gammav")};
+    return names;
+  }
+
 public:
   IsotropicNpt() {
     add_parameters({
@@ -418,10 +447,15 @@ class LBThermostat : public Interface<::LBThermostat> {
 
 protected:
   bool invalid_rng_state(VariantMap const &params) const override {
-    return (not params.count("seed") or is_none(params.at("seed"))) and
-           params.count("__global_kT") and is_seed_required() and
+    return (not params.contains("seed") or is_none(params.at("seed"))) and
+           params.contains("__global_kT") and is_seed_required() and
            get_value<double>(params, "__global_kT") != 0.;
   }
+
+  std::span<std::string_view const> get_required_parameters() const override {
+    static constexpr std::array names{std::string_view("gamma")};
+    return names;
+  }
 };
 #endif // WALBERLA
 
@@ -432,6 +466,10 @@ class DPDThermostat : public Interface<::DPDThermostat> {
     return thermostat.dpd;
   }
 
+  std::span<std::string_view const> get_required_parameters() const override {
+    return {};
+  }
+
 public:
   ::ThermostatFlags get_thermo_flag() const final { return THERMO_DPD; }
 };
@@ -444,6 +482,10 @@ class Stokesian : public Interface<::StokesianThermostat> {
     return thermostat.stokesian;
   }
 
+  std::span<std::string_view const> get_required_parameters() const override {
+    return {};
+  }
+
 public:
   ::ThermostatFlags get_thermo_flag() const final { return THERMO_SD; }
 };
@@ -455,6 +497,10 @@ class ThermalizedBond : public Interface<::ThermalizedBondThermostat> {
     return thermostat.thermalized_bond;
   }
 
+  std::span<std::string_view const> get_required_parameters() const override {
+    return {};
+  }
+
 public:
   ::ThermostatFlags get_thermo_flag() const final { return THERMO_BOND; }
 };
diff --git a/testsuite/python/lb.py b/testsuite/python/lb.py
index 4e585d5f08..a16ff6c5a3 100644
--- a/testsuite/python/lb.py
+++ b/testsuite/python/lb.py
@@ -54,7 +54,6 @@ class LBTest:
     system.cell_system.skin = 1.0
     if espressomd.gpu_available():
         system.cuda_init_handle.call_method("set_device_id_per_rank")
-    interpolation = False
     n_nodes = system.cell_system.get_state()["n_nodes"]
 
     def setUp(self):
@@ -75,7 +74,6 @@ def test_properties(self):
         # activated actor
         lbf = self.lb_class(kT=1.0, seed=42, **self.params, **self.lb_params)
         self.system.lb = lbf
-        self.system.thermostat.set_lb(LB_fluid=lbf, seed=1)
         self.assertTrue(lbf.is_active)
         self.check_properties(lbf)
         self.system.lb = None
@@ -83,7 +81,6 @@ def test_properties(self):
         # deactivated actor
         lbf = self.lb_class(kT=1.0, seed=42, **self.params, **self.lb_params)
         self.system.lb = lbf
-        self.system.thermostat.set_lb(LB_fluid=lbf, seed=1)
         self.system.lb = None
         self.assertFalse(lbf.is_active)
         self.check_properties(lbf)
@@ -383,7 +380,7 @@ def test_pressure_tensor_observable(self):
         lbf = self.lb_class(kT=1., seed=1, ext_force_density=[0, 0, 0],
                             **self.params, **self.lb_params)
         system.lb = lbf
-        system.thermostat.set_lb(LB_fluid=lbf, seed=1)
+        system.thermostat.set_lb(LB_fluid=lbf, seed=1, gamma=1.)
         system.integrator.run(10)
         pressure_tensor = np.copy(
             np.mean(lbf[:, :, :].pressure_tensor, axis=(0, 1, 2)))
diff --git a/testsuite/python/lb_interpolation.py b/testsuite/python/lb_interpolation.py
index 96e93523b5..3a2bd9a491 100644
--- a/testsuite/python/lb_interpolation.py
+++ b/testsuite/python/lb_interpolation.py
@@ -128,26 +128,33 @@ def test_mach_limit_check(self):
         self.assertIsNone(self.lbf[0, 0, 0].boundary)
 
     def test_interpolated_force(self):
+        def sample(kernel):
+            for i, j, k in itertools.product(range(3), range(3), range(3)):
+                system.lb[:, :, :].velocity = lb_vel
+                p.pos = (
+                    (i + 0.5) * AGRID,
+                    (j + 0.5) * AGRID,
+                    (k + 0.5) * AGRID)
+                p.v = [0., 0., 0.]
+                system.integrator.run(1)
+                np.testing.assert_allclose(np.copy(p.f), kernel(i, j, k))
         system = self.system
-        system.thermostat.set_lb(LB_fluid=system.lb, seed=42, gamma=1.)
         system.integrator.run(1)
         lb_vel = np.zeros([12, 12, 12, 3], dtype=float)
-        for i in range(12):
-            for j in range(12):
-                for k in range(12):
-                    lb_vel[i, j, k] = 1e-3 * np.array([i + 1, j + 1, k + 1])
+        for i, j, k in itertools.product(range(12), range(12), range(12)):
+            lb_vel[i, j, k] = 1e-3 * np.array([i + 1, j + 1, k + 1])
         p = system.part.add(pos=3 * [1.5])
-        for i in range(3):
-            for j in range(3):
-                for k in range(3):
-                    system.lb[:, :, :].velocity = lb_vel
-                    p.pos = (
-                        (i + 0.5) * AGRID,
-                        (j + 0.5) * AGRID,
-                        (k + 0.5) * AGRID)
-                    p.v = [0., 0., 0.]
-                    system.integrator.run(1)
-                    np.testing.assert_allclose(np.copy(p.f), lb_vel[i, j, k])
+        # enable particle coupling
+        system.thermostat.set_lb(LB_fluid=system.lb, seed=42, gamma=1.)
+        sample(kernel=lambda i, j, k: lb_vel[i, j, k])
+        # disable particle coupling
+        system.thermostat.set_lb(LB_fluid=system.lb, seed=42, gamma=0.)
+        sample(kernel=lambda i, j, k: [0., 0., 0.])
+        if espressomd.has_features("THERMOSTAT_PER_PARTICLE"):
+            # disable particle coupling globally & enable per-particle coupling
+            system.thermostat.set_lb(LB_fluid=system.lb, seed=42, gamma=0.)
+            p.gamma = 1.5
+            sample(kernel=lambda i, j, k: 1.5 * lb_vel[i, j, k])
 
 
 @utx.skipIfMissingFeatures(["WALBERLA"])
diff --git a/testsuite/python/lb_pressure_tensor.py b/testsuite/python/lb_pressure_tensor.py
index d4fb48b231..0c0b4df3a6 100644
--- a/testsuite/python/lb_pressure_tensor.py
+++ b/testsuite/python/lb_pressure_tensor.py
@@ -46,7 +46,6 @@ class TestLBPressureTensor:
     @classmethod
     def tearDownClass(cls):
         cls.system.lb = None
-        cls.system.thermostat.turn_off()
 
     @classmethod
     def setUpClass(cls):
diff --git a/testsuite/python/lb_thermostat.py b/testsuite/python/lb_thermostat.py
index 77677bb96e..53de0ea8ca 100644
--- a/testsuite/python/lb_thermostat.py
+++ b/testsuite/python/lb_thermostat.py
@@ -208,6 +208,8 @@ def test_exceptions(self):
         with self.assertRaisesRegex(RuntimeError, r"set_lb\(\) got an unexpected keyword argument 'act_on_virtual'"):
             self.system.thermostat.set_lb(
                 LB_fluid=self.lbf, act_on_virtual=False)
+        with self.assertRaisesRegex(RuntimeError, "Parameter 'gamma' is missing"):
+            self.system.thermostat.set_lb(LB_fluid=self.lbf)
         self.system.part.add(pos=[0., 0., 0.], gamma=[1., 2., 3.], id=2)
         with self.assertRaisesRegex(Exception, r"ERROR: anisotropic particle \(id 2\) coupled to LB"):
             self.system.integrator.run(1)