From d55f91c69d77a3f3f3f759c6939552ab3417d7ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= <jgrad@icp.uni-stuttgart.de>
Date: Fri, 25 Jun 2021 15:00:03 +0200
Subject: [PATCH] script_interface: Disable checkpointing of object containers

When an ObjectList container is deserialized, the contained
objects are deserialized on the head node only. Until this
can be fixed, serialization of non-empty object containers
in a MPI-parallel environment with world size > 1 is blocked.
---
 doc/sphinx/io.rst                             |  5 ++
 src/script_interface/CMakeLists.txt           |  6 +-
 src/script_interface/ObjectList.hpp           |  3 +
 .../object_container_mpi_guard.cpp            | 39 +++++++++++++
 .../object_container_mpi_guard.hpp            | 42 ++++++++++++++
 .../tests/ObjectList_test.cpp                 | 13 +++++
 testsuite/python/CMakeLists.txt               |  6 ++
 testsuite/python/save_checkpoint.py           | 57 +++++++++++--------
 testsuite/python/test_checkpoint.py           | 41 +++++++------
 9 files changed, 168 insertions(+), 44 deletions(-)
 create mode 100644 src/script_interface/object_container_mpi_guard.cpp
 create mode 100644 src/script_interface/object_container_mpi_guard.hpp

diff --git a/doc/sphinx/io.rst b/doc/sphinx/io.rst
index dccb32fd0d2..bea55ddfbb8 100644
--- a/doc/sphinx/io.rst
+++ b/doc/sphinx/io.rst
@@ -100,6 +100,11 @@ Be aware of the following limitations:
 
   * Pickling support of the :class:`espressomd.system.System` instance and contained objects such as bonded and non-bonded interactions and electrostatics methods. However, there are many more combinations of active interactions and algorithms than can be tested.
 
+  * Checkpointing only supports recursion on the head node. It is therefore
+    impossible to checkpoint a :class:`espressomd.system.System` instance that
+    contains LB boundaries, constraints or auto-update accumulators, when the
+    simulation is running with 2 or more MPI nodes.
+
   * The active actors, i.e., the content of ``system.actors``, are checkpointed. For lattice-Boltzmann fluids, this only includes the parameters such as the lattice constant (``agrid``). The actual flow field has to be saved separately with the lattice-Boltzmann specific methods
     :meth:`espressomd.lb.HydrodynamicInteraction.save_checkpoint`
     and loaded via :meth:`espressomd.lb.HydrodynamicInteraction.load_checkpoint` after restoring the checkpoint.
diff --git a/src/script_interface/CMakeLists.txt b/src/script_interface/CMakeLists.txt
index 01e1d345744..a029499dbac 100644
--- a/src/script_interface/CMakeLists.txt
+++ b/src/script_interface/CMakeLists.txt
@@ -1,5 +1,7 @@
-add_library(ScriptInterface SHARED initialize.cpp ObjectHandle.cpp
-                                   GlobalContext.cpp ContextManager.cpp)
+add_library(
+  ScriptInterface SHARED
+  initialize.cpp ObjectHandle.cpp object_container_mpi_guard.cpp
+  GlobalContext.cpp ContextManager.cpp)
 
 add_subdirectory(accumulators)
 add_subdirectory(collision_detection)
diff --git a/src/script_interface/ObjectList.hpp b/src/script_interface/ObjectList.hpp
index 0dd8aab6122..19b57e892fe 100644
--- a/src/script_interface/ObjectList.hpp
+++ b/src/script_interface/ObjectList.hpp
@@ -24,6 +24,7 @@
 
 #include "script_interface/ScriptInterface.hpp"
 #include "script_interface/get_value.hpp"
+#include "script_interface/object_container_mpi_guard.hpp"
 
 #include <utils/serialization/pack.hpp>
 
@@ -132,6 +133,8 @@ class ObjectList : public BaseType {
 
 private:
   std::string get_internal_state() const override {
+    object_container_mpi_guard(BaseType::name(), m_elements.size());
+
     std::vector<std::string> object_states(m_elements.size());
 
     boost::transform(m_elements, object_states.begin(),
diff --git a/src/script_interface/object_container_mpi_guard.cpp b/src/script_interface/object_container_mpi_guard.cpp
new file mode 100644
index 00000000000..8d5f973871c
--- /dev/null
+++ b/src/script_interface/object_container_mpi_guard.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2021 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "script_interface/object_container_mpi_guard.hpp"
+
+#include "core/communication.hpp"
+
+#include <boost/utility/string_ref.hpp>
+
+#include <cstddef>
+#include <sstream>
+#include <stdexcept>
+
+void object_container_mpi_guard(boost::string_ref const &name,
+                                std::size_t n_elements) {
+  if (comm_cart.size() > 1 and n_elements) {
+    std::stringstream error_msg;
+    error_msg << "Non-empty object containers do not support checkpointing in "
+              << "MPI environments. Container " << name << " contains "
+              << n_elements << " elements.";
+    throw std::runtime_error(error_msg.str());
+  }
+}
diff --git a/src/script_interface/object_container_mpi_guard.hpp b/src/script_interface/object_container_mpi_guard.hpp
new file mode 100644
index 00000000000..98d15a05042
--- /dev/null
+++ b/src/script_interface/object_container_mpi_guard.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2021 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/utility/string_ref.hpp>
+
+#include <cstddef>
+
+/**
+ * @brief Prevent object container serialization.
+ *
+ * The @ref ScriptInterface::ObjectHandle framework doesn't support
+ * recursive deserialization. When an object container such as
+ * @ref ScriptInterface::ObjectList is deserialized, the contained
+ * objects are deserialized on the head node only, which leads to
+ * silent bugs in simulations.
+ *
+ * This function needs to be called from an object container
+ * <tt>get_internal_state()</tt> method to throw a runtime error
+ * when the container is not empty and the MPI world size is
+ * greater than 1.
+ *
+ * @param name        Name of the object container
+ * @param n_elements  Number of elements in the container
+ */
+void object_container_mpi_guard(boost::string_ref const &name,
+                                std::size_t n_elements);
diff --git a/src/script_interface/tests/ObjectList_test.cpp b/src/script_interface/tests/ObjectList_test.cpp
index bd58c0e8bfc..f2e2fd92d21 100644
--- a/src/script_interface/tests/ObjectList_test.cpp
+++ b/src/script_interface/tests/ObjectList_test.cpp
@@ -17,7 +17,9 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#define BOOST_TEST_NO_MAIN
 #define BOOST_TEST_MODULE ObjectList test
+#define BOOST_TEST_ALTERNATIVE_INIT_API
 #define BOOST_TEST_DYN_LINK
 #include <boost/test/unit_test.hpp>
 
@@ -26,6 +28,10 @@
 #include "script_interface/LocalContext.hpp"
 #include "script_interface/ObjectList.hpp"
 
+#include "core/communication.hpp"
+
+#include <boost/mpi.hpp>
+
 #include <algorithm>
 #include <memory>
 #include <vector>
@@ -108,3 +114,10 @@ BOOST_AUTO_TEST_CASE(serialization) {
   BOOST_CHECK(list2->mock_core.front()->name() == "ObjectHandle");
   BOOST_CHECK(list2->mock_core.back()->name() == "ObjectHandle");
 }
+
+int main(int argc, char **argv) {
+  auto mpi_env = std::make_shared<boost::mpi::environment>(argc, argv);
+  Communication::init(mpi_env);
+
+  return boost::unit_test::unit_test_main(init_unit_test, argc, argv);
+}
diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt
index a20c73ee4bb..72e4baa2b1d 100644
--- a/testsuite/python/CMakeLists.txt
+++ b/testsuite/python/CMakeLists.txt
@@ -78,6 +78,12 @@ foreach(
       save_checkpoint_${TEST_COMBINATION}_${TEST_BINARY})
   endforeach(TEST_BINARY)
 endforeach(TEST_COMBINATION)
+python_test(FILE save_checkpoint.py MAX_NUM_PROC 1 SUFFIX
+            lb.cpu-p3m.cpu-lj-therm.lb.1-core)
+python_test(
+  FILE test_checkpoint.py MAX_NUM_PROC 1 SUFFIX
+  lb.cpu-p3m.cpu-lj-therm.lb.1-core DEPENDS
+  save_checkpoint_lb.cpu-p3m.cpu-lj-therm.lb.1-core)
 python_test(FILE cellsystem.py MAX_NUM_PROC 4)
 python_test(FILE tune_skin.py MAX_NUM_PROC 1)
 python_test(FILE constraint_homogeneous_magnetic_field.py MAX_NUM_PROC 4)
diff --git a/testsuite/python/save_checkpoint.py b/testsuite/python/save_checkpoint.py
index a91ebe6431d..4e99856757c 100644
--- a/testsuite/python/save_checkpoint.py
+++ b/testsuite/python/save_checkpoint.py
@@ -59,6 +59,8 @@
         if filepath.endswith((".checkpoint", ".cpt")):
             os.remove(os.path.join(checkpoint.checkpoint_dir, filepath))
 
+n_nodes = system.cell_system.get_state()["n_nodes"]
+
 LB_implementation = None
 if 'LB.CPU' in modes:
     LB_implementation = espressomd.lb.LBFluid
@@ -70,7 +72,8 @@
     system.actors.add(lbf)
     if 'THERM.LB' in modes:
         system.thermostat.set_lb(LB_fluid=lbf, seed=23, gamma=2.0)
-    if any(has_features(i) for i in ["LB_BOUNDARIES", "LB_BOUNDARIES_GPU"]):
+    if any(has_features(i)
+           for i in ["LB_BOUNDARIES", "LB_BOUNDARIES_GPU"]) and n_nodes == 1:
         if 'EK.GPU' not in modes:
             system.lbboundaries.add(
                 LBBoundary(shape=Wall(normal=(1, 0, 0), dist=0.5), velocity=(1e-4, 1e-4, 0)))
@@ -136,6 +139,7 @@
             delta_mid_bot=0.1)
         system.actors.add(elc)
 
+# accumulators
 obs = espressomd.observables.ParticlePositions(ids=[0, 1])
 acc_mean_variance = espressomd.accumulators.MeanVarianceCalculator(obs=obs)
 acc_time_series = espressomd.accumulators.TimeSeries(obs=obs)
@@ -150,32 +154,35 @@
 acc_time_series.update()
 acc_correlator.update()
 
-system.auto_update_accumulators.add(acc_mean_variance)
-system.auto_update_accumulators.add(acc_time_series)
-system.auto_update_accumulators.add(acc_correlator)
+if n_nodes == 1:
+    system.auto_update_accumulators.add(acc_mean_variance)
+    system.auto_update_accumulators.add(acc_time_series)
+    system.auto_update_accumulators.add(acc_correlator)
 
 # constraints
-system.constraints.add(shape=Sphere(center=system.box_l / 2, radius=0.1),
-                       particle_type=17)
-system.constraints.add(shape=Wall(normal=[1. / np.sqrt(3)] * 3, dist=0.5))
-system.constraints.add(constraints.Gravity(g=[1., 2., 3.]))
-system.constraints.add(constraints.HomogeneousMagneticField(H=[1., 2., 3.]))
-system.constraints.add(
-    constraints.HomogeneousFlowField(u=[1., 2., 3.], gamma=2.3))
-pot_field_data = constraints.ElectricPotential.field_from_fn(
-    system.box_l, np.ones(3), lambda x: np.linalg.norm(10 * np.ones(3) - x))
-checkpoint.register("pot_field_data")
-system.constraints.add(constraints.PotentialField(
-    field=pot_field_data, grid_spacing=np.ones(3), default_scale=1.6,
-    particle_scales={5: 6.0}))
-vec_field_data = constraints.ForceField.field_from_fn(
-    system.box_l, np.ones(3), lambda x: 10 * np.ones(3) - x)
-checkpoint.register("vec_field_data")
-system.constraints.add(constraints.ForceField(
-    field=vec_field_data, grid_spacing=np.ones(3), default_scale=1.4))
-if espressomd.has_features("ELECTROSTATICS"):
-    system.constraints.add(constraints.ElectricPlaneWave(
-        E0=[1., -2., 3.], k=[-.1, .2, .3], omega=5., phi=1.4))
+if n_nodes == 1:
+    system.constraints.add(shape=Sphere(center=system.box_l / 2, radius=0.1),
+                           particle_type=17)
+    system.constraints.add(shape=Wall(normal=[1. / np.sqrt(3)] * 3, dist=0.5))
+    system.constraints.add(constraints.Gravity(g=[1., 2., 3.]))
+    system.constraints.add(
+        constraints.HomogeneousMagneticField(H=[1., 2., 3.]))
+    system.constraints.add(
+        constraints.HomogeneousFlowField(u=[1., 2., 3.], gamma=2.3))
+    pot_field_data = constraints.ElectricPotential.field_from_fn(
+        system.box_l, np.ones(3), lambda x: np.linalg.norm(10 * np.ones(3) - x))
+    checkpoint.register("pot_field_data")
+    system.constraints.add(constraints.PotentialField(
+        field=pot_field_data, grid_spacing=np.ones(3), default_scale=1.6,
+        particle_scales={5: 6.0}))
+    vec_field_data = constraints.ForceField.field_from_fn(
+        system.box_l, np.ones(3), lambda x: 10 * np.ones(3) - x)
+    checkpoint.register("vec_field_data")
+    system.constraints.add(constraints.ForceField(
+        field=vec_field_data, grid_spacing=np.ones(3), default_scale=1.4))
+    if espressomd.has_features("ELECTROSTATICS"):
+        system.constraints.add(constraints.ElectricPlaneWave(
+            E0=[1., -2., 3.], k=[-.1, .2, .3], omega=5., phi=1.4))
 
 if 'LB.OFF' in modes:
     # set thermostat
diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py
index ecd6a82375f..00005be7b4f 100644
--- a/testsuite/python/test_checkpoint.py
+++ b/testsuite/python/test_checkpoint.py
@@ -40,13 +40,15 @@
 
 class CheckpointTest(ut.TestCase):
 
+    checkpoint = espressomd.checkpointing.Checkpoint(
+        checkpoint_id="mycheckpoint_@TEST_COMBINATION@_@TEST_BINARY@".replace(
+            '.', '__'),
+        checkpoint_path="@CMAKE_CURRENT_BINARY_DIR@")
+    checkpoint.load(0)
+    n_nodes = system.cell_system.get_state()["n_nodes"]
+
     @classmethod
     def setUpClass(cls):
-        cls.checkpoint = espressomd.checkpointing.Checkpoint(
-            checkpoint_id="mycheckpoint_@TEST_COMBINATION@_@TEST_BINARY@".replace(
-                '.', '__'),
-            checkpoint_path="@CMAKE_CURRENT_BINARY_DIR@")
-        cls.checkpoint.load(0)
         cls.ref_box_l = np.array([12.0, 14.0, 16.0])
         if 'DP3M' in modes:
             cls.ref_box_l = np.array([16.0, 16.0, 16.0])
@@ -160,6 +162,7 @@ def test_part(self):
         np.testing.assert_allclose(np.copy(p1.f), particle_force0)
         np.testing.assert_allclose(np.copy(p2.f), particle_force1)
 
+    @ut.skipIf(n_nodes > 1, "only runs for 1 MPI rank")
     def test_object_containers_serialization(self):
         '''
         Check that particles at the interface between two MPI nodes still
@@ -225,9 +228,8 @@ def test_thermostat_DPD(self):
         self.assertEqual(thmst['type'], 'DPD')
         self.assertEqual(thmst['kT'], 1.0)
         self.assertEqual(thmst['seed'], 42)
-        n_nodes = system.cell_system.get_state()["n_nodes"]
-        if n_nodes in {1, 2, 3, 4, 8}:
-            ref_counter = (n_nodes == 3) and 6 or 8
+        if self.n_nodes in {1, 2, 3, 4, 8}:
+            ref_counter = (self.n_nodes == 1) and 10 or 0
             self.assertEqual(thmst['counter'], ref_counter)
 
     @utx.skipIfMissingFeatures('NPT')
@@ -358,24 +360,27 @@ def test_mean_variance_calculator(self):
         np.testing.assert_array_equal(
             acc_mean_variance.variance(),
             np.array([[0., 0.5, 2.], [0., 0., 0.]]))
-        np.testing.assert_array_equal(
-            system.auto_update_accumulators[0].variance(),
-            np.array([[0., 0.5, 2.], [0., 0., 0.]]))
+        if self.n_nodes == 1:
+            np.testing.assert_array_equal(
+                system.auto_update_accumulators[0].variance(),
+                np.array([[0., 0.5, 2.], [0., 0., 0.]]))
 
     def test_time_series(self):
         expected = [[[1, 1, 1], [1, 1, 2]], [[1, 2, 3], [1, 1, 2]]]
         np.testing.assert_array_equal(acc_time_series.time_series(), expected)
-        np.testing.assert_array_equal(
-            system.auto_update_accumulators[1].time_series(),
-            expected)
+        if self.n_nodes == 1:
+            np.testing.assert_array_equal(
+                system.auto_update_accumulators[1].time_series(),
+                expected)
 
     def test_correlator(self):
         expected = np.zeros((36, 2, 3))
         expected[0:2] = [[[1, 2.5, 5], [1, 1, 4]], [[1, 2, 3], [1, 1, 4]]]
         np.testing.assert_array_equal(acc_correlator.result(), expected)
-        np.testing.assert_array_equal(
-            system.auto_update_accumulators[2].result(),
-            expected)
+        if self.n_nodes == 1:
+            np.testing.assert_array_equal(
+                system.auto_update_accumulators[2].result(),
+                expected)
 
     @utx.skipIfMissingFeatures('DP3M')
     @ut.skipIf('DP3M.CPU' not in modes,
@@ -482,6 +487,7 @@ def test_exclusions(self):
 
     @ut.skipIf(not LB or EK or not (espressomd.has_features("LB_BOUNDARIES")
                                     or espressomd.has_features("LB_BOUNDARIES_GPU")), "Missing features")
+    @ut.skipIf(n_nodes > 1, "only runs for 1 MPI rank")
     def test_lb_boundaries(self):
         # check boundaries agree on all MPI nodes
         self.assertEqual(len(system.lbboundaries), 2)
@@ -503,6 +509,7 @@ def test_lb_boundaries(self):
         np.testing.assert_equal(
             system.actors[0][:, :, :].boundary.astype(int), 0)
 
+    @ut.skipIf(n_nodes > 1, "only runs for 1 MPI rank")
     def test_constraints(self):
         from espressomd import constraints
         self.assertEqual(len(system.constraints),