From d55f91c69d77a3f3f3f759c6939552ab3417d7ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-No=C3=ABl=20Grad?= Date: Fri, 25 Jun 2021 15:00:03 +0200 Subject: [PATCH] script_interface: Disable checkpointing of object containers When an ObjectList container is deserialized, the contained objects are deserialized on the head node only. Until this can be fixed, serialization of non-empty object containers in a MPI-parallel environment with world size > 1 is blocked. --- doc/sphinx/io.rst | 5 ++ src/script_interface/CMakeLists.txt | 6 +- src/script_interface/ObjectList.hpp | 3 + .../object_container_mpi_guard.cpp | 39 +++++++++++++ .../object_container_mpi_guard.hpp | 42 ++++++++++++++ .../tests/ObjectList_test.cpp | 13 +++++ testsuite/python/CMakeLists.txt | 6 ++ testsuite/python/save_checkpoint.py | 57 +++++++++++-------- testsuite/python/test_checkpoint.py | 41 +++++++------ 9 files changed, 168 insertions(+), 44 deletions(-) create mode 100644 src/script_interface/object_container_mpi_guard.cpp create mode 100644 src/script_interface/object_container_mpi_guard.hpp diff --git a/doc/sphinx/io.rst b/doc/sphinx/io.rst index dccb32fd0d2..bea55ddfbb8 100644 --- a/doc/sphinx/io.rst +++ b/doc/sphinx/io.rst @@ -100,6 +100,11 @@ Be aware of the following limitations: * Pickling support of the :class:`espressomd.system.System` instance and contained objects such as bonded and non-bonded interactions and electrostatics methods. However, there are many more combinations of active interactions and algorithms than can be tested. + * Checkpointing only supports recursion on the head node. It is therefore + impossible to checkpoint a :class:`espressomd.system.System` instance that + contains LB boundaries, constraints or auto-update accumulators, when the + simulation is running with 2 or more MPI nodes. + * The active actors, i.e., the content of ``system.actors``, are checkpointed. For lattice-Boltzmann fluids, this only includes the parameters such as the lattice constant (``agrid``). The actual flow field has to be saved separately with the lattice-Boltzmann specific methods :meth:`espressomd.lb.HydrodynamicInteraction.save_checkpoint` and loaded via :meth:`espressomd.lb.HydrodynamicInteraction.load_checkpoint` after restoring the checkpoint. diff --git a/src/script_interface/CMakeLists.txt b/src/script_interface/CMakeLists.txt index 01e1d345744..a029499dbac 100644 --- a/src/script_interface/CMakeLists.txt +++ b/src/script_interface/CMakeLists.txt @@ -1,5 +1,7 @@ -add_library(ScriptInterface SHARED initialize.cpp ObjectHandle.cpp - GlobalContext.cpp ContextManager.cpp) +add_library( + ScriptInterface SHARED + initialize.cpp ObjectHandle.cpp object_container_mpi_guard.cpp + GlobalContext.cpp ContextManager.cpp) add_subdirectory(accumulators) add_subdirectory(collision_detection) diff --git a/src/script_interface/ObjectList.hpp b/src/script_interface/ObjectList.hpp index 0dd8aab6122..19b57e892fe 100644 --- a/src/script_interface/ObjectList.hpp +++ b/src/script_interface/ObjectList.hpp @@ -24,6 +24,7 @@ #include "script_interface/ScriptInterface.hpp" #include "script_interface/get_value.hpp" +#include "script_interface/object_container_mpi_guard.hpp" #include @@ -132,6 +133,8 @@ class ObjectList : public BaseType { private: std::string get_internal_state() const override { + object_container_mpi_guard(BaseType::name(), m_elements.size()); + std::vector object_states(m_elements.size()); boost::transform(m_elements, object_states.begin(), diff --git a/src/script_interface/object_container_mpi_guard.cpp b/src/script_interface/object_container_mpi_guard.cpp new file mode 100644 index 00000000000..8d5f973871c --- /dev/null +++ b/src/script_interface/object_container_mpi_guard.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2021 The ESPResSo project + * + * This file is part of ESPResSo. + * + * ESPResSo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ESPResSo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "script_interface/object_container_mpi_guard.hpp" + +#include "core/communication.hpp" + +#include + +#include +#include +#include + +void object_container_mpi_guard(boost::string_ref const &name, + std::size_t n_elements) { + if (comm_cart.size() > 1 and n_elements) { + std::stringstream error_msg; + error_msg << "Non-empty object containers do not support checkpointing in " + << "MPI environments. Container " << name << " contains " + << n_elements << " elements."; + throw std::runtime_error(error_msg.str()); + } +} diff --git a/src/script_interface/object_container_mpi_guard.hpp b/src/script_interface/object_container_mpi_guard.hpp new file mode 100644 index 00000000000..98d15a05042 --- /dev/null +++ b/src/script_interface/object_container_mpi_guard.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2021 The ESPResSo project + * + * This file is part of ESPResSo. + * + * ESPResSo is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ESPResSo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include + +/** + * @brief Prevent object container serialization. + * + * The @ref ScriptInterface::ObjectHandle framework doesn't support + * recursive deserialization. When an object container such as + * @ref ScriptInterface::ObjectList is deserialized, the contained + * objects are deserialized on the head node only, which leads to + * silent bugs in simulations. + * + * This function needs to be called from an object container + * get_internal_state() method to throw a runtime error + * when the container is not empty and the MPI world size is + * greater than 1. + * + * @param name Name of the object container + * @param n_elements Number of elements in the container + */ +void object_container_mpi_guard(boost::string_ref const &name, + std::size_t n_elements); diff --git a/src/script_interface/tests/ObjectList_test.cpp b/src/script_interface/tests/ObjectList_test.cpp index bd58c0e8bfc..f2e2fd92d21 100644 --- a/src/script_interface/tests/ObjectList_test.cpp +++ b/src/script_interface/tests/ObjectList_test.cpp @@ -17,7 +17,9 @@ * along with this program. If not, see . */ +#define BOOST_TEST_NO_MAIN #define BOOST_TEST_MODULE ObjectList test +#define BOOST_TEST_ALTERNATIVE_INIT_API #define BOOST_TEST_DYN_LINK #include @@ -26,6 +28,10 @@ #include "script_interface/LocalContext.hpp" #include "script_interface/ObjectList.hpp" +#include "core/communication.hpp" + +#include + #include #include #include @@ -108,3 +114,10 @@ BOOST_AUTO_TEST_CASE(serialization) { BOOST_CHECK(list2->mock_core.front()->name() == "ObjectHandle"); BOOST_CHECK(list2->mock_core.back()->name() == "ObjectHandle"); } + +int main(int argc, char **argv) { + auto mpi_env = std::make_shared(argc, argv); + Communication::init(mpi_env); + + return boost::unit_test::unit_test_main(init_unit_test, argc, argv); +} diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt index a20c73ee4bb..72e4baa2b1d 100644 --- a/testsuite/python/CMakeLists.txt +++ b/testsuite/python/CMakeLists.txt @@ -78,6 +78,12 @@ foreach( save_checkpoint_${TEST_COMBINATION}_${TEST_BINARY}) endforeach(TEST_BINARY) endforeach(TEST_COMBINATION) +python_test(FILE save_checkpoint.py MAX_NUM_PROC 1 SUFFIX + lb.cpu-p3m.cpu-lj-therm.lb.1-core) +python_test( + FILE test_checkpoint.py MAX_NUM_PROC 1 SUFFIX + lb.cpu-p3m.cpu-lj-therm.lb.1-core DEPENDS + save_checkpoint_lb.cpu-p3m.cpu-lj-therm.lb.1-core) python_test(FILE cellsystem.py MAX_NUM_PROC 4) python_test(FILE tune_skin.py MAX_NUM_PROC 1) python_test(FILE constraint_homogeneous_magnetic_field.py MAX_NUM_PROC 4) diff --git a/testsuite/python/save_checkpoint.py b/testsuite/python/save_checkpoint.py index a91ebe6431d..4e99856757c 100644 --- a/testsuite/python/save_checkpoint.py +++ b/testsuite/python/save_checkpoint.py @@ -59,6 +59,8 @@ if filepath.endswith((".checkpoint", ".cpt")): os.remove(os.path.join(checkpoint.checkpoint_dir, filepath)) +n_nodes = system.cell_system.get_state()["n_nodes"] + LB_implementation = None if 'LB.CPU' in modes: LB_implementation = espressomd.lb.LBFluid @@ -70,7 +72,8 @@ system.actors.add(lbf) if 'THERM.LB' in modes: system.thermostat.set_lb(LB_fluid=lbf, seed=23, gamma=2.0) - if any(has_features(i) for i in ["LB_BOUNDARIES", "LB_BOUNDARIES_GPU"]): + if any(has_features(i) + for i in ["LB_BOUNDARIES", "LB_BOUNDARIES_GPU"]) and n_nodes == 1: if 'EK.GPU' not in modes: system.lbboundaries.add( LBBoundary(shape=Wall(normal=(1, 0, 0), dist=0.5), velocity=(1e-4, 1e-4, 0))) @@ -136,6 +139,7 @@ delta_mid_bot=0.1) system.actors.add(elc) +# accumulators obs = espressomd.observables.ParticlePositions(ids=[0, 1]) acc_mean_variance = espressomd.accumulators.MeanVarianceCalculator(obs=obs) acc_time_series = espressomd.accumulators.TimeSeries(obs=obs) @@ -150,32 +154,35 @@ acc_time_series.update() acc_correlator.update() -system.auto_update_accumulators.add(acc_mean_variance) -system.auto_update_accumulators.add(acc_time_series) -system.auto_update_accumulators.add(acc_correlator) +if n_nodes == 1: + system.auto_update_accumulators.add(acc_mean_variance) + system.auto_update_accumulators.add(acc_time_series) + system.auto_update_accumulators.add(acc_correlator) # constraints -system.constraints.add(shape=Sphere(center=system.box_l / 2, radius=0.1), - particle_type=17) -system.constraints.add(shape=Wall(normal=[1. / np.sqrt(3)] * 3, dist=0.5)) -system.constraints.add(constraints.Gravity(g=[1., 2., 3.])) -system.constraints.add(constraints.HomogeneousMagneticField(H=[1., 2., 3.])) -system.constraints.add( - constraints.HomogeneousFlowField(u=[1., 2., 3.], gamma=2.3)) -pot_field_data = constraints.ElectricPotential.field_from_fn( - system.box_l, np.ones(3), lambda x: np.linalg.norm(10 * np.ones(3) - x)) -checkpoint.register("pot_field_data") -system.constraints.add(constraints.PotentialField( - field=pot_field_data, grid_spacing=np.ones(3), default_scale=1.6, - particle_scales={5: 6.0})) -vec_field_data = constraints.ForceField.field_from_fn( - system.box_l, np.ones(3), lambda x: 10 * np.ones(3) - x) -checkpoint.register("vec_field_data") -system.constraints.add(constraints.ForceField( - field=vec_field_data, grid_spacing=np.ones(3), default_scale=1.4)) -if espressomd.has_features("ELECTROSTATICS"): - system.constraints.add(constraints.ElectricPlaneWave( - E0=[1., -2., 3.], k=[-.1, .2, .3], omega=5., phi=1.4)) +if n_nodes == 1: + system.constraints.add(shape=Sphere(center=system.box_l / 2, radius=0.1), + particle_type=17) + system.constraints.add(shape=Wall(normal=[1. / np.sqrt(3)] * 3, dist=0.5)) + system.constraints.add(constraints.Gravity(g=[1., 2., 3.])) + system.constraints.add( + constraints.HomogeneousMagneticField(H=[1., 2., 3.])) + system.constraints.add( + constraints.HomogeneousFlowField(u=[1., 2., 3.], gamma=2.3)) + pot_field_data = constraints.ElectricPotential.field_from_fn( + system.box_l, np.ones(3), lambda x: np.linalg.norm(10 * np.ones(3) - x)) + checkpoint.register("pot_field_data") + system.constraints.add(constraints.PotentialField( + field=pot_field_data, grid_spacing=np.ones(3), default_scale=1.6, + particle_scales={5: 6.0})) + vec_field_data = constraints.ForceField.field_from_fn( + system.box_l, np.ones(3), lambda x: 10 * np.ones(3) - x) + checkpoint.register("vec_field_data") + system.constraints.add(constraints.ForceField( + field=vec_field_data, grid_spacing=np.ones(3), default_scale=1.4)) + if espressomd.has_features("ELECTROSTATICS"): + system.constraints.add(constraints.ElectricPlaneWave( + E0=[1., -2., 3.], k=[-.1, .2, .3], omega=5., phi=1.4)) if 'LB.OFF' in modes: # set thermostat diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py index ecd6a82375f..00005be7b4f 100644 --- a/testsuite/python/test_checkpoint.py +++ b/testsuite/python/test_checkpoint.py @@ -40,13 +40,15 @@ class CheckpointTest(ut.TestCase): + checkpoint = espressomd.checkpointing.Checkpoint( + checkpoint_id="mycheckpoint_@TEST_COMBINATION@_@TEST_BINARY@".replace( + '.', '__'), + checkpoint_path="@CMAKE_CURRENT_BINARY_DIR@") + checkpoint.load(0) + n_nodes = system.cell_system.get_state()["n_nodes"] + @classmethod def setUpClass(cls): - cls.checkpoint = espressomd.checkpointing.Checkpoint( - checkpoint_id="mycheckpoint_@TEST_COMBINATION@_@TEST_BINARY@".replace( - '.', '__'), - checkpoint_path="@CMAKE_CURRENT_BINARY_DIR@") - cls.checkpoint.load(0) cls.ref_box_l = np.array([12.0, 14.0, 16.0]) if 'DP3M' in modes: cls.ref_box_l = np.array([16.0, 16.0, 16.0]) @@ -160,6 +162,7 @@ def test_part(self): np.testing.assert_allclose(np.copy(p1.f), particle_force0) np.testing.assert_allclose(np.copy(p2.f), particle_force1) + @ut.skipIf(n_nodes > 1, "only runs for 1 MPI rank") def test_object_containers_serialization(self): ''' Check that particles at the interface between two MPI nodes still @@ -225,9 +228,8 @@ def test_thermostat_DPD(self): self.assertEqual(thmst['type'], 'DPD') self.assertEqual(thmst['kT'], 1.0) self.assertEqual(thmst['seed'], 42) - n_nodes = system.cell_system.get_state()["n_nodes"] - if n_nodes in {1, 2, 3, 4, 8}: - ref_counter = (n_nodes == 3) and 6 or 8 + if self.n_nodes in {1, 2, 3, 4, 8}: + ref_counter = (self.n_nodes == 1) and 10 or 0 self.assertEqual(thmst['counter'], ref_counter) @utx.skipIfMissingFeatures('NPT') @@ -358,24 +360,27 @@ def test_mean_variance_calculator(self): np.testing.assert_array_equal( acc_mean_variance.variance(), np.array([[0., 0.5, 2.], [0., 0., 0.]])) - np.testing.assert_array_equal( - system.auto_update_accumulators[0].variance(), - np.array([[0., 0.5, 2.], [0., 0., 0.]])) + if self.n_nodes == 1: + np.testing.assert_array_equal( + system.auto_update_accumulators[0].variance(), + np.array([[0., 0.5, 2.], [0., 0., 0.]])) def test_time_series(self): expected = [[[1, 1, 1], [1, 1, 2]], [[1, 2, 3], [1, 1, 2]]] np.testing.assert_array_equal(acc_time_series.time_series(), expected) - np.testing.assert_array_equal( - system.auto_update_accumulators[1].time_series(), - expected) + if self.n_nodes == 1: + np.testing.assert_array_equal( + system.auto_update_accumulators[1].time_series(), + expected) def test_correlator(self): expected = np.zeros((36, 2, 3)) expected[0:2] = [[[1, 2.5, 5], [1, 1, 4]], [[1, 2, 3], [1, 1, 4]]] np.testing.assert_array_equal(acc_correlator.result(), expected) - np.testing.assert_array_equal( - system.auto_update_accumulators[2].result(), - expected) + if self.n_nodes == 1: + np.testing.assert_array_equal( + system.auto_update_accumulators[2].result(), + expected) @utx.skipIfMissingFeatures('DP3M') @ut.skipIf('DP3M.CPU' not in modes, @@ -482,6 +487,7 @@ def test_exclusions(self): @ut.skipIf(not LB or EK or not (espressomd.has_features("LB_BOUNDARIES") or espressomd.has_features("LB_BOUNDARIES_GPU")), "Missing features") + @ut.skipIf(n_nodes > 1, "only runs for 1 MPI rank") def test_lb_boundaries(self): # check boundaries agree on all MPI nodes self.assertEqual(len(system.lbboundaries), 2) @@ -503,6 +509,7 @@ def test_lb_boundaries(self): np.testing.assert_equal( system.actors[0][:, :, :].boundary.astype(int), 0) + @ut.skipIf(n_nodes > 1, "only runs for 1 MPI rank") def test_constraints(self): from espressomd import constraints self.assertEqual(len(system.constraints),