DAOS-16100 test: Fix stopping daos_test during timeout (#15275) (#15603)

Fix stopping timed out processes run by a JobManager class by only searching for and killing the command executable being run by clush, orterun, mpirun, etc. Add a new harness/cmocka.py test to verify the stopping of the processes with a test timeout. Signed-off-by: Phil Henderson <[email protected]>
daos-stack · Dec 12, 2024 · 7f1eab0 · 7f1eab0
1 parent d238ceb
commit 7f1eab0
Show file tree

Hide file tree

Showing 11 changed files with 273 additions and 143 deletions.
diff --git a/src/tests/ftest/daos_test/dfs.py b/src/tests/ftest/daos_test/dfs.py
@@ -1,5 +1,5 @@
 """
-  (C) Copyright 2020-2023 Intel Corporation.
+  (C) Copyright 2020-2024 Intel Corporation.
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -34,8 +34,7 @@ def test_daos_dfs_unit(self):
         :avocado: tags=daos_test,dfs_test,dfs
         :avocado: tags=DaosCoreTestDfs,test_daos_dfs_unit
         """
-        self.daos_test = os.path.join(self.bin, 'dfs_test')
-        self.run_subtest()
+        self.run_subtest(os.path.join(self.bin, "dfs_test"))
 
     def test_daos_dfs_parallel(self):
         """Jira ID: DAOS-5409.
@@ -51,8 +50,7 @@ def test_daos_dfs_parallel(self):
         :avocado: tags=daos_test,dfs_test,dfs
         :avocado: tags=DaosCoreTestDfs,test_daos_dfs_parallel
         """
-        self.daos_test = os.path.join(self.bin, 'dfs_test')
-        self.run_subtest()
+        self.run_subtest(os.path.join(self.bin, "dfs_test"))
 
     def test_daos_dfs_sys(self):
         """Jira ID: DAOS-7759.
@@ -68,5 +66,4 @@ def test_daos_dfs_sys(self):
         :avocado: tags=daos_test,dfs_test,dfs
         :avocado: tags=DaosCoreTestDfs,test_daos_dfs_sys
         """
-        self.daos_test = os.path.join(self.bin, 'dfs_test')
-        self.run_subtest()
+        self.run_subtest(os.path.join(self.bin, "dfs_test"))
diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py
@@ -8,7 +8,7 @@
 from collections import OrderedDict
 
 from apricot import TestWithServers
-from cmocka_utils import CmockaUtils
+from cmocka_utils import CmockaUtils, get_cmocka_command
 from dfuse_utils import get_dfuse, start_dfuse
 from file_utils import create_directory
 from general_utils import get_log_file
@@ -31,8 +31,6 @@ def run_test(self, il_lib=None):
         if il_lib is None:
             self.fail('il_lib is not defined.')
 
-        daos_test = os.path.join(self.bin, 'dfuse_test')
-
         # Create a pool, container and start dfuse.
         pool = self.get_pool(connect=False)
         container = self.get_container(pool)
@@ -105,8 +103,8 @@ def run_test(self, il_lib=None):
                 daos_test_env['D_IL_MAX_EQ'] = '2'
                 daos_test_env['D_IL_ENFORCE_EXEC_ENV'] = '1'
 
-        command = [
-            daos_test,
+        command = os.path.join(self.bin, 'dfuse_test')
+        parameters = [
             '--test-dir',
             mount_dir,
             '--io',
@@ -117,7 +115,7 @@ def run_test(self, il_lib=None):
             '--cache'
         ]
         if use_dfuse:
-            command.append('--lowfd')
+            parameters.append('--lowfd')
         else:
             # make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem
             dummy_dir = '/tmp/dummy'
@@ -126,9 +124,9 @@ def run_test(self, il_lib=None):
                 self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}")
             daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir
         if cache_mode != 'writeback':
-            command.append('--metadata')
+            parameters.append('--metadata')
 
-        job = get_job_manager(self, "Clush", cmocka_utils.get_cmocka_command(" ".join(command)))
+        job = get_job_manager(self, "Clush", get_cmocka_command(command, ' '.join(parameters)))
         job.assign_hosts(cmocka_utils.hosts)
         job.assign_environment(daos_test_env)
 

diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py
@@ -6,7 +6,6 @@
 import os
 
 from apricot import TestWithoutServers
-from cmocka_utils import CmockaUtils
 from command_utils import SubProcessCommand
 from exception_utils import CommandFailure
 from job_manager_utils import Mpirun, Orterun
@@ -79,12 +78,14 @@ def test_load_mpi(self):
         try:
             Orterun(None)
         except CommandFailure as error:
-            self.fail("Orterun initialization failed: {}".format(error))
+            self.log.error("Orterun initialization failed: %s", error)
+            self.fail("Orterun initialization failed")
 
         try:
             Mpirun(None, mpi_type="mpich")
         except CommandFailure as error:
-            self.fail("Mpirun initialization failed: {}".format(error))
+            self.log.error("Mpirun initialization failed: %s", error)
+            self.fail("Mpirun initialization failed")
 
     def test_load_mpi_hw(self):
         """Simple test of apricot test code to load the openmpi module.
@@ -125,46 +126,3 @@ def test_sub_process_command(self):
         if failed:
             self.fail("The '{}' command failed".format(command))
         self.log.info("Test passed")
-
-    def test_no_cmocka_xml(self):
-        """Test to verify CmockaUtils detects lack of cmocka file generation.
-
-        If working correctly this test should fail due to a missing cmocka file.
-
-        :avocado: tags=all
-        :avocado: tags=vm
-        :avocado: tags=harness,harness_cmocka,failure_expected
-        :avocado: tags=HarnessBasicTest,test_no_cmocka_xml
-        """
-        self.log.info("=" * 80)
-        self.log.info("Running the 'hostname' command via CmockaUtils")
-        self.log.info("  This should generate a cmocka xml file with a 'Missing file' error")
-        name = "no_cmocka_xml_file_test"
-        cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log)
-        command = cmocka_utils.get_cmocka_command("hostname")
-        cmocka_utils.run_cmocka_test(self, command)
-
-        # Verify a generated cmocka xml file exists
-        expected = os.path.join(self.outputdir, "{}_cmocka_results.xml".format(name))
-        self.log.info("Verifying the existence of the generated cmocka file: %s", expected)
-        if not os.path.isfile(expected):
-            self.fail("No {} file found".format(expected))
-
-        # Verify the generated cmocka xml file contains the expected error
-        self.log.info("Verifying contents of the generated cmocka file: %s", expected)
-        with open(expected, "r", encoding="utf-8") as file_handle:
-            actual_contents = file_handle.readlines()
-        error_message = "Missing cmocka results for hostname in {}".format(self.outputdir)
-        expected_lines = [
-            "<testsuite errors=\"1\" failures=\"0\" name=\"{}\" skipped=\"0\" tests=\"1\"".format(
-                name),
-            "<testcase classname=\"{}\" name=\"{}\"".format(name, self.name),
-            "<error message=\"{}\" type=\"Missing file\">".format(error_message)
-        ]
-        for index, actual_line in enumerate(actual_contents[1:4]):
-            self.log.debug("  expecting: %s", expected_lines[index])
-            self.log.debug("  in actual: %s", actual_line[:-1].strip())
-            if expected_lines[index] not in actual_line:
-                self.fail("Badly formed {} file".format(expected))
-
-        self.log.info("Test passed")
diff --git a/src/tests/ftest/harness/cmocka.py b/src/tests/ftest/harness/cmocka.py
@@ -0,0 +1,138 @@
+"""
+  (C) Copyright 2022-2024 Intel Corporation.
+
+  SPDX-License-Identifier: BSD-2-Clause-Patent
+"""
+import os
+
+from apricot import TestWithoutServers
+from cmocka_utils import CmockaUtils, get_cmocka_command
+from host_utils import get_local_host
+from job_manager_utils import get_job_manager
+
+
+class HarnessCmockaTest(TestWithoutServers):
+    """Cmocka harness test cases.
+
+    :avocado: recursive
+    """
+
+    def test_no_cmocka_xml(self):
+        """Test to verify CmockaUtils detects lack of cmocka file generation.
+
+        If working correctly this test should fail due to a missing cmocka file.
+
+        :avocado: tags=all
+        :avocado: tags=vm
+        :avocado: tags=harness,failure_expected
+        :avocado: tags=HarnessCmockaTest,test_no_cmocka_xml
+        """
+        self._run_cmocka_test(get_cmocka_command("hostname"), False, True)
+        self.log.info("Test passed")
+
+    def test_clush_manager_timeout(self):
+        """Test to verify CmockaUtils handles timed out process correctly.
+
+        If working correctly this test should fail due to a test timeout and a missing cmocka file.
+
+        :avocado: tags=all
+        :avocado: tags=vm
+        :avocado: tags=harness,failure_expected
+        :avocado: tags=HarnessCmockaTest,test_clush_manager_timeout
+        """
+        self._run_cmocka_test(self._get_manager_command("Clush", "sleep", "60"), True, True)
+        self.fail("Test did not timeout")
+
+    def test_orterun_manager_timeout(self):
+        """Test to verify CmockaUtils handles timed out process correctly.
+
+        If working correctly this test should fail due to a test timeout and a missing cmocka file.
+
+        :avocado: tags=all
+        :avocado: tags=vm
+        :avocado: tags=harness,failure_expected
+        :avocado: tags=HarnessCmockaTest,test_orterun_manager_timeout
+        """
+        self._run_cmocka_test(self._get_manager_command("Orterun", "sleep", "60"), True, True)
+        self.fail("Test did not timeout")
+
+    def test_mpirun_manager_timeout(self):
+        """Test to verify CmockaUtils handles timed out process correctly.
+
+        If working correctly this test should fail due to a test timeout and a missing cmocka file.
+
+        :avocado: tags=all
+        :avocado: tags=vm
+        :avocado: tags=harness,failure_expected
+        :avocado: tags=HarnessCmockaTest,test_mpirun_manager_timeout
+        """
+        self._run_cmocka_test(self._get_manager_command("Mpirun", "sleep", "60"), True, True)
+        self.fail("Test did not timeout")
+
+    def _run_cmocka_test(self, command, timeout, missing):
+        """Run the cmocka test case.
+
+        Args:
+            command (ExecutableCommand): the command to run
+            timeout (bool): is the test expected to timeout
+            missing (bool): is the test expected to be missing a cmocka result
+        """
+        self.log.info("Running the '%s' command via CmockaUtils", str(command))
+        if timeout:
+            self.log.info("  This should generate a test timeout failure")
+        if missing:
+            self.log.info("  This should generate a cmocka xml file with a 'Missing file' error")
+
+        cmocka_utils = CmockaUtils(None, self.test_id, self.outputdir, self.test_dir, self.log)
+        try:
+            cmocka_utils.run_cmocka_test(self, command)
+        finally:
+            self._verify_no_cmocka_xml(self.test_id, command)
+
+    def _get_manager_command(self, class_name, executable, parameters):
+        """Get a JobManager command object.
+
+        Args:
+            class_name (str): JobManager class name
+            executable (str): executable to be managed
+            parameters (str): parameters for the executable to be managed
+
+        Returns:
+            JobManager: the requested JobManager class
+        """
+        command = get_cmocka_command(executable, parameters)
+        manager = get_job_manager(self, class_name, command)
+        manager.assign_hosts(get_local_host())
+        return manager
+
+    def _verify_no_cmocka_xml(self, name, command):
+        """Verify a cmocka xml file was generated with the expected error.
+
+        Args:
+            name (str): name of the cmocka test
+            command (ExecutableCommand): command for the cmocka test
+        """
+        # Verify a generated cmocka xml file exists
+        expected = os.path.join(self.outputdir, f"{name}_cmocka_results.xml")
+        self.log.info("Verifying the existence of the generated cmocka file: %s", expected)
+        if not os.path.isfile(expected):
+            self.fail(f"No {expected} file found")
+
+        # Verify the generated cmocka xml file contains the expected error
+        self.log.info("Verifying contents of the generated cmocka file: %s", expected)
+        with open(expected, "r", encoding="utf-8") as file_handle:
+            actual_contents = file_handle.readlines()
+        if hasattr(command, "job"):
+            error_message = f"Missing cmocka results for {str(command.job)} in {self.outputdir}"
+        else:
+            error_message = f"Missing cmocka results for {str(command)} in {self.outputdir}"
+        expected_lines = [
+            f"<testsuite errors=\"1\" failures=\"0\" name=\"{name}\" skipped=\"0\" tests=\"1\"",
+            f"<testcase classname=\"{name}\" name=\"{self.name}\"",
+            f"<error message=\"{error_message}\" type=\"Missing file\">"
+        ]
+        for index, actual_line in enumerate(actual_contents[1:4]):
+            self.log.debug("  expecting: %s", expected_lines[index])
+            self.log.debug("  in actual: %s", actual_line[:-1].strip())
+            if expected_lines[index] not in actual_line:
+                self.fail(f"Badly formed {expected} file")
diff --git a/src/tests/ftest/harness/cmocka.yaml b/src/tests/ftest/harness/cmocka.yaml
@@ -0,0 +1 @@
+timeout: 10
diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py
@@ -504,7 +504,6 @@ def __init__(self, *args, **kwargs):
 
         self.client_mca = None
         self.bin = None
-        self.daos_test = None
         self.cart_prefix = None
         self.cart_bin = None
         self.tmp = None
@@ -522,7 +521,6 @@ def setUp(self):
         """Set up run before each test."""
         super().setUp()
         self.bin = os.path.join(self.prefix, 'bin')
-        self.daos_test = os.path.join(self.prefix, 'bin', 'daos_test')
 
         # set the shared directory for daos tests
         self.tmp = self.test_env.shared_dir