From be335b88fb451092a52390170ff22725702be45c Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Tue, 26 Nov 2024 16:35:47 -0500 Subject: [PATCH] DAOS-16100 test: Fix stopping daos_test during timeout (#15275) Fix stopping timed out processes run by a JobManager class by only searching for and killing the command executable being run by clush, orterun, mpirun, etc. Add a new harness/cmocka.py test to verify the stopping of the processes with a test timeout. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pr daos_test dfuse_test test_load_mpi HarnessCmockaTest Allow-unstable-test: true Signed-off-by: Phil Henderson --- src/tests/ftest/daos_test/dfs.py | 11 +- src/tests/ftest/daos_test/dfuse.py | 14 +- src/tests/ftest/harness/basic.py | 50 +------ src/tests/ftest/harness/cmocka.py | 138 +++++++++++++++++++ src/tests/ftest/harness/cmocka.yaml | 1 + src/tests/ftest/util/apricot/apricot/test.py | 2 - src/tests/ftest/util/cmocka_utils.py | 72 ++++++---- src/tests/ftest/util/command_utils.py | 3 + src/tests/ftest/util/daos_core_base.py | 16 ++- src/tests/ftest/util/job_manager_utils.py | 84 +++++------ src/tests/ftest/util/run_utils.py | 25 +++- 11 files changed, 273 insertions(+), 143 deletions(-) create mode 100644 src/tests/ftest/harness/cmocka.py create mode 100644 src/tests/ftest/harness/cmocka.yaml diff --git a/src/tests/ftest/daos_test/dfs.py b/src/tests/ftest/daos_test/dfs.py index 6b43757a8d7..90b5c0799d6 100644 --- a/src/tests/ftest/daos_test/dfs.py +++ b/src/tests/ftest/daos_test/dfs.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -34,8 +34,7 @@ def test_daos_dfs_unit(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_unit """ - self.daos_test = os.path.join(self.bin, 'dfs_test') - self.run_subtest() + self.run_subtest(os.path.join(self.bin, "dfs_test")) def test_daos_dfs_parallel(self): """Jira ID: DAOS-5409. @@ -51,8 +50,7 @@ def test_daos_dfs_parallel(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_parallel """ - self.daos_test = os.path.join(self.bin, 'dfs_test') - self.run_subtest() + self.run_subtest(os.path.join(self.bin, "dfs_test")) def test_daos_dfs_sys(self): """Jira ID: DAOS-7759. @@ -68,5 +66,4 @@ def test_daos_dfs_sys(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_sys """ - self.daos_test = os.path.join(self.bin, 'dfs_test') - self.run_subtest() + self.run_subtest(os.path.join(self.bin, "dfs_test")) diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py index e735e745cab..4ef4a88fd02 100644 --- a/src/tests/ftest/daos_test/dfuse.py +++ b/src/tests/ftest/daos_test/dfuse.py @@ -8,7 +8,7 @@ from collections import OrderedDict from apricot import TestWithServers -from cmocka_utils import CmockaUtils +from cmocka_utils import CmockaUtils, get_cmocka_command from dfuse_utils import get_dfuse, start_dfuse from file_utils import create_directory from general_utils import get_log_file @@ -31,8 +31,6 @@ def run_test(self, il_lib=None): if il_lib is None: self.fail('il_lib is not defined.') - daos_test = os.path.join(self.bin, 'dfuse_test') - # Create a pool, container and start dfuse. pool = self.get_pool(connect=False) container = self.get_container(pool) @@ -105,8 +103,8 @@ def run_test(self, il_lib=None): daos_test_env['D_IL_MAX_EQ'] = '2' daos_test_env['D_IL_ENFORCE_EXEC_ENV'] = '1' - command = [ - daos_test, + command = os.path.join(self.bin, 'dfuse_test') + parameters = [ '--test-dir', mount_dir, '--io', @@ -117,7 +115,7 @@ def run_test(self, il_lib=None): '--cache' ] if use_dfuse: - command.append('--lowfd') + parameters.append('--lowfd') else: # make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem dummy_dir = '/tmp/dummy' @@ -126,9 +124,9 @@ def run_test(self, il_lib=None): self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}") daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir if cache_mode != 'writeback': - command.append('--metadata') + parameters.append('--metadata') - job = get_job_manager(self, "Clush", cmocka_utils.get_cmocka_command(" ".join(command))) + job = get_job_manager(self, "Clush", get_cmocka_command(command, ' '.join(parameters))) job.assign_hosts(cmocka_utils.hosts) job.assign_environment(daos_test_env) diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index 49759f4be09..0ff102e2074 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -6,7 +6,6 @@ import os from apricot import TestWithoutServers -from cmocka_utils import CmockaUtils from command_utils import SubProcessCommand from exception_utils import CommandFailure from job_manager_utils import Mpirun, Orterun @@ -79,12 +78,14 @@ def test_load_mpi(self): try: Orterun(None) except CommandFailure as error: - self.fail("Orterun initialization failed: {}".format(error)) + self.log.error("Orterun initialization failed: %s", error) + self.fail("Orterun initialization failed") try: Mpirun(None, mpi_type="mpich") except CommandFailure as error: - self.fail("Mpirun initialization failed: {}".format(error)) + self.log.error("Mpirun initialization failed: %s", error) + self.fail("Mpirun initialization failed") def test_load_mpi_hw(self): """Simple test of apricot test code to load the openmpi module. @@ -125,46 +126,3 @@ def test_sub_process_command(self): if failed: self.fail("The '{}' command failed".format(command)) self.log.info("Test passed") - - def test_no_cmocka_xml(self): - """Test to verify CmockaUtils detects lack of cmocka file generation. - - If working correctly this test should fail due to a missing cmocka file. - - :avocado: tags=all - :avocado: tags=vm - :avocado: tags=harness,harness_cmocka,failure_expected - :avocado: tags=HarnessBasicTest,test_no_cmocka_xml - """ - self.log.info("=" * 80) - self.log.info("Running the 'hostname' command via CmockaUtils") - self.log.info(" This should generate a cmocka xml file with a 'Missing file' error") - name = "no_cmocka_xml_file_test" - cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) - command = cmocka_utils.get_cmocka_command("hostname") - cmocka_utils.run_cmocka_test(self, command) - - # Verify a generated cmocka xml file exists - expected = os.path.join(self.outputdir, "{}_cmocka_results.xml".format(name)) - self.log.info("Verifying the existence of the generated cmocka file: %s", expected) - if not os.path.isfile(expected): - self.fail("No {} file found".format(expected)) - - # Verify the generated cmocka xml file contains the expected error - self.log.info("Verifying contents of the generated cmocka file: %s", expected) - with open(expected, "r", encoding="utf-8") as file_handle: - actual_contents = file_handle.readlines() - error_message = "Missing cmocka results for hostname in {}".format(self.outputdir) - expected_lines = [ - "".format(error_message) - ] - for index, actual_line in enumerate(actual_contents[1:4]): - self.log.debug(" expecting: %s", expected_lines[index]) - self.log.debug(" in actual: %s", actual_line[:-1].strip()) - if expected_lines[index] not in actual_line: - self.fail("Badly formed {} file".format(expected)) - - self.log.info("Test passed") diff --git a/src/tests/ftest/harness/cmocka.py b/src/tests/ftest/harness/cmocka.py new file mode 100644 index 00000000000..adc482132f2 --- /dev/null +++ b/src/tests/ftest/harness/cmocka.py @@ -0,0 +1,138 @@ +""" + (C) Copyright 2022-2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import os + +from apricot import TestWithoutServers +from cmocka_utils import CmockaUtils, get_cmocka_command +from host_utils import get_local_host +from job_manager_utils import get_job_manager + + +class HarnessCmockaTest(TestWithoutServers): + """Cmocka harness test cases. + + :avocado: recursive + """ + + def test_no_cmocka_xml(self): + """Test to verify CmockaUtils detects lack of cmocka file generation. + + If working correctly this test should fail due to a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,failure_expected + :avocado: tags=HarnessCmockaTest,test_no_cmocka_xml + """ + self._run_cmocka_test(get_cmocka_command("hostname"), False, True) + self.log.info("Test passed") + + def test_clush_manager_timeout(self): + """Test to verify CmockaUtils handles timed out process correctly. + + If working correctly this test should fail due to a test timeout and a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,failure_expected + :avocado: tags=HarnessCmockaTest,test_clush_manager_timeout + """ + self._run_cmocka_test(self._get_manager_command("Clush", "sleep", "60"), True, True) + self.fail("Test did not timeout") + + def test_orterun_manager_timeout(self): + """Test to verify CmockaUtils handles timed out process correctly. + + If working correctly this test should fail due to a test timeout and a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,failure_expected + :avocado: tags=HarnessCmockaTest,test_orterun_manager_timeout + """ + self._run_cmocka_test(self._get_manager_command("Orterun", "sleep", "60"), True, True) + self.fail("Test did not timeout") + + def test_mpirun_manager_timeout(self): + """Test to verify CmockaUtils handles timed out process correctly. + + If working correctly this test should fail due to a test timeout and a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,failure_expected + :avocado: tags=HarnessCmockaTest,test_mpirun_manager_timeout + """ + self._run_cmocka_test(self._get_manager_command("Mpirun", "sleep", "60"), True, True) + self.fail("Test did not timeout") + + def _run_cmocka_test(self, command, timeout, missing): + """Run the cmocka test case. + + Args: + command (ExecutableCommand): the command to run + timeout (bool): is the test expected to timeout + missing (bool): is the test expected to be missing a cmocka result + """ + self.log.info("Running the '%s' command via CmockaUtils", str(command)) + if timeout: + self.log.info(" This should generate a test timeout failure") + if missing: + self.log.info(" This should generate a cmocka xml file with a 'Missing file' error") + + cmocka_utils = CmockaUtils(None, self.test_id, self.outputdir, self.test_dir, self.log) + try: + cmocka_utils.run_cmocka_test(self, command) + finally: + self._verify_no_cmocka_xml(self.test_id, command) + + def _get_manager_command(self, class_name, executable, parameters): + """Get a JobManager command object. + + Args: + class_name (str): JobManager class name + executable (str): executable to be managed + parameters (str): parameters for the executable to be managed + + Returns: + JobManager: the requested JobManager class + """ + command = get_cmocka_command(executable, parameters) + manager = get_job_manager(self, class_name, command) + manager.assign_hosts(get_local_host()) + return manager + + def _verify_no_cmocka_xml(self, name, command): + """Verify a cmocka xml file was generated with the expected error. + + Args: + name (str): name of the cmocka test + command (ExecutableCommand): command for the cmocka test + """ + # Verify a generated cmocka xml file exists + expected = os.path.join(self.outputdir, f"{name}_cmocka_results.xml") + self.log.info("Verifying the existence of the generated cmocka file: %s", expected) + if not os.path.isfile(expected): + self.fail(f"No {expected} file found") + + # Verify the generated cmocka xml file contains the expected error + self.log.info("Verifying contents of the generated cmocka file: %s", expected) + with open(expected, "r", encoding="utf-8") as file_handle: + actual_contents = file_handle.readlines() + if hasattr(command, "job"): + error_message = f"Missing cmocka results for {str(command.job)} in {self.outputdir}" + else: + error_message = f"Missing cmocka results for {str(command)} in {self.outputdir}" + expected_lines = [ + f"" + ] + for index, actual_line in enumerate(actual_contents[1:4]): + self.log.debug(" expecting: %s", expected_lines[index]) + self.log.debug(" in actual: %s", actual_line[:-1].strip()) + if expected_lines[index] not in actual_line: + self.fail(f"Badly formed {expected} file") diff --git a/src/tests/ftest/harness/cmocka.yaml b/src/tests/ftest/harness/cmocka.yaml new file mode 100644 index 00000000000..be8aad8fd10 --- /dev/null +++ b/src/tests/ftest/harness/cmocka.yaml @@ -0,0 +1 @@ +timeout: 10 diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index a827b5df53e..76f9820aee5 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -504,7 +504,6 @@ def __init__(self, *args, **kwargs): self.client_mca = None self.bin = None - self.daos_test = None self.cart_prefix = None self.cart_bin = None self.tmp = None @@ -522,7 +521,6 @@ def setUp(self): """Set up run before each test.""" super().setUp() self.bin = os.path.join(self.prefix, 'bin') - self.daos_test = os.path.join(self.prefix, 'bin', 'daos_test') # set the shared directory for daos tests self.tmp = self.test_env.shared_dir diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index 69ffe767e35..6a9f0441938 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -7,12 +7,52 @@ from agent_utils import include_local_host from command_utils import ExecutableCommand -from command_utils_base import EnvironmentVariables +from command_utils_base import BasicParameter, EnvironmentVariables from exception_utils import CommandFailure from results_utils import Job, Results, TestName, TestResult, create_xml from run_utils import get_clush_command, run_local, run_remote +def get_cmocka_command(command, parameters=None): + """Get an ExecutableCommand representing the provided command string. + + Adds detection of any bad keywords in the command output that, if found, will result in a + command failure. + + Args: + command (str): the command path to use to create the CmockaCommand + parameters (str): parameters to use to create the CmockaCommand + + Returns: + ExecutableCommand: the object setup to run the command + """ + keywords = ["Process received signal", "stack smashing detected", "End of error message", + "buffer overflow detected"] + path, executable = os.path.split(command) + command = CmockaCommand(path, executable, keywords) + command.parameters.value = parameters + return command + + +class CmockaCommand(ExecutableCommand): + """Defines a object representing a cmocka test command.""" + + def __init__(self, path, executable, keywords): + """Create a CmockaCommand object. + + Args: + path (str): the command path. This is excluded from the search when the process is + killed. + executable (str): the command executable. Also the string used to search for the process + when it is killed. + keywords (list): list of words used to mark the command as failed if any are found in + the command output. Defaults to None. + """ + super().__init__(None, executable, path, check_results=keywords) + self.full_command_regex = True + self.parameters = BasicParameter(None) + + class CmockaUtils(): """Utilities for running test that generate cmocka xml results.""" @@ -76,24 +116,6 @@ def get_cmocka_env(self): "CMOCKA_MESSAGE_OUTPUT": "xml", }) - @staticmethod - def get_cmocka_command(command): - """Get an ExecutableCommand representing the provided command string. - - Adds detection of any bad keywords in the command output that, if found, will result in a - command failure. - - Args: - command (str): the command string to use to create the ExecutableCommand - - Returns: - ExecutableCommand: the object setup to run the command - - """ - keywords = ["Process received signal", "stack smashing detected", "End of error message", - "buffer overflow detected"] - return ExecutableCommand(namespace=None, command=command, check_results=keywords) - def run_cmocka_test(self, test, command): """Run the cmocka test command. @@ -111,17 +133,19 @@ def run_cmocka_test(self, test, command): command.run() except CommandFailure as error: - error_message = "Error detected running {}".format(job_command) + error_message = f"Error detected running {job_command}" error_exception = error test.log.exception(error_message) test.fail(error_message) finally: + if test.status is not None and test.status != 'PASS' and test.status != 'SKIP': + test.log.debug("Currently running processes for non-passing test:") + run_remote(test.log, self.hosts, "ps -jH") self._collect_cmocka_results(test) if not self._check_cmocka_files(): if error_message is None: - error_message = "Missing cmocka results for {} in {}".format( - job_command, self.cmocka_dir) + error_message = f"Missing cmocka results for {job_command} in {self.cmocka_dir}" self._generate_cmocka_files(test, error_message, error_exception) def _collect_cmocka_results(self, test): @@ -187,7 +211,7 @@ def _generate_cmocka_files(self, test, error_message, error_exception): test_result.traceback = error_exception test_result.time_elapsed = 0 - cmocka_xml = os.path.join(self.outputdir, "{}_cmocka_results.xml".format(self.test_name)) + cmocka_xml = os.path.join(self.outputdir, f"{self.test_name}_cmocka_results.xml") job = Job(self.test_name, xml_output=cmocka_xml) result = Results(test.logfile) result.tests.append(test_result) diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py index 56ceb4c7fe3..dbed7ac3c44 100644 --- a/src/tests/ftest/util/command_utils.py +++ b/src/tests/ftest/util/command_utils.py @@ -72,6 +72,9 @@ def __init__(self, namespace, command, path="", subprocess=False, check_results= # used to check on the progress or terminate the command. self._exe_names = [self.command] + # If set use the full command string when returning the 'command_regex' property + self.full_command_regex = False + # Define an attribute to store the CmdResult from the last run() call. # A CmdResult object has the following properties: # command - command string diff --git a/src/tests/ftest/util/daos_core_base.py b/src/tests/ftest/util/daos_core_base.py index f5eee477822..1b0ad42fecc 100644 --- a/src/tests/ftest/util/daos_core_base.py +++ b/src/tests/ftest/util/daos_core_base.py @@ -8,7 +8,7 @@ import shutil from apricot import TestWithServers -from cmocka_utils import CmockaUtils +from cmocka_utils import CmockaUtils, get_cmocka_command from general_utils import get_log_file from job_manager_utils import get_job_manager from test_utils_pool import POOL_TIMEOUT_INCREMENT @@ -51,8 +51,15 @@ def get_test_param(self, name, default=None): path = "/".join(["/run/daos_tests", name, "*"]) return self.params.get(self.get_test_name(), path, default) - def run_subtest(self): - """Run daos_test with a subtest argument.""" + def run_subtest(self, command=None): + """Run the executable with a subtest argument. + + Args: + command (str, optional): command to run. Defaults to None which will yield daos_test. + """ + if command is None: + command = os.path.join(self.bin, "daos_test") + subtest = self.get_test_param("daos_test") num_clients = self.get_test_param("num_clients") if num_clients is None: @@ -81,8 +88,7 @@ def run_subtest(self): daos_test_env["COVFILE"] = "/tmp/test.cov" daos_test_env["POOL_SCM_SIZE"] = str(scm_size) daos_test_env["POOL_NVME_SIZE"] = str(nvme_size) - daos_test_cmd = cmocka_utils.get_cmocka_command( - " ".join([self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]), str(args)])) + daos_test_cmd = get_cmocka_command(command, f"-n {dmg_config_file} -{subtest} {str(args)}") job = get_job_manager(self, "Orterun", daos_test_cmd, mpi_type="openmpi") job.assign_hosts(cmocka_utils.hosts, self.workdir, None) job.assign_processes(num_clients) diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 2b5f2cd6c26..932a9f7a306 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -179,6 +179,29 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ + # pylint: disable=unused-argument + self._hosts = hosts.copy() + + def _setup_hostfile(self, path=None, slots=None, hostfile=True): + """Setup the hostfile to use with the command. + + Args: + path (str, optional): path to use when specifying the hosts through + a hostfile. Defaults to None. + slots (int, optional): number of slots per host to specify in the + optional hostfile. Defaults to None. + hostfile (bool, optional): whether or not to also update any host related command + parameters to keep them in sync with the hosts. Defaults to True. + + Returns: + str: the full path of the written hostfile; None if one is not written + """ + if not hostfile: + return None + kwargs = {"hosts": self._hosts, "slots": slots} + if path is not None: + kwargs["path"] = path + return write_host_file(**kwargs) def assign_processes(self, processes): """Assign the number of processes. @@ -297,7 +320,10 @@ def kill(self): if not self.job: return regex = self.job.command_regex - detected, running = stop_processes(self.log, self._hosts, regex) + if self.job.full_command_regex: + regex = f"'{str(self.job)}'" + detected, running = stop_processes( + self.log, self._hosts, regex, full_command=self.job.full_command_regex) if not detected: self.log.info( "No remote %s processes killed on %s (none found), done.", regex, self._hosts) @@ -366,13 +392,8 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - self._hosts = hosts.copy() - if not hostfile: - return - kwargs = {"hosts": self._hosts, "slots": slots} - if path is not None: - kwargs["path"] = path - self.hostfile.value = write_host_file(**kwargs) + super().assign_hosts(hosts, path, slots, hostfile) + self.hostfile.value = self._setup_hostfile(path, slots, hostfile) def assign_processes(self, processes): """Assign the number of processes (-np). @@ -485,13 +506,8 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - self._hosts = hosts.copy() - if not hostfile: - return - kwargs = {"hosts": self._hosts, "slots": slots} - if path is not None: - kwargs["path"] = path - self.hostfile.value = write_host_file(**kwargs) + super().assign_hosts(hosts, path, slots, hostfile) + self.hostfile.value = self._setup_hostfile(path, slots, hostfile) def assign_processes(self, processes=None, ppn=None): """Assign the number of processes (-np) and processes per node (-ppn). @@ -589,13 +605,8 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - self._hosts = hosts.copy() - if not hostfile: - return - kwargs = {"hosts": self._hosts, "slots": None} - if path is not None: - kwargs["path"] = path - self.nodefile.value = write_host_file(**kwargs) + super().assign_hosts(hosts, path, slots, hostfile) + self.nodefile.value = self._setup_hostfile(path, slots, hostfile) self.ntasks_per_node.value = slots def assign_processes(self, processes): @@ -756,19 +767,6 @@ def check_subprocess_status(self, sub_process): self.job.pattern, self.timestamps["start"], None, self.job.pattern_count, self.job.pattern_timeout.value) - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): - """Assign the hosts to use with the command. - - Set the appropriate command line parameter with the specified value. - - Args: - hosts (NodeSet): hosts to specify on the command line - path (str, optional): not used. Defaults to None. - slots (int, optional): not used. Defaults to None. - hostfile (bool, optional): not used. Defaults to True. - """ - self._hosts = hosts.copy() - def assign_environment(self, env_vars, append=False): """Assign or add environment variables to the command. @@ -1213,17 +1211,6 @@ def __str__(self): commands = [super().__str__(), "-w {}".format(self.hosts), str(self.job)] return " ".join(commands) - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): - """Assign the hosts to use with the command (--hostfile). - - Args: - hosts (NodeSet): hosts to specify in the hostfile - path (str, optional): not used. Defaults to None. - slots (int, optional): not used. Defaults to None. - hostfile (bool, optional): not used. Defaults to True. - """ - self._hosts = hosts.copy() - def assign_environment(self, env_vars, append=False): """Assign or add environment variables to the command. @@ -1253,6 +1240,11 @@ def run(self, raise_exception=None): if raise_exception is None: raise_exception = self.exit_status_exception + if callable(self.register_cleanup_method): + # Stop any running processes started by this job manager when the test completes + # pylint: disable=not-callable + self.register_cleanup_method(stop_job_manager, job_manager=self) + command = " ".join([self.env.to_export_str(), str(self.job)]).strip() self.result = run_remote(self.log, self._hosts, command, self.verbose, self.timeout) diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index f4893558fb0..8e96e2228f0 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -4,6 +4,7 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import os +import re import subprocess # nosec import time from getpass import getuser @@ -529,7 +530,8 @@ def find_command(source, pattern, depth, other=None): return " ".join(command) -def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, force=False): +def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, force=False, + full_command=False): """Stop the processes on each hosts that match the pattern. Args: @@ -542,6 +544,11 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, force (bool, optional): if set use the KILL signal to immediately stop any running processes. Defaults to False which will attempt to kill w/o a signal, then with the ABRT signal, and finally with the KILL signal. + full_command (bool, optional): if set match the pattern using the full command with + pgrep/pkill. Defaults to False. + + Raises: + ValueError: if the pattern ends up matching process 1. Returns: tuple: (NodeSet, NodeSet) where the first NodeSet indicates on which hosts processes @@ -551,19 +558,25 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, """ processes_detected = NodeSet() processes_running = NodeSet() - command = f"/usr/bin/pgrep --list-full {pattern}" + search_command = f"/usr/bin/pgrep --list-full {pattern}" pattern_match = str(pattern) if exclude: - command = f"/usr/bin/ps xa | grep -E {pattern} | grep -vE {exclude}" + search_command = f"/usr/bin/ps xa | grep -E {pattern} | grep -vE {exclude}" pattern_match += " and doesn't match " + str(exclude) + elif full_command: + search_command = f"/usr/bin/pgrep --list-full --full -x {pattern}" # Search for any active processes log.debug("Searching for any processes on %s that match %s", hosts, pattern_match) - result = run_remote(log, hosts, command, verbose, timeout) + result = run_remote(log, hosts, search_command, verbose, timeout) if not result.passed_hosts: log.debug("No processes found on %s that match %s", result.failed_hosts, pattern_match) return processes_detected, processes_running + # Catch any attempt to kill process 1. + if "1" in re.findall(r"^(\d+)\s+", result.joined_stdout, re.MULTILINE): + raise ValueError(f"Attempting to kill process 1 as a match for {pattern}!") + # Indicate on which hosts processes matching the pattern were found running in the return status processes_detected.add(result.passed_hosts) @@ -580,9 +593,11 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, "Killing%s any processes on %s that match %s and then waiting %s seconds", step[0], result.passed_hosts, pattern_match, step[1]) kill_command = f"sudo /usr/bin/pkill{step[0]} {pattern}" + if full_command: + kill_command = f"sudo /usr/bin/pkill{step[0]} --full -x {pattern}" run_remote(log, result.passed_hosts, kill_command, verbose, timeout) time.sleep(step[1]) - result = run_remote(log, result.passed_hosts, command, verbose, timeout) + result = run_remote(log, result.passed_hosts, search_command, verbose, timeout) if not result.passed_hosts: # Indicate all running processes matching the pattern were stopped in the return status log.debug(