From ac9ab3142a02ef8841988947c30db44c1457b134 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Tue, 8 Oct 2024 13:42:04 -0400 Subject: [PATCH 01/22] DAOS-16100 test: Fix stopping daos_test during timeout Properly dstop the daos_test process if the test encounters a timeout while running. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_daos_management Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/daos_test/suite.yaml | 2 +- src/tests/ftest/util/cmocka_utils.py | 3 ++- src/tests/ftest/util/job_manager_utils.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index 3f8572f2bcf..8ea3980b9c2 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -8,7 +8,7 @@ hosts: timeout: 600 timeouts: test_daos_degraded_mode: 450 - test_daos_management: 110 + test_daos_management: 30 test_daos_pool: 180 test_daos_container: 700 test_daos_epoch: 125 diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index 69ffe767e35..4e26ae3228a 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -117,6 +117,7 @@ def run_cmocka_test(self, test, command): test.fail(error_message) finally: + run_remote(test.log, self.hosts, "ps -ejH") self._collect_cmocka_results(test) if not self._check_cmocka_files(): if error_message is None: diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 2b5f2cd6c26..a3e7d671e9d 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -143,6 +143,18 @@ def job(self, value): and self._job.check_results_list): self.check_results_list.extend(self._job.check_results_list) + @property + def command_regex(self): + """Get the regular expression to use to search for the command. + + Typical use would include combining with pgrep to verify a subprocess is running. + + Returns: + str: regular expression to use to search for the command + """ + # pylint: disable=protected-access + return "'({})'".format("|".join(self._exe_names + self.job._exe_names)) + def __str__(self): """Return the command with all of its defined parameters as a string. From 71a414242434013c0e51377c08ea5d30c2fe692b Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 23 Oct 2024 12:19:44 -0400 Subject: [PATCH 02/22] Force a timeout for debug. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_daos_management Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/cmocka_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index 4e26ae3228a..0c3cb9e9c22 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -108,6 +108,7 @@ def run_cmocka_test(self, test, command): error_message = None error_exception = None try: + command.timeout = 10 command.run() except CommandFailure as error: From cb91989410f478908f3dd4f039127746f13b7b41 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Thu, 24 Oct 2024 08:43:11 -0400 Subject: [PATCH 03/22] Force test timeout Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_daos_management Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/cmocka_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index 0c3cb9e9c22..a4183b7b95f 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -4,6 +4,7 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import os +import time from agent_utils import include_local_host from command_utils import ExecutableCommand @@ -108,7 +109,7 @@ def run_cmocka_test(self, test, command): error_message = None error_exception = None try: - command.timeout = 10 + time.sleep(30) # Debug command.run() except CommandFailure as error: From 6c05d568164f98cf965f6fc00982af8180eb463b Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Fri, 25 Oct 2024 17:10:05 -0400 Subject: [PATCH 04/22] Force timeout Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_daos_management Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/cmocka_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index a4183b7b95f..4efa3c2177f 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -109,7 +109,7 @@ def run_cmocka_test(self, test, command): error_message = None error_exception = None try: - time.sleep(30) # Debug + time.sleep(50) # Debug command.run() except CommandFailure as error: From bd8b1dd965791daac97458844d6d6091f5a668b2 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Mon, 28 Oct 2024 18:43:30 -0400 Subject: [PATCH 05/22] Updates. When stopping cmocka commands only use the executable name to find a pkill match. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/daos_test/dfs.py | 13 ++--- src/tests/ftest/daos_test/dfuse.py | 14 +++-- src/tests/ftest/daos_test/suite.yaml | 2 +- src/tests/ftest/harness/basic.py | 52 +++++++++++++----- src/tests/ftest/util/cmocka_utils.py | 73 ++++++++++++++++---------- src/tests/ftest/util/daos_core_base.py | 19 +++++-- 6 files changed, 111 insertions(+), 62 deletions(-) diff --git a/src/tests/ftest/daos_test/dfs.py b/src/tests/ftest/daos_test/dfs.py index 6b43757a8d7..721012d8026 100644 --- a/src/tests/ftest/daos_test/dfs.py +++ b/src/tests/ftest/daos_test/dfs.py @@ -1,11 +1,9 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ -import os - from daos_core_base import DaosCoreBase @@ -34,8 +32,7 @@ def test_daos_dfs_unit(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_unit """ - self.daos_test = os.path.join(self.bin, 'dfs_test') - self.run_subtest() + self.run_subtest('dfs_test') def test_daos_dfs_parallel(self): """Jira ID: DAOS-5409. @@ -51,8 +48,7 @@ def test_daos_dfs_parallel(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_parallel """ - self.daos_test = os.path.join(self.bin, 'dfs_test') - self.run_subtest() + self.run_subtest('dfs_test') def test_daos_dfs_sys(self): """Jira ID: DAOS-7759. @@ -68,5 +64,4 @@ def test_daos_dfs_sys(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_sys """ - self.daos_test = os.path.join(self.bin, 'dfs_test') - self.run_subtest() + self.run_subtest('dfs_test') diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py index 322ff407fe3..20d0e160a3c 100644 --- a/src/tests/ftest/daos_test/dfuse.py +++ b/src/tests/ftest/daos_test/dfuse.py @@ -8,7 +8,7 @@ from collections import OrderedDict from apricot import TestWithServers -from cmocka_utils import CmockaUtils +from cmocka_utils import CmockaUtils, get_cmocka_command from dfuse_utils import get_dfuse, start_dfuse from file_utils import create_directory from general_utils import get_log_file @@ -31,8 +31,6 @@ def run_test(self, il_lib=None): if il_lib is None: self.fail('il_lib is not defined.') - daos_test = os.path.join(self.bin, 'dfuse_test') - # Create a pool, container and start dfuse. pool = self.get_pool(connect=False) container = self.get_container(pool) @@ -105,8 +103,7 @@ def run_test(self, il_lib=None): daos_test_env['D_IL_MAX_EQ'] = '2' daos_test_env['D_IL_NO_BYPASS'] = '1' - command = [ - daos_test, + parameters = [ '--test-dir', mount_dir, '--io', @@ -117,7 +114,7 @@ def run_test(self, il_lib=None): '--cache' ] if use_dfuse: - command.append('--lowfd') + parameters.append('--lowfd') else: # make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem dummy_dir = '/tmp/dummy' @@ -126,9 +123,10 @@ def run_test(self, il_lib=None): self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}") daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir if cache_mode != 'writeback': - command.append('--metadata') + parameters.append('--metadata') - job = get_job_manager(self, "Clush", cmocka_utils.get_cmocka_command(" ".join(command))) + job = get_job_manager( + self, "Clush", get_cmocka_command(self.bin, 'dfuse_test', " ".join(parameters))) job.assign_hosts(cmocka_utils.hosts) job.assign_environment(daos_test_env) diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index 8ea3980b9c2..3f8572f2bcf 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -8,7 +8,7 @@ hosts: timeout: 600 timeouts: test_daos_degraded_mode: 450 - test_daos_management: 30 + test_daos_management: 110 test_daos_pool: 180 test_daos_container: 700 test_daos_epoch: 125 diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index 49759f4be09..cdc583119f2 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -6,7 +6,7 @@ import os from apricot import TestWithoutServers -from cmocka_utils import CmockaUtils +from cmocka_utils import CmockaUtils, get_cmocka_command from command_utils import SubProcessCommand from exception_utils import CommandFailure from job_manager_utils import Mpirun, Orterun @@ -141,30 +141,58 @@ def test_no_cmocka_xml(self): self.log.info(" This should generate a cmocka xml file with a 'Missing file' error") name = "no_cmocka_xml_file_test" cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) - command = cmocka_utils.get_cmocka_command("hostname") + command = get_cmocka_command("", "hostname") cmocka_utils.run_cmocka_test(self, command) + self._verify_no_cmocka_xml(name) + self.log.info("Test passed") + + def test_no_cmocka_xml_timeout(self): + """Test to verify CmockaUtils handles timed out process correctly. + + If working correctly this test should fail due to a test timeout and a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=HarnessBasicTest,test_no_cmocka_xml_timeout + """ + self.log.info("=" * 80) + self.log.info("Running the 'sleep 30' command via CmockaUtils") + self.log.info(" This should generate a test timeout failure") + self.log.info(" This should generate a cmocka xml file with a 'Missing file' error") + name = "no_cmocka_xml_file_timeout_test" + cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) + command = get_cmocka_command("", "sleep", "60") + try: + cmocka_utils.run_cmocka_test(self, command) + finally: + self._verify_no_cmocka_xml(name) + self.fail("Test did not timeout") + def _verify_no_cmocka_xml(self, name): + """Verify a cmocka xml file was generated with the expected error. + + Args: + name (str): name of the cmocka test + """ # Verify a generated cmocka xml file exists - expected = os.path.join(self.outputdir, "{}_cmocka_results.xml".format(name)) + expected = os.path.join(self.outputdir, f"{name}_cmocka_results.xml") self.log.info("Verifying the existence of the generated cmocka file: %s", expected) if not os.path.isfile(expected): - self.fail("No {} file found".format(expected)) + self.fail(f"No {expected} file found") # Verify the generated cmocka xml file contains the expected error self.log.info("Verifying contents of the generated cmocka file: %s", expected) with open(expected, "r", encoding="utf-8") as file_handle: actual_contents = file_handle.readlines() - error_message = "Missing cmocka results for hostname in {}".format(self.outputdir) + error_message = f"Missing cmocka results for hostname in {self.outputdir}" expected_lines = [ - "".format(error_message) + f"" ] for index, actual_line in enumerate(actual_contents[1:4]): self.log.debug(" expecting: %s", expected_lines[index]) self.log.debug(" in actual: %s", actual_line[:-1].strip()) if expected_lines[index] not in actual_line: - self.fail("Badly formed {} file".format(expected)) - - self.log.info("Test passed") + self.fail(f"Badly formed {expected} file") diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index 4efa3c2177f..8c3a14e99c3 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -4,16 +4,55 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import os -import time from agent_utils import include_local_host from command_utils import ExecutableCommand -from command_utils_base import EnvironmentVariables +from command_utils_base import BasicParameter, EnvironmentVariables from exception_utils import CommandFailure from results_utils import Job, Results, TestName, TestResult, create_xml from run_utils import get_clush_command, run_local, run_remote +def get_cmocka_command(path, executable, parameters=None): + """Get an ExecutableCommand representing the provided command string. + + Adds detection of any bad keywords in the command output that, if found, will result in a + command failure. + + Args: + path (str): the command path to use to create the CmockaCommand + executable (str): the command name to use to create the CmockaCommand + parameters (str): parameters to use to create the CmockaCommand + + Returns: + ExecutableCommand: the object setup to run the command + + """ + keywords = ["Process received signal", "stack smashing detected", "End of error message", + "buffer overflow detected"] + command = CmockaCommand(path, executable, keywords) + command.parameters.value = parameters + return command + + +class CmockaCommand(ExecutableCommand): + """Defines a object representing a daos command.""" + + def __init__(self, path, executable, keywords): + """Create a CmockaCommand object. + + Args: + path (str): the command path. This is excluded from the search when the process is + killed. + executable (str): the command executable. Also the string used to search for the process + when it is killed. + keywords (list): list of words used to mark the command as failed if any are found in + the command output. Defaults to None. + """ + super().__init__(None, executable, path, check_results=keywords) + self.parameters = BasicParameter(None) + + class CmockaUtils(): """Utilities for running test that generate cmocka xml results.""" @@ -77,25 +116,7 @@ def get_cmocka_env(self): "CMOCKA_MESSAGE_OUTPUT": "xml", }) - @staticmethod - def get_cmocka_command(command): - """Get an ExecutableCommand representing the provided command string. - - Adds detection of any bad keywords in the command output that, if found, will result in a - command failure. - - Args: - command (str): the command string to use to create the ExecutableCommand - - Returns: - ExecutableCommand: the object setup to run the command - - """ - keywords = ["Process received signal", "stack smashing detected", "End of error message", - "buffer overflow detected"] - return ExecutableCommand(namespace=None, command=command, check_results=keywords) - - def run_cmocka_test(self, test, command): + def run_cmocka_test(self, test, command='daos_test'): """Run the cmocka test command. After the command completes, copy any remote cmocka results that may exist back to this host @@ -109,22 +130,20 @@ def run_cmocka_test(self, test, command): error_message = None error_exception = None try: - time.sleep(50) # Debug command.run() except CommandFailure as error: - error_message = "Error detected running {}".format(job_command) + error_message = f"Error detected running {job_command}" error_exception = error test.log.exception(error_message) test.fail(error_message) finally: - run_remote(test.log, self.hosts, "ps -ejH") + run_remote(test.log, self.hosts, "ps -ejH") # TODO: remove debug self._collect_cmocka_results(test) if not self._check_cmocka_files(): if error_message is None: - error_message = "Missing cmocka results for {} in {}".format( - job_command, self.cmocka_dir) + error_message = f"Missing cmocka results for {job_command} in {self.cmocka_dir}" self._generate_cmocka_files(test, error_message, error_exception) def _collect_cmocka_results(self, test): @@ -190,7 +209,7 @@ def _generate_cmocka_files(self, test, error_message, error_exception): test_result.traceback = error_exception test_result.time_elapsed = 0 - cmocka_xml = os.path.join(self.outputdir, "{}_cmocka_results.xml".format(self.test_name)) + cmocka_xml = os.path.join(self.outputdir, f"{self.test_name}_cmocka_results.xml") job = Job(self.test_name, xml_output=cmocka_xml) result = Results(test.logfile) result.tests.append(test_result) diff --git a/src/tests/ftest/util/daos_core_base.py b/src/tests/ftest/util/daos_core_base.py index f5eee477822..a1a6f6e2554 100644 --- a/src/tests/ftest/util/daos_core_base.py +++ b/src/tests/ftest/util/daos_core_base.py @@ -8,7 +8,7 @@ import shutil from apricot import TestWithServers -from cmocka_utils import CmockaUtils +from cmocka_utils import CmockaUtils, get_cmocka_command from general_utils import get_log_file from job_manager_utils import get_job_manager from test_utils_pool import POOL_TIMEOUT_INCREMENT @@ -51,8 +51,17 @@ def get_test_param(self, name, default=None): path = "/".join(["/run/daos_tests", name, "*"]) return self.params.get(self.get_test_name(), path, default) - def run_subtest(self): - """Run daos_test with a subtest argument.""" + def run_subtest(self, executable='daos_test', path=None): + """Run the executable with a subtest argument. + + Args: + executable (str, optional): name of the executable. Defaults to 'daos_test'. + path (str, optional): path for the executable. Defaults to self.bin. + """ + if path is None: + # path=None yields the default self.bin path; path="" yields no path + path = self.bin + subtest = self.get_test_param("daos_test") num_clients = self.get_test_param("num_clients") if num_clients is None: @@ -81,8 +90,8 @@ def run_subtest(self): daos_test_env["COVFILE"] = "/tmp/test.cov" daos_test_env["POOL_SCM_SIZE"] = str(scm_size) daos_test_env["POOL_NVME_SIZE"] = str(nvme_size) - daos_test_cmd = cmocka_utils.get_cmocka_command( - " ".join([self.daos_test, "-n", dmg_config_file, "".join(["-", subtest]), str(args)])) + daos_test_cmd = get_cmocka_command( + path, executable, f"-n {dmg_config_file} -{subtest} {str(args)}") job = get_job_manager(self, "Orterun", daos_test_cmd, mpi_type="openmpi") job.assign_hosts(cmocka_utils.hosts, self.workdir, None) job.assign_processes(num_clients) From 456f2561984e3b9800c84f27dceb8c3e71ccb3c6 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 30 Oct 2024 14:34:00 -0400 Subject: [PATCH 06/22] Updates. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/harness/basic.py | 15 +++++++++------ src/tests/ftest/util/apricot/apricot/test.py | 2 -- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index cdc583119f2..3c8a45f1549 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -9,7 +9,7 @@ from cmocka_utils import CmockaUtils, get_cmocka_command from command_utils import SubProcessCommand from exception_utils import CommandFailure -from job_manager_utils import Mpirun, Orterun +from job_manager_utils import JobManager, Mpirun, Orterun class HarnessBasicTest(TestWithoutServers): @@ -143,7 +143,7 @@ def test_no_cmocka_xml(self): cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) command = get_cmocka_command("", "hostname") cmocka_utils.run_cmocka_test(self, command) - self._verify_no_cmocka_xml(name) + self._verify_no_cmocka_xml(name, str(command)) self.log.info("Test passed") def test_no_cmocka_xml_timeout(self): @@ -163,17 +163,20 @@ def test_no_cmocka_xml_timeout(self): name = "no_cmocka_xml_file_timeout_test" cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) command = get_cmocka_command("", "sleep", "60") + job = JobManager("/run/job_manager/time/*", "time", command) + job.register_cleanup_method = self.register_cleanup try: - cmocka_utils.run_cmocka_test(self, command) + cmocka_utils.run_cmocka_test(self, job) finally: - self._verify_no_cmocka_xml(name) + self._verify_no_cmocka_xml(name, str(job)) self.fail("Test did not timeout") - def _verify_no_cmocka_xml(self, name): + def _verify_no_cmocka_xml(self, name, command): """Verify a cmocka xml file was generated with the expected error. Args: name (str): name of the cmocka test + command (str): command for the cmocka test """ # Verify a generated cmocka xml file exists expected = os.path.join(self.outputdir, f"{name}_cmocka_results.xml") @@ -185,7 +188,7 @@ def _verify_no_cmocka_xml(self, name): self.log.info("Verifying contents of the generated cmocka file: %s", expected) with open(expected, "r", encoding="utf-8") as file_handle: actual_contents = file_handle.readlines() - error_message = f"Missing cmocka results for hostname in {self.outputdir}" + error_message = f"Missing cmocka results for {command} in {self.outputdir}" expected_lines = [ f" Date: Wed, 30 Oct 2024 14:42:09 -0400 Subject: [PATCH 07/22] Cleanup. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/cmocka_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index 8c3a14e99c3..b0561eabeaa 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -36,7 +36,7 @@ def get_cmocka_command(path, executable, parameters=None): class CmockaCommand(ExecutableCommand): - """Defines a object representing a daos command.""" + """Defines a object representing a cmocka test command.""" def __init__(self, path, executable, keywords): """Create a CmockaCommand object. @@ -116,7 +116,7 @@ def get_cmocka_env(self): "CMOCKA_MESSAGE_OUTPUT": "xml", }) - def run_cmocka_test(self, test, command='daos_test'): + def run_cmocka_test(self, test, command): """Run the cmocka test command. After the command completes, copy any remote cmocka results that may exist back to this host From e496214879b6dfd3190b8b85946768c1888318f0 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 30 Oct 2024 15:46:12 -0400 Subject: [PATCH 08/22] Ensure registered tearDown steps are run in TestWithoutServers Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/apricot/apricot/test.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index f401b263ddd..614a0d99fbd 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -476,6 +476,9 @@ def tearDown(self): self.report_timeout() super().tearDown() + # Execute any tear down steps in the reverse order of which they were registered. + self._teardown_errors.extend(self._cleanup()) + # Clean up any temporary files self._teardown_errors.extend(self.remove_temp_test_dir()) @@ -892,6 +895,7 @@ def start_agents(self, agent_groups=None, force=False): self.setup_agents(agent_groups) if self.agent_managers: self.start_agent_managers(force) + self.register_cleanup(self.stop_agents) def start_servers(self, server_groups=None, force=False): """Start the daos_server processes. @@ -915,6 +919,7 @@ def start_servers(self, server_groups=None, force=False): self.setup_servers(server_groups) if self.server_managers: force_agent_start = self.start_server_managers(force) + self.register_cleanup(self.stop_servers) return force_agent_start def restart_servers(self): @@ -1402,16 +1407,6 @@ def tearDown(self): # Tear down any test-specific items self._teardown_errors = self.pre_tear_down() - # Destroy any job managers, containers, pools, and dfuse instances next - # Eventually this call will encompass all teardown steps - self._teardown_errors.extend(self._cleanup()) - - # Stop the agents - self._teardown_errors.extend(self.stop_agents()) - - # Stop the servers - self._teardown_errors.extend(self.stop_servers()) - super().tearDown() def pre_tear_down(self): From 9cada0e3464408c58cadb19357a3f5918e989199 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 30 Oct 2024 15:49:25 -0400 Subject: [PATCH 09/22] Add missing modified file. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/harness/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index 3c8a45f1549..3e04e0f6007 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -168,7 +168,7 @@ def test_no_cmocka_xml_timeout(self): try: cmocka_utils.run_cmocka_test(self, job) finally: - self._verify_no_cmocka_xml(name, str(job)) + self._verify_no_cmocka_xml(name, str(command)) self.fail("Test did not timeout") def _verify_no_cmocka_xml(self, name, command): From 0c3493f7d951e42bfb9b23473303b3eed986ccba Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 30 Oct 2024 17:57:19 -0400 Subject: [PATCH 10/22] Define hosts for cleanup. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/harness/basic.py | 2 + src/tests/ftest/util/job_manager_utils.py | 98 +++++++++++------------ 2 files changed, 51 insertions(+), 49 deletions(-) diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index 3e04e0f6007..3443d6a7364 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -9,6 +9,7 @@ from cmocka_utils import CmockaUtils, get_cmocka_command from command_utils import SubProcessCommand from exception_utils import CommandFailure +from host_utils import get_local_host from job_manager_utils import JobManager, Mpirun, Orterun @@ -164,6 +165,7 @@ def test_no_cmocka_xml_timeout(self): cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) command = get_cmocka_command("", "sleep", "60") job = JobManager("/run/job_manager/time/*", "time", command) + job.assign_hosts(get_local_host()) job.register_cleanup_method = self.register_cleanup try: cmocka_utils.run_cmocka_test(self, job) diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index a3e7d671e9d..3b1bdfc2689 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -177,7 +177,7 @@ def check_subprocess_status(self, sub_process): """ return self.job.check_subprocess_status(sub_process) - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): + def assign_hosts(self, *args, **kwargs): """Assign the hosts to use with the command. Set the appropriate command line parameter with the specified value. @@ -191,6 +191,36 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ + self._set_hosts(*args, **kwargs) + + def _set_hosts(self, hosts): + """Assign hosts. + + Args: + hosts (NodeSet): hosts to specify on the command line + """ + self._hosts = hosts.copy() + + def _setup_hostfile(self, path=None, slots=None, hostfile=True): + """Setup the hostfile to use with the command. + + Args: + path (str, optional): path to use when specifying the hosts through + a hostfile. Defaults to None. + slots (int, optional): number of slots per host to specify in the + optional hostfile. Defaults to None. + hostfile (bool, optional): whether or not to also update any host related command + parameters to keep them in sync with the hosts. Defaults to True. + + Returns: + str: the full path of the written hostfile; None if one is not written + """ + if not hostfile: + return None + kwargs = {"hosts": self._hosts, "slots": slots} + if path is not None: + kwargs["path"] = path + return write_host_file(**kwargs) def assign_processes(self, processes): """Assign the number of processes. @@ -367,7 +397,7 @@ def __init__(self, job, subprocess=False, mpi_type="openmpi"): self.bind_to = FormattedParameter("--bind-to {}", None) self.mpi_type = mpi_type - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): + def assign_hosts(self, *args, **kwargs): """Assign the hosts to use with the command (--hostfile). Args: @@ -378,13 +408,8 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - self._hosts = hosts.copy() - if not hostfile: - return - kwargs = {"hosts": self._hosts, "slots": slots} - if path is not None: - kwargs["path"] = path - self.hostfile.value = write_host_file(**kwargs) + super().assign_hosts(*args, **kwargs) + self.hostfile.value = self._setup_hostfile(*args, **kwargs) def assign_processes(self, processes): """Assign the number of processes (-np). @@ -486,7 +511,7 @@ def __init__(self, job, subprocess=False, mpi_type="openmpi"): self.args = BasicParameter(None, None) self.mpi_type = mpi_type - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): + def assign_hosts(self, *args, **kwargs): """Assign the hosts to use with the command (-f). Args: @@ -497,13 +522,8 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - self._hosts = hosts.copy() - if not hostfile: - return - kwargs = {"hosts": self._hosts, "slots": slots} - if path is not None: - kwargs["path"] = path - self.hostfile.value = write_host_file(**kwargs) + super().assign_hosts(*args, **kwargs) + self.hostfile.value = self._setup_hostfile(*args, **kwargs) def assign_processes(self, processes=None, ppn=None): """Assign the number of processes (-np) and processes per node (-ppn). @@ -590,7 +610,7 @@ def __init__(self, job, path="", subprocess=False): self.partition = FormattedParameter("--partition={}", None) self.output = FormattedParameter("--output={}", None) - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): + def assign_hosts(self, *args, **kwargs): """Assign the hosts to use with the command (-f). Args: @@ -601,13 +621,17 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - self._hosts = hosts.copy() - if not hostfile: - return - kwargs = {"hosts": self._hosts, "slots": None} - if path is not None: - kwargs["path"] = path - self.nodefile.value = write_host_file(**kwargs) + super().assign_hosts(*args, **kwargs) + self.nodefile.value = self._setup_hostfile(*args, **kwargs) + self._set_ntasks_per_node(*args, **kwargs) + + def _set_ntasks_per_node(self, slots=None): + """Assign the ntasks_per_node value. + + Args: + slots (int, optional): number of slots per host to specify in the + hostfile. Defaults to None. + """ self.ntasks_per_node.value = slots def assign_processes(self, processes): @@ -768,19 +792,6 @@ def check_subprocess_status(self, sub_process): self.job.pattern, self.timestamps["start"], None, self.job.pattern_count, self.job.pattern_timeout.value) - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): - """Assign the hosts to use with the command. - - Set the appropriate command line parameter with the specified value. - - Args: - hosts (NodeSet): hosts to specify on the command line - path (str, optional): not used. Defaults to None. - slots (int, optional): not used. Defaults to None. - hostfile (bool, optional): not used. Defaults to True. - """ - self._hosts = hosts.copy() - def assign_environment(self, env_vars, append=False): """Assign or add environment variables to the command. @@ -1225,17 +1236,6 @@ def __str__(self): commands = [super().__str__(), "-w {}".format(self.hosts), str(self.job)] return " ".join(commands) - def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): - """Assign the hosts to use with the command (--hostfile). - - Args: - hosts (NodeSet): hosts to specify in the hostfile - path (str, optional): not used. Defaults to None. - slots (int, optional): not used. Defaults to None. - hostfile (bool, optional): not used. Defaults to True. - """ - self._hosts = hosts.copy() - def assign_environment(self, env_vars, append=False): """Assign or add environment variables to the command. From 34fc71b2dec728ce50b7a28d0131956176c5293c Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 30 Oct 2024 19:04:13 -0400 Subject: [PATCH 11/22] Updates. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/cmocka_utils.py | 2 +- src/tests/ftest/util/job_manager_utils.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index b0561eabeaa..63a92e64e04 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -139,7 +139,7 @@ def run_cmocka_test(self, test, command): test.fail(error_message) finally: - run_remote(test.log, self.hosts, "ps -ejH") # TODO: remove debug + run_remote(test.log, self.hosts, "ps -jH") self._collect_cmocka_results(test) if not self._check_cmocka_files(): if error_message is None: diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 3b1bdfc2689..8f15fdf4398 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -153,7 +153,7 @@ def command_regex(self): str: regular expression to use to search for the command """ # pylint: disable=protected-access - return "'({})'".format("|".join(self._exe_names + self.job._exe_names)) + return f"'({'|'.join(self._exe_names + self.job._exe_names)})'" def __str__(self): """Return the command with all of its defined parameters as a string. @@ -338,19 +338,19 @@ def kill(self): """Forcibly terminate any job processes running on hosts.""" if not self.job: return - regex = self.job.command_regex - detected, running = stop_processes(self.log, self._hosts, regex) + detected, running = stop_processes(self.log, self._hosts, self.command_regex) if not detected: self.log.info( - "No remote %s processes killed on %s (none found), done.", regex, self._hosts) + "No remote %s processes killed on %s (none found), done.", + self.command_regex, self._hosts) elif running: self.log.info( "***Unable to kill remote %s process on %s! Please investigate/report.***", - regex, running) + self.command_regex, running) else: self.log.info( "***At least one remote %s process needed to be killed on %s! Please investigate/" - "report.***", regex, detected) + "report.***", self.command_regex, detected) class Orterun(JobManager): From 4b58ad74d91c195e7f15e2671cff2b58df8ab7d9 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 30 Oct 2024 19:21:53 -0400 Subject: [PATCH 12/22] Fix assign hosts. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/job_manager_utils.py | 37 ++++++----------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 8f15fdf4398..d702432ffca 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -177,7 +177,7 @@ def check_subprocess_status(self, sub_process): """ return self.job.check_subprocess_status(sub_process) - def assign_hosts(self, *args, **kwargs): + def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): """Assign the hosts to use with the command. Set the appropriate command line parameter with the specified value. @@ -191,14 +191,6 @@ def assign_hosts(self, *args, **kwargs): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - self._set_hosts(*args, **kwargs) - - def _set_hosts(self, hosts): - """Assign hosts. - - Args: - hosts (NodeSet): hosts to specify on the command line - """ self._hosts = hosts.copy() def _setup_hostfile(self, path=None, slots=None, hostfile=True): @@ -397,7 +389,7 @@ def __init__(self, job, subprocess=False, mpi_type="openmpi"): self.bind_to = FormattedParameter("--bind-to {}", None) self.mpi_type = mpi_type - def assign_hosts(self, *args, **kwargs): + def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): """Assign the hosts to use with the command (--hostfile). Args: @@ -408,8 +400,8 @@ def assign_hosts(self, *args, **kwargs): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - super().assign_hosts(*args, **kwargs) - self.hostfile.value = self._setup_hostfile(*args, **kwargs) + super().assign_hosts(hosts, path, slots, hostfile) + self.hostfile.value = self._setup_hostfile(path, slots, hostfile) def assign_processes(self, processes): """Assign the number of processes (-np). @@ -511,7 +503,7 @@ def __init__(self, job, subprocess=False, mpi_type="openmpi"): self.args = BasicParameter(None, None) self.mpi_type = mpi_type - def assign_hosts(self, *args, **kwargs): + def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): """Assign the hosts to use with the command (-f). Args: @@ -522,8 +514,8 @@ def assign_hosts(self, *args, **kwargs): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - super().assign_hosts(*args, **kwargs) - self.hostfile.value = self._setup_hostfile(*args, **kwargs) + super().assign_hosts(hosts, path, slots, hostfile) + self.hostfile.value = self._setup_hostfile(path, slots, hostfile) def assign_processes(self, processes=None, ppn=None): """Assign the number of processes (-np) and processes per node (-ppn). @@ -610,7 +602,7 @@ def __init__(self, job, path="", subprocess=False): self.partition = FormattedParameter("--partition={}", None) self.output = FormattedParameter("--output={}", None) - def assign_hosts(self, *args, **kwargs): + def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): """Assign the hosts to use with the command (-f). Args: @@ -621,17 +613,8 @@ def assign_hosts(self, *args, **kwargs): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ - super().assign_hosts(*args, **kwargs) - self.nodefile.value = self._setup_hostfile(*args, **kwargs) - self._set_ntasks_per_node(*args, **kwargs) - - def _set_ntasks_per_node(self, slots=None): - """Assign the ntasks_per_node value. - - Args: - slots (int, optional): number of slots per host to specify in the - hostfile. Defaults to None. - """ + super().assign_hosts(hosts, path, slots, hostfile) + self.nodefile.value = self._setup_hostfile(path, slots, hostfile) self.ntasks_per_node.value = slots def assign_processes(self, processes): From 28c779af7ee02612cb8f57295e28014786e9c62d Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Thu, 31 Oct 2024 10:05:58 -0400 Subject: [PATCH 13/22] Kill manager after job. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/cmocka_utils.py | 4 ++- src/tests/ftest/util/job_manager_utils.py | 33 +++++++++++------------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index 63a92e64e04..cb47e33ef1d 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -139,7 +139,9 @@ def run_cmocka_test(self, test, command): test.fail(error_message) finally: - run_remote(test.log, self.hosts, "ps -jH") + if test.status is not None and test.status != 'PASS' and test.status != 'SKIP': + test.log.debug("Currently running processes for non-passing test:") + run_remote(test.log, self.hosts, "ps -jH") self._collect_cmocka_results(test) if not self._check_cmocka_files(): if error_message is None: diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index d702432ffca..272cba915b9 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -143,18 +143,6 @@ def job(self, value): and self._job.check_results_list): self.check_results_list.extend(self._job.check_results_list) - @property - def command_regex(self): - """Get the regular expression to use to search for the command. - - Typical use would include combining with pgrep to verify a subprocess is running. - - Returns: - str: regular expression to use to search for the command - """ - # pylint: disable=protected-access - return f"'({'|'.join(self._exe_names + self.job._exe_names)})'" - def __str__(self): """Return the command with all of its defined parameters as a string. @@ -330,19 +318,30 @@ def kill(self): """Forcibly terminate any job processes running on hosts.""" if not self.job: return - detected, running = stop_processes(self.log, self._hosts, self.command_regex) + # Kill the job command + self._kill_process(self.job.command_regex) + time.sleep(5) + # Kill the manager command + self._kill_process(self.command_regex) + + def _kill_process(self, pattern): + """Forcibly terminate the specified process. + + Args: + pattern (str): regular expression used to find process names to stop + """ + detected, running = stop_processes(self.log, self._hosts, pattern) if not detected: self.log.info( - "No remote %s processes killed on %s (none found), done.", - self.command_regex, self._hosts) + "No remote %s processes killed on %s (none found), done.", pattern, self._hosts) elif running: self.log.info( "***Unable to kill remote %s process on %s! Please investigate/report.***", - self.command_regex, running) + pattern, running) else: self.log.info( "***At least one remote %s process needed to be killed on %s! Please investigate/" - "report.***", self.command_regex, detected) + "report.***", pattern, detected) class Orterun(JobManager): From 04594cbc17e7ff3a444c1834c47ca16bdb72bbb0 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Thu, 31 Oct 2024 16:59:40 -0400 Subject: [PATCH 14/22] Don't stop the systemctl command. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/harness/basic.py | 9 ++++----- src/tests/ftest/harness/basic.yaml | 8 ++++++++ src/tests/ftest/util/job_manager_utils.py | 17 +++++++++++++---- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index 3443d6a7364..41443de59ea 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -10,7 +10,7 @@ from command_utils import SubProcessCommand from exception_utils import CommandFailure from host_utils import get_local_host -from job_manager_utils import JobManager, Mpirun, Orterun +from job_manager_utils import Mpirun, Orterun, get_job_manager class HarnessBasicTest(TestWithoutServers): @@ -164,11 +164,10 @@ def test_no_cmocka_xml_timeout(self): name = "no_cmocka_xml_file_timeout_test" cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) command = get_cmocka_command("", "sleep", "60") - job = JobManager("/run/job_manager/time/*", "time", command) - job.assign_hosts(get_local_host()) - job.register_cleanup_method = self.register_cleanup + manager = get_job_manager(self, job=command) + manager.assign_hosts(get_local_host()) try: - cmocka_utils.run_cmocka_test(self, job) + cmocka_utils.run_cmocka_test(self, manager) finally: self._verify_no_cmocka_xml(name, str(command)) self.fail("Test did not timeout") diff --git a/src/tests/ftest/harness/basic.yaml b/src/tests/ftest/harness/basic.yaml index be8aad8fd10..050cb1b9961 100644 --- a/src/tests/ftest/harness/basic.yaml +++ b/src/tests/ftest/harness/basic.yaml @@ -1 +1,9 @@ timeout: 10 + +job_manager: !mux + manager_1: + class_name: Clush + manager_2: + class_name: Orterun + manager_3: + class_name: Mpirun diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 272cba915b9..42f087ab630 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -179,6 +179,7 @@ def assign_hosts(self, hosts, path=None, slots=None, hostfile=True): hostfile (bool, optional): whether or not to also update any host related command parameters to keep them in sync with the hosts. Defaults to True. """ + # pylint: disable=unused-argument self._hosts = hosts.copy() def _setup_hostfile(self, path=None, slots=None, hostfile=True): @@ -318,11 +319,7 @@ def kill(self): """Forcibly terminate any job processes running on hosts.""" if not self.job: return - # Kill the job command self._kill_process(self.job.command_regex) - time.sleep(5) - # Kill the manager command - self._kill_process(self.command_regex) def _kill_process(self, pattern): """Forcibly terminate the specified process. @@ -458,6 +455,12 @@ def run(self, raise_exception=None): return super().run(raise_exception) + def kill(self): + """Forcibly terminate any job processes running on hosts.""" + super().kill() + time.sleep(1) + self._kill_process(self.command_regex) + class Mpirun(JobManager): """A class for the mpirun job manager command.""" @@ -572,6 +575,12 @@ def run(self, raise_exception=None): return super().run(raise_exception) + def kill(self): + """Forcibly terminate any job processes running on hosts.""" + super().kill() + time.sleep(1) + self._kill_process(self.command_regex) + class Srun(JobManager): """A class for the srun job manager command.""" From cd45c235abd4da8f42402bf14076cc89a8f4ed51 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Fri, 1 Nov 2024 17:19:34 -0400 Subject: [PATCH 15/22] Fix basic tests. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/harness/basic.py | 84 ++--------------- src/tests/ftest/harness/basic.yaml | 8 -- src/tests/ftest/harness/cmocka.py | 135 ++++++++++++++++++++++++++++ src/tests/ftest/harness/cmocka.yaml | 1 + 4 files changed, 141 insertions(+), 87 deletions(-) create mode 100644 src/tests/ftest/harness/cmocka.py create mode 100644 src/tests/ftest/harness/cmocka.yaml diff --git a/src/tests/ftest/harness/basic.py b/src/tests/ftest/harness/basic.py index 41443de59ea..0ff102e2074 100644 --- a/src/tests/ftest/harness/basic.py +++ b/src/tests/ftest/harness/basic.py @@ -6,11 +6,9 @@ import os from apricot import TestWithoutServers -from cmocka_utils import CmockaUtils, get_cmocka_command from command_utils import SubProcessCommand from exception_utils import CommandFailure -from host_utils import get_local_host -from job_manager_utils import Mpirun, Orterun, get_job_manager +from job_manager_utils import Mpirun, Orterun class HarnessBasicTest(TestWithoutServers): @@ -80,12 +78,14 @@ def test_load_mpi(self): try: Orterun(None) except CommandFailure as error: - self.fail("Orterun initialization failed: {}".format(error)) + self.log.error("Orterun initialization failed: %s", error) + self.fail("Orterun initialization failed") try: Mpirun(None, mpi_type="mpich") except CommandFailure as error: - self.fail("Mpirun initialization failed: {}".format(error)) + self.log.error("Mpirun initialization failed: %s", error) + self.fail("Mpirun initialization failed") def test_load_mpi_hw(self): """Simple test of apricot test code to load the openmpi module. @@ -126,77 +126,3 @@ def test_sub_process_command(self): if failed: self.fail("The '{}' command failed".format(command)) self.log.info("Test passed") - - def test_no_cmocka_xml(self): - """Test to verify CmockaUtils detects lack of cmocka file generation. - - If working correctly this test should fail due to a missing cmocka file. - - :avocado: tags=all - :avocado: tags=vm - :avocado: tags=harness,harness_cmocka,failure_expected - :avocado: tags=HarnessBasicTest,test_no_cmocka_xml - """ - self.log.info("=" * 80) - self.log.info("Running the 'hostname' command via CmockaUtils") - self.log.info(" This should generate a cmocka xml file with a 'Missing file' error") - name = "no_cmocka_xml_file_test" - cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) - command = get_cmocka_command("", "hostname") - cmocka_utils.run_cmocka_test(self, command) - self._verify_no_cmocka_xml(name, str(command)) - self.log.info("Test passed") - - def test_no_cmocka_xml_timeout(self): - """Test to verify CmockaUtils handles timed out process correctly. - - If working correctly this test should fail due to a test timeout and a missing cmocka file. - - :avocado: tags=all - :avocado: tags=vm - :avocado: tags=harness,harness_cmocka,failure_expected - :avocado: tags=HarnessBasicTest,test_no_cmocka_xml_timeout - """ - self.log.info("=" * 80) - self.log.info("Running the 'sleep 30' command via CmockaUtils") - self.log.info(" This should generate a test timeout failure") - self.log.info(" This should generate a cmocka xml file with a 'Missing file' error") - name = "no_cmocka_xml_file_timeout_test" - cmocka_utils = CmockaUtils(None, name, self.outputdir, self.test_dir, self.log) - command = get_cmocka_command("", "sleep", "60") - manager = get_job_manager(self, job=command) - manager.assign_hosts(get_local_host()) - try: - cmocka_utils.run_cmocka_test(self, manager) - finally: - self._verify_no_cmocka_xml(name, str(command)) - self.fail("Test did not timeout") - - def _verify_no_cmocka_xml(self, name, command): - """Verify a cmocka xml file was generated with the expected error. - - Args: - name (str): name of the cmocka test - command (str): command for the cmocka test - """ - # Verify a generated cmocka xml file exists - expected = os.path.join(self.outputdir, f"{name}_cmocka_results.xml") - self.log.info("Verifying the existence of the generated cmocka file: %s", expected) - if not os.path.isfile(expected): - self.fail(f"No {expected} file found") - - # Verify the generated cmocka xml file contains the expected error - self.log.info("Verifying contents of the generated cmocka file: %s", expected) - with open(expected, "r", encoding="utf-8") as file_handle: - actual_contents = file_handle.readlines() - error_message = f"Missing cmocka results for {command} in {self.outputdir}" - expected_lines = [ - f"" - ] - for index, actual_line in enumerate(actual_contents[1:4]): - self.log.debug(" expecting: %s", expected_lines[index]) - self.log.debug(" in actual: %s", actual_line[:-1].strip()) - if expected_lines[index] not in actual_line: - self.fail(f"Badly formed {expected} file") diff --git a/src/tests/ftest/harness/basic.yaml b/src/tests/ftest/harness/basic.yaml index 050cb1b9961..be8aad8fd10 100644 --- a/src/tests/ftest/harness/basic.yaml +++ b/src/tests/ftest/harness/basic.yaml @@ -1,9 +1 @@ timeout: 10 - -job_manager: !mux - manager_1: - class_name: Clush - manager_2: - class_name: Orterun - manager_3: - class_name: Mpirun diff --git a/src/tests/ftest/harness/cmocka.py b/src/tests/ftest/harness/cmocka.py new file mode 100644 index 00000000000..50f51a9abaa --- /dev/null +++ b/src/tests/ftest/harness/cmocka.py @@ -0,0 +1,135 @@ +""" + (C) Copyright 2022-2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import os + +from apricot import TestWithoutServers +from cmocka_utils import CmockaUtils, get_cmocka_command +from host_utils import get_local_host +from job_manager_utils import get_job_manager + + +class HarnessCmockaTest(TestWithoutServers): + """Cmocka harness test cases. + + :avocado: recursive + """ + + def test_no_cmocka_xml(self): + """Test to verify CmockaUtils detects lack of cmocka file generation. + + If working correctly this test should fail due to a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=HarnessCmockaTest,test_no_cmocka_xml + """ + self._run_cmocka_test(get_cmocka_command("", "hostname"), False, True) + self.log.info("Test passed") + + def test_clush_manager_timeout(self): + """Test to verify CmockaUtils handles timed out process correctly. + + If working correctly this test should fail due to a test timeout and a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=HarnessCmockaTest,test_clush_manager_timeout + """ + self._run_cmocka_test(self._get_manager_command("Clush", "sleep", "60"), True, True) + self.fail("Test did not timeout") + + def test_orterun_manager_timeout(self): + """Test to verify CmockaUtils handles timed out process correctly. + + If working correctly this test should fail due to a test timeout and a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=HarnessCmockaTest,test_orterun_manager_timeout + """ + self._run_cmocka_test(self._get_manager_command("Orterun", "sleep", "60"), True, True) + self.fail("Test did not timeout") + + def test_mpirun_manager_timeout(self): + """Test to verify CmockaUtils handles timed out process correctly. + + If working correctly this test should fail due to a test timeout and a missing cmocka file. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=HarnessCmockaTest,test_mpirun_manager_timeout + """ + self._run_cmocka_test(self._get_manager_command("Mpirun", "sleep", "60"), True, True) + self.fail("Test did not timeout") + + def _run_cmocka_test(self, command, timeout, missing): + """Run the cmocka test case. + + Args: + command (ExecutableCommand): the command to run + timeout (bool): is the test expected to timeout + missing (bool): is the test expected to be missing a cmocka result + """ + self.log.info("Running the '%s' command via CmockaUtils", str(command)) + if timeout: + self.log.info(" This should generate a test timeout failure") + if missing: + self.log.info(" This should generate a cmocka xml file with a 'Missing file' error") + + cmocka_utils = CmockaUtils(None, self.test_id, self.outputdir, self.test_dir, self.log) + try: + cmocka_utils.run_cmocka_test(self, command) + finally: + self._verify_no_cmocka_xml(self.test_id, str(command)) + + def _get_manager_command(self, class_name, executable, parameters): + """Get a JobManager command object. + + Args: + class_name (str): JobManager class name + executable (str): executable to be managed + parameters (str): parameters for the executable to be managed + + Returns: + JobManager: the requested JobManager class + """ + command = get_cmocka_command("", executable, parameters) + manager = get_job_manager(self, class_name, command) + manager.assign_hosts(get_local_host()) + return manager + + def _verify_no_cmocka_xml(self, name, command): + """Verify a cmocka xml file was generated with the expected error. + + Args: + name (str): name of the cmocka test + command (str): command for the cmocka test + """ + # Verify a generated cmocka xml file exists + expected = os.path.join(self.outputdir, f"{name}_cmocka_results.xml") + self.log.info("Verifying the existence of the generated cmocka file: %s", expected) + if not os.path.isfile(expected): + self.fail(f"No {expected} file found") + + # Verify the generated cmocka xml file contains the expected error + self.log.info("Verifying contents of the generated cmocka file: %s", expected) + with open(expected, "r", encoding="utf-8") as file_handle: + actual_contents = file_handle.readlines() + error_message = f"Missing cmocka results for {command} in {self.outputdir}" + expected_lines = [ + f"" + ] + for index, actual_line in enumerate(actual_contents[1:4]): + self.log.debug(" expecting: %s", expected_lines[index]) + self.log.debug(" in actual: %s", actual_line[:-1].strip()) + if expected_lines[index] not in actual_line: + self.fail(f"Badly formed {expected} file") diff --git a/src/tests/ftest/harness/cmocka.yaml b/src/tests/ftest/harness/cmocka.yaml new file mode 100644 index 00000000000..be8aad8fd10 --- /dev/null +++ b/src/tests/ftest/harness/cmocka.yaml @@ -0,0 +1 @@ +timeout: 10 From 12cd18af9e1f9b13e5ee24c3e31f17a6980983ae Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Fri, 1 Nov 2024 21:42:58 -0400 Subject: [PATCH 16/22] Don't kill orterun/mpirun commands. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: DaosCoreTestDfs DaosCoreTestDfuse harness_cmocka test_daos_management MultiEnginesPerSocketTest FaultDomain Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/harness/cmocka.py | 9 ++++++--- src/tests/ftest/util/job_manager_utils.py | 17 +++++------------ 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/tests/ftest/harness/cmocka.py b/src/tests/ftest/harness/cmocka.py index 50f51a9abaa..53e43ff91dd 100644 --- a/src/tests/ftest/harness/cmocka.py +++ b/src/tests/ftest/harness/cmocka.py @@ -87,7 +87,7 @@ def _run_cmocka_test(self, command, timeout, missing): try: cmocka_utils.run_cmocka_test(self, command) finally: - self._verify_no_cmocka_xml(self.test_id, str(command)) + self._verify_no_cmocka_xml(self.test_id, command) def _get_manager_command(self, class_name, executable, parameters): """Get a JobManager command object. @@ -110,7 +110,7 @@ def _verify_no_cmocka_xml(self, name, command): Args: name (str): name of the cmocka test - command (str): command for the cmocka test + command (ExecutableCommand): command for the cmocka test """ # Verify a generated cmocka xml file exists expected = os.path.join(self.outputdir, f"{name}_cmocka_results.xml") @@ -122,7 +122,10 @@ def _verify_no_cmocka_xml(self, name, command): self.log.info("Verifying contents of the generated cmocka file: %s", expected) with open(expected, "r", encoding="utf-8") as file_handle: actual_contents = file_handle.readlines() - error_message = f"Missing cmocka results for {command} in {self.outputdir}" + if hasattr(command, "job"): + error_message = f"Missing cmocka results for {str(command.job)} in {self.outputdir}" + else: + error_message = f"Missing cmocka results for {str(command)} in {self.outputdir}" expected_lines = [ f" Date: Sat, 2 Nov 2024 01:24:17 -0400 Subject: [PATCH 17/22] Cleanup. Updated description: Fix stopping timed out processes run by a JobManager class by only searching for and killing the command executable being run by clush, orterun, mpirun, etc. Add a new harness/cmocka.py test to verify the stopping of the processes with a test timeout. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pr daos_test dfuse_test test_load_mpi HarnessCmockaTest Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/daos_test/dfs.py | 8 +++++--- src/tests/ftest/daos_test/dfuse.py | 4 ++-- src/tests/ftest/harness/cmocka.py | 12 ++++++------ src/tests/ftest/util/cmocka_utils.py | 7 +++---- src/tests/ftest/util/daos_core_base.py | 13 +++++-------- 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/tests/ftest/daos_test/dfs.py b/src/tests/ftest/daos_test/dfs.py index 721012d8026..90b5c0799d6 100644 --- a/src/tests/ftest/daos_test/dfs.py +++ b/src/tests/ftest/daos_test/dfs.py @@ -4,6 +4,8 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ +import os + from daos_core_base import DaosCoreBase @@ -32,7 +34,7 @@ def test_daos_dfs_unit(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_unit """ - self.run_subtest('dfs_test') + self.run_subtest(os.path.join(self.bin, "dfs_test")) def test_daos_dfs_parallel(self): """Jira ID: DAOS-5409. @@ -48,7 +50,7 @@ def test_daos_dfs_parallel(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_parallel """ - self.run_subtest('dfs_test') + self.run_subtest(os.path.join(self.bin, "dfs_test")) def test_daos_dfs_sys(self): """Jira ID: DAOS-7759. @@ -64,4 +66,4 @@ def test_daos_dfs_sys(self): :avocado: tags=daos_test,dfs_test,dfs :avocado: tags=DaosCoreTestDfs,test_daos_dfs_sys """ - self.run_subtest('dfs_test') + self.run_subtest(os.path.join(self.bin, "dfs_test")) diff --git a/src/tests/ftest/daos_test/dfuse.py b/src/tests/ftest/daos_test/dfuse.py index 20d0e160a3c..86b3e383142 100644 --- a/src/tests/ftest/daos_test/dfuse.py +++ b/src/tests/ftest/daos_test/dfuse.py @@ -103,6 +103,7 @@ def run_test(self, il_lib=None): daos_test_env['D_IL_MAX_EQ'] = '2' daos_test_env['D_IL_NO_BYPASS'] = '1' + command = os.path.join(self.bin, 'dfuse_test') parameters = [ '--test-dir', mount_dir, @@ -125,8 +126,7 @@ def run_test(self, il_lib=None): if cache_mode != 'writeback': parameters.append('--metadata') - job = get_job_manager( - self, "Clush", get_cmocka_command(self.bin, 'dfuse_test', " ".join(parameters))) + job = get_job_manager(self, "Clush", get_cmocka_command(command, ' '.join(parameters))) job.assign_hosts(cmocka_utils.hosts) job.assign_environment(daos_test_env) diff --git a/src/tests/ftest/harness/cmocka.py b/src/tests/ftest/harness/cmocka.py index 53e43ff91dd..adc482132f2 100644 --- a/src/tests/ftest/harness/cmocka.py +++ b/src/tests/ftest/harness/cmocka.py @@ -24,10 +24,10 @@ def test_no_cmocka_xml(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=harness,failure_expected :avocado: tags=HarnessCmockaTest,test_no_cmocka_xml """ - self._run_cmocka_test(get_cmocka_command("", "hostname"), False, True) + self._run_cmocka_test(get_cmocka_command("hostname"), False, True) self.log.info("Test passed") def test_clush_manager_timeout(self): @@ -37,7 +37,7 @@ def test_clush_manager_timeout(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=harness,failure_expected :avocado: tags=HarnessCmockaTest,test_clush_manager_timeout """ self._run_cmocka_test(self._get_manager_command("Clush", "sleep", "60"), True, True) @@ -50,7 +50,7 @@ def test_orterun_manager_timeout(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=harness,failure_expected :avocado: tags=HarnessCmockaTest,test_orterun_manager_timeout """ self._run_cmocka_test(self._get_manager_command("Orterun", "sleep", "60"), True, True) @@ -63,7 +63,7 @@ def test_mpirun_manager_timeout(self): :avocado: tags=all :avocado: tags=vm - :avocado: tags=harness,harness_cmocka,failure_expected + :avocado: tags=harness,failure_expected :avocado: tags=HarnessCmockaTest,test_mpirun_manager_timeout """ self._run_cmocka_test(self._get_manager_command("Mpirun", "sleep", "60"), True, True) @@ -100,7 +100,7 @@ def _get_manager_command(self, class_name, executable, parameters): Returns: JobManager: the requested JobManager class """ - command = get_cmocka_command("", executable, parameters) + command = get_cmocka_command(executable, parameters) manager = get_job_manager(self, class_name, command) manager.assign_hosts(get_local_host()) return manager diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index cb47e33ef1d..c21cb366213 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -13,23 +13,22 @@ from run_utils import get_clush_command, run_local, run_remote -def get_cmocka_command(path, executable, parameters=None): +def get_cmocka_command(command, parameters=None): """Get an ExecutableCommand representing the provided command string. Adds detection of any bad keywords in the command output that, if found, will result in a command failure. Args: - path (str): the command path to use to create the CmockaCommand - executable (str): the command name to use to create the CmockaCommand + command (str): the command path to use to create the CmockaCommand parameters (str): parameters to use to create the CmockaCommand Returns: ExecutableCommand: the object setup to run the command - """ keywords = ["Process received signal", "stack smashing detected", "End of error message", "buffer overflow detected"] + path, executable = os.path.split(command) command = CmockaCommand(path, executable, keywords) command.parameters.value = parameters return command diff --git a/src/tests/ftest/util/daos_core_base.py b/src/tests/ftest/util/daos_core_base.py index a1a6f6e2554..1b0ad42fecc 100644 --- a/src/tests/ftest/util/daos_core_base.py +++ b/src/tests/ftest/util/daos_core_base.py @@ -51,16 +51,14 @@ def get_test_param(self, name, default=None): path = "/".join(["/run/daos_tests", name, "*"]) return self.params.get(self.get_test_name(), path, default) - def run_subtest(self, executable='daos_test', path=None): + def run_subtest(self, command=None): """Run the executable with a subtest argument. Args: - executable (str, optional): name of the executable. Defaults to 'daos_test'. - path (str, optional): path for the executable. Defaults to self.bin. + command (str, optional): command to run. Defaults to None which will yield daos_test. """ - if path is None: - # path=None yields the default self.bin path; path="" yields no path - path = self.bin + if command is None: + command = os.path.join(self.bin, "daos_test") subtest = self.get_test_param("daos_test") num_clients = self.get_test_param("num_clients") @@ -90,8 +88,7 @@ def run_subtest(self, executable='daos_test', path=None): daos_test_env["COVFILE"] = "/tmp/test.cov" daos_test_env["POOL_SCM_SIZE"] = str(scm_size) daos_test_env["POOL_NVME_SIZE"] = str(nvme_size) - daos_test_cmd = get_cmocka_command( - path, executable, f"-n {dmg_config_file} -{subtest} {str(args)}") + daos_test_cmd = get_cmocka_command(command, f"-n {dmg_config_file} -{subtest} {str(args)}") job = get_job_manager(self, "Orterun", daos_test_cmd, mpi_type="openmpi") job.assign_hosts(cmocka_utils.hosts, self.workdir, None) job.assign_processes(num_clients) From 6d17d3dfec65753cdadbfb89c7a25c60a5a974fb Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 6 Nov 2024 18:13:17 -0500 Subject: [PATCH 18/22] Add check for stopping pid 1. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pr daos_test dfuse_test test_load_mpi HarnessCmockaTest Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/cmocka_utils.py | 1 + src/tests/ftest/util/command_utils.py | 10 +++++++--- src/tests/ftest/util/job_manager_utils.py | 17 +++++------------ src/tests/ftest/util/run_utils.py | 8 ++++++++ 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/tests/ftest/util/cmocka_utils.py b/src/tests/ftest/util/cmocka_utils.py index c21cb366213..6a9f0441938 100644 --- a/src/tests/ftest/util/cmocka_utils.py +++ b/src/tests/ftest/util/cmocka_utils.py @@ -49,6 +49,7 @@ def __init__(self, path, executable, keywords): the command output. Defaults to None. """ super().__init__(None, executable, path, check_results=keywords) + self.full_command_regex = True self.parameters = BasicParameter(None) diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py index 56ceb4c7fe3..2c9c335b1c9 100644 --- a/src/tests/ftest/util/command_utils.py +++ b/src/tests/ftest/util/command_utils.py @@ -72,6 +72,9 @@ def __init__(self, namespace, command, path="", subprocess=False, check_results= # used to check on the progress or terminate the command. self._exe_names = [self.command] + # If set use the full command string when returning the 'command_regex' property + self.full_command_regex = False + # Define an attribute to store the CmdResult from the last run() call. # A CmdResult object has the following properties: # command - command string @@ -132,10 +135,11 @@ def command_regex(self): Typical use would include combining with pgrep to verify a subprocess is running. Returns: - str: regular expression to use to search for the command - + str: regular expression to use to search for the command, typically with pgrep or pkill """ - return "'({})'".format("|".join(self._exe_names)) + if self.full_command_regex: + return f"--full '{str(self)}'" + return f"'({'|'.join(self._exe_names)})'" @property def with_bind(self): diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 3da91fa8ca2..8ca10a8c47e 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -319,26 +319,19 @@ def kill(self): """Forcibly terminate any job processes running on hosts.""" if not self.job: return - self._kill_process(self.job.command_regex) - - def _kill_process(self, pattern): - """Forcibly terminate the specified process. - - Args: - pattern (str): regular expression used to find process names to stop - """ - detected, running = stop_processes(self.log, self._hosts, pattern) + regex = self.job.command_regex + detected, running = stop_processes(self.log, self._hosts, regex) if not detected: self.log.info( - "No remote %s processes killed on %s (none found), done.", pattern, self._hosts) + "No remote %s processes killed on %s (none found), done.", regex, self._hosts) elif running: self.log.info( "***Unable to kill remote %s process on %s! Please investigate/report.***", - pattern, running) + regex, running) else: self.log.info( "***At least one remote %s process needed to be killed on %s! Please investigate/" - "report.***", pattern, detected) + "report.***", regex, detected) class Orterun(JobManager): diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index f4893558fb0..6e946ad3ddc 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -4,6 +4,7 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import os +import re import subprocess # nosec import time from getpass import getuser @@ -543,6 +544,9 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, processes. Defaults to False which will attempt to kill w/o a signal, then with the ABRT signal, and finally with the KILL signal. + Raises: + ValueError: if the pattern ends up matching process 1. + Returns: tuple: (NodeSet, NodeSet) where the first NodeSet indicates on which hosts processes matching the pattern were initially detected and the second NodeSet indicates on which @@ -564,6 +568,10 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, log.debug("No processes found on %s that match %s", result.failed_hosts, pattern_match) return processes_detected, processes_running + # Catch any attempt to kill process 1. + if "1" in re.findall(r"^(\d+)\s+", result.joined_stdout, re.MULTILINE): + raise ValueError(f"Attempting to kill process 1 as a match for {pattern}!") + # Indicate on which hosts processes matching the pattern were found running in the return status processes_detected.add(result.passed_hosts) From a6ff5d274c87c361d99605b5a55e9a4c253f5d7a Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Fri, 8 Nov 2024 16:02:06 -0500 Subject: [PATCH 19/22] Only match exact full commands if requested in stop_processes. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pr daos_test dfuse_test test_load_mpi HarnessCmockaTest Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/util/command_utils.py | 7 +++---- src/tests/ftest/util/job_manager_utils.py | 5 ++++- src/tests/ftest/util/run_utils.py | 17 ++++++++++++----- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/tests/ftest/util/command_utils.py b/src/tests/ftest/util/command_utils.py index 2c9c335b1c9..dbed7ac3c44 100644 --- a/src/tests/ftest/util/command_utils.py +++ b/src/tests/ftest/util/command_utils.py @@ -135,11 +135,10 @@ def command_regex(self): Typical use would include combining with pgrep to verify a subprocess is running. Returns: - str: regular expression to use to search for the command, typically with pgrep or pkill + str: regular expression to use to search for the command + """ - if self.full_command_regex: - return f"--full '{str(self)}'" - return f"'({'|'.join(self._exe_names)})'" + return "'({})'".format("|".join(self._exe_names)) @property def with_bind(self): diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 8ca10a8c47e..932a9f7a306 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -320,7 +320,10 @@ def kill(self): if not self.job: return regex = self.job.command_regex - detected, running = stop_processes(self.log, self._hosts, regex) + if self.job.full_command_regex: + regex = f"'{str(self.job)}'" + detected, running = stop_processes( + self.log, self._hosts, regex, full_command=self.job.full_command_regex) if not detected: self.log.info( "No remote %s processes killed on %s (none found), done.", regex, self._hosts) diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 6e946ad3ddc..8e96e2228f0 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -530,7 +530,8 @@ def find_command(source, pattern, depth, other=None): return " ".join(command) -def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, force=False): +def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, force=False, + full_command=False): """Stop the processes on each hosts that match the pattern. Args: @@ -543,6 +544,8 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, force (bool, optional): if set use the KILL signal to immediately stop any running processes. Defaults to False which will attempt to kill w/o a signal, then with the ABRT signal, and finally with the KILL signal. + full_command (bool, optional): if set match the pattern using the full command with + pgrep/pkill. Defaults to False. Raises: ValueError: if the pattern ends up matching process 1. @@ -555,15 +558,17 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, """ processes_detected = NodeSet() processes_running = NodeSet() - command = f"/usr/bin/pgrep --list-full {pattern}" + search_command = f"/usr/bin/pgrep --list-full {pattern}" pattern_match = str(pattern) if exclude: - command = f"/usr/bin/ps xa | grep -E {pattern} | grep -vE {exclude}" + search_command = f"/usr/bin/ps xa | grep -E {pattern} | grep -vE {exclude}" pattern_match += " and doesn't match " + str(exclude) + elif full_command: + search_command = f"/usr/bin/pgrep --list-full --full -x {pattern}" # Search for any active processes log.debug("Searching for any processes on %s that match %s", hosts, pattern_match) - result = run_remote(log, hosts, command, verbose, timeout) + result = run_remote(log, hosts, search_command, verbose, timeout) if not result.passed_hosts: log.debug("No processes found on %s that match %s", result.failed_hosts, pattern_match) return processes_detected, processes_running @@ -588,9 +593,11 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, "Killing%s any processes on %s that match %s and then waiting %s seconds", step[0], result.passed_hosts, pattern_match, step[1]) kill_command = f"sudo /usr/bin/pkill{step[0]} {pattern}" + if full_command: + kill_command = f"sudo /usr/bin/pkill{step[0]} --full -x {pattern}" run_remote(log, result.passed_hosts, kill_command, verbose, timeout) time.sleep(step[1]) - result = run_remote(log, result.passed_hosts, command, verbose, timeout) + result = run_remote(log, result.passed_hosts, search_command, verbose, timeout) if not result.passed_hosts: # Indicate all running processes matching the pattern were stopped in the return status log.debug( From 2eaace2f1e33a7a616396223e794a8f30909730d Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Thu, 21 Nov 2024 14:55:35 -0500 Subject: [PATCH 20/22] Remove register clanup calls for agent/server stop. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pr daos_test dfuse_test test_load_mpi HarnessCmockaTest Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/harness/cmocka.py | 14 ++++++++++++-- src/tests/ftest/util/apricot/apricot/test.py | 15 ++++++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/tests/ftest/harness/cmocka.py b/src/tests/ftest/harness/cmocka.py index adc482132f2..b4b3d8f0620 100644 --- a/src/tests/ftest/harness/cmocka.py +++ b/src/tests/ftest/harness/cmocka.py @@ -5,18 +5,28 @@ """ import os -from apricot import TestWithoutServers +from apricot import TestWithServers from cmocka_utils import CmockaUtils, get_cmocka_command from host_utils import get_local_host from job_manager_utils import get_job_manager -class HarnessCmockaTest(TestWithoutServers): +class HarnessCmockaTest(TestWithServers): """Cmocka harness test cases. + Inherit TestWithServers so that tearDown() will call self._cleanup(). + :avocado: recursive """ + def __init__(self, *args, **kwargs): + """Initialize a TestWithServers object.""" + super().__init__(*args, **kwargs) + + # Disable starting agents and servers + self.setup_start_agents = False + self.setup_start_servers = False + def test_no_cmocka_xml(self): """Test to verify CmockaUtils detects lack of cmocka file generation. diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 614a0d99fbd..f401b263ddd 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -476,9 +476,6 @@ def tearDown(self): self.report_timeout() super().tearDown() - # Execute any tear down steps in the reverse order of which they were registered. - self._teardown_errors.extend(self._cleanup()) - # Clean up any temporary files self._teardown_errors.extend(self.remove_temp_test_dir()) @@ -895,7 +892,6 @@ def start_agents(self, agent_groups=None, force=False): self.setup_agents(agent_groups) if self.agent_managers: self.start_agent_managers(force) - self.register_cleanup(self.stop_agents) def start_servers(self, server_groups=None, force=False): """Start the daos_server processes. @@ -919,7 +915,6 @@ def start_servers(self, server_groups=None, force=False): self.setup_servers(server_groups) if self.server_managers: force_agent_start = self.start_server_managers(force) - self.register_cleanup(self.stop_servers) return force_agent_start def restart_servers(self): @@ -1407,6 +1402,16 @@ def tearDown(self): # Tear down any test-specific items self._teardown_errors = self.pre_tear_down() + # Destroy any job managers, containers, pools, and dfuse instances next + # Eventually this call will encompass all teardown steps + self._teardown_errors.extend(self._cleanup()) + + # Stop the agents + self._teardown_errors.extend(self.stop_agents()) + + # Stop the servers + self._teardown_errors.extend(self.stop_servers()) + super().tearDown() def pre_tear_down(self): From 056b931b4883a638fec3b990526148b3aecd0fae Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Fri, 22 Nov 2024 09:41:00 -0500 Subject: [PATCH 21/22] Restore register cleanup for server/agent stop. Remove stopping agents when stopping servers as DAOS-6873 is resolved. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pr daos_test dfuse_test test_load_mpi HarnessCmockaTest ConfigGenerateRun Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- .../ftest/control/config_generate_run.py | 17 +++++--------- src/tests/ftest/harness/cmocka.py | 14 ++---------- src/tests/ftest/util/apricot/apricot/test.py | 22 +++++-------------- 3 files changed, 14 insertions(+), 39 deletions(-) diff --git a/src/tests/ftest/control/config_generate_run.py b/src/tests/ftest/control/config_generate_run.py index efaa992df31..0062cb3b769 100644 --- a/src/tests/ftest/control/config_generate_run.py +++ b/src/tests/ftest/control/config_generate_run.py @@ -52,6 +52,7 @@ def test_config_generate_run(self): control_metadata = os.path.join(self.test_env.log_dir, 'control_metadata') # Call dmg config generate. AP is always the first server host. + self.log_step("Generating server configuration") server_host = self.hostlist_servers[0] result = self.get_dmg_command().config_generate( access_points=server_host, num_engines=num_engines, scm_only=scm_only, @@ -66,25 +67,19 @@ def test_config_generate_run(self): # Stop and restart daos_server. self.start_server_managers() has the # server start-up check built into it, so if there's something wrong, # it'll throw an error. - self.log.info("Stopping servers") + self.log_step("Stopping servers") self.stop_servers() # Create a new server config from generated_yaml and update SCM-related # data in engine_params so that the cleanup before the server start # works. - self.log.info("Copy config to %s and update engine_params", self.test_env.server_config) + self.log_step(f"Copy config to {self.test_env.server_config} and update engine_params") self.server_managers[0].update_config_file_from_file(generated_yaml) # Start server with the generated config. - self.log.info("Restarting server with the generated config") + self.log_step("Restarting server with the generated config") try: - agent_force = self.start_server_managers(force=True) + self.start_server_managers(force=True) except ServerFailed as error: self.fail(f"Restarting server failed! {error}") - - # We don't need agent for this test. However, when we stop the server, - # agent is also stopped. Then the harness checks that the agent is - # running during the teardown. If agent isn't running at that point, it - # would cause an error, so start it here. - self.log.info("Restarting agents") - self.start_agent_managers(force=agent_force) + self.log.info("Test passed") diff --git a/src/tests/ftest/harness/cmocka.py b/src/tests/ftest/harness/cmocka.py index b4b3d8f0620..adc482132f2 100644 --- a/src/tests/ftest/harness/cmocka.py +++ b/src/tests/ftest/harness/cmocka.py @@ -5,28 +5,18 @@ """ import os -from apricot import TestWithServers +from apricot import TestWithoutServers from cmocka_utils import CmockaUtils, get_cmocka_command from host_utils import get_local_host from job_manager_utils import get_job_manager -class HarnessCmockaTest(TestWithServers): +class HarnessCmockaTest(TestWithoutServers): """Cmocka harness test cases. - Inherit TestWithServers so that tearDown() will call self._cleanup(). - :avocado: recursive """ - def __init__(self, *args, **kwargs): - """Initialize a TestWithServers object.""" - super().__init__(*args, **kwargs) - - # Disable starting agents and servers - self.setup_start_agents = False - self.setup_start_servers = False - def test_no_cmocka_xml(self): """Test to verify CmockaUtils detects lack of cmocka file generation. diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index f401b263ddd..bb4dba4f1e0 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -476,6 +476,9 @@ def tearDown(self): self.report_timeout() super().tearDown() + # Execute any tear down steps in the reverse order of which they were registered. + self._teardown_errors.extend(self._cleanup()) + # Clean up any temporary files self._teardown_errors.extend(self.remove_temp_test_dir()) @@ -892,6 +895,7 @@ def start_agents(self, agent_groups=None, force=False): self.setup_agents(agent_groups) if self.agent_managers: self.start_agent_managers(force) + self.register_cleanup(self.stop_agents) def start_servers(self, server_groups=None, force=False): """Start the daos_server processes. @@ -915,6 +919,7 @@ def start_servers(self, server_groups=None, force=False): self.setup_servers(server_groups) if self.server_managers: force_agent_start = self.start_server_managers(force) + self.register_cleanup(self.stop_servers) return force_agent_start def restart_servers(self): @@ -1394,7 +1399,7 @@ def tearDown(self): # class (see DAOS-1452/DAOS-9941 and Avocado issue #5217 with # associated PR-5224) if self.status is not None and self.status != 'PASS' and self.status != 'SKIP': - self.__dump_engines_stacks("Test status is {}".format(self.status)) + self.__dump_engines_stacks(f"Test status is {self.status}") # Report whether or not the timeout has expired self.report_timeout() @@ -1402,16 +1407,6 @@ def tearDown(self): # Tear down any test-specific items self._teardown_errors = self.pre_tear_down() - # Destroy any job managers, containers, pools, and dfuse instances next - # Eventually this call will encompass all teardown steps - self._teardown_errors.extend(self._cleanup()) - - # Stop the agents - self._teardown_errors.extend(self.stop_agents()) - - # Stop the servers - self._teardown_errors.extend(self.stop_servers()) - super().tearDown() def pre_tear_down(self): @@ -1611,11 +1606,6 @@ def stop_servers(self): "Stopping %s group(s) of servers", len(self.server_managers)) errors.extend(self._stop_managers(self.server_managers, "servers")) - # Stopping agents whenever servers are stopped for DAOS-6873 - self.log.info( - "Workaround for DAOS-6873: Stopping %s group(s) of agents", - len(self.agent_managers)) - errors.extend(self._stop_managers(self.agent_managers, "agents")) return errors def _stop_managers(self, managers, name): From 34711e410f18c92b636b7217913078a1f3e404ce Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Mon, 25 Nov 2024 18:18:49 -0500 Subject: [PATCH 22/22] Fix merge. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: pr daos_test dfuse_test test_load_mpi HarnessCmockaTest Allow-unstable-test: true Required-githooks: true Signed-off-by: Phil Henderson --- src/tests/ftest/control/config_generate_run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/ftest/control/config_generate_run.py b/src/tests/ftest/control/config_generate_run.py index 0062cb3b769..f73ae0f5c7e 100644 --- a/src/tests/ftest/control/config_generate_run.py +++ b/src/tests/ftest/control/config_generate_run.py @@ -82,4 +82,5 @@ def test_config_generate_run(self): self.start_server_managers(force=True) except ServerFailed as error: self.fail(f"Restarting server failed! {error}") + self.log.info("Test passed")