From f390961306303a4a28db46fa62d58a2e9076cea5 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Mon, 12 Aug 2024 08:49:45 -0700 Subject: [PATCH] DAOS-16217 test: Update run_local(). (#14748) (#14866) Update the current run_local() command to return an object similar to run_remote() to allow them to be used interchangeably. increase verify_perms.py timeout. Also include #14882 Use subprocess.run() for run_local() Signed-off-by: Dalton Bohning Co-authored-by: Phil Henderson --- src/tests/ftest/dfuse/pil4dfs_fio.py | 2 +- src/tests/ftest/harness/core_files.py | 11 +- src/tests/ftest/harness/unit.py | 261 +++++++++++--- src/tests/ftest/process_core_files.py | 62 ++-- .../ftest/server/multiengine_persocket.py | 5 +- src/tests/ftest/slurm_setup.py | 4 +- src/tests/ftest/util/agent_utils.py | 6 +- src/tests/ftest/util/collection_utils.py | 32 +- src/tests/ftest/util/dfuse_utils.py | 12 +- src/tests/ftest/util/fio_utils.py | 4 +- src/tests/ftest/util/general_utils.py | 11 +- src/tests/ftest/util/io_utilities.py | 2 +- src/tests/ftest/util/launch_utils.py | 100 +++--- src/tests/ftest/util/package_utils.py | 6 +- src/tests/ftest/util/run_utils.py | 333 ++++++++++-------- src/tests/ftest/util/server_utils.py | 16 +- src/tests/ftest/util/slurm_utils.py | 35 +- src/tests/ftest/util/soak_test_base.py | 30 +- src/tests/ftest/util/user_utils.py | 14 +- src/tests/ftest/verify_perms.py | 9 +- 20 files changed, 557 insertions(+), 398 deletions(-) diff --git a/src/tests/ftest/dfuse/pil4dfs_fio.py b/src/tests/ftest/dfuse/pil4dfs_fio.py index a5405fa3388..2aa3cd1b952 100644 --- a/src/tests/ftest/dfuse/pil4dfs_fio.py +++ b/src/tests/ftest/dfuse/pil4dfs_fio.py @@ -74,7 +74,7 @@ def _get_bandwidth(self, fio_result, rw): """Returns FIO bandwidth of a given I/O pattern Args: - fio_result (RemoteCommandResult): results of a FIO command. + fio_result (CommandResult): results of a FIO command. rw (str): Type of I/O pattern. Returns: diff --git a/src/tests/ftest/harness/core_files.py b/src/tests/ftest/harness/core_files.py index fed038a6a82..a017b8bba7a 100644 --- a/src/tests/ftest/harness/core_files.py +++ b/src/tests/ftest/harness/core_files.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2021-2023 Intel Corporation. + (C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -9,7 +9,7 @@ from apricot import TestWithServers from ClusterShell.NodeSet import NodeSet -from run_utils import RunException, run_local, run_remote +from run_utils import run_local, run_remote class HarnessCoreFilesTest(TestWithServers): @@ -40,11 +40,10 @@ def test_core_files(self): """ # create a core.gdb file self.log.debug("Create a core.gdb.harness.advanced file in core_pattern dir.") - try: - results = run_local(self.log, "cat /proc/sys/kernel/core_pattern", check=True) - except RunException: + result = run_local(self.log, "cat /proc/sys/kernel/core_pattern") + if not result.passed: self.fail("Unable to find local core file pattern") - core_path = os.path.split(results.stdout.splitlines()[-1])[0] + core_path = os.path.split(result.joined_stdout.splitlines()[-1])[0] core_file = "{}/core.gdb.harness.advanced".format(core_path) self.log.debug("Creating %s", core_file) diff --git a/src/tests/ftest/harness/unit.py b/src/tests/ftest/harness/unit.py index 1f4524674f6..bd6b108e64b 100644 --- a/src/tests/ftest/harness/unit.py +++ b/src/tests/ftest/harness/unit.py @@ -1,12 +1,13 @@ """ - (C) Copyright 2023 Intel Corporation. + (C) Copyright 2023-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ from apricot import TestWithoutServers from ClusterShell.NodeSet import NodeSet from data_utils import dict_extract_values, dict_subtract, list_flatten, list_stats, list_unique -from run_utils import ResultData, run_remote +from host_utils import get_local_host +from run_utils import ResultData, run_local, run_remote class HarnessUnitTest(TestWithoutServers): @@ -15,12 +16,13 @@ class HarnessUnitTest(TestWithoutServers): :avocado: recursive """ - def _verify_remote_command_result(self, result, passed, expected, timeout, homogeneous, - passed_hosts, failed_hosts, all_stdout, all_stderr): - """Verify a RemoteCommandResult object. + def _verify_command_result(self, result, passed, expected, timeout, homogeneous, passed_hosts, + failed_hosts, all_stdout, all_stderr, join_stdout, join_stderr): + # pylint: disable=too-many-arguments + """Verify a CommandResult object. Args: - result (RemoteCommandResult): object to verify + result (CommandResult): object to verify passed (bool): expected passed command state expected (list): expected list of ResultData objects timeout (bool): expected command timeout state @@ -29,26 +31,25 @@ def _verify_remote_command_result(self, result, passed, expected, timeout, homog failed_hosts (NodeSet): expected set of hosts on which the command failed all_stdout (dict): expected stdout str per host key all_stderr (dict): expected stderr str per host key + join_stdout (str): expected all stdout joined into one string + join_stderr (str): expected all stderr joined into one string """ - self.assertEqual(passed, result.passed, 'Incorrect RemoteCommandResult.passed') - self.assertEqual( - len(expected), len(result.output), 'Incorrect RemoteCommandResult.output count') + self.assertEqual(passed, result.passed, 'Incorrect CommandResult.passed') + self.assertEqual(len(expected), len(result.output), 'Incorrect CommandResult.output count') sorted_output = sorted(result.output) for index, expect in enumerate(sorted(expected)): actual = sorted_output[index] for key in ('command', 'returncode', 'hosts', 'stdout', 'stderr', 'timeout'): self.assertEqual( - getattr(expect, key), getattr(actual, key), - 'Incorrect ResultData.{}'.format(key)) - self.assertEqual(timeout, result.timeout, 'Incorrect RemoteCommandResult.timeout') - self.assertEqual( - homogeneous, result.homogeneous, 'Incorrect RemoteCommandResult.homogeneous') - self.assertEqual( - passed_hosts, result.passed_hosts, 'Incorrect RemoteCommandResult.passed_hosts') - self.assertEqual( - failed_hosts, result.failed_hosts, 'Incorrect RemoteCommandResult.failed_hosts') - self.assertEqual(all_stdout, result.all_stdout, 'Incorrect RemoteCommandResult.all_stdout') - self.assertEqual(all_stderr, result.all_stderr, 'Incorrect RemoteCommandResult.all_stderr') + getattr(expect, key), getattr(actual, key), f'Incorrect ResultData.{key}') + self.assertEqual(timeout, result.timeout, 'Incorrect CommandResult.timeout') + self.assertEqual(homogeneous, result.homogeneous, 'Incorrect CommandResult.homogeneous') + self.assertEqual(passed_hosts, result.passed_hosts, 'Incorrect CommandResult.passed_hosts') + self.assertEqual(failed_hosts, result.failed_hosts, 'Incorrect CommandResult.failed_hosts') + self.assertEqual(all_stdout, result.all_stdout, 'Incorrect CommandResult.all_stdout') + self.assertEqual(all_stderr, result.all_stderr, 'Incorrect CommandResult.all_stderr') + self.assertEqual(join_stdout, result.joined_stdout, 'Incorrect CommandResult.joined_stdout') + self.assertEqual(join_stderr, result.joined_stderr, 'Incorrect CommandResult.joined_stderr') def test_harness_unit_list_unique(self): """Verify list_unique(). @@ -233,6 +234,136 @@ def test_harness_unit_dict_subtract(self): }) self.log_step('Unit Test Passed') + def test_harness_unit_run_local(self): + """Verify run_local(). + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local + """ + host = get_local_host() + command = 'uname -o' + self.log_step('Verify run_local()') + self._verify_command_result( + result=run_local(self.log, command), + passed=True, + expected=[ResultData(command, 0, host, ['GNU/Linux'], [], False)], + timeout=False, + homogeneous=True, + passed_hosts=host, + failed_hosts=NodeSet(), + all_stdout={str(host): 'GNU/Linux'}, + all_stderr={str(host): ''}, + join_stdout='GNU/Linux', + join_stderr='', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_separated(self): + """Verify run_local() with separate stdout and stderr. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_separated + """ + host = get_local_host() + command = 'echo stdout; echo stderr 1>&2' + self.log_step('Verify run_local() w/ no stdout') + self._verify_command_result( + result=run_local(self.log, command, stderr=True), + passed=True, + expected=[ResultData(command, 0, host, ['stdout'], ['stderr'], False)], + timeout=False, + homogeneous=True, + passed_hosts=host, + failed_hosts=NodeSet(), + all_stdout={str(host): 'stdout'}, + all_stderr={str(host): 'stderr'}, + join_stdout='stdout', + join_stderr='stderr', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_no_stdout(self): + """Verify run_local() with no stdout. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_no_stdout + """ + host = get_local_host() + command = 'echo stderr 1>&2' + self.log_step('Verify run_local() w/ no stdout') + self._verify_command_result( + result=run_local(self.log, command, stderr=True), + passed=True, + expected=[ResultData(command, 0, host, [], ['stderr'], False)], + timeout=False, + homogeneous=True, + passed_hosts=host, + failed_hosts=NodeSet(), + all_stdout={str(host): ''}, + all_stderr={str(host): 'stderr'}, + join_stdout='', + join_stderr='stderr', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_failure(self): + """Verify run_local() with a failure. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_failure + """ + host = get_local_host() + command = 'echo fail; exit 1' + self.log_step('Verify run_local() w/ a failure') + self._verify_command_result( + result=run_local(self.log, command), + passed=False, + expected=[ResultData(command, 1, host, ['fail'], [], False)], + timeout=False, + homogeneous=True, + passed_hosts=NodeSet(), + failed_hosts=host, + all_stdout={str(host): 'fail'}, + all_stderr={str(host): ''}, + join_stdout='fail', + join_stderr='', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_timeout(self): + """Verify run_local() with a timeout. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_timeout + """ + host = get_local_host() + command = 'echo wait; sleep 5' + self.log_step('Verify run_local() w/ a timeout') + self._verify_command_result( + result=run_local(self.log, command, True, 2), + passed=False, + expected=[ResultData(command, 124, host, ['wait'], [], True)], + timeout=True, + homogeneous=True, + passed_hosts=NodeSet(), + failed_hosts=host, + all_stdout={str(host): 'wait'}, + all_stderr={str(host): ''}, + join_stdout='wait', + join_stderr='', + ) + self.log_step('Unit Test Passed') + def test_harness_unit_run_remote_single(self): """Verify run_remote() with a single host. @@ -244,7 +375,7 @@ def test_harness_unit_run_remote_single(self): hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') command = 'uname -o' self.log_step('Verify run_remote() w/ single host') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, NodeSet(hosts[0]), command), passed=True, expected=[ResultData(command, 0, NodeSet(hosts[0]), ['GNU/Linux'], [], False)], @@ -253,7 +384,9 @@ def test_harness_unit_run_remote_single(self): passed_hosts=NodeSet(hosts[0]), failed_hosts=NodeSet(), all_stdout={hosts[0]: 'GNU/Linux'}, - all_stderr={hosts[0]: ''} + all_stderr={hosts[0]: ''}, + join_stdout='GNU/Linux', + join_stderr='', ) self.log_step('Unit Test Passed') @@ -268,7 +401,7 @@ def test_harness_unit_run_remote_homogeneous(self): hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') command = 'uname -o' self.log_step('Verify run_remote() w/ homogeneous output') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command), passed=True, expected=[ResultData(command, 0, hosts, ['GNU/Linux'], [], False)], @@ -277,7 +410,9 @@ def test_harness_unit_run_remote_homogeneous(self): passed_hosts=hosts, failed_hosts=NodeSet(), all_stdout={str(hosts): 'GNU/Linux'}, - all_stderr={str(hosts): ''} + all_stderr={str(hosts): ''}, + join_stdout='GNU/Linux', + join_stderr='', ) self.log_step('Unit Test Passed') @@ -292,7 +427,7 @@ def test_harness_unit_run_remote_heterogeneous(self): hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') command = 'hostname -s' self.log_step('Verify run_remote() w/ heterogeneous output') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command), passed=True, expected=[ @@ -311,6 +446,8 @@ def test_harness_unit_run_remote_heterogeneous(self): hosts[0]: '', hosts[1]: '' }, + join_stdout='\n'.join(hosts), + join_stderr='', ) self.log_step('Unit Test Passed') @@ -323,10 +460,9 @@ def test_harness_unit_run_remote_combined(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_combined """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'echo stdout; if [ $(hostname -s) == \'{}\' ]; then echo stderr 1>&2; fi'.format( - hosts[1]) + command = f'echo stdout; if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo stderr 1>&2; fi' self.log_step('Verify run_remote() w/ separated stdout and stderr') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=False), passed=True, expected=[ @@ -344,7 +480,9 @@ def test_harness_unit_run_remote_combined(self): all_stderr={ hosts[0]: '', hosts[1]: '' - } + }, + join_stdout='stdout\nstdout\nstderr', + join_stderr='', ) self.log_step('Unit Test Passed') @@ -357,10 +495,9 @@ def test_harness_unit_run_remote_separated(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_separated """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'echo stdout; if [ $(hostname -s) == \'{}\' ]; then echo stderr 1>&2; fi'.format( - hosts[1]) + command = f'echo stdout; if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo stderr 1>&2; fi' self.log_step('Verify run_remote() w/ separated stdout and stderr') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=True), passed=True, expected=[ @@ -378,12 +515,14 @@ def test_harness_unit_run_remote_separated(self): all_stderr={ hosts[0]: '', hosts[1]: 'stderr' - } + }, + join_stdout='stdout\nstdout', + join_stderr='stderr', ) self.log_step('Unit Test Passed') def test_harness_unit_run_remote_no_stdout(self): - """Verify run_remote() with separated stdout and stderr. + """Verify run_remote() with no stdout. :avocado: tags=all :avocado: tags=vm @@ -391,9 +530,9 @@ def test_harness_unit_run_remote_no_stdout(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_no_stdout """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'if [ $(hostname -s) == \'{}\' ]; then echo stderr 1>&2; fi'.format(hosts[1]) + command = f'if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo stderr 1>&2; fi' self.log_step('Verify run_remote() w/ no stdout') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=True), passed=True, expected=[ @@ -411,12 +550,14 @@ def test_harness_unit_run_remote_no_stdout(self): all_stderr={ hosts[0]: '', hosts[1]: 'stderr' - } + }, + join_stdout='', + join_stderr='stderr', ) self.log_step('Unit Test Passed') def test_harness_unit_run_remote_failure(self): - """Verify run_remote() with separated stdout and stderr. + """Verify run_remote() with a failure. :avocado: tags=all :avocado: tags=vm @@ -424,10 +565,9 @@ def test_harness_unit_run_remote_failure(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_failure """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'if [ $(hostname -s) == \'{}\' ]; then echo fail; exit 1; fi; echo pass'.format( - hosts[1]) + command = f'if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo fail; exit 1; fi; echo pass' self.log_step('Verify run_remote() w/ a failure') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=True), passed=False, expected=[ @@ -445,6 +585,43 @@ def test_harness_unit_run_remote_failure(self): all_stderr={ hosts[0]: '', hosts[1]: '' - } + }, + join_stdout='pass\nfail', + join_stderr='', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_remote_timeout(self): + """Verify run_remote() with a timeout. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_timeout + """ + hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') + command = f'if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo wait; sleep 5; fi; echo pass' + self.log_step('Verify run_remote() w/ a timeout') + self._verify_command_result( + result=run_remote(self.log, hosts, command, stderr=True, timeout=2), + passed=False, + expected=[ + ResultData(command, 0, NodeSet(hosts[0]), ['pass'], [], False), + ResultData(command, 124, NodeSet(hosts[1]), ['wait'], [], True), + ], + timeout=True, + homogeneous=False, + passed_hosts=NodeSet(hosts[0]), + failed_hosts=NodeSet(hosts[1]), + all_stdout={ + hosts[0]: 'pass', + hosts[1]: 'wait' + }, + all_stderr={ + hosts[0]: '', + hosts[1]: '' + }, + join_stdout='pass\nwait', + join_stderr='', ) self.log_step('Unit Test Passed') diff --git a/src/tests/ftest/process_core_files.py b/src/tests/ftest/process_core_files.py index 9349cac6172..47fbf7a4ef4 100644 --- a/src/tests/ftest/process_core_files.py +++ b/src/tests/ftest/process_core_files.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -141,7 +141,8 @@ def process_core_files(self, directory, delete, test=None): if os.path.splitext(core_name)[-1] == ".bz2": # Decompress the file command = f"lbzip2 -d -v '{os.path.join(core_dir, core_name)}'" - run_local(self.log, command) + if not run_local(self.log, command).passed: + raise CoreFileException(f"Error decompressing {core_name}") core_name = os.path.splitext(core_name)[0] exe_name = self._get_exe_name(os.path.join(core_dir, core_name)) self._create_stacktrace(core_dir, core_name, exe_name) @@ -187,22 +188,23 @@ def _create_stacktrace(self, core_dir, core_name, exe_name): stack_trace_file = os.path.join(core_dir, f"'{core_name}.stacktrace'") self.log.debug("Generating a stacktrace from the %s core file from %s", core_full, host) - run_local(self.log, f"ls -l '{core_full}'") + if not run_local(self.log, f"ls -l '{core_full}'").passed: + raise RunException(f"Error listing {core_full}") command = ( f"gdb -cd='{core_dir}' -ex 'set pagination off' -ex 'thread apply all bt full' -ex " f"detach -ex quit '{exe_name}' '{core_name}'") + result = run_local(self.log, command, verbose=False) + if not result.passed: + raise RunException(f"Error creating {stack_trace_file}") + try: - output = run_local(self.log, command, check=False, verbose=False) with open(stack_trace_file, "w", encoding="utf-8") as stack_trace: - stack_trace.writelines(output.stdout) + stack_trace.write(result.joined_stdout) except IOError as error: raise RunException(f"Error writing {stack_trace_file}") from error - except RunException as error: - raise RunException(f"Error creating {stack_trace_file}") from error - def _get_exe_name(self, core_file): """Get the executable name from the core file. @@ -219,7 +221,7 @@ def _get_exe_name(self, core_file): self.log.debug("Extracting the executable name from '%s'", core_file) command = f"gdb -c '{core_file}' -ex 'info proc exe' -ex quit" result = run_local(self.log, command, verbose=False) - last_line = result.stdout.splitlines()[-1] + last_line = result.joined_stdout.splitlines()[-1] self.log.debug(" last line: %s", last_line) cmd = last_line[7:] self.log.debug(" last_line[7:-1]: %s", cmd) @@ -277,7 +279,7 @@ def install_debuginfo_packages(self): cmds.append(["sudo", "rm", "-f", path]) if self.USE_DEBUGINFO_INSTALL: - dnf_args = ["--exclude", "ompi-debuginfo"] + dnf_args = ["--nobest", "--exclude", "ompi-debuginfo"] if os.getenv("TEST_RPMS", 'false') == 'true': if "suse" in self.distro_info.name.lower(): dnf_args.extend(["libpmemobj1", "python3", "openmpi3"]) @@ -291,9 +293,8 @@ def install_debuginfo_packages(self): else: raise RunException(f"Unsupported distro: {self.distro_info}") cmds.append(["sudo", "dnf", "-y", "install"] + dnf_args) - output = run_local( - self.log, " ".join(["rpm", "-q", "--qf", "'%{evr}'", "daos"]), check=False) - rpm_version = output.stdout + result = run_local(self.log, " ".join(["rpm", "-q", "--qf", "'%{evr}'", "daos"])) + rpm_version = result.joined_stdout cmds.append( ["sudo", "dnf", "debuginfo-install", "-y"] + dnf_args + ["daos-" + rpm_version, "daos-*-" + rpm_version]) @@ -324,9 +325,7 @@ def install_debuginfo_packages(self): retry = False for cmd in cmds: - try: - run_local(self.log, " ".join(cmd), check=True) - except RunException: + if not run_local(self.log, " ".join(cmd)).passed: # got an error, so abort this list of commands and re-run # it with a dnf clean, makecache first retry = True @@ -339,9 +338,7 @@ def install_debuginfo_packages(self): cmds.insert(0, cmd_prefix + ["clean", "all"]) cmds.insert(1, cmd_prefix + ["makecache"]) for cmd in cmds: - try: - run_local(self.log, " ".join(cmd)) - except RunException: + if not run_local(self.log, " ".join(cmd)).passed: break def is_el(self): @@ -380,14 +377,11 @@ def resolve_debuginfo(self, pkg): """ package_info = None - try: - # Eventually use python libraries for this rather than exec()ing out to rpm - output = run_local( - self.log, - " ".join( - ["rpm", "-q", "--qf", "'%{name} %{version} %{release} %{epoch}'", pkg]), - check=False) - name, version, release, epoch = output.stdout.split() + # Eventually use python libraries for this rather than exec()ing out to rpm + command = f"rpm -q --qf '%{{name}} %{{version}} %{{release}} %{{epoch}}' {pkg}" + result = run_local(self.log, command) + if result.passed: + name, version, release, epoch = result.joined_stdout.split() debuginfo_map = {"glibc": "glibc-debuginfo-common"} try: @@ -400,7 +394,7 @@ def resolve_debuginfo(self, pkg): "release": release, "epoch": epoch } - except ValueError: + else: self.log.debug("Package %s not installed, skipping debuginfo", pkg) return package_info @@ -413,20 +407,16 @@ def delete_gdb_core_files(self): """ self.log.debug("Checking core files generated by core file processing") - try: - results = run_local(self.log, "cat /proc/sys/kernel/core_pattern", check=True) - except RunException: + result = run_local(self.log, "cat /proc/sys/kernel/core_pattern") + if not result.passed: self.log.error("Unable to find local core file pattern") self.log.debug("Stacktrace", exc_info=True) return 1 - core_path = os.path.split(results.stdout.splitlines()[-1])[0] + core_path = os.path.split(result.joined_stdout.splitlines()[-1])[0] self.log.debug("Deleting core.gdb.*.* core files located in %s", core_path) other = ["-printf '%M %n %-12u %-12g %12k %t %p\n' -delete"] - try: - run_local( - self.log, find_command(core_path, "core.gdb.*.*", 1, other), check=True) - except RunException: + if not run_local(self.log, find_command(core_path, "core.gdb.*.*", 1, other)).passed: self.log.debug("core.gdb.*.* files could not be removed") return 1 return 0 diff --git a/src/tests/ftest/server/multiengine_persocket.py b/src/tests/ftest/server/multiengine_persocket.py index ff91186d618..8c92fdfbdad 100644 --- a/src/tests/ftest/server/multiengine_persocket.py +++ b/src/tests/ftest/server/multiengine_persocket.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -181,7 +181,8 @@ def check_pmem(self, hosts, count): def storage_format(self): """Perform storage format.""" - run_local(self.log, "dmg storage format") + if not run_local(self.log, "dmg storage format").passed: + self.fail("dmg storage format failed") def cleanup(self): """Servers clean up after test complete.""" diff --git a/src/tests/ftest/slurm_setup.py b/src/tests/ftest/slurm_setup.py index d13ab537eb7..0c3d300d5ff 100755 --- a/src/tests/ftest/slurm_setup.py +++ b/src/tests/ftest/slurm_setup.py @@ -207,7 +207,7 @@ def _create_epilog_script(self, script): """ self.log.debug('Creating the slurm epilog script to run after each job.') try: - with open(script, 'w') as script_file: + with open(script, 'w', encoding='utf-8') as script_file: script_file.write('#!/bin/bash\n#\n') script_file.write('/usr/bin/bash -c \'pkill --signal 9 dfuse\'\n') script_file.write( @@ -364,7 +364,7 @@ def _append_config_file(self, echo_command): echo_command (str): command adding contents to the config file Returns: - RemoteCommandResult: the result from the echo | tee command + CommandResult: the result from the echo | tee command """ tee_command = command_as_user(f'tee -a {self.SLURM_CONF}', self.root) return run_remote(self.log, self.all_nodes, f'{echo_command} | {tee_command}') diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py index 76c293f0e4f..b68b141676d 100644 --- a/src/tests/ftest/util/agent_utils.py +++ b/src/tests/ftest/util/agent_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2019-2023 Intel Corporation. + (C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -294,9 +294,7 @@ def support_collect_log(self, **kwargs): CommandFailure: if the daos_agent command fails. Returns: - RemoteCommandResult: a grouping of the command results from - the same hosts with the same return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosAgentCommand(self.manager.job.command_path) cmd.sudo = True diff --git a/src/tests/ftest/util/collection_utils.py b/src/tests/ftest/util/collection_utils.py index 15963e03446..3b054eb19e9 100644 --- a/src/tests/ftest/util/collection_utils.py +++ b/src/tests/ftest/util/collection_utils.py @@ -16,7 +16,7 @@ # pylint: disable=import-error,no-name-in-module from util.environment_utils import TestEnvironment from util.host_utils import get_local_host -from util.run_utils import RunException, find_command, run_local, run_remote, stop_processes +from util.run_utils import find_command, run_local, run_remote, stop_processes from util.user_utils import get_chown_command from util.yaml_utils import get_test_category @@ -562,20 +562,17 @@ def move_files(logger, hosts, source, pattern, destination, depth, timeout, test # Clush -rcopy the temporary remote directory to this host command = ["clush", "-w", str(hosts), "-pv", "--rcopy", f"'{tmp_copy_dir}'", "--dest", f"'{rcopy_dest}'"] - try: - run_local(logger, " ".join(command), check=True, timeout=timeout) - except RunException: + if not run_local(logger, " ".join(command), timeout=timeout).passed: message = f"Error copying remote files to {destination}" test_result.fail_test(logger, "Process", message, sys.exc_info()) return_code = 16 - finally: - # Remove the temporary remote directory on each host - command = f"{sudo_command}rm -fr '{tmp_copy_dir}'" - if not run_remote(logger, hosts, command).passed: - message = f"Error removing temporary remote copy directory '{tmp_copy_dir}'" - test_result.fail_test(logger, "Process", message) - return_code = 16 + # Remove the temporary remote directory on each host + command = f"{sudo_command}rm -fr '{tmp_copy_dir}'" + if not run_remote(logger, hosts, command).passed: + message = f"Error removing temporary remote copy directory '{tmp_copy_dir}'" + test_result.fail_test(logger, "Process", message) + return_code = 16 return return_code @@ -648,14 +645,13 @@ def create_steps_log(logger, job_results_dir, test_result): job_log = os.path.join(test_logs_dir, 'job.log') step_log = os.path.join(test_logs_dir, 'steps.log') command = rf"grep -E '(INFO |ERROR)\| (==> Step|START|PASS|FAIL|ERROR)' {job_log}" - try: - result = run_local(logger, command) - with open(step_log, 'w', encoding="utf-8") as file: - file.write(result.stdout) - except Exception: # pylint: disable=broad-except + result = run_local(logger, command) + if not result.passed: message = f"Error creating {step_log}" test_result.fail_test(logger, "Process", message, sys.exc_info()) return 8192 + with open(step_log, 'w', encoding="utf-8") as file: + file.write(result.joined_stdout) return 0 @@ -713,9 +709,7 @@ def rename_avocado_test_dir(logger, test, job_results_dir, test_result, jenkins_ return 1024 # Remove latest symlink directory to avoid inclusion in the Jenkins build artifacts - try: - run_local(logger, f"rm -fr '{test_logs_lnk}'") - except RunException: + if not run_local(logger, f"rm -fr '{test_logs_lnk}'").passed: message = f"Error removing {test_logs_lnk}" test_result.fail_test(logger, "Process", message, sys.exc_info()) return 1024 diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py index 46109fdb8b0..718fe392a03 100644 --- a/src/tests/ftest/util/dfuse_utils.py +++ b/src/tests/ftest/util/dfuse_utils.py @@ -92,8 +92,7 @@ def _run_as_owner(self, hosts, command, timeout=120): Defaults to 120 seconds. Returns: - RemoteCommandResult: result of the command - + CommandResult: result of the command """ return run_remote( self.log, hosts, command_as_user(command, self.run_user), timeout=timeout) @@ -233,7 +232,7 @@ def run(self, check=True, mount_callback=None): Args: check (bool): Check if dfuse mounted properly after mount is executed. - mount_callback (method, optional): method to pass RemoteCommandResult to + mount_callback (method, optional): method to pass CommandResult to after mount. Default simply raises an exception on failure. Raises: @@ -517,8 +516,7 @@ def run(self): CommandFailure: If the command fails Returns: - RemoteCommandResult: result from run_remote - + CommandResult: result from run_remote """ self.log.info('Running verify_perms.py on %s', str(self.hosts)) result = run_remote(self.log, self.hosts, self.with_exports, timeout=self.timeout) @@ -568,9 +566,7 @@ def _run_process(self, raise_exception=None): CommandFailure: if there is an error running the command Returns: - RemoteCommandResult: a grouping of the command results from the same host with the - same return status - + CommandResult: groups of command results from the same hosts with the same return status """ if raise_exception is None: raise_exception = self.exit_status_exception diff --git a/src/tests/ftest/util/fio_utils.py b/src/tests/ftest/util/fio_utils.py index 983f2255841..2c34eda01af 100644 --- a/src/tests/ftest/util/fio_utils.py +++ b/src/tests/ftest/util/fio_utils.py @@ -173,9 +173,7 @@ def _run_process(self, raise_exception=None): CommandFailure: if there is an error running the command Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the - same return status - + CommandResult: groups of command results from the same hosts with the same return status """ if not self._hosts: raise CommandFailure('No hosts specified for fio command') diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py index 2ba711bde4a..e745c69160d 100644 --- a/src/tests/ftest/util/general_utils.py +++ b/src/tests/ftest/util/general_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -23,7 +23,7 @@ from avocado.utils import process from ClusterShell.NodeSet import NodeSet from ClusterShell.Task import task_self -from run_utils import RunException, get_clush_command, run_local, run_remote +from run_utils import get_clush_command, run_local, run_remote from user_utils import get_chown_command, get_primary_group @@ -1356,11 +1356,8 @@ def check_ping(log, host, expected_ping=True, cmd_timeout=60, verbose=True): Returns: bool: True if the expected number of pings were returned; False otherwise. """ - log.debug("Checking for %s to be %sresponsive", host, "" if expected_ping else "un") - try: - run_local( - log, "ping -c 1 {}".format(host), check=True, timeout=cmd_timeout, verbose=verbose) - except RunException: + log.debug("Checking for %s to be %s", host, "responsive" if expected_ping else "unresponsive") + if not run_local(log, f"ping -c 1 {host}", timeout=cmd_timeout, verbose=verbose).passed: return not expected_ping return expected_ping diff --git a/src/tests/ftest/util/io_utilities.py b/src/tests/ftest/util/io_utilities.py index 56a727cfa6f..af7cd37e8ca 100644 --- a/src/tests/ftest/util/io_utilities.py +++ b/src/tests/ftest/util/io_utilities.py @@ -367,7 +367,7 @@ def run(self): """Run the command. Returns: - RemoteCommandResult: result from run_remote + CommandResult: groups of command results from the same hosts with the same return status """ self.log.info('Running directory_tree.py on %s', str(self.hosts)) return run_remote(self.log, self.hosts, self.with_exports, timeout=self.timeout) diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index f1c359d7f4f..2ed42ce7a9b 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -44,13 +44,11 @@ def fault_injection_enabled(logger): """ logger.debug("-" * 80) logger.debug("Checking for fault injection enablement via 'fault_status':") - try: - run_local(logger, "fault_status", check=True) + if run_local(logger, "fault_status").passed: logger.debug(" Fault injection is enabled") return True - except RunException: - # Command failed or yielded a non-zero return status - logger.debug(" Fault injection is disabled") + # Command failed or yielded a non-zero return status + logger.debug(" Fault injection is disabled") return False @@ -88,10 +86,7 @@ def display_disk_space(logger, path): """ logger.debug("-" * 80) logger.debug("Current disk space usage of %s", path) - try: - run_local(logger, f"df -h {path}", check=False) - except RunException: - pass + run_local(logger, f"df -h {path}") def summarize_run(logger, mode, status): @@ -153,9 +148,11 @@ def get_test_tag_info(logger, directory): test_tag_info = {} for test_file in sorted(list(map(str, Path(directory).rglob("*.py")))): command = f"grep -ER '(^class .*:|:avocado: tags=| def test_)' {test_file}" - output = run_local(logger, command, check=False, verbose=False) + result = run_local(logger, command, verbose=False) + if not result.passed: + continue data = re.findall( - r'(?:class (.*)\(.*\):|def (test_.*)\(|:avocado: tags=(.*))', output.stdout) + r'(?:class (.*)\(.*\):|def (test_.*)\(|:avocado: tags=(.*))', result.joined_stdout) class_key = None method_key = None for match in data: @@ -392,28 +389,27 @@ def execute(self, logger, test, repeat, number, sparse, fail_fast): "[Test %s/%s] Running the %s test on repetition %s/%s", number, self.total_tests, test, repeat, self.total_repeats) start_time = int(time.time()) - - try: - return_code = run_local( - logger, " ".join(command), capture_output=False, check=False).returncode - if return_code == 0: - logger.debug("All avocado test variants passed") - elif return_code & 2 == 2: - logger.debug("At least one avocado test variant failed") - elif return_code & 4 == 4: - message = "Failed avocado commands detected" - self.test_result.fail_test(logger, "Execute", message) - elif return_code & 8 == 8: - logger.debug("At least one avocado test variant was interrupted") - if return_code: - self._collect_crash_files(logger) - - except RunException: - message = f"Error executing {test} on repeat {repeat}" + result = run_local(logger, " ".join(command), capture_output=False) + end_time = int(time.time()) + return_code = result.output[0].returncode + if return_code == 0: + logger.debug("All avocado test variants passed") + elif return_code & 1 == 1: + logger.debug("At least one avocado test variant failed") + elif return_code & 2 == 2: + logger.debug("At least one avocado job failed") + elif return_code & 4 == 4: + message = "Failed avocado commands detected" + self.test_result.fail_test(logger, "Execute", message) + elif return_code & 8 == 8: + logger.debug("At least one avocado test variant was interrupted") + else: + message = f"Unhandled rc={return_code} while executing {test} on repeat {repeat}" self.test_result.fail_test(logger, "Execute", message, sys.exc_info()) return_code = 1 + if return_code: + self._collect_crash_files(logger) - end_time = int(time.time()) logger.info("Total test time: %ss", end_time - start_time) return return_code @@ -801,10 +797,11 @@ def _generate_certs(self, logger): certgen_dir = os.path.abspath( os.path.join("..", "..", "..", "..", "lib64", "daos", "certgen")) command = os.path.join(certgen_dir, "gen_certificates.sh") - try: - run_local(logger, f"/usr/bin/rm -rf {certs_dir}") - run_local(logger, f"{command} {test_env.log_dir}") - except RunException: + if not run_local(logger, f"/usr/bin/rm -rf {certs_dir}").passed: + message = "Error removing old certificates" + self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) + return 128 + if not run_local(logger, f"{command} {test_env.log_dir}").passed: message = "Error generating certificates" self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) return 128 @@ -826,12 +823,13 @@ def _collect_crash_files(self, logger): if crash_files: latest_crash_dir = os.path.join(avocado_logs_dir, "latest", "crashes") - try: - run_local(logger, f"mkdir -p {latest_crash_dir}", check=True) + if run_local(logger, f"mkdir -p {latest_crash_dir}").passed: for crash_file in crash_files: - run_local(logger, f"mv {crash_file} {latest_crash_dir}", check=True) - except RunException: - message = "Error collecting crash files" + if not run_local(logger, f"mv {crash_file} {latest_crash_dir}").passed: + message = "Error collecting crash files: mv" + self.test_result.fail_test(logger, "Execute", message, sys.exc_info()) + else: + message = "Error collecting crash files: mkdir" self.test_result.fail_test(logger, "Execute", message, sys.exc_info()) else: logger.debug("No avocado crash files found in %s", crash_dir) @@ -927,8 +925,10 @@ def list_tests(self, logger, verbose): # Find all the test files that contain tests matching the tags logger.debug("-" * 80) logger.info("Detecting tests matching tags: %s", " ".join(command)) - output = run_local(logger, " ".join(command), check=True) - unique_test_files = set(re.findall(self._avocado.get_list_regex(), output.stdout)) + result = run_local(logger, " ".join(command)) + if not result.passed: + raise RunException("Error running avocado list") + unique_test_files = set(re.findall(self._avocado.get_list_regex(), result.joined_stdout)) for index, test_file in enumerate(unique_test_files): self.tests.append(TestInfo(test_file, index + 1, self._yaml_extension)) logger.info(" %s", self.tests[-1]) @@ -1015,7 +1015,8 @@ def update_test_yaml(self, logger, scm_size, scm_mount, extra_yaml, multiplier, if new_yaml_file: if verbose > 0: # Optionally display a diff of the yaml file - run_local(logger, f"diff -y {test.yaml_file} {new_yaml_file}", check=False) + if not run_local(logger, f"diff -y {test.yaml_file} {new_yaml_file}").passed: + raise RunException(f"Error diff'ing {test.yaml_file}") test.yaml_file = new_yaml_file # Display the modified yaml file variants with debug @@ -1023,7 +1024,8 @@ def update_test_yaml(self, logger, scm_size, scm_mount, extra_yaml, multiplier, if test.extra_yaml: command.extend(test.extra_yaml) command.extend(["--summary", "3"]) - run_local(logger, " ".join(command)) + if not run_local(logger, " ".join(command)).passed: + raise RunException(f"Error listing test variants for {test.yaml_file}") # Collect the host information from the updated test yaml test.set_yaml_info(logger, include_localhost) @@ -1144,13 +1146,11 @@ def _setup_application_directory(self, logger, result): logger.debug(" Copying applications from the '%s' directory", self._test_env.app_src) run_local(logger, f"ls -al '{self._test_env.app_src}'") for app in os.listdir(self._test_env.app_src): - try: - run_local( - logger, - f"cp -r '{os.path.join(self._test_env.app_src, app)}' " - f"'{self._test_env.app_dir}'", - check=True) - except RunException: + result = run_local( + logger, + f"cp -r '{os.path.join(self._test_env.app_src, app)}' " + f"'{self._test_env.app_dir}'") + if not result.passed: message = 'Error copying files to the application directory' result.tests[-1].fail_test(logger, 'Run', message, sys.exc_info()) return 128 diff --git a/src/tests/ftest/util/package_utils.py b/src/tests/ftest/util/package_utils.py index 22eafd2dc93..6823c2d4202 100644 --- a/src/tests/ftest/util/package_utils.py +++ b/src/tests/ftest/util/package_utils.py @@ -1,5 +1,5 @@ """ -(C) Copyright 2023 Intel Corporation. +(C) Copyright 2023-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -40,7 +40,7 @@ def install_packages(log, hosts, packages, user=None, timeout=600): timeout (int, optional): timeout for the dnf install command. Defaults to 600. Returns: - RemoteCommandResult: the 'dnf install' command results + CommandResult: the 'dnf install' command results """ log.info('Installing packages on %s: %s', hosts, ', '.join(packages)) command = command_as_user(' '.join(['dnf', 'install', '-y'] + packages), user) @@ -58,7 +58,7 @@ def remove_packages(log, hosts, packages, user=None, timeout=600): timeout (int, optional): timeout for the dnf remove command. Defaults to 600. Returns: - RemoteCommandResult: the 'dnf remove' command results + CommandResult: the 'dnf remove' command results """ log.info('Removing packages on %s: %s', hosts, ', '.join(packages)) command = command_as_user(' '.join(['dnf', 'remove', '-y'] + packages), user) diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 23ac53c76d7..2f9d33b07c5 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -4,9 +4,9 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import os -import shlex import subprocess # nosec import time +from getpass import getuser from socket import gethostname from ClusterShell.NodeSet import NodeSet @@ -68,23 +68,157 @@ def passed(self): Returns: bool: if the command was successful - """ return self.returncode == 0 -class RemoteCommandResult(): - """Stores the command result from a Task object.""" +class LocalTask(): + """A mock ClusterShell.Task object for subprocess command output.""" + + def __init__(self, host, return_code, stdout, stderr, timed_out): + """Create a LocalTask. + + Args: + host (NodeSet): host from which the command was executed + return_code (int): executed command's return code + stdout (str): executed command's stdout + stderr (str): executed command's stderr + timed_out (bool) did the executed command time out + """ + self._return_codes = {return_code: [host]} + self._stdout = {stdout if stdout is not None else '': [host]} + self._stderr = {stderr if stderr is not None else '': [host]} + self._timeout_sources = [] + if timed_out: + self._timeout_sources.append(host) + + def iter_retcodes(self): + """Iterate over return codes of the local command result. + + Yields: + tuple: return code (int), hosts (list) + """ + yield from self._return_codes.items() + + def iter_keys_timeout(self): + """Iterate over hosts that timed out. + + Yields: + str: host where the command timed out + """ + yield from self._timeout_sources + + def iter_buffers(self, match_keys=None): + """Iterate over the command stdout for each host. + + Args: + match_keys (list, optional): only return output matching these hosts. Defaults to None. + + Returns: + tuple: command stdout (str), hosts (list) + """ + for output, hosts in self._stdout.items(): + if match_keys is None or hosts[0] in match_keys: + yield output, hosts + + def iter_errors(self, match_keys=None): + """Iterate over the command stderr for each host. + + Args: + match_keys (list, optional): only return output matching these hosts. Defaults to None. + + Returns: + tuple: command stderr (str), hosts (list) + """ + for output, hosts in self._stderr.items(): + if match_keys is None or hosts[0] in match_keys: + yield output, hosts + + +class CommandResult(): + """Groups of command results from the same hosts with the same return status.""" def __init__(self, command, task): - """Create a RemoteCommandResult object. + """Create a CommandResult object. Args: command (str): command executed task (Task): object containing the results from an executed clush command """ self.output = [] - self._process_task(task, command) + + # Get a dictionary of host list values for each unique return code key + return_codes = dict(task.iter_retcodes()) + + # Add any hosts that timed out using an 124 return code + timed_out_hosts = list(task.iter_keys_timeout()) + if timed_out_hosts and 124 in return_codes: + # To be on the safe side even though we typically wouldn't see 124 from iter_retcodes() + return_codes[124].extend(timed_out_hosts) + elif timed_out_hosts: + return_codes[124] = timed_out_hosts + + # Populate the a list of unique output for each NodeSet + for code in sorted(return_codes): + stdout_data = self.__sanitize_iter_data( + return_codes[code], list(task.iter_buffers(return_codes[code])), '') + + for stdout_raw, stdout_hosts in stdout_data: + # In run_remote(), task.run() is executed with the stderr=False default. + # As a result task.iter_buffers() will return combined stdout and stderr. + stdout = self.__msg_tree_elem_to_list(stdout_raw) + stderr_data = self.__sanitize_iter_data( + stdout_hosts, list(task.iter_errors(stdout_hosts)), '') + for stderr_raw, stderr_hosts in stderr_data: + stderr = self.__msg_tree_elem_to_list(stderr_raw) + self.output.append( + ResultData( + command, code, NodeSet.fromlist(stderr_hosts), stdout, stderr, + bool(code == 124))) + + @staticmethod + def __sanitize_iter_data(hosts, data, default_entry): + """Ensure the data generated from an iter function has entries for each host. + + Args: + hosts (list): lists of host which generated data + data (list): data from an iter function as a list + default_entry (object): entry to add to data for missing hosts in data + + Returns: + list: a list of tuples of entries and list of hosts + """ + if not data: + return [(default_entry, hosts)] + + source_keys = NodeSet.fromlist(hosts) + data_keys = NodeSet() + for _, keys in data: + data_keys.add(NodeSet.fromlist(keys)) + + sanitized_data = data.copy() + missing_keys = source_keys - data_keys + if missing_keys: + sanitized_data.append((default_entry, list(missing_keys))) + return sanitized_data + + @staticmethod + def __msg_tree_elem_to_list(msg_tree_elem): + """Convert a ClusterShell.MsgTree.MsgTreeElem to a list of strings. + + Args: + msg_tree_elem (MsgTreeElem): output from Task.iter_* method. + + Returns: + list: list of strings + """ + msg_tree_elem_list = [] + for line in msg_tree_elem.splitlines(): + if isinstance(line, bytes): + msg_tree_elem_list.append(line.decode("utf-8")) + else: + msg_tree_elem_list.append(line) + return msg_tree_elem_list @property def homogeneous(self): @@ -92,7 +226,6 @@ def homogeneous(self): Returns: bool: if all the hosts produced the same output - """ return len(self.output) == 1 @@ -102,7 +235,6 @@ def passed(self): Returns: bool: if the command was successful on each host - """ all_zero = all(data.passed for data in self.output) return all_zero and not self.timeout @@ -113,7 +245,6 @@ def timeout(self): Returns: bool: True if the command timed out on at least one set of hosts; False otherwise - """ return any(data.timeout for data in self.output) @@ -123,7 +254,6 @@ def passed_hosts(self): Returns: NodeSet: all nodes where the command passed - """ return NodeSet.fromlist(data.hosts for data in self.output if data.returncode == 0) @@ -133,7 +263,6 @@ def failed_hosts(self): Returns: NodeSet: all nodes where the command failed - """ return NodeSet.fromlist(data.hosts for data in self.output if data.returncode != 0) @@ -143,7 +272,6 @@ def all_stdout(self): Returns: dict: the stdout (the values) from each set of hosts (the keys, as a str of the NodeSet) - """ stdout = {} for data in self.output: @@ -156,96 +284,37 @@ def all_stderr(self): Returns: dict: the stderr (the values) from each set of hosts (the keys, as a str of the NodeSet) - """ stderr = {} for data in self.output: stderr[str(data.hosts)] = '\n'.join(data.stderr) return stderr - def _process_task(self, task, command): - """Populate the output list and determine the passed result for the specified task. - - Args: - task (Task): a ClusterShell.Task.Task object for the executed command - command (str): the executed command - """ - # Get a dictionary of host list values for each unique return code key - results = dict(task.iter_retcodes()) - - # Get a list of any hosts that timed out - timed_out = [str(hosts) for hosts in task.iter_keys_timeout()] - - # Populate the a list of unique output for each NodeSet - for code in sorted(results): - stdout_data = self._sanitize_iter_data( - results[code], list(task.iter_buffers(results[code])), '') - - for stdout_raw, stdout_hosts in stdout_data: - # In run_remote(), task.run() is executed with the stderr=False default. - # As a result task.iter_buffers() will return combined stdout and stderr. - stdout = self._msg_tree_elem_to_list(stdout_raw) - stderr_data = self._sanitize_iter_data( - stdout_hosts, list(task.iter_errors(stdout_hosts)), '') - for stderr_raw, stderr_hosts in stderr_data: - stderr = self._msg_tree_elem_to_list(stderr_raw) - self.output.append( - ResultData( - command, code, NodeSet.fromlist(stderr_hosts), stdout, stderr, False)) - if timed_out: - self.output.append( - ResultData(command, 124, NodeSet.fromlist(timed_out), None, None, True)) - - @staticmethod - def _sanitize_iter_data(hosts, data, default_entry): - """Ensure the data generated from an iter function has entries for each host. - - Args: - hosts (list): lists of host which generated data - data (list): data from an iter function as a list - default_entry (object): entry to add to data for missing hosts in data + @property + def joined_stdout(self): + """Get all of the stdout from the issued command from each host joined by newlines. Returns: - list: a list of tuples of entries and list of hosts + str: all of the stdout from each host joined by newlines """ - if not data: - return [(default_entry, hosts)] - - source_keys = NodeSet.fromlist(hosts) - data_keys = NodeSet() - for _, keys in data: - data_keys.add(NodeSet.fromlist(keys)) - - sanitized_data = data.copy() - missing_keys = source_keys - data_keys - if missing_keys: - sanitized_data.append((default_entry, list(missing_keys))) - return sanitized_data - - @staticmethod - def _msg_tree_elem_to_list(msg_tree_elem): - """Convert a ClusterShell.MsgTree.MsgTreeElem to a list of strings. + all_stdout = self.all_stdout + return '\n'.join(filter(None, [all_stdout[key] for key in sorted(all_stdout)])) - Args: - msg_tree_elem (MsgTreeElem): output from Task.iter_* method. + @property + def joined_stderr(self): + """Get all of the stderr from the issued command from each host joined by newlines. Returns: - list: list of strings + str: all of the stderr from each host joined by newlines """ - msg_tree_elem_list = [] - for line in msg_tree_elem.splitlines(): - if isinstance(line, bytes): - msg_tree_elem_list.append(line.decode("utf-8")) - else: - msg_tree_elem_list.append(line) - return msg_tree_elem_list + all_stderr = self.all_stderr + return '\n'.join(filter(None, [all_stderr[key] for key in sorted(all_stderr)])) def log_output(self, log): """Log the command result. Args: log (logger): logger for the messages produced by this method - """ for data in self.output: log_result_data(log, data) @@ -289,7 +358,6 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su Returns: str: the clush command - """ cmd_list = ["clush"] if args: @@ -301,40 +369,34 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su return " ".join(cmd_list) -def run_local(log, command, capture_output=True, timeout=None, check=False, verbose=True): - """Run the command locally. +def run_local(log, command, verbose=True, timeout=None, stderr=False, capture_output=True): + """Run the command on the local host. Args: log (logger): logger for the messages produced by this method command (str): command from which to obtain the output - capture_output(bool, optional): whether or not to include the command output in the - subprocess.CompletedProcess.stdout returned by this method. Defaults to True. + verbose (bool, optional): log the command output. Defaults to True. timeout (int, optional): number of seconds to wait for the command to complete. Defaults to None. - check (bool, optional): if set the method will raise an exception if the command does not - yield a return code equal to zero. Defaults to False. - verbose (bool, optional): if set log the output of the command (capture_output must also be - set). Defaults to True. - - Raises: - RunException: if the command fails: times out (timeout must be specified), - yields a non-zero exit status (check must be True), is interrupted by the user, or - encounters some other exception. + stderr (bool, optional): whether to enable stdout/stderr separation. Defaults to False. + capture_output (bool, optional): whether to include stdout/stderr in the CommandResult. + Defaults to True. Returns: - subprocess.CompletedProcess: an object representing the result of the command execution with - the following properties: - - args (the command argument) - - returncode - - stdout (only set if capture_output=True) - - stderr (not used; included in stdout) - + CommandResult: groups of command results from the same hosts with the same return status """ - local_host = gethostname().split(".")[0] - kwargs = {"encoding": "utf-8", "shell": False, "check": check, "timeout": timeout} + local_host = NodeSet(gethostname().split(".")[0]) + kwargs = { + "encoding": "utf-8", + "shell": True, + "check": False, + "timeout": timeout, + "env": os.environ.copy() + } if capture_output: kwargs["stdout"] = subprocess.PIPE - kwargs["stderr"] = subprocess.STDOUT + kwargs["stderr"] = subprocess.PIPE if stderr else subprocess.STDOUT + if timeout and verbose: log.debug("Running on %s with a %s timeout: %s", local_host, timeout, command) elif verbose: @@ -342,43 +404,27 @@ def run_local(log, command, capture_output=True, timeout=None, check=False, verb try: # pylint: disable=subprocess-run-check - result = subprocess.run(shlex.split(command), **kwargs) # nosec + process = subprocess.run(command, **kwargs) # nosec + task = LocalTask(local_host, process.returncode, process.stdout, process.stderr, False) except subprocess.TimeoutExpired as error: # Raised if command times out - log.debug(str(error)) - log.debug(" output: %s", error.output) - log.debug(" stderr: %s", error.stderr) - raise RunException(f"Command '{command}' exceed {timeout}s timeout") from error - - except subprocess.CalledProcessError as error: - # Raised if command yields a non-zero return status with check=True - log.debug(str(error)) - log.debug(" output: %s", error.output) - log.debug(" stderr: %s", error.stderr) - raise RunException(f"Command '{command}' returned non-zero status") from error - - except KeyboardInterrupt as error: - # User Ctrl-C - message = f"Command '{command}' interrupted by user" - log.debug(message) - raise RunException(message) from error - - except Exception as error: + task = LocalTask(local_host, 124, error.stdout, error.stderr, True) + + except Exception as error: # pylint: disable=broad-except # Catch all - message = f"Command '{command}' encountered unknown error" - log.debug(message) - log.debug(str(error)) - raise RunException(message) from error + task = LocalTask(local_host, 255, None, str(error), False) + results = CommandResult(command, task) if capture_output and verbose: - # Log the output of the command - log.debug(" %s (rc=%s):", local_host, result.returncode) - if result.stdout: - for line in result.stdout.splitlines(): - log.debug(" %s", line) - - return result + # Log any captured command output when requested + results.log_output(log) + elif capture_output: + # Always log any failed commands whose output was captured + for data in results.output: + if not data.passed: + log_result_data(log, data) + return results def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, stderr=False, @@ -398,9 +444,7 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, clush default (64) or available cores Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ task = task_self() task.set_info('debug', task_debug) @@ -417,7 +461,7 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, else: log.debug("Running on %s with a %s second timeout: %s", hosts, timeout, command) task.run(command=command, nodes=hosts, timeout=timeout) - results = RemoteCommandResult(command, task) + results = CommandResult(command, task) if verbose: results.log_output(log) else: @@ -439,9 +483,8 @@ def command_as_user(command, user, env=None): Returns: str: command adjusted to run as another user - """ - if not user: + if not user or user == getuser(): if not env: return command return " ".join([env.to_export_str(), command]).strip() @@ -469,7 +512,6 @@ def find_command(source, pattern, depth, other=None): Returns: str: the find command - """ command = ["find", source, "-maxdepth", str(depth), "-type", "f", "-name", f"'{pattern}'"] if isinstance(other, list): @@ -498,7 +540,6 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, matching the pattern were initially detected and the second NodeSet indicates on which hosts the processes matching the pattern are still running (will be empty if every process was killed or no process matching the pattern were found). - """ processes_detected = NodeSet() processes_running = NodeSet() diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index 15e9f08ae30..299103fd577 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -395,9 +395,7 @@ def scm_prepare(self, **kwargs): DaosServerCommand.ScmSubCommand.PrepareSubCommand object Raises: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False @@ -420,9 +418,7 @@ def scm_reset(self, **kwargs): DaosServerCommand.ScmSubCommand.ResetSubCommand object Raises: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False @@ -440,9 +436,7 @@ def nvme_prepare(self, **kwargs): DaosServerCommand.NvmeSubCommand.PrepareSubCommand object Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False @@ -460,9 +454,7 @@ def support_collect_log(self, **kwargs): DaosServerCommand.SupportSubCommand.CollectLogSubCommand object Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.run_user = "daos_server" diff --git a/src/tests/ftest/util/slurm_utils.py b/src/tests/ftest/util/slurm_utils.py index 9ba92b0f8c9..f2b4de4b393 100644 --- a/src/tests/ftest/util/slurm_utils.py +++ b/src/tests/ftest/util/slurm_utils.py @@ -1,5 +1,5 @@ """ -(C) Copyright 2019-2023 Intel Corporation. +(C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -11,7 +11,7 @@ from ClusterShell.NodeSet import NodeSet, NodeSetParseError # pylint: disable=import-error,no-name-in-module -from util.run_utils import RunException, run_local, run_remote +from util.run_utils import run_local, run_remote PACKAGES = ['slurm', 'slurm-example-configs', 'slurm-slurmctld', 'slurm-slurmd'] W_LOCK = threading.Lock() @@ -30,8 +30,7 @@ def cancel_jobs(log, control, job_id): job_id (int): slurm job id Returns: - RemoteCommandResult: results from the scancel command - + CommandResult: results from the scancel command """ command = ['scancel', str(job_id)] return run_remote(log, control, ' '.join(command)) @@ -52,8 +51,7 @@ def create_partition(log, control, name, hosts, default='yes', max_time='UNLIMIT state (str, optional): state of jobs that can be allocated. Defaults to 'up'. Returns: - RemoteCommandResult: results from the scontrol command - + CommandResult: results from the scontrol command """ command = ['scontrol', 'create'] command.append('='.join(['PartitionName', str(name)])) @@ -90,8 +88,7 @@ def show_partition(log, control, name): name (str): slurm partition name Returns: - RemoteCommandResult: results from the scontrol command - + CommandResult: results from the scontrol command """ command = ['scontrol', 'show', 'partition', str(name)] return run_remote(log, control, ' '.join(command)) @@ -106,8 +103,7 @@ def show_reservation(log, control, name): name (str): slurm reservation name Returns: - RemoteCommandResult: results from the scontrol command - + CommandResult: results from the scontrol command """ command = ['scontrol', 'show', 'reservation', str(name)] return run_remote(log, control, ' '.join(command)) @@ -121,8 +117,7 @@ def sinfo(log, control): control (NodeSet): slurm control host Returns: - RemoteCommandResult: results from the sinfo command - + CommandResult: results from the sinfo command """ return run_remote(log, control, 'sinfo') @@ -137,7 +132,6 @@ def sbatch(log, script, log_file=None): Returns: CommandResult: results from the sbatch command - """ command = ['sbatch'] if log_file: @@ -285,7 +279,7 @@ def run_slurm_script(log, script, logfile=None): """ job_id = None result = sbatch(log, script, logfile) - match = re.search(r"Submitted\s+batch\s+job\s+(\d+)", result.stdout) + match = re.search(r"Submitted\s+batch\s+job\s+(\d+)", result.joined_stdout) if match is not None: job_id = match.group(1) else: @@ -306,14 +300,12 @@ def check_slurm_job(log, handle): """ state = "UNKNOWN" - command = ["scontrol", "show", "job", handle] - try: - result = run_local(log, ' '.join(command), verbose=False, check=True) - match = re.search(r"JobState=([a-zA-Z]+)", result.stdout) + command = f"scontrol show job {handle}" + result = run_local(log, command, verbose=False) + if result.passed: + match = re.search(r"JobState=([a-zA-Z]+)", result.joined_stdout) if match is not None: state = match.group(1) - except RunException as error: - log.debug(str(error)) return state @@ -401,8 +393,7 @@ def srun(log, control, hosts, cmd, srun_params=None, timeout=60): timeout (int, optional): timeout for the srun command. Defaults to 60. Returns: - RemoteCommandResult: results from the srun command - + CommandResult: results from the srun command """ srun_time = max(int(timeout / 60), 1) cmd = srun_str(hosts, cmd, srun_params, str(srun_time)) diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index a97bb2094eb..b5e35949218 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -21,7 +21,7 @@ from exception_utils import CommandFailure from general_utils import journalctl_time from host_utils import get_local_host -from run_utils import RunException, run_local, run_remote +from run_utils import run_local, run_remote from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse, create_app_cmdline, create_dm_cmdline, create_fio_cmdline, create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline, @@ -137,11 +137,9 @@ def pre_tear_down(self): self.log.info("<>", job_id) cmd = "scancel --partition {} -u {} {}".format( self.host_info.clients.partition.name, self.username, job_id) - try: - run_local(self.log, cmd, timeout=120) - except RunException as error: + if not run_local(self.log, cmd, timeout=120).passed: # Exception was raised due to a non-zero exit status - errors.append(f"Failed to cancel jobs {self.failed_job_id_list}: {error}") + errors.append(f"Failed to cancel jobs {self.failed_job_id_list}") if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) @@ -473,11 +471,10 @@ def job_completion(self, job_id_list): if not result.passed: self.log.error("Remote copy failed on %s", str(result.failed_hosts)) # copy the local files; local host not included in hostlist_client - try: - run_local(self.log, cmd, timeout=600) - run_local(self.log, cmd2, timeout=600) - except RunException as error: - self.log.info("Local copy failed with %s", error) + if not run_local(self.log, cmd, timeout=600).passed: + self.log.info("Local copy failed: %s", cmd) + if not run_local(self.log, cmd2, timeout=600).passed: + self.log.info("Local copy failed: %s", cmd2) self.soak_results = {} return job_id_list @@ -604,11 +601,8 @@ def run_soak(self, test_param): " ".join([pool.identifier for pool in self.pool])) # cleanup soak log directories before test - try: - run_local(self.log, f"rm -rf {self.soak_dir}/*", timeout=300) - except RunException as error: - raise SoakTestError( - f"<>") from error + if not run_local(self.log, f"rm -rf {self.soak_dir}/*", timeout=300).passed: + raise SoakTestError(f"<>") if self.enable_remote_logging: result = run_remote( self.log, self.hostlist_clients, f"rm -rf {self.soak_dir}/*", timeout=300) @@ -616,11 +610,9 @@ def run_soak(self, test_param): raise SoakTestError( f"<> {str(result.failed_hosts)}") else: - try: - run_local(self.log, f"rm -rf {self.sharedsoak_dir}/*", timeout=300) - except RunException as error: + if not run_local(self.log, f"rm -rf {self.sharedsoak_dir}/*", timeout=300).passed: raise SoakTestError( - f"<>") from error + f"<>") # Baseline metrics data run_metrics_check(self, prefix="initial") # Initialize time diff --git a/src/tests/ftest/util/user_utils.py b/src/tests/ftest/util/user_utils.py index 368a25862db..ce5a58ed4f7 100644 --- a/src/tests/ftest/util/user_utils.py +++ b/src/tests/ftest/util/user_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -81,8 +81,7 @@ def getent(log, hosts, database, key, sudo=False): sudo (bool): whether to execute commands with sudo Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, @@ -103,8 +102,7 @@ def groupadd(log, hosts, group, force=False, sudo=False): sudo (bool, optional): whether to execute commands with sudo. Default is False Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, @@ -127,8 +125,7 @@ def useradd(log, hosts, user, group=None, parent_dir=None, sudo=False): sudo (bool): whether to execute commands with sudo. Default is False Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, @@ -150,8 +147,7 @@ def userdel(log, hosts, user, sudo=False): sudo (bool): whether to execute commands with sudo. Default is False Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, diff --git a/src/tests/ftest/verify_perms.py b/src/tests/ftest/verify_perms.py index 1596a5664e9..21b3a986c03 100755 --- a/src/tests/ftest/verify_perms.py +++ b/src/tests/ftest/verify_perms.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -17,7 +17,7 @@ # pylint: disable=import-error,no-name-in-module from util.logger_utils import get_console_handler -from util.run_utils import RunException, run_local +from util.run_utils import run_local from util.user_utils import get_user_uid_gid # Set up a logger for the console messages @@ -290,10 +290,7 @@ def _real_x(entry_type, path): ''' if entry_type == 'file': - try: - return run_local(logger, path, check=True, verbose=False).returncode == 0 - except RunException: - return False + return run_local(logger, path, verbose=False).passed if entry_type == 'dir': try: os.chdir(path)