From 3033979789ea012c4bb1fe9364449cda4d5dabc3 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 10 Jan 2024 12:57:22 -0800 Subject: [PATCH 1/2] DAOS-14654 test: simplify ior_per_rank.py (#13346) - Only run with transfer size 1M - Reduce stonewall to 15s Required-githooks: true Signed-off-by: Dalton Bohning --- src/tests/ftest/deployment/ior_per_rank.py | 108 +++++++++---------- src/tests/ftest/deployment/ior_per_rank.yaml | 10 +- 2 files changed, 58 insertions(+), 60 deletions(-) diff --git a/src/tests/ftest/deployment/ior_per_rank.py b/src/tests/ftest/deployment/ior_per_rank.py index f914216f326..5a8463cb940 100644 --- a/src/tests/ftest/deployment/ior_per_rank.py +++ b/src/tests/ftest/deployment/ior_per_rank.py @@ -5,6 +5,7 @@ """ from avocado.core.exceptions import TestFail +from ClusterShell.NodeSet import NodeSet from general_utils import DaosTestError, percent_change from ior_test_base import IorTestBase from ior_utils import IorCommand, IorMetrics @@ -32,53 +33,52 @@ def execute_ior_per_rank(self, rank): self.log.info("Running Test on rank: %s", rank) # create the pool on specified rank. self.add_pool(connect=False, target_list=[rank]) + self.container = self.get_container(self.pool) + + host = self.server_managers[0].get_host(rank) + + # execute ior on given rank and collect the results + try: + self.ior_cmd.flags.update(self.write_flags) + dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info) + dfs_perf_write = IorCommand.get_ior_metrics(dfs_out) + self.ior_cmd.flags.update(self.read_flags) + dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info) + dfs_perf_read = IorCommand.get_ior_metrics(dfs_out) + + # Destroy container, to be sure we use newly created container in next iteration + self.container.destroy() + self.container = None + + # gather actual and expected perf data to be compared + dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB]) + dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB]) + actual_write_x = abs(percent_change(self.expected_bw, dfs_max_write)) + actual_read_x = abs(percent_change(self.expected_bw, dfs_max_read)) + + # verify write performance + if actual_write_x > self.write_x: + if host not in self.failed_nodes: + self.failed_nodes[host] = [] + self.failed_nodes[host].append( + f"rank {rank} low write perf. " + f"BW: {dfs_max_write:.2f}/{self.expected_bw:.2f}; " + f"percent diff: {actual_write_x:.2f}/{self.write_x:.2f}") + + # verify read performance + if actual_read_x > self.read_x: + if host not in self.failed_nodes: + self.failed_nodes[host] = [] + self.failed_nodes[host].append( + f"rank {rank} low read perf. " + f"BW: {dfs_max_read:.2f}/{self.expected_bw:.2f}; " + f"percent diff: {actual_read_x:.2f}/{self.read_x:.2f}") + + except (TestFail, DaosTestError) as error: + if host not in self.failed_nodes: + self.failed_nodes[host] = [] + self.failed_nodes[host].append(str(error)) - # execute ior on given rank for different transfer sizes and collect the results - for idx, transfer_size in enumerate(self.transfer_sizes): - try: - self.ior_cmd.transfer_size.update(transfer_size) - self.ior_cmd.flags.update(self.write_flags) - dfs_out = self.run_ior_with_pool(fail_on_warning=self.log.info) - dfs_perf_write = IorCommand.get_ior_metrics(dfs_out) - self.ior_cmd.flags.update(self.read_flags) - dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info) - dfs_perf_read = IorCommand.get_ior_metrics(dfs_out) - - # Destroy container, to be sure we use newly created container in next iteration - self.container.destroy() - self.container = None - - # gather actual and expected perf data to be compared - if idx == 0: - dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB]) - dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB]) - actual_write_x = percent_change(dfs_max_write, self.expected_bw) - actual_read_x = percent_change(dfs_max_read, self.expected_bw) - else: - dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_OPS]) - dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_OPS]) - actual_write_x = percent_change(dfs_max_write, self.expected_iops) - actual_read_x = percent_change(dfs_max_read, self.expected_iops) - - # compare actual and expected perf data - self.assertLessEqual(abs(actual_write_x), self.write_x, - "Max Write Diff too large for rank: {}".format(rank)) - self.assertLessEqual(abs(actual_read_x), self.read_x, - "Max Read Diff too large for rank: {}".format(rank)) - # collect list of good nodes - good_node = self.server_managers[0].get_host(rank) - if ((good_node not in self.good_nodes) - and (good_node not in self.failed_nodes)): - self.good_nodes.append(good_node) - except (TestFail, DaosTestError): - # collect bad nodes - failed_node = self.server_managers[0].get_host(rank) - if failed_node not in self.failed_nodes: - self.failed_nodes[failed_node] = [rank] - else: - self.failed_nodes[failed_node].append(rank) - if failed_node in self.good_nodes: - self.good_nodes.remove(failed_node) # Destroy pool, to be sure we use newly created pool in next iteration self.pool.destroy() self.pool = None @@ -100,8 +100,6 @@ def test_ior_per_rank(self): # test params self.failed_nodes = {} - self.good_nodes = [] - self.transfer_sizes = self.params.get("transfer_sizes", self.ior_cmd.namespace) self.write_flags = self.params.get("write_flags", self.ior_cmd.namespace) self.read_flags = self.params.get("read_flags", self.ior_cmd.namespace) @@ -122,13 +120,15 @@ def test_ior_per_rank(self): for rank in rank_list: self.execute_ior_per_rank(rank) - # list of good nodes - if self.good_nodes: - self.log.info("List of good nodes: %s", self.good_nodes) + # the good nodes are any that did not fail + good_nodes = self.hostlist_servers - NodeSet.fromlist(self.failed_nodes.keys()) + if good_nodes: + self.log.info("Good nodes: %s", good_nodes) # list the failed node and the rank number associated with that node if self.failed_nodes: - self.log.info("List of failed ranks with corresponding nodes") - for node, rank in self.failed_nodes.items(): - self.log.info("Node: %s, Rank: %s", node, rank) + self.log.info("List of failed nodes with corresponding ranks") + for node, reason_list in self.failed_nodes.items(): + for reason in reason_list: + self.log.info("%s: %s", node, reason) self.fail("Performance check failed for one or more nodes") diff --git a/src/tests/ftest/deployment/ior_per_rank.yaml b/src/tests/ftest/deployment/ior_per_rank.yaml index 406ef6dfff9..3a45226b5ca 100644 --- a/src/tests/ftest/deployment/ior_per_rank.yaml +++ b/src/tests/ftest/deployment/ior_per_rank.yaml @@ -23,12 +23,12 @@ server_config: pool: mode: 146 size: 750G # Cannot use percentage, as it does not work when using pool create for per rank. - control_method: dmg properties: ec_cell_sz:128KiB container: type: POSIX properties: cksum:crc16,cksum_size:16384,srv_cksum:on control_method: daos + oclass: SX ior: client_processes: ppn: 32 @@ -36,13 +36,11 @@ ior: test_file: testFile write_flags: "-w -C -e -g -G 27 -k -Q 1" read_flags: "-r -R -C -e -g -G 27 -k -Q 1" - sw_deadline: 30 + sw_deadline: 15 sw_wearout: 1 sw_status_file: "/var/tmp/daos_testing/stoneWallingStatusFile" - dfs_oclass: 'SX' - transfer_sizes: - - 1M - - 256B + dfs_oclass: SX + transfer_size: 1M block_size: 150G # 0.5 only for CI, due to the varying nature of different clusters in CI. # Change it to 15% (0.15) for Aurora. From a2752efcbeb783298e20dac6fc0b62fb1c027638 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Tue, 23 Jan 2024 16:40:00 +0000 Subject: [PATCH 2/2] DAOS-13078 test: trigger test Test-tag: test_ior_per_rank Test-repeat: 2 Skip-unit-tests: true Skip-fault-injection-test: true Required-githooks: true Signed-off-by: Dalton Bohning