Skip to content

Commit

Permalink
DAOS-14654 test: simplify ior_per_rank.py (#13346) (#13650)
Browse files Browse the repository at this point in the history
- Only run with transfer size 1M
- Reduce stonewall to 15s

Signed-off-by: Dalton Bohning <[email protected]>
  • Loading branch information
daltonbohning authored Jan 24, 2024
1 parent 8b5ffa6 commit 50720aa
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 60 deletions.
108 changes: 54 additions & 54 deletions src/tests/ftest/deployment/ior_per_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from avocado.core.exceptions import TestFail
from ClusterShell.NodeSet import NodeSet
from general_utils import DaosTestError, percent_change
from ior_test_base import IorTestBase
from ior_utils import IorCommand, IorMetrics
Expand Down Expand Up @@ -32,53 +33,52 @@ def execute_ior_per_rank(self, rank):
self.log.info("Running Test on rank: %s", rank)
# create the pool on specified rank.
self.add_pool(connect=False, target_list=[rank])
self.container = self.get_container(self.pool)

host = self.server_managers[0].get_host(rank)

# execute ior on given rank and collect the results
try:
self.ior_cmd.flags.update(self.write_flags)
dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info)
dfs_perf_write = IorCommand.get_ior_metrics(dfs_out)
self.ior_cmd.flags.update(self.read_flags)
dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info)
dfs_perf_read = IorCommand.get_ior_metrics(dfs_out)

# Destroy container, to be sure we use newly created container in next iteration
self.container.destroy()
self.container = None

# gather actual and expected perf data to be compared
dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB])
dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB])
actual_write_x = abs(percent_change(self.expected_bw, dfs_max_write))
actual_read_x = abs(percent_change(self.expected_bw, dfs_max_read))

# verify write performance
if actual_write_x > self.write_x:
if host not in self.failed_nodes:
self.failed_nodes[host] = []
self.failed_nodes[host].append(
f"rank {rank} low write perf. "
f"BW: {dfs_max_write:.2f}/{self.expected_bw:.2f}; "
f"percent diff: {actual_write_x:.2f}/{self.write_x:.2f}")

# verify read performance
if actual_read_x > self.read_x:
if host not in self.failed_nodes:
self.failed_nodes[host] = []
self.failed_nodes[host].append(
f"rank {rank} low read perf. "
f"BW: {dfs_max_read:.2f}/{self.expected_bw:.2f}; "
f"percent diff: {actual_read_x:.2f}/{self.read_x:.2f}")

except (TestFail, DaosTestError) as error:
if host not in self.failed_nodes:
self.failed_nodes[host] = []
self.failed_nodes[host].append(str(error))

# execute ior on given rank for different transfer sizes and collect the results
for idx, transfer_size in enumerate(self.transfer_sizes):
try:
self.ior_cmd.transfer_size.update(transfer_size)
self.ior_cmd.flags.update(self.write_flags)
dfs_out = self.run_ior_with_pool(fail_on_warning=self.log.info)
dfs_perf_write = IorCommand.get_ior_metrics(dfs_out)
self.ior_cmd.flags.update(self.read_flags)
dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info)
dfs_perf_read = IorCommand.get_ior_metrics(dfs_out)

# Destroy container, to be sure we use newly created container in next iteration
self.container.destroy()
self.container = None

# gather actual and expected perf data to be compared
if idx == 0:
dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB])
dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB])
actual_write_x = percent_change(dfs_max_write, self.expected_bw)
actual_read_x = percent_change(dfs_max_read, self.expected_bw)
else:
dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_OPS])
dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_OPS])
actual_write_x = percent_change(dfs_max_write, self.expected_iops)
actual_read_x = percent_change(dfs_max_read, self.expected_iops)

# compare actual and expected perf data
self.assertLessEqual(abs(actual_write_x), self.write_x,
"Max Write Diff too large for rank: {}".format(rank))
self.assertLessEqual(abs(actual_read_x), self.read_x,
"Max Read Diff too large for rank: {}".format(rank))
# collect list of good nodes
good_node = self.server_managers[0].get_host(rank)
if ((good_node not in self.good_nodes)
and (good_node not in self.failed_nodes)):
self.good_nodes.append(good_node)
except (TestFail, DaosTestError):
# collect bad nodes
failed_node = self.server_managers[0].get_host(rank)
if failed_node not in self.failed_nodes:
self.failed_nodes[failed_node] = [rank]
else:
self.failed_nodes[failed_node].append(rank)
if failed_node in self.good_nodes:
self.good_nodes.remove(failed_node)
# Destroy pool, to be sure we use newly created pool in next iteration
self.pool.destroy()
self.pool = None
Expand All @@ -100,8 +100,6 @@ def test_ior_per_rank(self):

# test params
self.failed_nodes = {}
self.good_nodes = []
self.transfer_sizes = self.params.get("transfer_sizes", self.ior_cmd.namespace)
self.write_flags = self.params.get("write_flags", self.ior_cmd.namespace)
self.read_flags = self.params.get("read_flags", self.ior_cmd.namespace)

Expand All @@ -122,13 +120,15 @@ def test_ior_per_rank(self):
for rank in rank_list:
self.execute_ior_per_rank(rank)

# list of good nodes
if self.good_nodes:
self.log.info("List of good nodes: %s", self.good_nodes)
# the good nodes are any that did not fail
good_nodes = self.hostlist_servers - NodeSet.fromlist(self.failed_nodes.keys())
if good_nodes:
self.log.info("Good nodes: %s", good_nodes)

# list the failed node and the rank number associated with that node
if self.failed_nodes:
self.log.info("List of failed ranks with corresponding nodes")
for node, rank in self.failed_nodes.items():
self.log.info("Node: %s, Rank: %s", node, rank)
self.log.info("List of failed nodes with corresponding ranks")
for node, reason_list in self.failed_nodes.items():
for reason in reason_list:
self.log.info("%s: %s", node, reason)
self.fail("Performance check failed for one or more nodes")
10 changes: 4 additions & 6 deletions src/tests/ftest/deployment/ior_per_rank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,24 @@ server_config:
pool:
mode: 146
size: 750G # Cannot use percentage, as it does not work when using pool create for per rank.
control_method: dmg
properties: ec_cell_sz:128KiB
container:
type: POSIX
properties: cksum:crc16,cksum_size:16384,srv_cksum:on
control_method: daos
oclass: SX
ior:
client_processes:
ppn: 32
api: DFS
test_file: testFile
write_flags: "-w -C -e -g -G 27 -k -Q 1"
read_flags: "-r -R -C -e -g -G 27 -k -Q 1"
sw_deadline: 30
sw_deadline: 15
sw_wearout: 1
sw_status_file: "/var/tmp/daos_testing/stoneWallingStatusFile"
dfs_oclass: 'SX'
transfer_sizes:
- 1M
- 256B
dfs_oclass: SX
transfer_size: 1M
block_size: 150G
# 0.5 only for CI, due to the varying nature of different clusters in CI.
# Change it to 15% (0.15) for Aurora.
Expand Down

0 comments on commit 50720aa

Please sign in to comment.