Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-14654 test: simplify ior_per_rank.py (#13346) #13650

Merged
merged 2 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 54 additions & 54 deletions src/tests/ftest/deployment/ior_per_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from avocado.core.exceptions import TestFail
from ClusterShell.NodeSet import NodeSet
from general_utils import DaosTestError, percent_change
from ior_test_base import IorTestBase
from ior_utils import IorCommand, IorMetrics
Expand Down Expand Up @@ -32,53 +33,52 @@ def execute_ior_per_rank(self, rank):
self.log.info("Running Test on rank: %s", rank)
# create the pool on specified rank.
self.add_pool(connect=False, target_list=[rank])
self.container = self.get_container(self.pool)

host = self.server_managers[0].get_host(rank)

# execute ior on given rank and collect the results
try:
self.ior_cmd.flags.update(self.write_flags)
dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info)
dfs_perf_write = IorCommand.get_ior_metrics(dfs_out)
self.ior_cmd.flags.update(self.read_flags)
dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info)
dfs_perf_read = IorCommand.get_ior_metrics(dfs_out)

# Destroy container, to be sure we use newly created container in next iteration
self.container.destroy()
self.container = None

# gather actual and expected perf data to be compared
dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB])
dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB])
actual_write_x = abs(percent_change(self.expected_bw, dfs_max_write))
actual_read_x = abs(percent_change(self.expected_bw, dfs_max_read))

# verify write performance
if actual_write_x > self.write_x:
if host not in self.failed_nodes:
self.failed_nodes[host] = []
self.failed_nodes[host].append(
f"rank {rank} low write perf. "
f"BW: {dfs_max_write:.2f}/{self.expected_bw:.2f}; "
f"percent diff: {actual_write_x:.2f}/{self.write_x:.2f}")

# verify read performance
if actual_read_x > self.read_x:
if host not in self.failed_nodes:
self.failed_nodes[host] = []
self.failed_nodes[host].append(
f"rank {rank} low read perf. "
f"BW: {dfs_max_read:.2f}/{self.expected_bw:.2f}; "
f"percent diff: {actual_read_x:.2f}/{self.read_x:.2f}")

except (TestFail, DaosTestError) as error:
if host not in self.failed_nodes:
self.failed_nodes[host] = []
self.failed_nodes[host].append(str(error))

# execute ior on given rank for different transfer sizes and collect the results
for idx, transfer_size in enumerate(self.transfer_sizes):
try:
self.ior_cmd.transfer_size.update(transfer_size)
self.ior_cmd.flags.update(self.write_flags)
dfs_out = self.run_ior_with_pool(fail_on_warning=self.log.info)
dfs_perf_write = IorCommand.get_ior_metrics(dfs_out)
self.ior_cmd.flags.update(self.read_flags)
dfs_out = self.run_ior_with_pool(create_cont=False, fail_on_warning=self.log.info)
dfs_perf_read = IorCommand.get_ior_metrics(dfs_out)

# Destroy container, to be sure we use newly created container in next iteration
self.container.destroy()
self.container = None

# gather actual and expected perf data to be compared
if idx == 0:
dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_MIB])
dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_MIB])
actual_write_x = percent_change(dfs_max_write, self.expected_bw)
actual_read_x = percent_change(dfs_max_read, self.expected_bw)
else:
dfs_max_write = float(dfs_perf_write[0][IorMetrics.MAX_OPS])
dfs_max_read = float(dfs_perf_read[0][IorMetrics.MAX_OPS])
actual_write_x = percent_change(dfs_max_write, self.expected_iops)
actual_read_x = percent_change(dfs_max_read, self.expected_iops)

# compare actual and expected perf data
self.assertLessEqual(abs(actual_write_x), self.write_x,
"Max Write Diff too large for rank: {}".format(rank))
self.assertLessEqual(abs(actual_read_x), self.read_x,
"Max Read Diff too large for rank: {}".format(rank))
# collect list of good nodes
good_node = self.server_managers[0].get_host(rank)
if ((good_node not in self.good_nodes)
and (good_node not in self.failed_nodes)):
self.good_nodes.append(good_node)
except (TestFail, DaosTestError):
# collect bad nodes
failed_node = self.server_managers[0].get_host(rank)
if failed_node not in self.failed_nodes:
self.failed_nodes[failed_node] = [rank]
else:
self.failed_nodes[failed_node].append(rank)
if failed_node in self.good_nodes:
self.good_nodes.remove(failed_node)
# Destroy pool, to be sure we use newly created pool in next iteration
self.pool.destroy()
self.pool = None
Expand All @@ -100,8 +100,6 @@ def test_ior_per_rank(self):

# test params
self.failed_nodes = {}
self.good_nodes = []
self.transfer_sizes = self.params.get("transfer_sizes", self.ior_cmd.namespace)
self.write_flags = self.params.get("write_flags", self.ior_cmd.namespace)
self.read_flags = self.params.get("read_flags", self.ior_cmd.namespace)

Expand All @@ -122,13 +120,15 @@ def test_ior_per_rank(self):
for rank in rank_list:
self.execute_ior_per_rank(rank)

# list of good nodes
if self.good_nodes:
self.log.info("List of good nodes: %s", self.good_nodes)
# the good nodes are any that did not fail
good_nodes = self.hostlist_servers - NodeSet.fromlist(self.failed_nodes.keys())
if good_nodes:
self.log.info("Good nodes: %s", good_nodes)

# list the failed node and the rank number associated with that node
if self.failed_nodes:
self.log.info("List of failed ranks with corresponding nodes")
for node, rank in self.failed_nodes.items():
self.log.info("Node: %s, Rank: %s", node, rank)
self.log.info("List of failed nodes with corresponding ranks")
for node, reason_list in self.failed_nodes.items():
for reason in reason_list:
self.log.info("%s: %s", node, reason)
self.fail("Performance check failed for one or more nodes")
10 changes: 4 additions & 6 deletions src/tests/ftest/deployment/ior_per_rank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,24 @@ server_config:
pool:
mode: 146
size: 750G # Cannot use percentage, as it does not work when using pool create for per rank.
control_method: dmg
properties: ec_cell_sz:128KiB
container:
type: POSIX
properties: cksum:crc16,cksum_size:16384,srv_cksum:on
control_method: daos
oclass: SX
ior:
client_processes:
ppn: 32
api: DFS
test_file: testFile
write_flags: "-w -C -e -g -G 27 -k -Q 1"
read_flags: "-r -R -C -e -g -G 27 -k -Q 1"
sw_deadline: 30
sw_deadline: 15
sw_wearout: 1
sw_status_file: "/var/tmp/daos_testing/stoneWallingStatusFile"
dfs_oclass: 'SX'
transfer_sizes:
- 1M
- 256B
dfs_oclass: SX
transfer_size: 1M
block_size: 150G
# 0.5 only for CI, due to the varying nature of different clusters in CI.
# Change it to 15% (0.15) for Aurora.
Expand Down
Loading