From 099c0b880f8fb113b3e2ea7d4b28f43f4a8666b4 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Tue, 22 Oct 2024 17:21:29 -0400 Subject: [PATCH] DAOS-16265 test: Split erasurecode/multiple_failure.py (#15355) Split the erasurecode/multiple_failure.py into two separate tests to reduce the possibility of a large number of ERR messages in the server log file from preventing other test variants from failing dure to out of space errors. Signed-off-by: Phil Henderson --- .../erasurecode/multiple_rank_failure.py | 51 +++++++++++++++ ...ailure.yaml => multiple_rank_failure.yaml} | 6 ++ ..._failure.py => multiple_target_failure.py} | 28 ++------ .../erasurecode/multiple_target_failure.yaml | 65 +++++++++++++++++++ 4 files changed, 129 insertions(+), 21 deletions(-) create mode 100644 src/tests/ftest/erasurecode/multiple_rank_failure.py rename src/tests/ftest/erasurecode/{multiple_failure.yaml => multiple_rank_failure.yaml} (99%) rename src/tests/ftest/erasurecode/{multiple_failure.py => multiple_target_failure.py} (76%) create mode 100644 src/tests/ftest/erasurecode/multiple_target_failure.yaml diff --git a/src/tests/ftest/erasurecode/multiple_rank_failure.py b/src/tests/ftest/erasurecode/multiple_rank_failure.py new file mode 100644 index 00000000000..84be5aab6f6 --- /dev/null +++ b/src/tests/ftest/erasurecode/multiple_rank_failure.py @@ -0,0 +1,51 @@ +''' + (C) Copyright 2021-2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +''' +from ec_utils import ErasureCodeIor + + +class EcodOnlineMultiRankFail(ErasureCodeIor): + # pylint: disable=too-many-ancestors + """ + Test Class Description: To validate Erasure code object data after killing multiple rank,targets + while IOR Write in progress. + :avocado: recursive + """ + + def __init__(self, *args, **kwargs): + """Initialize a EcOnlineRebuild object.""" + super().__init__(*args, **kwargs) + self.set_online_rebuild = True + + def test_ec_multiple_rank_failure(self): + """Jira ID: DAOS-7344. + + Test Description: Test Erasure code object with IOR with multiple rank failure + Use Case: Create the pool, run IOR with supported EC object type class, kill multiple + server ranks, while IOR Write phase is in progress, verify all IOR write + finish.Read and verify data. + + :avocado: tags=all,full_regression + :avocado: tags=hw,large + :avocado: tags=ec,ec_online_rebuild,rebuild,ec_fault,ec_multiple_failure + :avocado: tags=EcodOnlineMultiRankFail,test_ec_multiple_rank_failure + """ + # Kill Two server ranks + self.rank_to_kill = [self.server_count - 1, self.server_count - 3] + + # Write IOR data set with different EC object. kill rank, targets or mix of both while IOR + # Write phase is in progress. + self.log_step( + f"Write datasets using IOR and kill rank {self.rank_to_kill} while IOR is running") + self.ior_write_dataset() + + # Disabled Online rebuild + self.set_online_rebuild = False + + # Read IOR data and verify for EC object again + # EC data was written with +2 parity so after killing ranks of targets data should be + # intact and no data corruption observed. + self.log_step(f"Read datasets using IOR after killing rank {self.rank_to_kill}") + self.ior_read_dataset(parity=2) diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_rank_failure.yaml similarity index 99% rename from src/tests/ftest/erasurecode/multiple_failure.yaml rename to src/tests/ftest/erasurecode/multiple_rank_failure.yaml index 95aab541329..a90d9369f44 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_rank_failure.yaml @@ -1,10 +1,13 @@ hosts: test_servers: 6 test_clients: 2 + timeout: 2000 + setup: start_agents_once: false start_servers_once: false + server_config: name: daos_server engines_per_host: 2 @@ -23,13 +26,16 @@ server_config: fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + pool: size: 93% set_logmasks: False + container: type: POSIX control_method: daos properties: cksum:crc16,cksum_size:16384,srv_cksum:on + ior: api: "DFS" client_processes: diff --git a/src/tests/ftest/erasurecode/multiple_failure.py b/src/tests/ftest/erasurecode/multiple_target_failure.py similarity index 76% rename from src/tests/ftest/erasurecode/multiple_failure.py rename to src/tests/ftest/erasurecode/multiple_target_failure.py index 8fb412dad58..9458ca73492 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.py +++ b/src/tests/ftest/erasurecode/multiple_target_failure.py @@ -6,7 +6,7 @@ from ec_utils import ErasureCodeIor -class EcodOnlineMultFail(ErasureCodeIor): +class EcodOnlineMultiTargetFail(ErasureCodeIor): # pylint: disable=too-many-ancestors """ Test Class Description: To validate Erasure code object data after killing multiple rank,targets @@ -23,6 +23,8 @@ def run_ior_cascade_failure(self): """Common function to Write and Read IOR""" # Write IOR data set with different EC object. kill rank, targets or mix of both while IOR # Write phase is in progress. + self.log_step( + f"Write datasets using IOR and exclude target {self.pool_exclude} while IOR is running") self.ior_write_dataset() # Disabled Online rebuild @@ -31,25 +33,9 @@ def run_ior_cascade_failure(self): # Read IOR data and verify for EC object again # EC data was written with +2 parity so after killing ranks of targets data should be # intact and no data corruption observed. + self.log_step(f"Read datasets using IOR after exclude target {self.pool_exclude}") self.ior_read_dataset(parity=2) - def test_ec_multiple_rank_failure(self): - """Jira ID: DAOS-7344. - - Test Description: Test Erasure code object with IOR with multiple rank failure - Use Case: Create the pool, run IOR with supported EC object type class, kill multiple - server ranks, while IOR Write phase is in progress, verify all IOR write - finish.Read and verify data. - - :avocado: tags=all,full_regression - :avocado: tags=hw,large - :avocado: tags=ec,ec_online_rebuild,rebuild,ec_fault,ec_multiple_failure - :avocado: tags=EcodOnlineMultFail,test_ec_multiple_rank_failure - """ - # Kill Two server ranks - self.rank_to_kill = [self.server_count - 1, self.server_count - 3] - self.run_ior_cascade_failure() - def test_ec_multiple_targets_on_same_rank(self): """Jira ID: DAOS-7344. @@ -61,7 +47,7 @@ def test_ec_multiple_targets_on_same_rank(self): :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=ec,ec_array,ec_online_rebuild,rebuild,ec_fault,ec_multiple_failure - :avocado: tags=EcodOnlineMultFail,test_ec_multiple_targets_on_same_rank + :avocado: tags=EcodOnlineMultiTargetFail,test_ec_multiple_targets_on_same_rank """ # Kill Two targets 2,4 from same rank 2 self.pool_exclude[2] = "2,4" @@ -78,7 +64,7 @@ def test_ec_multiple_targets_on_diff_ranks(self): :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=ec,ec_array,ec_online_rebuild,rebuild,ec_fault,ec_multiple_failure - :avocado: tags=EcodOnlineMultFail,test_ec_multiple_targets_on_diff_ranks + :avocado: tags=EcodOnlineMultiTargetFail,test_ec_multiple_targets_on_diff_ranks """ # Kill Two targets from different ranks self.pool_exclude[2] = "2" @@ -96,7 +82,7 @@ def test_ec_single_target_rank_failure(self): :avocado: tags=all,full_regression :avocado: tags=hw,large :avocado: tags=ec,ec_online_rebuild,rebuild,ec_fault,ec_multiple_failure - :avocado: tags=EcodOnlineMultFail,test_ec_single_target_rank_failure + :avocado: tags=EcodOnlineMultiTargetFail,test_ec_single_target_rank_failure """ # Kill One server rank self.rank_to_kill = [self.server_count - 1] diff --git a/src/tests/ftest/erasurecode/multiple_target_failure.yaml b/src/tests/ftest/erasurecode/multiple_target_failure.yaml new file mode 100644 index 00000000000..a90d9369f44 --- /dev/null +++ b/src/tests/ftest/erasurecode/multiple_target_failure.yaml @@ -0,0 +1,65 @@ +hosts: + test_servers: 6 + test_clients: 2 + +timeout: 2000 + +setup: + start_agents_once: false + start_servers_once: false + +server_config: + name: daos_server + engines_per_host: 2 + engines: + 0: + pinned_numa_node: 0 + nr_xs_helpers: 1 + fabric_iface: ib0 + fabric_iface_port: 31317 + log_file: daos_server0.log + storage: auto + 1: + pinned_numa_node: 1 + nr_xs_helpers: 1 + fabric_iface: ib1 + fabric_iface_port: 31417 + log_file: daos_server1.log + storage: auto + +pool: + size: 93% + set_logmasks: False + +container: + type: POSIX + control_method: daos + properties: cksum:crc16,cksum_size:16384,srv_cksum:on + +ior: + api: "DFS" + client_processes: + np: 32 + dfs_destroy: false + iorflags: + flags: "-w -W -F -k -G 1 -vv" + read_flags: "-r -R -F -k -G 1 -vv" + test_file: /testFile + repetitions: 1 + sizes: !mux + Full_Striped: + chunk_block_transfer_sizes: + - [32M, 8G, 8M] + Partial_Striped: + chunk_block_transfer_sizes: + - [32M, 512M, 2K] + objectclass: !mux + EC_2P2GX: + dfs_oclass_list: + - ["EC_2P2GX", 6] + EC_4P2GX: + dfs_oclass_list: + - ["EC_4P2GX", 8] + EC_8P2GX: + dfs_oclass_list: + - ["EC_8P2GX", 10]