From 34173c2ff6081022fba8fbeed5f6f0fe49c6b1b8 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Mon, 9 Sep 2024 21:55:50 +0000 Subject: [PATCH 01/10] DAOS-16464 test: improve online_rebuild_mdtest.py Run with a stonewall and stop ranks after half of the stonewall time so the timing is more reliable than arbitrarily sleeping for 30 seconds. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Required-githooks: true Signed-off-by: Dalton Bohning --- .../erasurecode/online_rebuild_mdtest.py | 20 ++------ .../erasurecode/online_rebuild_mdtest.yaml | 38 +++++++-------- src/tests/ftest/util/ec_utils.py | 46 ++++++------------- 3 files changed, 37 insertions(+), 67 deletions(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.py b/src/tests/ftest/erasurecode/online_rebuild_mdtest.py index 8f320a077d4..c832b67a108 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.py +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -14,11 +14,6 @@ class EcodOnlineRebuildMdtest(ErasureCodeMdtest): :avocado: recursive """ - def __init__(self, *args, **kwargs): - """Initialize a EcOnlineRebuild object.""" - super().__init__(*args, **kwargs) - self.set_online_rebuild = True - def test_ec_online_rebuild_mdtest(self): """Jira ID: DAOS-7320. @@ -35,13 +30,6 @@ def test_ec_online_rebuild_mdtest(self): :avocado: tags=ec,ec_array,mdtest,ec_online_rebuild :avocado: tags=EcodOnlineRebuildMdtest,test_ec_online_rebuild_mdtest """ - # Kill last server rank - self.rank_to_kill = self.server_count - 1 - - # Run only object type which matches the server count and - # remove other objects - for oclass in self.obj_class: - if oclass[1] == self.server_count: - self.obj_class = oclass[0] - - self.start_online_mdtest() + # Stop one random rank while mdtest is running + ranks_to_stop = self.random.sample(list(self.server_managers[0].ranks), k=1) + self.start_online_mdtest(ranks_to_stop) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index 3c1ea2f7588..caca7aba541 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -7,7 +7,7 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 2 -timeout: 1000 +timeout: 1500 setup: start_agents_once: False start_servers_once: False @@ -37,25 +37,27 @@ container: properties: rd_fac:2 mdtest: client_processes: - np_48: - np: 48 - num_of_files_dirs: 200 - mdtest_api: - dfs: - api: 'DFS' - test_dir: "/" - iteration: 4 + np: 4 + api: DFS + test_dir: / dfs_destroy: True - manager: "MPICH" - flags: "-u" + manager: MPICH + flags: -u write_bytes: 4194304 read_bytes: 4194304 depth: 10 + num_of_files_dirs: 100000000 + stonewall_timer: 10 + stonewall_statusfile: stoneWallingStatusFile # EC does not supported for directory so for now running with RP - dfs_dir_oclass: "RP_3G1" - objectclass: - dfs_oclass_list: - #- [EC_Object_Class, Minimum number of servers] - - ["EC_2P2GX", 6] - - ["EC_4P2GX", 8] - - ["EC_8P2GX", 12] + dfs_dir_oclass: RP_3G1 + dfs_oclass_mux: !mux + 6_server: + !filter-only : "/run/hosts/servers/6_server" # yamllint disable-line rule:colons + dfs_oclass: EC_2P2GX + 8_server: + !filter-only : "/run/hosts/servers/8_server" # yamllint disable-line rule:colons + dfs_oclass: EC_4P2GX + 12_server: + !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons + dfs_oclass: EC_8P2GX diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index 54ccda3b9aa..d78dfc4e2e9 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -413,56 +413,36 @@ def start_online_single_operation(self, operation, parity=1): class ErasureCodeMdtest(MdtestBase): """Class to used for EC testing for MDtest Benchmark.""" - def __init__(self, *args, **kwargs): - """Initialize a MdtestBase object.""" - super().__init__(*args, **kwargs) - self.server_count = None - self.set_online_rebuild = False - self.rank_to_kill = None - self.obj_class = None - def setUp(self): """Set up each test case.""" super().setUp() - engine_count = self.server_managers[0].get_config_value("engines_per_host") - self.server_count = len(self.hostlist_servers) * engine_count - self.obj_class = self.params.get("dfs_oclass_list", '/run/mdtest/objectclass/*') # Create Pool self.add_pool() self.out_queue = queue.Queue() - def write_single_mdtest_dataset(self): - """Run MDtest with EC object type.""" - # Update the MDtest obj class - self.mdtest_cmd.dfs_oclass.update(self.obj_class) + def start_online_mdtest(self, ranks_to_stop): + """Run mdtest and stop ranks while mdtest is running. - # Write the MDtest data - self.execute_mdtest(self.out_queue) - - def start_online_mdtest(self): - """Run MDtest operation with thread in background. - - Trigger the server failure while MDtest is running + Args: + ranks_to_stop (list): ranks to stop while mdtest is running """ # Create the MDtest run thread - job = threading.Thread(target=self.write_single_mdtest_dataset) + job = threading.Thread( + target=self.execute_mdtest, + kwargs={"out_queue": self.out_queue}) # Launch the MDtest thread job.start() - # Kill the server rank while IO operation in progress - if self.set_online_rebuild: - time.sleep(30) - # Kill the server rank - if self.rank_to_kill is not None: - self.server_managers[0].stop_ranks([self.rank_to_kill], - self.d_log, - force=True) + # Stop the server ranks while IO operation in progress + time.sleep(self.mdtest_cmd.stonewall_timer.value / 2) + self.server_managers[0].stop_ranks(ranks_to_stop, self.d_log, force=True) # Wait to finish the thread job.join() # Verify the queue result and make sure test has no failure while not self.out_queue.empty(): - if self.out_queue.get() == "Mdtest Failed": - self.fail("FAIL") + result = self.out_queue.get() + if result == "Mdtest Failed": + self.fail(result) From a09e044dbe1898b3bef03133b4a41c24ee8c6418 Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Wed, 18 Dec 2024 17:59:10 -0500 Subject: [PATCH 02/10] DAOS-16494 test: Updated the read/write bytes to 1M. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Required-githooks: true Signed-off-by: Padmanabhan --- src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index caca7aba541..372e25ef466 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -43,8 +43,8 @@ mdtest: dfs_destroy: True manager: MPICH flags: -u - write_bytes: 4194304 - read_bytes: 4194304 + write_bytes: 1048576 + read_bytes: 1048576 depth: 10 num_of_files_dirs: 100000000 stonewall_timer: 10 From 775a37bf8ee4b566a3a7e8c9e2c77dbf4eb64b2a Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Fri, 20 Dec 2024 12:00:18 -0500 Subject: [PATCH 03/10] DAOS-16464 test: Set logmask to ERR for the test. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index 372e25ef466..7f91facb145 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -21,6 +21,7 @@ server_config: fabric_iface: ib0 fabric_iface_port: 31416 log_file: daos_server0.log + log_mask: ERR storage: auto 1: pinned_numa_node: 1 @@ -28,6 +29,7 @@ server_config: fabric_iface: ib1 fabric_iface_port: 31517 log_file: daos_server1.log + log_mask: ERR storage: auto pool: size: 93% From 090305b29c6383e6bd6b4d9dce836a0698ef13ee Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Tue, 14 Jan 2025 16:25:41 -0500 Subject: [PATCH 04/10] DAOS-16464 test: Revert the dfs_class and update stonewall information. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- .../erasurecode/online_rebuild_mdtest.yaml | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index 884f3f0ca42..581a766e1ae 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -48,15 +48,20 @@ mdtest: write_bytes: 524288 read_bytes: 524288 depth: 10 - num_of_files_dirs: 10000000 - stonewall_timer: 10 - stonewall_statusfile: stoneWallingStatusFile + num_of_files_dirs: 100000009 + stonewall_timer: 30 # EC does not supported for directory so for now running with RP - dfs_dir_oclass: "RP_3G1" - objectclass: - dfs_oclass_list: - #- [EC_Object_Class, Exact number of servers] - - ["EC_2P2GX", 6] - - ["EC_4P2GX", 8] - - ["EC_4P3GX", 12] - - ["EC_8P2GX", 12] + dfs_dir_oclass: RP_3G1 + dfs_oclass_mux: !mux + 6_server_ec2p2gx: + !filter-only : "/run/hosts/servers/6_server" # yamllint disable-line rule:colons + dfs_oclass: EC_2P2GX + 8_server_ec4p2gx: + !filter-only : "/run/hosts/servers/8_server" # yamllint disable-line rule:colons + dfs_oclass: EC_4P2GX + 12_server_ec4p3gx: + !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons + dfs_oclass: EC_4P3GX + 12_server_ec8p2gx: + !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons + dfs_oclass: EC_8P2GX From e9b903e32f56ef2d06512624e1a9763a8d804866 Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Tue, 14 Jan 2025 17:39:55 -0500 Subject: [PATCH 05/10] DAOS-16464 test: Check the command output for ERR. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- src/tests/ftest/util/ec_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index d78dfc4e2e9..86d84cf417b 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -406,7 +407,7 @@ def start_online_single_operation(self, operation, parity=1): # Verify the queue and make sure no FAIL for any run while not self.out_queue.empty(): - if self.out_queue.get() == "FAIL": + if (self.out_queue.get() == "FAIL" or self.out_queue.get() == "ERR"): self.fail("FAIL") From d5ab56b9c5db07c746434ee6fa5400688fb7d19e Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Wed, 15 Jan 2025 17:08:17 -0500 Subject: [PATCH 06/10] DAOS-16464 test: Update mdtest flags and update object class. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index 581a766e1ae..bd14a3e94b8 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -44,7 +44,7 @@ mdtest: test_dir: / dfs_destroy: True manager: MPICH - flags: -u + flags: "-u -F -C" write_bytes: 524288 read_bytes: 524288 depth: 10 @@ -59,9 +59,9 @@ mdtest: 8_server_ec4p2gx: !filter-only : "/run/hosts/servers/8_server" # yamllint disable-line rule:colons dfs_oclass: EC_4P2GX - 12_server_ec4p3gx: + 12_server_ec4p2gx: !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons - dfs_oclass: EC_4P3GX + dfs_oclass: EC_4P2GX 12_server_ec8p2gx: !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons dfs_oclass: EC_8P2GX From 8f4af35575c003842083c14f8d6b091997f90213 Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Thu, 16 Jan 2025 14:51:57 -0500 Subject: [PATCH 07/10] DAOS-16464 test: Add back EC_4P3GX object class testing. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index bd14a3e94b8..96162dbc769 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -59,9 +59,9 @@ mdtest: 8_server_ec4p2gx: !filter-only : "/run/hosts/servers/8_server" # yamllint disable-line rule:colons dfs_oclass: EC_4P2GX - 12_server_ec4p2gx: + 12_server_ec4p3gx: !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons - dfs_oclass: EC_4P2GX + dfs_oclass: EC_4P3GX 12_server_ec8p2gx: !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons dfs_oclass: EC_8P2GX From 9c7841be10d774b0d9467a7842ae19749720459f Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Fri, 17 Jan 2025 16:04:09 -0500 Subject: [PATCH 08/10] DAOS-16464 test: Updated copyright and minor fix. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- src/tests/ftest/erasurecode/online_rebuild_mdtest.py | 1 + src/tests/ftest/util/ec_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.py b/src/tests/ftest/erasurecode/online_rebuild_mdtest.py index c832b67a108..c7e18c44e87 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.py +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.py @@ -1,5 +1,6 @@ ''' (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index 86d84cf417b..b404b7902a8 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -407,7 +407,7 @@ def start_online_single_operation(self, operation, parity=1): # Verify the queue and make sure no FAIL for any run while not self.out_queue.empty(): - if (self.out_queue.get() == "FAIL" or self.out_queue.get() == "ERR"): + if self.out_queue.get() == "FAIL": self.fail("FAIL") From 4bd35d8ee073595fed3d0a12281ddb427e3d5d12 Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Sat, 18 Jan 2025 13:16:31 -0500 Subject: [PATCH 09/10] DAOS-16464 test: Add a better exception handling mechanism. Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- src/tests/ftest/util/ec_utils.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index b404b7902a8..c3c05951dfb 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -419,18 +419,37 @@ def setUp(self): super().setUp() # Create Pool self.add_pool() + self.container = None self.out_queue = queue.Queue() + def _start_execute_mdtest(self, mdtest_result_queue): + """Run the execute_mdtest method + + Args: + mdtest_result_queue(Queue) : Queue for passing errors. + Returns: + result(object) : mdtest run result + """ + try: + result = self.execute_mdtest(mdtest_result_queue) + except (CommandFailure, DaosApiError, DaosTestError): + mdtest_result_queue.put('Mdtest Failed') + return result + def start_online_mdtest(self, ranks_to_stop): """Run mdtest and stop ranks while mdtest is running. Args: ranks_to_stop (list): ranks to stop while mdtest is running """ + # Create the container and check the status + self.container = self.get_mdtest_container(self.pool) + if self.container is None: + self.fail("Container Create Failed") # Create the MDtest run thread job = threading.Thread( - target=self.execute_mdtest, - kwargs={"out_queue": self.out_queue}) + target=self._start_execute_mdtest, + kwargs={"mdtest_result_queue": self.out_queue}) # Launch the MDtest thread job.start() From b9bdbdd1190e1fcc17516334ef631a251a8a79a4 Mon Sep 17 00:00:00 2001 From: Padmanabhan Date: Wed, 22 Jan 2025 10:01:29 -0500 Subject: [PATCH 10/10] DAOS-16464 test: Minor Review updates Test-tag: EcodOnlineRebuildMdtest Test-repeat: 3 Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Padmanabhan --- src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml | 2 +- src/tests/ftest/util/ec_utils.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index 96162dbc769..13c652f96a8 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -48,7 +48,7 @@ mdtest: write_bytes: 524288 read_bytes: 524288 depth: 10 - num_of_files_dirs: 100000009 + num_of_files_dirs: 10000000 stonewall_timer: 30 # EC does not supported for directory so for now running with RP dfs_dir_oclass: RP_3G1 diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index c3c05951dfb..87469de9e72 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -432,7 +432,7 @@ def _start_execute_mdtest(self, mdtest_result_queue): """ try: result = self.execute_mdtest(mdtest_result_queue) - except (CommandFailure, DaosApiError, DaosTestError): + except Exception: # pylint: disable=broad-except mdtest_result_queue.put('Mdtest Failed') return result @@ -444,8 +444,6 @@ def start_online_mdtest(self, ranks_to_stop): """ # Create the container and check the status self.container = self.get_mdtest_container(self.pool) - if self.container is None: - self.fail("Container Create Failed") # Create the MDtest run thread job = threading.Thread( target=self._start_execute_mdtest,