diff --git a/src/include/daos_errno.h b/src/include/daos_errno.h index ad3bf93e5b7..8f2960b5933 100644 --- a/src/include/daos_errno.h +++ b/src/include/daos_errno.h @@ -235,6 +235,8 @@ enum daos_errno { /** Return value representing success */ DER_SUCCESS = 0, D_FOREACH_ERR_RANGE(D_DEFINE_RANGE_ERRNO) + /** Last valid errno, used for dumping */ + DER_LAST_VALID, /** Unknown error value */ DER_UNKNOWN = (DER_ERR_GURT_BASE + 500000), }; diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 83c52db1e1b..398efb6dba7 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1449,6 +1449,15 @@ agg_peer_update(struct ec_agg_entry *entry, bool write_parity) D_ASSERT(!write_parity || entry->ae_sgl.sg_iovs[AGG_IOV_PARITY].iov_buf); + agg_param = container_of(entry, struct ec_agg_param, ap_agg_entry); + + /* If rebuild started, abort it before sending RPC to save conflict window with rebuild + * (see obj_inflight_io_check()). + */ + if (agg_param->ap_pool_info.api_pool->sp_rebuilding > 0) { + D_DEBUG(DB_EPC, DF_UOID" abort as rebuild started\n", DP_UOID(entry->ae_oid)); + return -1; + } rc = agg_get_obj_handle(entry); if (rc) { @@ -1456,7 +1465,6 @@ agg_peer_update(struct ec_agg_entry *entry, bool write_parity) return rc; } - agg_param = container_of(entry, struct ec_agg_param, ap_agg_entry); rc = pool_map_find_failed_tgts(agg_param->ap_pool_info.api_pool->sp_map, &targets, &failed_tgts_cnt); if (rc) { @@ -1728,6 +1736,15 @@ agg_process_holes(struct ec_agg_entry *entry) int tid, rc = 0; int *status; + agg_param = container_of(entry, struct ec_agg_param, ap_agg_entry); + /* If rebuild started, abort it before sending RPC to save conflict window with rebuild + * (see obj_inflight_io_check()). + */ + if (agg_param->ap_pool_info.api_pool->sp_rebuilding > 0) { + D_DEBUG(DB_EPC, DF_UOID" abort as rebuild started\n", DP_UOID(entry->ae_oid)); + return -1; + } + D_ALLOC_ARRAY(stripe_ud.asu_recxs, entry->ae_cur_stripe.as_extent_cnt + 1); if (stripe_ud.asu_recxs == NULL) { @@ -1745,8 +1762,6 @@ agg_process_holes(struct ec_agg_entry *entry) if (rc) goto out; - agg_param = container_of(entry, struct ec_agg_param, - ap_agg_entry); rc = ABT_eventual_create(sizeof(*status), &stripe_ud.asu_eventual); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); @@ -2672,15 +2687,7 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, ec_agg_param->ap_agg_entry.ae_obj_hdl = DAOS_HDL_INVAL; } - if (cont->sc_pool->spc_pool->sp_rebuilding > 0 && !cont->sc_stopping) { - /* There is rebuild going on, and we can't proceed EC aggregate boundary, - * Let's wait for 5 seconds for another EC aggregation. - */ - D_ASSERT(cont->sc_ec_agg_req != NULL); - sched_req_sleep(cont->sc_ec_agg_req, 5 * 1000); - } - - if (rc == -DER_BUSY) { + if (rc == -DER_BUSY && cont->sc_pool->spc_pool->sp_rebuilding == 0) { /** Hit an object conflict VOS aggregation or discard. Rather than exiting, let's * yield and try again. */ @@ -2696,6 +2703,12 @@ cont_ec_aggregate_cb(struct ds_cont_child *cont, daos_epoch_range_t *epr, } update_hae: + /* clear the flag before next turn's cont_aggregate_runnable(), to save conflict + * window with rebuild (see obj_inflight_io_check()). + */ + if (cont->sc_pool->spc_pool->sp_rebuilding > 0) + cont->sc_ec_agg_active = 0; + if (rc == 0) { cont->sc_ec_agg_eph = max(cont->sc_ec_agg_eph, epr->epr_hi); if (!cont->sc_stopping && cont->sc_ec_query_agg_eph) { diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index f9f3d7ca482..1b26fb01d74 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2425,8 +2425,8 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc) dkey = (daos_key_t *)&oer->er_dkey; iod = (daos_iod_t *)&oer->er_iod; iod_csums = oer->er_iod_csums.ca_arrays; - rc = vos_update_begin(ioc.ioc_coc->sc_hdl, oer->er_oid, oer->er_epoch_range.epr_hi, 0, - dkey, 1, iod, iod_csums, 0, &ioh, NULL); + rc = vos_update_begin(ioc.ioc_coc->sc_hdl, oer->er_oid, oer->er_epoch_range.epr_hi, + VOS_OF_REBUILD, dkey, 1, iod, iod_csums, 0, &ioh, NULL); if (rc) { D_ERROR(DF_UOID" Update begin failed: "DF_RC"\n", DP_UOID(oer->er_oid), DP_RC(rc)); @@ -2501,9 +2501,8 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc) D_ASSERT(ioc.ioc_coc != NULL); dkey = (daos_key_t *)&oea->ea_dkey; if (parity_bulk != CRT_BULK_NULL) { - rc = vos_update_begin(ioc.ioc_coc->sc_hdl, oea->ea_oid, - oea->ea_epoch_range.epr_hi, 0, dkey, 1, - iod, iod_csums, 0, &ioh, NULL); + rc = vos_update_begin(ioc.ioc_coc->sc_hdl, oea->ea_oid, oea->ea_epoch_range.epr_hi, + VOS_OF_REBUILD, dkey, 1, iod, iod_csums, 0, &ioh, NULL); if (rc) { D_ERROR(DF_UOID" Update begin failed: "DF_RC"\n", DP_UOID(oea->ea_oid), DP_RC(rc)); diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 059d82e5bbf..e59335704ce 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -1152,7 +1152,7 @@ __migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, ptr += size * iods[i].iod_size; offset = recx->rx_idx; size = recx->rx_nr; - parity_eph = ephs[i][j]; + parity_eph = encode ? ephs[i][j] : mrone->mo_epoch; } if (size > 0) @@ -1214,9 +1214,8 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, update_eph = mrone->mo_iods_update_ephs_from_parity[i][j]; update_eph_p = &update_eph; - rc = __migrate_fetch_update_parity(mrone, oh, &iod, fetch_eph, &update_eph_p, - mrone->mo_iods_num_from_parity, ds_cont, - true); + rc = __migrate_fetch_update_parity(mrone, oh, &iod, fetch_eph, + &update_eph_p, 1, ds_cont, true); if (rc) return rc; } @@ -1568,7 +1567,8 @@ migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, fetch_eph = mrone->mo_iods_update_ephs_from_parity[i][j]; rc = __migrate_fetch_update_bulk(mrone, oh, &iod, 1, fetch_eph, mrone->mo_iods_update_ephs_from_parity[i][j], - DIOF_EC_RECOV_FROM_PARITY, ds_cont); + DIOF_EC_RECOV_FROM_PARITY | DIOF_FOR_MIGRATION, + ds_cont); if (rc != 0) D_GOTO(out, rc); } diff --git a/src/tests/ftest/cart/SConscript b/src/tests/ftest/cart/SConscript index 42170d0731e..858609e2e8e 100644 --- a/src/tests/ftest/cart/SConscript +++ b/src/tests/ftest/cart/SConscript @@ -10,7 +10,7 @@ import os SIMPLE_TEST_SRC = ['threaded_client.c', 'dual_iface_server.c', 'no_pmix_multi_ctx.c', 'threaded_server.c', 'test_corpc_prefwd.c', - 'test_corpc_exclusive.c', + 'test_corpc_exclusive.c', 'dump_errnos.c', 'test_proto_server.c', 'test_proto_client.c', 'test_multisend_server.c', 'test_multisend_client.c', 'test_no_timeout.c', 'test_ep_cred_server.c', diff --git a/src/tests/ftest/cart/dump_errnos.c b/src/tests/ftest/cart/dump_errnos.c new file mode 100644 index 00000000000..a2c77f45486 --- /dev/null +++ b/src/tests/ftest/cart/dump_errnos.c @@ -0,0 +1,40 @@ +/* + * (C) Copyright 2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * Small utility to dump all descriptions of errcodes + */ +#include +#include +#include +#include + +int +main(int argc, char **argv) +{ + const char *str; + int i; + + if (argc == 1) { + for (i = DER_SUCCESS; i < DER_LAST_VALID; i++) { + str = d_errstr(-i); + if (strcmp("DER_UNKNOWN", str)) + printf("%d = %s\n", -i, d_errstr(-i)); + } + + return 0; + } + + i = atoi(argv[1]); + + if (i > 0) { + printf("Errnos are negative numbers, changing\n"); + i = -i; + } + + printf("%d = %s\n", i, d_errstr(i)); + + return 0; +} diff --git a/src/tests/ftest/recovery/pool_list_consolidation.py b/src/tests/ftest/recovery/pool_list_consolidation.py index e65aa5bd1c0..6eeee3789a7 100644 --- a/src/tests/ftest/recovery/pool_list_consolidation.py +++ b/src/tests/ftest/recovery/pool_list_consolidation.py @@ -7,8 +7,9 @@ from avocado.core.exceptions import TestFail from ClusterShell.NodeSet import NodeSet -from general_utils import check_file_exists, pcmd, report_errors +from general_utils import check_file_exists, report_errors from recovery_test_base import RecoveryTestBase +from run_utils import run_remote class PoolListConsolidationTest(RecoveryTestBase): @@ -189,10 +190,9 @@ def verify_pool_dir_removed(self, errors): list: Error list. """ - hosts = list(set(self.server_managers[0].ranks.values())) - nodeset_hosts = NodeSet.fromlist(hosts) pool_path = f"/mnt/daos0/{self.pool.uuid.lower()}" - check_out = check_file_exists(hosts=nodeset_hosts, filename=pool_path) + check_out = check_file_exists( + hosts=self.hostlist_servers, filename=pool_path, directory=True) if check_out[0]: msg = f"Pool path still exists! Node without pool path = {check_out[1]}" errors.append(msg) @@ -255,7 +255,7 @@ def test_lost_majority_ps_replicas(self): 1. Create a pool with --nsvc=3. Rank 0, 1, and 2 will be pool service replicas. 2. Stop servers. - 3. Remove /mnt/daos//rdb-pool from rank 0 and 2. + 3. Remove //rdb-pool from rank 0 and 2. 4. Start servers. 5. Run DAOS checker under kinds of mode. 6. Try creating a container. The pool can be started now, so create should succeed. @@ -269,37 +269,47 @@ def test_lost_majority_ps_replicas(self): :avocado: tags=recovery,cat_recov,pool_list_consolidation :avocado: tags=PoolListConsolidationTest,test_lost_majority_ps_replicas """ - # 1. Create a pool with --nsvc=3. + self.log_step("Create a pool with --nsvc=3.") self.pool = self.get_pool(svcn=3) - # 2. Stop servers. + self.log_step("Stop servers") dmg_command = self.get_dmg_command() dmg_command.system_stop() - # 3. Remove /mnt/daos//rdb-pool from two ranks. - rdb_pool_path = f"/mnt/daos0/{self.pool.uuid.lower()}/rdb-pool" - command = f"sudo rm /mnt/daos0/{self.pool.uuid.lower()}/rdb-pool" + self.log_step("Remove //rdb-pool from two ranks.") + scm_mount = self.server_managers[0].get_config_value("scm_mount") + rdb_pool_path = f"{scm_mount}/{self.pool.uuid.lower()}/rdb-pool" + command = f"sudo rm {rdb_pool_path}" hosts = list(set(self.server_managers[0].ranks.values())) count = 0 for host in hosts: node = NodeSet(host) check_out = check_file_exists(hosts=node, filename=rdb_pool_path, sudo=True) if check_out[0]: - pcmd(hosts=node, command=command) + if not run_remote(log=self.log, hosts=node, command=command).passed: + self.fail(f'Failed to remove {rdb_pool_path} on {host}') self.log.info("rm rdb-pool from %s", str(node)) count += 1 if count > 1: break - - # 4. Start servers. + using_control_metadata = self.server_managers[0].manager.job.using_control_metadata + if count == 0 or using_control_metadata: + msg = ("MD-on-SSD cluster. Contents under mount point are removed by control plane " + "after system stop.") + self.log.info(msg) + dmg_command.system_start() + # return results in PASS. + return + + self.log_step("Start servers.") dmg_command.system_start() + self.log_step("Run DAOS checker under kinds of mode.") errors = [] - # 5. Run DAOS checker under kinds of mode. errors = self.chk_dist_checker( inconsistency="corrupted pool without quorum") - # 6. Try creating a container. It should succeed. + self.log_step("Try creating a container. It should succeed.") cont_create_success = False for _ in range(5): time.sleep(5) @@ -315,8 +325,9 @@ def test_lost_majority_ps_replicas(self): if not cont_create_success: errors.append("Container create failed after running checker!") - # 7. Show that rdb-pool are recovered. i.e., at least three out of four ranks - # should have rdb-pool. + msg = ("Show that rdb-pool are recovered. i.e., at least three out of four ranks should " + "have rdb-pool.") + self.log_step(msg) hosts = list(set(self.server_managers[0].ranks.values())) count = 0 for host in hosts: @@ -338,7 +349,7 @@ def test_lost_all_rdb(self): 1. Create a pool. 2. Stop servers. - 3. Remove /mnt/daos0//rdb-pool from all ranks. + 3. Remove //rdb-pool from all ranks. 4. Start servers. 5. Run DAOS checker under kinds of mode. 6. Check that the pool does not appear with dmg pool list. @@ -351,38 +362,50 @@ def test_lost_all_rdb(self): :avocado: tags=recovery,cat_recov,pool_list_consolidation :avocado: tags=PoolListConsolidationTest,test_lost_all_rdb """ - # 1. Create a pool. + self.log_step("Create a pool.") self.pool = self.get_pool() - # 2. Stop servers. + self.log_step("Stop servers.") dmg_command = self.get_dmg_command() dmg_command.system_stop() - # 3. Remove /mnt/daos//rdb-pool from all ranks. - hosts = list(set(self.server_managers[0].ranks.values())) - nodeset_hosts = NodeSet.fromlist(hosts) - command = f"sudo rm /mnt/daos0/{self.pool.uuid.lower()}/rdb-pool" - remove_result = pcmd(hosts=nodeset_hosts, command=command) - success_nodes = remove_result[0] - if nodeset_hosts != success_nodes: - msg = (f"Failed to remove rdb-pool! All = {nodeset_hosts}, " + self.log_step("Remove //rdb-pool from all ranks.") + scm_mount = self.server_managers[0].get_config_value("scm_mount") + rdb_pool_path = f"{scm_mount}/{self.pool.uuid.lower()}/rdb-pool" + rdb_pool_out = check_file_exists( + hosts=self.hostlist_servers, filename=rdb_pool_path, sudo=True) + if not rdb_pool_out[0]: + msg = ("MD-on-SSD cluster. Contents under mount point are removed by control plane " + "after system stop.") + self.log.info(msg) + dmg_command.system_start() + # return results in PASS. + return + command = f"sudo rm {rdb_pool_path}" + remove_result = run_remote(log=self.log, hosts=self.hostlist_servers, command=command) + if not remove_result.passed: + self.fail(f"Failed to remove {rdb_pool_path} from {self.hostlist_servers}") + success_nodes = remove_result.passed_hosts + if self.hostlist_servers != success_nodes: + msg = (f"Failed to remove rdb-pool! All = {self.hostlist_servers}, " f"Success = {success_nodes}") self.fail(msg) # 4. Start servers. + self.log_step("Start servers.") dmg_command.system_start() + self.log_step("Run DAOS checker under kinds of mode.") errors = [] - # 5. Run DAOS checker under kinds of mode. errors = self.chk_dist_checker( inconsistency="corrupted pool without quorum") - # 6. Check that the pool does not appear with dmg pool list. + self.log_step("Check that the pool does not appear with dmg pool list.") pools = dmg_command.get_pool_list_all() if pools: errors.append(f"Pool still exists after running checker! {pools}") - # 7. Verify that the pool directory was removed from the mount point. + self.log_step("Verify that the pool directory was removed from the mount point.") errors = self.verify_pool_dir_removed(errors=errors) # Don't try to destroy the pool during tearDown. diff --git a/src/tests/ftest/recovery/pool_membership.py b/src/tests/ftest/recovery/pool_membership.py index d9a29377d6e..b8233ac81bf 100644 --- a/src/tests/ftest/recovery/pool_membership.py +++ b/src/tests/ftest/recovery/pool_membership.py @@ -6,7 +6,7 @@ import time from ClusterShell.NodeSet import NodeSet -from general_utils import report_errors +from general_utils import check_file_exists, report_errors from ior_test_base import IorTestBase from run_utils import run_remote @@ -72,7 +72,7 @@ def test_orphan_pool_shard(self): 6. Enable and start the checker. 7. Query the checker and verify that the issue was fixed. i.e., Current status is COMPLETED. - 8. Disable the checker. + 8. Disable the checker and start server. 9. Call dmg storage query usage to verify that the pool usage is back to the original value. @@ -136,6 +136,17 @@ def test_orphan_pool_shard(self): self.log_step("Stop servers.") self.server_managers[0].system_stop() + pool_directory = f"{src_mount}/{pool.uuid.lower()}" + pool_directory_result = check_file_exists( + hosts=self.hostlist_servers, filename=pool_directory, directory=True) + if not pool_directory_result[0]: + msg = ("MD-on-SSD cluster. Contents under mount point are removed by control plane " + "after system stop.") + self.log.info(msg) + dmg_command.system_start() + # return results in PASS. + return + # 5. Copy /mnt/daos?/ from the engine where we created the pool to # another engine where we didn’t create. @@ -213,8 +224,8 @@ def test_orphan_pool_shard(self): errors.append( "Checker didn't fix orphan pool shard! msg = {}".format(query_msg)) - # 8. Disable the checker. - self.log_step("Disable checker.") + # 8. Disable the checker and start server. + self.log_step("Disable the checker and start server.") dmg_command.check_disable() # 9. Call dmg storage query usage to verify that the pool usage is back to the @@ -242,7 +253,7 @@ def test_dangling_pool_map(self): 4. Enable and start the checker. 5. Query the checker and verify that the issue was fixed. i.e., Current status is COMPLETED. - 6. Disable the checker. + 6. Disable the checker and start server. 7. Verify that the pool has one less target. Jira ID: DAOS-11736 @@ -252,22 +263,29 @@ def test_dangling_pool_map(self): :avocado: tags=recovery,cat_recov,pool_membership :avocado: tags=PoolMembershipTest,test_dangling_pool_map """ - # 1. Create a pool. - self.log_step("Creating a pool (dmg pool create)") + self.log_step("Create a pool.") pool = self.get_pool(connect=False) - # 2. Stop servers. + self.log_step("Stop servers.") dmg_command = self.get_dmg_command() dmg_command.system_stop() - # 3. Manually remove ///vos-0 from rank 0 node. + self.log_step("Manually remove ///vos-0 from rank 0 node.") rank_0_host = NodeSet(self.server_managers[0].get_host(0)) scm_mount = self.server_managers[0].get_config_value("scm_mount") - rm_cmd = f"sudo rm {scm_mount}/{pool.uuid.lower()}/vos-0" + vos_0_path = f"{scm_mount}/{pool.uuid.lower()}/vos-0" + vos_0_result = check_file_exists(hosts=self.hostlist_servers, filename=vos_0_path) + if not vos_0_result[0]: + msg = ("MD-on-SSD cluster. Contents under mount point are removed by control plane " + "after system stop.") + self.log.info(msg) + dmg_command.system_start() + # return results in PASS. + return + rm_cmd = f"sudo rm {vos_0_path}" if not run_remote(log=self.log, hosts=rank_0_host, command=rm_cmd).passed: self.fail(f"Following command failed on {rank_0_host}! {rm_cmd}") - # 4. Enable and start the checker. self.log_step("Enable and start the checker.") dmg_command.check_enable(stop=False) @@ -275,10 +293,9 @@ def test_dangling_pool_map(self): # the fault. Developer is fixing this issue. time.sleep(3) - # Start checker. dmg_command.check_start() - # 5. Query the checker and verify that the issue was fixed. + self.log_step("Query the checker and verify that the issue was fixed.") repair_reports = self.wait_for_check_complete() errors = [] @@ -287,11 +304,10 @@ def test_dangling_pool_map(self): errors.append( "Checker didn't fix orphan pool shard! msg = {}".format(query_msg)) - # 6. Disable the checker. - self.log_step("Disable checker.") + self.log_step("Disable the checker and start server.") dmg_command.check_disable() - # 7. Verify that the pool has one less target. + self.log_step("Verify that the pool has one less target.") query_out = pool.query() total_targets = query_out["response"]["total_targets"] active_targets = query_out["response"]["active_targets"] @@ -325,30 +341,35 @@ def test_dangling_rank_entry(self): targets = self.params.get("targets", "/run/server_config/engines/0/*") exp_msg = "dangling rank entry" - # 1. Create a pool and a container. self.log_step("Create a pool and a container.") self.pool = self.get_pool(connect=False) self.container = self.get_container(pool=self.pool) - # 2. Write some data with IOR using SX. self.log_step("Write some data with IOR.") self.ior_cmd.set_daos_params(self.pool, self.container.identifier) self.run_ior_with_pool(create_pool=False, create_cont=False) - # 3. Stop servers. self.log_step("Stop servers.") dmg_command = self.get_dmg_command() dmg_command.system_stop() - # 4. Remove pool directory from one of the mount points. self.log_step("Remove pool directory from one of the mount points.") rank_1_host = NodeSet(self.server_managers[0].get_host(1)) scm_mount = self.server_managers[0].get_config_value("scm_mount") - rm_cmd = f"sudo rm -rf {scm_mount}/{self.pool.uuid.lower()}" + pool_directory = f"{scm_mount}/{self.pool.uuid.lower()}" + pool_directory_result = check_file_exists( + hosts=self.hostlist_servers, filename=pool_directory, directory=True) + if not pool_directory_result[0]: + msg = ("MD-on-SSD cluster. Contents under mount point are removed by control plane " + "after system stop.") + self.log.info(msg) + dmg_command.system_start() + # return results in PASS. + return + rm_cmd = f"sudo rm -rf {pool_directory}" if not run_remote(log=self.log, hosts=rank_1_host, command=rm_cmd).passed: self.fail(f"Following command failed on {rank_1_host}! {rm_cmd}") - # 5. Enable checker. self.log_step("Enable checker.") dmg_command.check_enable(stop=False) @@ -356,11 +377,9 @@ def test_dangling_rank_entry(self): # the fault. Developer is fixing this issue. time.sleep(3) - # 6. Start checker. self.log_step("Start checker.") dmg_command.check_start() - # 7. Query the checker until expected number of inconsistencies are repaired. self.log_step( "Query the checker until expected number of inconsistencies are repaired.") repair_reports = self.wait_for_check_complete() @@ -384,7 +403,6 @@ def test_dangling_rank_entry(self): if not exp_msg_found: errors.append(f"{exp_msg} not in repair message!") - # 8. Disable checker. self.log_step("Disable checker.") dmg_command.check_disable() diff --git a/src/tests/ftest/recovery/pool_membership.yaml b/src/tests/ftest/recovery/pool_membership.yaml index b9f854e4d25..c18f0539c2c 100644 --- a/src/tests/ftest/recovery/pool_membership.yaml +++ b/src/tests/ftest/recovery/pool_membership.yaml @@ -31,7 +31,7 @@ container: ior: client_processes: ppn: 1 - flags: -k -D 10 -v -w -W + flags: -k -v -w -W api: DFS transfer_size: 1M block_size: 10G diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 4587b84a053..5934aa72cca 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -665,8 +665,10 @@ def __init__(self, *args, **kwargs): self.config_file_base = "test" self.log_dir = os.path.split( os.getenv("D_LOG_FILE", "/tmp/server.log"))[0] - # whether engines ULT stacks have been already dumped - self.dumped_engines_stacks = False + # Whether to dump engines ULT stacks on failure + self.__dump_engine_ult_on_failure = True + # Whether engines ULT stacks have been already dumped + self.__have_dumped_ult_stacks = False # Suffix to append to each access point name self.access_points_suffix = None @@ -738,6 +740,10 @@ def setUp(self): self.access_points, self.access_points_suffix) self.host_info.access_points = self.access_points + # Toggle whether to dump server ULT stacks on failure + self.__dump_engine_ult_on_failure = self.params.get( + "dump_engine_ult_on_failure", "/run/setup/*", True) + # # Find a configuration that meets the test requirements # self.config = Configuration( # self.params, self.hostlist_servers, debug=self.debug) @@ -1344,14 +1350,14 @@ def remove_temp_test_dir(self): errors.append("Error removing temporary test files on {}".format(result.failed_hosts)) return errors - def dump_engines_stacks(self, message): + def __dump_engines_stacks(self, message): """Dump the engines ULT stacks. Args: - message (str): reason for dumping the ULT stacks. Defaults to None. + message (str): reason for dumping the ULT stacks """ - if self.dumped_engines_stacks is False: - self.dumped_engines_stacks = True + if self.__dump_engine_ult_on_failure and not self.__have_dumped_ult_stacks: + self.__have_dumped_ult_stacks = True self.log.info("%s, dumping ULT stacks", message) dump_engines_stacks(self.hostlist_servers) @@ -1360,17 +1366,17 @@ def report_timeout(self): super().report_timeout() if self.timeout is not None and self.time_elapsed > self.timeout: # dump engines ULT stacks upon test timeout - self.dump_engines_stacks("Test has timed-out") + self.__dump_engines_stacks("Test has timed-out") def fail(self, message=None): """Dump engines ULT stacks upon test failure.""" - self.dump_engines_stacks("Test has failed") + self.__dump_engines_stacks("Test has failed") super().fail(message) def error(self, message=None): # pylint: disable=arguments-renamed """Dump engines ULT stacks upon test error.""" - self.dump_engines_stacks("Test has errored") + self.__dump_engines_stacks("Test has errored") super().error(message) def tearDown(self): @@ -1383,7 +1389,7 @@ def tearDown(self): # class (see DAOS-1452/DAOS-9941 and Avocado issue #5217 with # associated PR-5224) if self.status is not None and self.status != 'PASS' and self.status != 'SKIP': - self.dump_engines_stacks("Test status is {}".format(self.status)) + self.__dump_engines_stacks("Test status is {}".format(self.status)) # Report whether or not the timeout has expired self.report_timeout() @@ -1595,7 +1601,7 @@ def stop_servers(self): "ERROR: At least one multi-variant server was not found in " "its expected state; stopping all servers") # dump engines stacks if not already done - self.dump_engines_stacks("Some engine not in expected state") + self.__dump_engines_stacks("Some engine not in expected state") self.test_log.info( "Stopping %s group(s) of servers", len(self.server_managers)) errors.extend(self._stop_managers(self.server_managers, "servers"))