From d5037fe8d4884616de8ed6d586cff3844c2590da Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Fri, 12 Jan 2024 15:39:13 +0000 Subject: [PATCH 01/18] DAOS-14852 test: Fix JSON key lookup in NVMe space usage query (#13594) Signed-off-by: Tom Nabarro --- src/tests/ftest/util/dmg_utils.py | 28 ++++++++++++++++---- src/tests/ftest/util/pool_create_all_base.py | 15 +++++------ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index 7d336c4304d..53913ff934a 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -406,6 +406,8 @@ def storage_query_usage(self): # "serial": "CVFT534200AY400BGN", # "pci_addr": "0000:05:00.0", # "fw_rev": "8DV10131", + # "vendor_id": "0x8086", + # "pci_type": "", # "socket_id": 0, # "health_stats": null, # "namespaces": [ @@ -416,7 +418,7 @@ def storage_query_usage(self): # ], # "smd_devices": [ # { - # "dev_state": "NORMAL", + # "role_bits": 0, # "uuid": "259608d1-c469-4684-9986-9f7708b20ca3", # "tgt_ids": [ 0, 1, 2, 3, 4, 5, 6, 7 ], # "rank": 0, @@ -428,12 +430,28 @@ def storage_query_usage(self): # "meta_wal_size": 0, # "rdb_size": 134217728, # "rdb_wal_size": 268435456, - # "health": null, - # "tr_addr": "0000:05:00.0", - # "roles": "data", - # "has_sys_xs": false + # "roles": "NA", + # "has_sys_xs": false, + # "ctrlr": { + # "info": "", + # "model": "", + # "serial": "", + # "pci_addr": "", + # "fw_rev": "", + # "vendor_id": "", + # "pci_type": "", + # "socket_id": 0, + # "health_stats": null, + # "namespaces": null, + # "smd_devices": null, + # "dev_state": "UNKNOWN", + # "led_state": "OFF" + # }, + # "ctrlr_namespace_id": 1 # } # ] + # "dev_state": "NORMAL", + # "led_state": "NA", # } # ], # "scm_modules": null, diff --git a/src/tests/ftest/util/pool_create_all_base.py b/src/tests/ftest/util/pool_create_all_base.py index 5f4bee9a1c0..f3b099f5124 100644 --- a/src/tests/ftest/util/pool_create_all_base.py +++ b/src/tests/ftest/util/pool_create_all_base.py @@ -60,10 +60,8 @@ def get_usable_bytes(self): nvme_bytes = 0 for nvme_device in host_storage["storage"]["nvme_devices"]: - if nvme_device["smd_devices"] is None: - continue - for smd_device in nvme_device["smd_devices"]: - if smd_device["ctrlr"]["dev_state"] == "NORMAL": + if nvme_device["dev_state"] == "NORMAL": + for smd_device in (nvme_device["smd_devices"] or []): nvme_bytes += smd_device["usable_bytes"] nvme_engine_bytes = min(nvme_engine_bytes, nvme_bytes) @@ -310,11 +308,10 @@ def check_pool_distribution(self, scm_delta_bytes, nvme_delta_bytes=None): nvme_bytes = 0 for nvme_device in host_storage["storage"]["nvme_devices"]: - for smd_device in nvme_device["smd_devices"]: - if smd_device["ctrlr"]["dev_state"] != "NORMAL": - continue - nvme_bytes += smd_device["total_bytes"] - nvme_bytes -= smd_device["avail_bytes"] + if nvme_device["dev_state"] == "NORMAL": + for smd_device in (nvme_device["smd_devices"] or []): + nvme_bytes += smd_device["total_bytes"] + nvme_bytes -= smd_device["avail_bytes"] if nvme_bytes < nvme_used_bytes[0]: nvme_used_bytes[0] = nvme_bytes if nvme_bytes > nvme_used_bytes[1]: From 217aab97f580eae2c06e63256fe380c850e42ca0 Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Fri, 12 Jan 2024 10:40:46 -0800 Subject: [PATCH 02/18] DAOS-14966 cart: Fix incorrect opc logging (#13591) - Fix macro to generate case statement correctly Signed-off-by: Alexander A Oganezov --- src/cart/crt_rpc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index e6fa9dd4705..62efb48e8e3 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -264,7 +264,8 @@ crt_opc_decode(crt_opcode_t crt_opc, char **module_name, char **opc_name) /* Redefining X macro allows to reuse existing lists */ #define X(a, ...) \ case a: \ - opc = #a; + opc = #a; \ + break; /* Next find the opcode name if available for the module */ if (cart_module) { From b83f05c5f1fbefb8a5a867526285ff4034eeb54b Mon Sep 17 00:00:00 2001 From: wangdi Date: Tue, 16 Jan 2024 09:59:37 -0800 Subject: [PATCH 03/18] DAOS-14972 pool: Only allow UPIN engine as PS replicas (#13593) Only allow UPIN engine as PS replicas to avoid IV leader switch hashles during rebuild for the moment. Signed-off-by: Di Wang --- src/pool/srv_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index 0d9854fd436..c95f15cc715 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -20,7 +20,7 @@ #define POOL_GROUP_MAP_STATES (PO_COMP_ST_UP | PO_COMP_ST_UPIN | PO_COMP_ST_DRAIN) /* Map states of ranks that make up the pool service */ -#define POOL_SVC_MAP_STATES (PO_COMP_ST_UP | PO_COMP_ST_UPIN) +#define POOL_SVC_MAP_STATES (PO_COMP_ST_UPIN) /* * Since we want all PS replicas to belong to the pool group, From 0e08ed011e62112575cbec943d1ffd228b4cd976 Mon Sep 17 00:00:00 2001 From: Cedric Koch-Hofer <94527853+knard-intel@users.noreply.github.com> Date: Tue, 16 Jan 2024 19:37:16 +0100 Subject: [PATCH 04/18] DAOS-14896 gurt: Fix d_getenv with negative int (#13586) Fix regression of d_getenv_xxx() functions used for retrieve int envioronment variable: support of string reprsenting signed integer. Signed-off-by: Cedric Koch-Hofer --- src/gurt/misc.c | 90 ++++++++++++++------------- src/gurt/tests/test_gurt.c | 123 +++++++++++++++++++++++++++++-------- 2 files changed, 143 insertions(+), 70 deletions(-) diff --git a/src/gurt/misc.c b/src/gurt/misc.c index de0a1ae6fd7..ffb1a85bb0d 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -25,8 +25,6 @@ #include #include -#define UINT64_MAX_STR "18446744073709551615" - /* state buffer for DAOS rand and srand calls, NOT thread safe */ static struct drand48_data randBuffer = {0}; @@ -951,18 +949,17 @@ d_rank_range_list_free(d_rank_range_list_t *range_list) } static inline bool -dis_unsigned_str(char *str) +dis_signed_str(char *str) { - char *eos; - - if (str == NULL || str[0] == '\0') - return false; + char *eos; + size_t str_size; - eos = str + (sizeof(UINT64_MAX_STR) - 1); - while (str != eos && *str != '\0' && *str >= '0' && *str <= '9') + str_size = strlen(str); + eos = str + str_size; + while (str != eos && *str != '-' && (*str < '0' || *str > '9')) ++str; - return *str == '\0'; + return *str == '-'; } static inline bool @@ -1214,40 +1211,68 @@ d_getenv_char(const char *name, char *char_val) } static int -d_getenv_ull(unsigned long long *val, const char *name) +d_getenv_ull(unsigned long long *val, const char *name, size_t val_size) { char *env; + char *env_tmp = NULL; char *endptr; - unsigned long long tmp; + unsigned long long val_tmp; int rc; assert(val != NULL); assert(name != NULL); + assert(val_size <= sizeof(unsigned long long)); d_env_rwlock_rdlock(); env = getenv(name); if (env == NULL) { rc = -DER_NONEXIST; + d_env_rwlock_unlock(); goto out; } - if (!dis_unsigned_str(env)) { - rc = -DER_INVAL; + /* DAOS-14896 NOTES: + * - Duplicate env to reduce data race condition with external libraries not using the DAOS + * thread safe environment variables management API. + * - Use of strdup() as there is no limit to environment variable size. + */ + env_tmp = strdup(env); + if (env_tmp == NULL) { + rc = -DER_NOMEM; + d_env_rwlock_unlock(); goto out; } + d_env_rwlock_unlock(); - errno = 0; - tmp = strtoull(env, &endptr, 0); - if (errno != 0 || endptr == env || *endptr != '\0') { + errno = 0; + val_tmp = strtoull(env_tmp, &endptr, 10); + if (errno != 0 || endptr == env_tmp || *endptr != '\0') { rc = -DER_INVAL; goto out; } - *val = tmp; + if (val_size != sizeof(unsigned long long)) { + const unsigned long long val_max = (1ull << val_size * 8) - 1; + const bool is_signed = dis_signed_str(env_tmp); + + if (is_signed) + val_tmp = ~val_tmp; + if (val_tmp > val_max || (is_signed && val_tmp >= val_max)) { + rc = -DER_INVAL; + goto out; + } + if (is_signed) { + val_tmp = ~val_tmp; + val_tmp <<= (sizeof(unsigned long long) - val_size) * 8; + val_tmp >>= (sizeof(unsigned long long) - val_size) * 8; + } + } + + *val = val_tmp; rc = -DER_SUCCESS; out: - d_env_rwlock_unlock(); + free(env_tmp); return rc; } @@ -1269,17 +1294,10 @@ d_getenv_uint(const char *name, unsigned *uint_val) assert(uint_val != NULL); assert(name != NULL); - rc = d_getenv_ull(&tmp, name); + rc = d_getenv_ull(&tmp, name, sizeof(unsigned)); if (rc != -DER_SUCCESS) return rc; -#if UINT_MAX != ULLONG_MAX - assert(sizeof(unsigned) < sizeof(unsigned long long)); - if (tmp > UINT_MAX) { - return -DER_INVAL; - } -#endif - *uint_val = (unsigned)tmp; return -DER_SUCCESS; } @@ -1301,17 +1319,10 @@ d_getenv_uint32_t(const char *name, uint32_t *uint32_val) assert(uint32_val != NULL); assert(name != NULL); - rc = d_getenv_ull(&tmp, name); + rc = d_getenv_ull(&tmp, name, sizeof(uint32_t)); if (rc != -DER_SUCCESS) return rc; -#if UINT32_MAX != ULLONG_MAX - assert(sizeof(uint32_t) < sizeof(unsigned long long)); - if (tmp > UINT32_MAX) { - return -DER_INVAL; - } -#endif - *uint32_val = (uint32_t)tmp; return -DER_SUCCESS; } @@ -1333,17 +1344,10 @@ d_getenv_uint64_t(const char *name, uint64_t *uint64_val) assert(uint64_val != NULL); assert(name != NULL); - rc = d_getenv_ull(&tmp, name); + rc = d_getenv_ull(&tmp, name, sizeof(uint64_t)); if (rc != -DER_SUCCESS) return rc; -#if UINT64_MAX != ULLONG_MAX - assert(sizeof(uint64_t) < sizeof(unsigned long long)); - if (tmp > UINT64_MAX) { - return -DER_INVAL; - } -#endif - *uint64_val = (uint64_t)tmp; return -DER_SUCCESS; } diff --git a/src/gurt/tests/test_gurt.c b/src/gurt/tests/test_gurt.c index 49db0a883e6..ebb9a0ec701 100644 --- a/src/gurt/tests/test_gurt.c +++ b/src/gurt/tests/test_gurt.c @@ -2288,35 +2288,63 @@ test_d_getenv_uint(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_true(val == UINT_MAX); - getenv_return = "42"; + getenv_return = "-1"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT_MAX); + + getenv_return = "-10"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT_MAX - 9); + + getenv_return = "-4294967294"; + rc = d_getenv_uint("foo", &val); + assert_true(val == 2); + + getenv_return = "-4294967295"; + rc = d_getenv_uint("foo", &val); + assert_true(val == 1); + + getenv_return = " 000042"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_SUCCESS); assert_true(val == 42); + getenv_return = " -000042"; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == -42); + getenv_return = "4294967296"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); - getenv_return = "-42"; + getenv_return = "-4294967296"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "booo"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "42booo"; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); + + getenv_return = ""; + rc = d_getenv_uint("foo", &val); + assert_int_equal(rc, -DER_INVAL); + assert_true(val == -42); getenv_return = NULL; rc = d_getenv_uint("foo", &val); assert_int_equal(rc, -DER_NONEXIST); - assert_true(val == 42); + assert_true(val == -42); } static void @@ -2330,40 +2358,63 @@ test_d_getenv_uint32_t(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_true(val == UINT32_MAX); - getenv_return = "42"; + getenv_return = "-1"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT32_MAX); + + getenv_return = "-10"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT32_MAX - 9); + + getenv_return = "-4294967294"; + rc = d_getenv_uint32_t("foo", &val); + assert_true(val == 2); + + getenv_return = "-4294967295"; + rc = d_getenv_uint32_t("foo", &val); + assert_true(val == 1); + + getenv_return = " 000042"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_SUCCESS); assert_true(val == 42); + getenv_return = " -000042"; + rc = d_getenv_uint32_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == -42); + getenv_return = "4294967296"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); - getenv_return = "-42"; + getenv_return = "-4294967296"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "booo"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "42booo"; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = ""; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = NULL; rc = d_getenv_uint32_t("foo", &val); assert_int_equal(rc, -DER_NONEXIST); - assert_true(val == 42); + assert_true(val == -42); } static void @@ -2377,45 +2428,63 @@ test_d_getenv_uint64_t(void **state) assert_int_equal(rc, -DER_SUCCESS); assert_true(val == UINT64_MAX); - getenv_return = "42"; + getenv_return = "-1"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_SUCCESS); - assert_true(val == 42); + assert_true(val == UINT64_MAX); - getenv_return = "18446744073709551616"; + getenv_return = "-10"; rc = d_getenv_uint64_t("foo", &val); - assert_int_equal(rc, -DER_INVAL); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == UINT64_MAX - 9); + + getenv_return = "-18446744073709551614"; + rc = d_getenv_uint64_t("foo", &val); + assert_true(val == 2); + + getenv_return = "-18446744073709551615"; + rc = d_getenv_uint64_t("foo", &val); + assert_true(val == 1); + + getenv_return = " 000042"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); assert_true(val == 42); - getenv_return = "012345678901234567890"; + getenv_return = " -000042"; + rc = d_getenv_uint64_t("foo", &val); + assert_int_equal(rc, -DER_SUCCESS); + assert_true(val == -42); + + getenv_return = "18446744073709551616"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); - getenv_return = "-42"; + getenv_return = "-18446744073709551616"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "booo"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = "42booo"; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = ""; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_INVAL); - assert_true(val == 42); + assert_true(val == -42); getenv_return = NULL; rc = d_getenv_uint64_t("foo", &val); assert_int_equal(rc, -DER_NONEXIST); - assert_true(val == 42); + assert_true(val == -42); } static void From 8ca9d227bb69067187a21b434940c2fc873a5403 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Tue, 16 Jan 2024 10:47:30 -0800 Subject: [PATCH 05/18] DAOS-14594 test: fix dfs_parallel xml generation (#13312) Since cmocka is not MPI-aware, force all ranks other than rank 0 to write to stdout to avoid race conditions with the XML file. Signed-off-by: Dalton Bohning --- src/tests/suite/dfs_test.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/tests/suite/dfs_test.c b/src/tests/suite/dfs_test.c index f35e9df0f7b..217f30ad178 100644 --- a/src/tests/suite/dfs_test.c +++ b/src/tests/suite/dfs_test.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2022 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -81,16 +81,17 @@ run_specified_tests(const char *tests, int rank, int size, int main(int argc, char **argv) { - test_arg_t *arg; - char tests[64]; - char *exclude_str = NULL; - int ntests = 0; - int nr_failed = 0; - int nr_total_failed = 0; - int opt = 0, index = 0; - int rank; - int size; - int rc; + test_arg_t *arg; + char tests[64]; + char *exclude_str = NULL; + char *cmocka_message_output = NULL; + int ntests = 0; + int nr_failed = 0; + int nr_total_failed = 0; + int opt = 0, index = 0; + int rank; + int size; + int rc; d_register_alt_assert(mock_assert); @@ -166,6 +167,16 @@ main(int argc, char **argv) tests[new_idx] = '\0'; } + /** if writing XML, force all ranks other than rank 0 to use stdout to avoid conflicts */ + cmocka_message_output = getenv("CMOCKA_MESSAGE_OUTPUT"); + if (rank != 0 && cmocka_message_output && strcasecmp(cmocka_message_output, "xml") == 0) { + rc = d_setenv("CMOCKA_MESSAGE_OUTPUT", "stdout", 1); + if (rc) { + print_message("d_setenv() failed with %d\n", rc); + return -1; + } + } + nr_failed = run_specified_tests(tests, rank, size, NULL, 0); exit: From 8599cebf0d672a8e6dda4b566b416f25a14b2ffc Mon Sep 17 00:00:00 2001 From: wangdi Date: Wed, 17 Jan 2024 04:38:22 -0800 Subject: [PATCH 06/18] DAOS-14845 rebuild: do not wait for EC agg for reclaim (#13610) Do not need wait for EC aggregation for reclaim operation, which does not involve fetch and update. Signed-off-by: Di Wang --- src/rebuild/scan.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 4c2c78c4bee..352459a2d84 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -882,12 +882,13 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, } /* Wait for EC aggregation to finish. NB: migrate needs to wait for EC aggregation to finish */ - while (cont_child->sc_ec_agg_active) { + while (cont_child->sc_ec_agg_active && + rpt->rt_rebuild_op != RB_OP_RECLAIM && + rpt->rt_rebuild_op != RB_OP_FAIL_RECLAIM) { D_ASSERTF(rpt->rt_pool->sp_rebuilding >= 0, DF_UUID" rebuilding %d\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_pool->sp_rebuilding); /* Wait for EC aggregation to abort before discard the object */ - D_DEBUG(DB_REBUILD, DF_UUID" wait for ec agg abort.\n", - DP_UUID(entry->ie_couuid)); + D_INFO(DF_UUID" wait for ec agg abort.\n", DP_UUID(entry->ie_couuid)); dss_sleep(1000); if (rpt->rt_abort || rpt->rt_finishing) { D_DEBUG(DB_REBUILD, DF_CONT" rebuild op %s ver %u abort %u/%u.\n", From fd9d630462fff3f57be732bbf05dc1102d69a7d1 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 17 Jan 2024 07:42:26 -0500 Subject: [PATCH 07/18] DAOS-14969 test: Increase crt_timeout for test_daos_oid_allocator (#13599) * DAOS-14969 test: Increase crt_timeout for test_daos_oid_allocator Temporarily increase the crt_timeout for the test_daos_oid_allocator test to 60 seconds. Signed-off-by: Phil Henderson --- src/tests/ftest/daos_test/suite.yaml | 2 ++ src/tests/ftest/util/daos_core_base.py | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/daos_test/suite.yaml b/src/tests/ftest/daos_test/suite.yaml index afcc048f965..5d08fb4493d 100644 --- a/src/tests/ftest/daos_test/suite.yaml +++ b/src/tests/ftest/daos_test/suite.yaml @@ -191,3 +191,5 @@ daos_tests: test_daos_extend_simple: 5 test_daos_rebuild_ec: 43 test_daos_degraded_ec: 29 + crt_timeout: + test_daos_oid_allocator: 60 diff --git a/src/tests/ftest/util/daos_core_base.py b/src/tests/ftest/util/daos_core_base.py index 9bf0ff4c501..1baa93b91b4 100644 --- a/src/tests/ftest/util/daos_core_base.py +++ b/src/tests/ftest/util/daos_core_base.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -96,6 +96,14 @@ def start_server_managers(self, force=False): ["=".join(items) for items in list(env_dict.items())] ) + # Update any other server settings unique to this test method + for setting in ["crt_timeout"]: + value = self.get_test_param(setting) + if value: + for server_mgr in self.server_managers: + for engine_params in server_mgr.manager.job.yaml.engine_params: + engine_params.set_value(setting, value) + # Start the servers return super().start_server_managers(force=force) From a0503a0d1267b014ded36ef8217ada87686346ef Mon Sep 17 00:00:00 2001 From: wangdi Date: Wed, 17 Jan 2024 09:11:12 -0800 Subject: [PATCH 08/18] DAOS-14965 tests: using correct rd_fac in online_rebuild_single (#13600) Use rd_fac:2 in online_rebuild_single Use svcn: 5 Signed-off-by: Di Wang --- src/tests/ftest/erasurecode/online_rebuild_single.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_single.yaml b/src/tests/ftest/erasurecode/online_rebuild_single.yaml index bda14dbb9e5..8b0b3f4baf0 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_single.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_single.yaml @@ -30,9 +30,10 @@ server_config: storage: auto pool: size: 93% - svcn: 1 + svcn: 5 control_method: dmg pool_query_timeout: 30 + properties: rd_fac:2 container: type: POSIX control_method: API From da658ad7c603f768bc60d1206e55e0bced68bb8f Mon Sep 17 00:00:00 2001 From: "Brian J. Murrell" Date: Wed, 17 Jan 2024 16:36:52 -0500 Subject: [PATCH 09/18] DAOS-14440 build: Update distro versions in GHA (#13608) The supported distro versions got missed in 56e3228d36 for the GitHub Actions workflow. Signed-off-by: Brian J. Murrell --- .github/workflows/rpm-build-and-test.yml | 4 ++-- ci/functional/test_main.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rpm-build-and-test.yml b/.github/workflows/rpm-build-and-test.yml index bbad2b715b4..232b3cbd971 100644 --- a/.github/workflows/rpm-build-and-test.yml +++ b/.github/workflows/rpm-build-and-test.yml @@ -5,10 +5,10 @@ env: # build is done on the lowest version and test on the highest with a "sanity test" # stage done on all versions in the list ecept the highest EL8_BUILD_VERSION: 8.6 - EL8_VERSION: 8 + EL8_VERSION: 8.8 EL9_BUILD_VERSION: 9 EL9_VERSION: 9 - LEAP15_VERSION: 15.4 + LEAP15_VERSION: 15.5 on: workflow_dispatch: diff --git a/ci/functional/test_main.sh b/ci/functional/test_main.sh index d318b3601e3..56fe36f8571 100755 --- a/ci/functional/test_main.sh +++ b/ci/functional/test_main.sh @@ -45,7 +45,7 @@ test_cluster() { FIRST_NODE=${first_node} \ TEST_RPMS=${TEST_RPMS} \ NODELIST=${tnodes} \ - BUILD_URL=\"$BUILD_URL\" \ + BUILD_URL=\"${BUILD_URL:-Unknown in GHA}\" \ STAGE_NAME=\"$STAGE_NAME\" \ $(cat ci/functional/test_main_prep_node.sh)" } From cf10b98fcddc30b8278b837cc9c258333512da45 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Thu, 18 Jan 2024 12:24:30 -0800 Subject: [PATCH 10/18] DAOS-14831 test: use scm and nvme for ior/hard_rebuild (#13551) - Use both SCM and NVMe when creating a pool to accomodate MD on SSD. - Use GX so space is spread across targets. Signed-off-by: Dalton Bohning --- src/tests/ftest/ior/hard_rebuild.yaml | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/tests/ftest/ior/hard_rebuild.yaml b/src/tests/ftest/ior/hard_rebuild.yaml index a3a5f5f5444..97137c97b46 100644 --- a/src/tests/ftest/ior/hard_rebuild.yaml +++ b/src/tests/ftest/ior/hard_rebuild.yaml @@ -31,34 +31,28 @@ server_config: log_file: daos_server1.log log_mask: ERR storage: auto -create_pool_max_size: - scm: true - percentage: 90 pool: - control_method: dmg + size: 90% container: type: POSIX control_method: daos - properties: dedup:memcmp ior: api: "DFS" client_processes: np: 32 - dfs_destroy: false iorflags: flags: "-C -k -e -w -g -G 27 -D 120 -Q 1 -vv" read_flags: "-C -k -e -r -R -g -G 27 -D 120 -Q 1 -vv" test_file: daos:testFile segment_count: 2000000 - repetitions: 1 chunk_block_transfer_sizes: # [ChunkSize, BlocksSize, TransferSize] - [47008, 47008, 47008] objectclass: dfs_oclass_list: # - [EC_Object_Class, Minimum number of servers] - - ["EC_2P2G1", 6] - - ["EC_4P2G1", 8] - - ["EC_8P2G1", 12] + - ["EC_2P2GX", 6] + - ["EC_4P2GX", 8] + - ["EC_8P2GX", 12] sw_wearout: 1 sw_status_file: "/var/tmp/daos_testing/stoneWallingStatusFile" From 91b93c84e8159b275baa42b0a4507229bb3a91a6 Mon Sep 17 00:00:00 2001 From: Liu Xuezhao Date: Fri, 19 Jan 2024 07:22:11 +0800 Subject: [PATCH 11/18] DAOS-13252 tests: set svcn for multiple_failure test (#13619) original set 1 cannot work if the killed rank happened to be the only server replica. Signed-off-by: Xuezhao Liu --- src/tests/ftest/erasurecode/multiple_failure.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml index dbd63f69bbe..73ceb3bfdc0 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_failure.yaml @@ -25,7 +25,7 @@ server_config: storage: auto pool: size: 93% - svcn: 1 + svcn: 5 control_method: dmg container: type: POSIX From 1ed8f1da82d0dabddbfa171da634d4a89ae752e1 Mon Sep 17 00:00:00 2001 From: Cedric Koch-Hofer <94527853+knard-intel@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:06:46 +0100 Subject: [PATCH 12/18] DAOS-14981 gurt: restore d_getenv_int undefined symbol (#13622) * DAOS-14981 gurt: restore d_getenv_int undefined symbol Restore missing plain function d_getenv_int() to fix missing symbol with libdaos. Signed-off-by: Cedric Koch-Hofer --- src/gurt/misc.c | 15 +++++++++++++++ src/include/gurt/common.h | 10 +++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/gurt/misc.c b/src/gurt/misc.c index ffb1a85bb0d..3b287ca73ff 100644 --- a/src/gurt/misc.c +++ b/src/gurt/misc.c @@ -1302,6 +1302,21 @@ d_getenv_uint(const char *name, unsigned *uint_val) return -DER_SUCCESS; } +/** + * get an unsigned integer type environment variables. + * + * \param[in] name name of the environment variable. + * \param[in,out] uint_val returned value of the ENV. Will not change the original + * value if ENV is not set or set as a non-integer value. + * \return 0 on success, a negative value on error. + * \deprecated d_getenv_int() is deprecated, please use d_getenv_uint(). + */ +int +d_getenv_int(const char *name, unsigned *uint_val) +{ + return d_getenv_uint(name, uint_val); +} + /** * get a 32bits unsigned integer type environment variables * diff --git a/src/include/gurt/common.h b/src/include/gurt/common.h index 779a547768b..1cf40fc3292 100644 --- a/src/include/gurt/common.h +++ b/src/include/gurt/common.h @@ -587,6 +587,9 @@ d_getenv_bool(const char *name, bool *bool_val); int d_getenv_char(const char *name, char *char_val); int +d_getenv_int(const char *name, unsigned int *uint_val) + __attribute__((deprecated("use d_getenv_uint"))); +int d_getenv_uint(const char *name, unsigned int *uint_val); int d_getenv_uint32_t(const char *name, uint32_t *uint32_val); @@ -601,13 +604,6 @@ d_unsetenv(const char *name); int d_clearenv(void); -static inline int -d_getenv_int(const char *name, unsigned int *uint_val) -{ - D_WARN("d_getenv_int() is deprecated, please use d_getenv_uint()"); - return d_getenv_uint(name, uint_val); -} - int d_write_string_buffer(struct d_string_buffer_t *buf, const char *fmt, ...); void From be4402b995e27a488295e7ab322a1a35ec19be59 Mon Sep 17 00:00:00 2001 From: wangdi Date: Fri, 19 Jan 2024 07:33:30 -0800 Subject: [PATCH 13/18] DAOS-14884 pool: set the pool server handle before update (#13618) Set the pool server handler before IV update, to make sure IV server checking accurate on the PS leader once step up finish. Signed-off-by: Di Wang --- src/pool/srv_internal.h | 4 --- src/pool/srv_iv.c | 57 ----------------------------------------- src/pool/srv_pool.c | 10 +++++--- 3 files changed, 7 insertions(+), 64 deletions(-) diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index c95f15cc715..f997d299f80 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -255,10 +255,6 @@ int ds_pool_iv_srv_hdl_invalidate(struct ds_pool *pool); int ds_pool_iv_conn_hdl_fetch(struct ds_pool *pool); int ds_pool_iv_conn_hdl_invalidate(struct ds_pool *pool, uuid_t hdl_uuid); -int ds_pool_iv_srv_hdl_fetch_non_sys(struct ds_pool *pool, - uuid_t *srv_cont_hdl, - uuid_t *srv_pool_hdl); - /* * srv_metrics.c */ diff --git a/src/pool/srv_iv.c b/src/pool/srv_iv.c index 92970ff3d5f..a1969c67bd6 100644 --- a/src/pool/srv_iv.c +++ b/src/pool/srv_iv.c @@ -1496,63 +1496,6 @@ ds_pool_iv_srv_hdl_fetch(struct ds_pool *pool, uuid_t *pool_hdl_uuid, return rc; } -struct srv_hdl_ult_arg { - struct ds_pool *pool; - ABT_eventual eventual; -}; - -static void -pool_iv_srv_hdl_fetch_ult(void *data) -{ - struct srv_hdl_ult_arg *arg = data; - int rc; - - rc = ds_pool_iv_srv_hdl_fetch(arg->pool, NULL, NULL); - - ABT_eventual_set(arg->eventual, (void *)&rc, sizeof(rc)); -} - -int -ds_pool_iv_srv_hdl_fetch_non_sys(struct ds_pool *pool, uuid_t *srv_cont_hdl, - uuid_t *srv_pool_hdl) -{ - struct srv_hdl_ult_arg arg; - ABT_eventual eventual; - int *status; - int rc; - - /* Fetch the capability from the leader. To avoid extra locks, - * all metadatas are maintained by xstream 0, so let's create - * an ULT on xstream 0 to let xstream 0 to handle capa fetch - * and update. - */ - rc = ABT_eventual_create(sizeof(*status), &eventual); - if (rc != ABT_SUCCESS) - return dss_abterr2der(rc); - - arg.pool = pool; - arg.eventual = eventual; - rc = dss_ult_create(pool_iv_srv_hdl_fetch_ult, &arg, DSS_XS_SYS, - 0, 0, NULL); - if (rc) - D_GOTO(out_eventual, rc); - - rc = ABT_eventual_wait(eventual, (void **)&status); - if (rc != ABT_SUCCESS) - D_GOTO(out_eventual, rc = dss_abterr2der(rc)); - if (*status != 0) - D_GOTO(out_eventual, rc = *status); - - if (srv_cont_hdl) - uuid_copy(*srv_cont_hdl, pool->sp_srv_cont_hdl); - if (srv_pool_hdl) - uuid_copy(*srv_pool_hdl, pool->sp_srv_pool_hdl); - -out_eventual: - ABT_eventual_free(&eventual); - return rc; -} - int ds_pool_iv_prop_update(struct ds_pool *pool, daos_prop_t *prop) { diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 6396e6bea59..8f857ea1e12 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1810,6 +1810,11 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) } else { uuid_generate(pool_hdl_uuid); uuid_generate(cont_hdl_uuid); + /* Only copy server handle to make is_from_srv() check correctly, and + * container server handle will not be copied here, otherwise + * ds_pool_iv_refresh_hdl will not open the server container handle. + */ + uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid); } rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, @@ -4296,8 +4301,7 @@ ds_pool_query_handler(crt_rpc_t *rpc, int handler_version) metrics = svc->ps_pool->sp_metrics[DAOS_POOL_MODULE]; /* See comment above, rebuild doesn't connect the pool */ - if ((query_bits & DAOS_PO_QUERY_SPACE) && - !is_pool_from_srv(in->pqi_op.pi_uuid, in->pqi_op.pi_hdl)) { + if (query_bits & DAOS_PO_QUERY_SPACE) { rc = pool_space_query_bcast(rpc->cr_ctx, svc, in->pqi_op.pi_hdl, &out->pqo_space); if (unlikely(rc)) From 1946ef3cd4a09488747fd79d47b32fdcd58ac534 Mon Sep 17 00:00:00 2001 From: wangdi Date: Sun, 21 Jan 2024 08:21:39 -0800 Subject: [PATCH 14/18] DAOS-14969 container: retry IV might cause deadlock (#13632) OID IV entry lock might be required again for retry case. Signed-off-by: Di Wang --- src/cart/crt_iv.c | 4 ++-- src/container/oid_iv.c | 10 +++++++++- src/engine/server_iv.c | 6 +++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/cart/crt_iv.c b/src/cart/crt_iv.c index af3226facd8..59ad504993f 100644 --- a/src/cart/crt_iv.c +++ b/src/cart/crt_iv.c @@ -3508,8 +3508,8 @@ crt_iv_update_internal(crt_iv_namespace_t ivns, uint32_t class_id, D_GOTO(exit, rc); } else { - DL_CDEBUG(rc == -DER_NONEXIST || rc == -DER_NOTLEADER, DLOG_INFO, DLOG_ERR, rc, - "ivo_on_update failed"); + DL_CDEBUG(rc == -DER_NONEXIST || rc == -DER_NOTLEADER || rc == -DER_BUSY, + DLOG_INFO, DLOG_ERR, rc, "ivo_on_update failed"); update_comp_cb(ivns, class_id, iv_key, NULL, iv_value, rc, cb_arg); diff --git a/src/container/oid_iv.c b/src/container/oid_iv.c index d1041184006..f10f5d34f7e 100644 --- a/src/container/oid_iv.c +++ b/src/container/oid_iv.c @@ -31,6 +31,7 @@ struct oid_iv_entry { struct oid_iv_range rg; /** protect the entry */ ABT_mutex lock; + void *current_req; }; /** Priv data in the iv layer */ @@ -130,7 +131,14 @@ oid_iv_ent_update(struct ds_iv_entry *ns_entry, struct ds_iv_key *iv_key, D_ASSERT(priv != NULL); entry = ns_entry->iv_value.sg_iovs[0].iov_buf; - ABT_mutex_lock(entry->lock); + rc = ABT_mutex_trylock(entry->lock); + /* For retry requests, from _iv_op(), the lock may not be released + * in some cases. + */ + if (rc == ABT_ERR_MUTEX_LOCKED && entry->current_req != src) + return -DER_BUSY; + + entry->current_req = src; avail = &entry->rg; oids = src->sg_iovs[0].iov_buf; diff --git a/src/engine/server_iv.c b/src/engine/server_iv.c index a7d258705a3..5f5d00722cc 100644 --- a/src/engine/server_iv.c +++ b/src/engine/server_iv.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1053,7 +1053,7 @@ _iv_op(struct ds_iv_ns *ns, struct ds_iv_key *key, d_sg_list_t *value, retry: rc = iv_op_internal(ns, key, value, sync, shortcut, opc); if (retry && !ns->iv_stop && - (daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER)) { + (daos_rpc_retryable_rc(rc) || rc == -DER_NOTLEADER || rc == -DER_BUSY)) { if (rc == -DER_NOTLEADER && key->rank != (d_rank_t)(-1) && sync && (sync->ivs_mode == CRT_IV_SYNC_LAZY || sync->ivs_mode == CRT_IV_SYNC_EAGER)) { @@ -1070,7 +1070,7 @@ _iv_op(struct ds_iv_ns *ns, struct ds_iv_key *key, d_sg_list_t *value, * but in-flight fetch request return IVCB_FORWARD, then queued RPC will * reply IVCB_FORWARD. */ - D_WARN("ns %u retry for class %d opc %d rank %u/%u: " DF_RC "\n", ns->iv_ns_id, + D_INFO("ns %u retry for class %d opc %d rank %u/%u: " DF_RC "\n", ns->iv_ns_id, key->class_id, opc, key->rank, ns->iv_master_rank, DP_RC(rc)); /* sleep 1sec and retry */ dss_sleep(1000); From 22687b767a8705213aa765673c2944b32e589119 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Mon, 22 Jan 2024 09:44:34 -0600 Subject: [PATCH 15/18] DAOS-14474 dfs: remove unused --evict option from checker (#13616) The dfs fs check command already allows evicting containers using the --flags=evict option. the --evict option was added by mistake and does not do anything today, so remove it. Signed-off-by: Mohamad Chaarawi --- src/control/cmd/daos/filesystem.go | 1 - 1 file changed, 1 deletion(-) diff --git a/src/control/cmd/daos/filesystem.go b/src/control/cmd/daos/filesystem.go index e7fcf828a45..8baa1c93535 100644 --- a/src/control/cmd/daos/filesystem.go +++ b/src/control/cmd/daos/filesystem.go @@ -291,7 +291,6 @@ type fsCheckCmd struct { FsckFlags FsCheckFlag `long:"flags" short:"f" description:"comma-separated flags: print, remove, relink, verify, evict"` DirName string `long:"dir-name" short:"n" description:"directory name under lost+found to store leaked oids (a timestamp dir would be created if this is not specified)"` - Evict bool `long:"evict" short:"e" description:"evict all open handles on the container"` } func (cmd *fsCheckCmd) Execute(_ []string) error { From 9152ed02ad0726543e9238fb494f8e8ebb9453cd Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Mon, 22 Jan 2024 09:45:51 -0600 Subject: [PATCH 16/18] DAOS-14219 dfs: checker should not follow symlinks (#13625) The DFS checker should mark the symlink oid and not dereference the symlink value. the value can be invalid anyway and if a valid path in the container, it would be reachable from the hardlink path. Signed-off-by: Mohamad Chaarawi --- src/client/dfs/dfs.c | 2 +- src/tests/suite/dfs_unit_test.c | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/client/dfs/dfs.c b/src/client/dfs/dfs.c index a95acd9d31b..fe0b226cb90 100644 --- a/src/client/dfs/dfs.c +++ b/src/client/dfs/dfs.c @@ -6686,7 +6686,7 @@ oit_mark_cb(dfs_t *dfs, dfs_obj_t *parent, const char name[], void *args) } /** open the entry name and get the oid */ - rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY, &obj, NULL, NULL); + rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY | O_NOFOLLOW, &obj, NULL, NULL); if (rc) { D_ERROR("dfs_lookup_rel() of %s failed: %d\n", name, rc); return rc; diff --git a/src/tests/suite/dfs_unit_test.c b/src/tests/suite/dfs_unit_test.c index 6c2bf8fe1bb..34d49220f58 100644 --- a/src/tests/suite/dfs_unit_test.c +++ b/src/tests/suite/dfs_unit_test.c @@ -2503,7 +2503,7 @@ dfs_test_checker(void **state) test_arg_t *arg = *state; dfs_t *dfs; int nr = 100, i; - dfs_obj_t *root, *lf; + dfs_obj_t *root, *lf, *sym; daos_obj_id_t root_oid; daos_handle_t root_oh; daos_handle_t coh; @@ -2574,6 +2574,12 @@ dfs_test_checker(void **state) assert_int_equal(rc, 0); } + /** create a symlink with a non-existent target in the container */ + rc = dfs_open(dfs, NULL, "SL1", S_IFLNK | S_IWUSR | S_IRUSR, O_RDWR | O_CREAT | O_EXCL, 0, + 0, "/usr/local", &sym); + assert_int_equal(rc, 0); + rc = dfs_release(sym); + rc = dfs_disconnect(dfs); assert_int_equal(rc, 0); /** have to call fini to release the cached container handle for the checker to work */ From 20ae230f19d756cdb219c3ac02d86e1e87e778bb Mon Sep 17 00:00:00 2001 From: Cedric Koch-Hofer <94527853+knard-intel@users.noreply.github.com> Date: Mon, 22 Jan 2024 16:47:19 +0100 Subject: [PATCH 17/18] DAOS-15036 control: Fix error messages (#13627) Fix error messages with invlaid number of parameters. Signed-off-by: Cedric Koch-Hofer --- src/control/lib/control/pool.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/control/lib/control/pool.go b/src/control/lib/control/pool.go index 79b1efaf1e2..24327914275 100644 --- a/src/control/lib/control/pool.go +++ b/src/control/lib/control/pool.go @@ -1365,7 +1365,7 @@ func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControl for _, smdDevice := range controller.SmdDevices { if !smdDevice.Roles.IsEmpty() && (smdDevice.Roles.OptionBits&storage.BdevRoleData) == 0 { log.Debugf("Skipping SMD device %s (rank %d, ctrlr %s) not used for storing data", - smdDevice.UUID, smdDevice.Rank, controller.PciAddr, smdDevice.Rank) + smdDevice.UUID, smdDevice.Rank, controller.PciAddr) continue } @@ -1377,7 +1377,7 @@ func processNVMeSpaceStats(log debugLogger, filterRank filterRankFn, nvmeControl if !filterRank(smdDevice.Rank) { log.Debugf("Skipping SMD device %s (rank %d, ctrlr %s) not in ranklist", - smdDevice.UUID, smdDevice.Rank, controller.PciAddr, smdDevice.Rank) + smdDevice.UUID, smdDevice.Rank, controller.PciAddr) continue } From a2356d889ec09d1fc76d43ecdf5963e12925c97d Mon Sep 17 00:00:00 2001 From: Li Wei Date: Tue, 23 Jan 2024 00:48:14 +0900 Subject: [PATCH 18/18] DAOS-14443 rdb: Improve rdb_campaign error (#13180) The recent unplanned raft update has introduced a new raft error, RAFT_ERR_MIGHT_VIOLATE_LEASE, which is mapped to the default -DER_MISC. This patch maps the new raft error to the slightly more meaningful -DER_NO_PERM, to complete the raft update. Signed-off-by: Li Wei --- src/rdb/rdb.c | 2 +- src/rdb/rdb_raft.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/rdb/rdb.c b/src/rdb/rdb.c index f5c3eb47629..a1d39c507ef 100644 --- a/src/rdb/rdb.c +++ b/src/rdb/rdb.c @@ -684,7 +684,7 @@ rdb_resign(struct rdb *db, uint64_t term) * * \param[in] db database * - * \retval -DER_INVAL not a voting replica + * \retval -DER_NO_PERM not a voting replica or might violate a lease */ int rdb_campaign(struct rdb *db) diff --git a/src/rdb/rdb_raft.c b/src/rdb/rdb_raft.c index 8be8dc6ed99..886a873729c 100644 --- a/src/rdb/rdb_raft.c +++ b/src/rdb/rdb_raft.c @@ -61,6 +61,7 @@ rdb_raft_rc(int raft_rc) case RAFT_ERR_NOMEM: return -DER_NOMEM; case RAFT_ERR_SNAPSHOT_ALREADY_LOADED: return -DER_ALREADY; case RAFT_ERR_INVALID_CFG_CHANGE: return -DER_INVAL; + case RAFT_ERR_MIGHT_VIOLATE_LEASE: return -DER_NO_PERM; default: return -DER_MISC; } } @@ -2854,7 +2855,7 @@ rdb_raft_campaign(struct rdb *db) node = raft_get_my_node(db->d_raft); if (node == NULL || !raft_node_is_voting(node)) { D_DEBUG(DB_MD, DF_DB": must be voting node\n", DP_DB(db)); - rc = -DER_INVAL; + rc = -DER_NO_PERM; goto out_mutex; }