From 10d47ec221f6ab72f48357056009d91021606b79 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Fri, 6 Sep 2024 16:52:18 +0800 Subject: [PATCH] DAOS-16486 object: return proper error on stale pool map (#15064) Client with stale pool map may try to send RPC to a DOWN target, if the target was brought DOWN due to faulty NVMe device, the ds_pool_child could have been stopped on the NVMe faulty reaction, We'd ensure proper error code is returned for such case. Allow-unstable-test: true Required-githooks: true Signed-off-by: Niu Yawei --- src/dtx/tests/srv_mock.c | 7 +++++++ src/include/daos_srv/pool.h | 2 ++ src/object/srv_obj.c | 31 ++++++++++++++++++++++++++++++- src/pool/srv_target.c | 15 +++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/dtx/tests/srv_mock.c b/src/dtx/tests/srv_mock.c index 245b3b11513..3d4ac70d773 100644 --- a/src/dtx/tests/srv_mock.c +++ b/src/dtx/tests/srv_mock.c @@ -71,6 +71,13 @@ ds_pool_child_put(struct ds_pool_child *child) assert_true(false); } +struct ds_pool_child * +ds_pool_child_find(const uuid_t uuid) +{ + assert_true(false); + return NULL; +} + struct ds_pool_child * ds_pool_child_lookup(const uuid_t uuid) { diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 07ca3c0dbc1..6cbe3873f0a 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -249,6 +249,8 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t uint64_t cli_time, bool dup_op, int rc_in, struct ds_pool_svc_op_val *op_valp); /* Find ds_pool_child in cache, hold one reference */ +struct ds_pool_child *ds_pool_child_find(const uuid_t uuid); +/* Find ds_pool_child in STARTING or STARTED state, hold one reference */ struct ds_pool_child *ds_pool_child_lookup(const uuid_t uuid); /* Put the reference held by ds_pool_child_lookup() */ void ds_pool_child_put(struct ds_pool_child *child); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index febd3d36ead..a51682b4785 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2170,8 +2170,37 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid, int rc; rc = obj_ioc_init(pool_uuid, coh_uuid, cont_uuid, rpc, ioc); - if (rc) + if (rc) { + DL_ERROR(rc, "Failed to initialize object I/O context."); + + /* + * Client with stale pool map may try to send RPC to a DOWN target, if the + * target was brought DOWN due to faulty NVMe device, the ds_pool_child could + * have been stopped on the NVMe faulty reaction, then above obj_io_init() + * will fail with -DER_NO_HDL. + * + * We'd ensure proper error code is returned for such case. + */ + poc = ds_pool_child_find(pool_uuid); + if (poc == NULL) { + D_ERROR("Failed to find pool:"DF_UUID"\n", DP_UUID(pool_uuid)); + return rc; + } + + if (rpc_map_ver < poc->spc_pool->sp_map_version) { + D_ERROR("Stale pool map version %u < %u from client.\n", + rpc_map_ver, poc->spc_pool->sp_map_version); + + /* Restart the DTX if using stale pool map */ + if (opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_CPD) + rc = -DER_TX_RESTART; + else + rc = -DER_STALE; + } + + ds_pool_child_put(poc); return rc; + } poc = ioc->ioc_coc->sc_pool; D_ASSERT(poc != NULL); diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 0b195216cf5..cfa837e8b2a 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -87,6 +87,21 @@ pool_child_lookup_noref(const uuid_t uuid) return NULL; } +struct ds_pool_child * +ds_pool_child_find(const uuid_t uuid) +{ + struct ds_pool_child *child; + + child = pool_child_lookup_noref(uuid); + if (child == NULL) { + D_ERROR(DF_UUID": Pool child isn't found.\n", DP_UUID(uuid)); + return child; + } + + child->spc_ref++; + return child; +} + struct ds_pool_child * ds_pool_child_lookup(const uuid_t uuid) {