Skip to content

Commit

Permalink
DAOS-16486 object: return proper error on stale pool map (#15064)
Browse files Browse the repository at this point in the history
Client with stale pool map may try to send RPC to a DOWN target, if the
target was brought DOWN due to faulty NVMe device, the ds_pool_child could
have been stopped on the NVMe faulty reaction, We'd ensure proper error
code is returned for such case.

Signed-off-by: Niu Yawei <[email protected]>
  • Loading branch information
NiuYawei authored Sep 6, 2024
1 parent 0e52fa5 commit 1353284
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 1 deletion.
7 changes: 7 additions & 0 deletions src/dtx/tests/srv_mock.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ ds_pool_child_put(struct ds_pool_child *child)
assert_true(false);
}

struct ds_pool_child *
ds_pool_child_find(const uuid_t uuid)
{
assert_true(false);
return NULL;
}

struct ds_pool_child *
ds_pool_child_lookup(const uuid_t uuid)
{
Expand Down
2 changes: 2 additions & 0 deletions src/include/daos_srv/pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,8 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
uint64_t cli_time, bool dup_op, int rc_in, struct ds_pool_svc_op_val *op_valp);

/* Find ds_pool_child in cache, hold one reference */
struct ds_pool_child *ds_pool_child_find(const uuid_t uuid);
/* Find ds_pool_child in STARTING or STARTED state, hold one reference */
struct ds_pool_child *ds_pool_child_lookup(const uuid_t uuid);
/* Put the reference held by ds_pool_child_lookup() */
void ds_pool_child_put(struct ds_pool_child *child);
Expand Down
31 changes: 30 additions & 1 deletion src/object/srv_obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -2170,8 +2170,37 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid,
int rc;

rc = obj_ioc_init(pool_uuid, coh_uuid, cont_uuid, rpc, ioc);
if (rc)
if (rc) {
DL_ERROR(rc, "Failed to initialize object I/O context.");

/*
* Client with stale pool map may try to send RPC to a DOWN target, if the
* target was brought DOWN due to faulty NVMe device, the ds_pool_child could
* have been stopped on the NVMe faulty reaction, then above obj_io_init()
* will fail with -DER_NO_HDL.
*
* We'd ensure proper error code is returned for such case.
*/
poc = ds_pool_child_find(pool_uuid);
if (poc == NULL) {
D_ERROR("Failed to find pool:"DF_UUID"\n", DP_UUID(pool_uuid));
return rc;
}

if (rpc_map_ver < poc->spc_pool->sp_map_version) {
D_ERROR("Stale pool map version %u < %u from client.\n",
rpc_map_ver, poc->spc_pool->sp_map_version);

/* Restart the DTX if using stale pool map */
if (opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_CPD)
rc = -DER_TX_RESTART;
else
rc = -DER_STALE;
}

ds_pool_child_put(poc);
return rc;
}

poc = ioc->ioc_coc->sc_pool;
D_ASSERT(poc != NULL);
Expand Down
15 changes: 15 additions & 0 deletions src/pool/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,21 @@ pool_child_lookup_noref(const uuid_t uuid)
return NULL;
}

struct ds_pool_child *
ds_pool_child_find(const uuid_t uuid)
{
struct ds_pool_child *child;

child = pool_child_lookup_noref(uuid);
if (child == NULL) {
D_ERROR(DF_UUID": Pool child isn't found.\n", DP_UUID(uuid));
return child;
}

child->spc_ref++;
return child;
}

struct ds_pool_child *
ds_pool_child_lookup(const uuid_t uuid)
{
Expand Down

0 comments on commit 1353284

Please sign in to comment.