Skip to content

Commit

Permalink
DAOS-16170 cart: do not release completed RPC reference repeatedly (#…
Browse files Browse the repository at this point in the history
…15476)

For collective RPC, when handle failure cases during crt_req_send(),
its reference may has been released via crt_rpc_complete_and_unlock()
that is triggered by crt_corpc_complete(). Under such case, we should
check whether the RPC is completed or not before calling RPC_DECREF()
to avoid releasing the RPC reference repeatedly.

The patch also initializes some local variable for CHK RPC to avoid
accessing invalid DRAM when handle failed collective CHK RPC.

Signed-off-by: Fan Yong <[email protected]>
  • Loading branch information
Nasf-Fan authored Dec 6, 2024
1 parent 647b216 commit ab5815d
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/cart/crt_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1532,7 +1532,7 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg)
/* failure already reported through complete cb */
if (complete_cb != NULL)
rc = 0;
} else {
} else if (!crt_rpc_completed(rpc_priv)) {
RPC_DECREF(rpc_priv);
}
}
Expand Down
12 changes: 6 additions & 6 deletions src/chk/chk_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t poo
int
chk_mark_remote(d_rank_list_t *rank_list, uint64_t gen, d_rank_t rank, uint32_t version)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_mark_in *cmi;
struct chk_mark_out *cmo;
int rc;
Expand Down Expand Up @@ -730,7 +730,7 @@ int
chk_act_remote(d_rank_list_t *rank_list, uint64_t gen, uint64_t seq, uint32_t cla,
uint32_t act, d_rank_t rank, bool for_all)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_act_in *cai;
struct chk_act_out *cao;
int rc;
Expand Down Expand Up @@ -825,7 +825,7 @@ int
chk_pool_start_remote(d_rank_list_t *rank_list, uint64_t gen, uuid_t uuid, uint32_t phase,
uint32_t flags)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_pool_start_in *cpsi;
struct chk_pool_start_out *cpso;
int rc;
Expand Down Expand Up @@ -863,7 +863,7 @@ chk_pool_mbs_remote(d_rank_t rank, uint32_t phase, uint64_t gen, uuid_t uuid, ch
uint64_t seq, uint32_t flags, uint32_t mbs_nr, struct chk_pool_mbs *mbs_array,
int *svc_rc, struct rsvc_hint *svc_hint)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_pool_mbs_in *cpmi;
struct chk_pool_mbs_out *cpmo;
int rc;
Expand Down Expand Up @@ -908,7 +908,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act,
char *msg, uint32_t option_nr, uint32_t *options, uint32_t detail_nr,
d_sg_list_t *details, uint64_t seq)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_report_in *cri;
struct chk_report_out *cro;
int rc;
Expand Down Expand Up @@ -984,7 +984,7 @@ int
chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags,
uint32_t *pool_nr, uuid_t **pools)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_rejoin_in *cri;
struct chk_rejoin_out *cro;
uuid_t *tmp;
Expand Down
86 changes: 78 additions & 8 deletions src/tests/suite/daos_cr.c
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,78 @@ cr_locate_dcri(struct daos_check_info *dci, struct daos_check_report_info *base,
return dcri;
}

static int
cr_pool_fail_result(struct daos_check_info *dci, uuid_t uuid, uint32_t cla, uint32_t act, int *ret)
{
struct daos_check_pool_info *dcpi;
struct daos_check_report_info *dcri;
int i;

if (dci->dci_pool_nr != 1) {
print_message("CR pool count %d (pool " DF_UUID ") is not 1\n",
dci->dci_pool_nr, DP_UUID(uuid));
return -DER_INVAL;
}

dcpi = &dci->dci_pools[0];
D_ASSERTF(uuid_compare(dcpi->dcpi_uuid, uuid) == 0,
"Unmatched pool UUID (1): " DF_UUID " vs " DF_UUID "\n",
DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));

if (!cr_pool_status_failed(dcpi->dcpi_status)) {
print_message("CR pool " DF_UUID " status %s is not failed\n",
DP_UUID(uuid), dcpi->dcpi_status);
return -DER_INVAL;
}

if (cr_pool_phase_is_done(dcpi->dcpi_phase)) {
print_message("CR pool " DF_UUID " phase should not be done\n",
DP_UUID(uuid));
return -DER_INVAL;
}

#ifdef CR_ACCURATE_QUERY_RESULT
if (dci->dci_report_nr != 1) {
print_message("CR (failed) pool " DF_UUID " has unexpected reports: %d\n",
DP_UUID(uuid), dci->dci_report_nr);
return -DER_INVAL;
}
#endif

for (i = 0; i < dci->dci_report_nr; i++) {
dcri = &dci->dci_reports[i];
if (uuid_compare(dcri->dcri_uuid, uuid) != 0) {
#ifdef CR_ACCURATE_QUERY_RESULT
print_message("Detect unrelated inconsistency report: "
DF_UUID " vs " DF_UUID "\n",
DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));
return -DER_INVAL;
#else
continue;
#endif
}

if (dcri->dcri_class != cla) {
print_message("CR (failed) pool " DF_UUID " reports unexpected "
"inconsistency at %d: %u vs %u\n",
DP_UUID(uuid), i, dcri->dcri_class, cla);
return -DER_INVAL;
}

if (dcri->dcri_act != act) {
print_message("CR (failed) pool " DF_UUID " reports unexpected "
"solution at %d: %u vs %u\n",
DP_UUID(uuid), i, dcri->dcri_act, act);
return -DER_INVAL;
}

*ret = dcri->dcri_result;
return 0;
}

return -DER_INVAL;
}

static void
cr_dci_fini(struct daos_check_info *dci)
{
Expand Down Expand Up @@ -2953,7 +3025,7 @@ cr_engine_rejoin_fail(void **state)
uint32_t class = TCC_POOL_NONEXIST_ON_MS;
uint32_t action;
int rank = -1;
int result;
int result = 0;
int rc;
int i;

Expand Down Expand Up @@ -3006,15 +3078,13 @@ cr_engine_rejoin_fail(void **state)
rc = cr_ins_verify(&dci, TCIS_COMPLETED);
assert_rc_equal(rc, 0);

/* The check on the pool will fail as -DER_HG or -DER_TIMEDOUT. */
result = -DER_HG;
rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
if (rc == -DER_INVAL) {
result = -DER_TIMEDOUT;
rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
}
rc = cr_pool_fail_result(&dci, pool.pool_uuid, class, action, &result);
assert_rc_equal(rc, 0);

/* The check on the pool will fail because of -DER_HG or -DER_TIMEDOUT or -DER_OOG. */
D_ASSERTF(result == -DER_HG || result == -DER_TIMEDOUT || result == -DER_OOG,
"Unexpected pool " DF_UUID " fail result: %d\n", DP_UUID(pool.pool_uuid), result);

/* Reint the rank, rejoin will fail but not affect the rank start. */
rc = cr_rank_reint(rank, true);
assert_rc_equal(rc, 0);
Expand Down

0 comments on commit ab5815d

Please sign in to comment.