Skip to content

Commit

Permalink
DAOS-16170 cart: do not release completed RPC reference repeatedly - b26
Browse files Browse the repository at this point in the history
For collective RPC, when handle failure cases during crt_req_send(),
its reference may has been released via crt_rpc_complete_and_unlock()
that is triggered by crt_corpc_complete(). Under such case, we should
check whether the RPC is completed or not before calling RPC_DECREF()
to avoid releasing the RPC reference repeatedly.

The patch also initializes some local variable for CHK RPC to avoid
accessing invalid DRAM when handle failed collective CHK RPC.

Some enhancement for CR test logic.

Test-tag: test_daos_cat_recov_core
Allow-unstable-test: true

Signed-off-by: Fan Yong <[email protected]>
  • Loading branch information
Nasf-Fan committed Nov 18, 2024
1 parent d1af13a commit 0df56ec
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/cart/crt_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1537,7 +1537,7 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg)
/* failure already reported through complete cb */
if (complete_cb != NULL)
rc = 0;
} else {
} else if (!crt_rpc_completed(rpc_priv)) {
RPC_DECREF(rpc_priv);
}
}
Expand Down
12 changes: 6 additions & 6 deletions src/chk/chk_rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t poo
int
chk_mark_remote(d_rank_list_t *rank_list, uint64_t gen, d_rank_t rank, uint32_t version)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_mark_in *cmi;
struct chk_mark_out *cmo;
int rc;
Expand Down Expand Up @@ -730,7 +730,7 @@ int
chk_act_remote(d_rank_list_t *rank_list, uint64_t gen, uint64_t seq, uint32_t cla,
uint32_t act, d_rank_t rank, bool for_all)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_act_in *cai;
struct chk_act_out *cao;
int rc;
Expand Down Expand Up @@ -825,7 +825,7 @@ int
chk_pool_start_remote(d_rank_list_t *rank_list, uint64_t gen, uuid_t uuid, uint32_t phase,
uint32_t flags)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_pool_start_in *cpsi;
struct chk_pool_start_out *cpso;
int rc;
Expand Down Expand Up @@ -863,7 +863,7 @@ chk_pool_mbs_remote(d_rank_t rank, uint32_t phase, uint64_t gen, uuid_t uuid, ch
uint64_t seq, uint32_t flags, uint32_t mbs_nr, struct chk_pool_mbs *mbs_array,
int *svc_rc, struct rsvc_hint *svc_hint)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_pool_mbs_in *cpmi;
struct chk_pool_mbs_out *cpmo;
int rc;
Expand Down Expand Up @@ -908,7 +908,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act,
char *msg, uint32_t option_nr, uint32_t *options, uint32_t detail_nr,
d_sg_list_t *details, uint64_t seq)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_report_in *cri;
struct chk_report_out *cro;
int rc;
Expand Down Expand Up @@ -984,7 +984,7 @@ int
chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags,
uint32_t *pool_nr, uuid_t **pools)
{
crt_rpc_t *req;
crt_rpc_t *req = NULL;
struct chk_rejoin_in *cri;
struct chk_rejoin_out *cro;
uuid_t *tmp;
Expand Down
86 changes: 78 additions & 8 deletions src/tests/suite/daos_cr.c
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,78 @@ cr_locate_dcri(struct daos_check_info *dci, struct daos_check_report_info *base,
return dcri;
}

static int
cr_pool_fail_result(struct daos_check_info *dci, uuid_t uuid, uint32_t cla, uint32_t act, int *ret)
{
struct daos_check_pool_info *dcpi;
struct daos_check_report_info *dcri;
int i;

if (dci->dci_pool_nr != 1) {
print_message("CR pool count %d (pool " DF_UUID ") is not 1\n",
dci->dci_pool_nr, DP_UUID(uuid));
return -DER_INVAL;
}

dcpi = &dci->dci_pools[0];
D_ASSERTF(uuid_compare(dcpi->dcpi_uuid, uuid) == 0,
"Unmatched pool UUID (1): " DF_UUID " vs " DF_UUID "\n",
DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));

if (!cr_pool_status_failed(dcpi->dcpi_status)) {
print_message("CR pool " DF_UUID " status %s is not failed\n",
DP_UUID(uuid), dcpi->dcpi_status);
return -DER_INVAL;
}

if (cr_pool_phase_is_done(dcpi->dcpi_phase)) {
print_message("CR pool " DF_UUID " phase should not be done\n",
DP_UUID(uuid));
return -DER_INVAL;
}

#ifdef CR_ACCURATE_QUERY_RESULT
if (dci->dci_report_nr != 1) {
print_message("CR (failed) pool " DF_UUID " has unexpected reports: %d\n",
DP_UUID(uuid), dci->dci_report_nr);
return -DER_INVAL;
}
#endif

for (i = 0; i < dci->dci_report_nr; i++) {
dcri = &dci->dci_reports[i];
if (uuid_compare(dcri->dcri_uuid, uuid) != 0) {
#ifdef CR_ACCURATE_QUERY_RESULT
print_message("Detect unrelated inconsistency report: "
DF_UUID " vs " DF_UUID "\n",
DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));
return -DER_INVAL;
#else
continue;
#endif
}

if (dcri->dcri_class != cla) {
print_message("CR (failed) pool " DF_UUID " reports unexpected "
"inconsistency at %d: %u vs %u\n",
DP_UUID(uuid), i, dcri->dcri_class, cla);
return -DER_INVAL;
}

if (dcri->dcri_act != act) {
print_message("CR (failed) pool " DF_UUID " reports unexpected "
"solution at %d: %u vs %u\n",
DP_UUID(uuid), i, dcri->dcri_act, act);
return -DER_INVAL;
}

*ret = dcri->dcri_result;
return 0;
}

return -DER_INVAL;
}

static void
cr_dci_fini(struct daos_check_info *dci)
{
Expand Down Expand Up @@ -2953,7 +3025,7 @@ cr_engine_rejoin_fail(void **state)
uint32_t class = TCC_POOL_NONEXIST_ON_MS;
uint32_t action;
int rank = -1;
int result;
int result = 0;
int rc;
int i;

Expand Down Expand Up @@ -3006,15 +3078,13 @@ cr_engine_rejoin_fail(void **state)
rc = cr_ins_verify(&dci, TCIS_COMPLETED);
assert_rc_equal(rc, 0);

/* The check on the pool will fail as -DER_HG or -DER_TIMEDOUT. */
result = -DER_HG;
rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
if (rc == -DER_INVAL) {
result = -DER_TIMEDOUT;
rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
}
rc = cr_pool_fail_result(&dci, pool.pool_uuid, class, action, &result);
assert_rc_equal(rc, 0);

/* The check on the pool will fail because of -DER_HG or -DER_TIMEDOUT or -DER_OOG. */
D_ASSERTF(result == -DER_HG || result == -DER_TIMEDOUT || result == -DER_OOG,
"Unexpected pool " DF_UUID " fail result: %d\n", DP_UUID(pool.pool_uuid), result);

/* Reint the rank, rejoin will fail but not affect the rank start. */
rc = cr_rank_reint(rank, true);
assert_rc_equal(rc, 0);
Expand Down

0 comments on commit 0df56ec

Please sign in to comment.