From 0df56ecd2f0a7945019cda8030570ab17a0ba1f8 Mon Sep 17 00:00:00 2001
From: Fan Yong <fan.yong@intel.com>
Date: Sat, 9 Nov 2024 00:38:57 +0800
Subject: [PATCH] DAOS-16170 cart: do not release completed RPC reference
 repeatedly - b26

For collective RPC, when handle failure cases during crt_req_send(),
its reference may has been released via crt_rpc_complete_and_unlock()
that is triggered by crt_corpc_complete(). Under such case, we should
check whether the RPC is completed or not before calling RPC_DECREF()
to avoid releasing the RPC reference repeatedly.

The patch also initializes some local variable for CHK RPC to avoid
accessing invalid DRAM when handle failed collective CHK RPC.

Some enhancement for CR test logic.

Test-tag: test_daos_cat_recov_core
Allow-unstable-test: true

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/cart/crt_rpc.c        |  2 +-
 src/chk/chk_rpc.c         | 12 +++---
 src/tests/suite/daos_cr.c | 86 +++++++++++++++++++++++++++++++++++----
 3 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c
index ead03d1bf29..c1011bfbd3a 100644
--- a/src/cart/crt_rpc.c
+++ b/src/cart/crt_rpc.c
@@ -1537,7 +1537,7 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg)
 			/* failure already reported through complete cb */
 			if (complete_cb != NULL)
 				rc = 0;
-		} else {
+		} else if (!crt_rpc_completed(rpc_priv)) {
 			RPC_DECREF(rpc_priv);
 		}
 	}
diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c
index 84e6ae33857..e340d482cec 100644
--- a/src/chk/chk_rpc.c
+++ b/src/chk/chk_rpc.c
@@ -694,7 +694,7 @@ chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t poo
 int
 chk_mark_remote(d_rank_list_t *rank_list, uint64_t gen, d_rank_t rank, uint32_t version)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_mark_in	*cmi;
 	struct chk_mark_out	*cmo;
 	int			 rc;
@@ -730,7 +730,7 @@ int
 chk_act_remote(d_rank_list_t *rank_list, uint64_t gen, uint64_t seq, uint32_t cla,
 	       uint32_t act, d_rank_t rank, bool for_all)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_act_in	*cai;
 	struct chk_act_out	*cao;
 	int			 rc;
@@ -825,7 +825,7 @@ int
 chk_pool_start_remote(d_rank_list_t *rank_list, uint64_t gen, uuid_t uuid, uint32_t phase,
 		      uint32_t flags)
 {
-	crt_rpc_t			*req;
+	crt_rpc_t			*req = NULL;
 	struct chk_pool_start_in	*cpsi;
 	struct chk_pool_start_out	*cpso;
 	int				 rc;
@@ -863,7 +863,7 @@ chk_pool_mbs_remote(d_rank_t rank, uint32_t phase, uint64_t gen, uuid_t uuid, ch
 		    uint64_t seq, uint32_t flags, uint32_t mbs_nr, struct chk_pool_mbs *mbs_array,
 		    int *svc_rc, struct rsvc_hint *svc_hint)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_pool_mbs_in	*cpmi;
 	struct chk_pool_mbs_out	*cpmo;
 	int			 rc;
@@ -908,7 +908,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act,
 		      char *msg, uint32_t option_nr, uint32_t *options, uint32_t detail_nr,
 		      d_sg_list_t *details, uint64_t seq)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_report_in	*cri;
 	struct chk_report_out	*cro;
 	int			 rc;
@@ -984,7 +984,7 @@ int
 chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags,
 		  uint32_t *pool_nr, uuid_t **pools)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_rejoin_in	*cri;
 	struct chk_rejoin_out	*cro;
 	uuid_t			*tmp;
diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c
index 4c981f29645..85ade4754de 100644
--- a/src/tests/suite/daos_cr.c
+++ b/src/tests/suite/daos_cr.c
@@ -382,6 +382,78 @@ cr_locate_dcri(struct daos_check_info *dci, struct daos_check_report_info *base,
 	return dcri;
 }
 
+static int
+cr_pool_fail_result(struct daos_check_info *dci, uuid_t uuid, uint32_t cla, uint32_t act, int *ret)
+{
+	struct daos_check_pool_info	*dcpi;
+	struct daos_check_report_info	*dcri;
+	int				 i;
+
+	if (dci->dci_pool_nr != 1) {
+		print_message("CR pool count %d (pool " DF_UUID ") is not 1\n",
+			      dci->dci_pool_nr, DP_UUID(uuid));
+		return -DER_INVAL;
+	}
+
+	dcpi = &dci->dci_pools[0];
+	D_ASSERTF(uuid_compare(dcpi->dcpi_uuid, uuid) == 0,
+		  "Unmatched pool UUID (1): " DF_UUID " vs " DF_UUID "\n",
+		  DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));
+
+	if (!cr_pool_status_failed(dcpi->dcpi_status)) {
+		print_message("CR pool " DF_UUID " status %s is not failed\n",
+			      DP_UUID(uuid), dcpi->dcpi_status);
+		return -DER_INVAL;
+	}
+
+	if (cr_pool_phase_is_done(dcpi->dcpi_phase)) {
+		print_message("CR pool " DF_UUID " phase should not be done\n",
+			      DP_UUID(uuid));
+		return -DER_INVAL;
+	}
+
+#ifdef CR_ACCURATE_QUERY_RESULT
+	if (dci->dci_report_nr != 1) {
+		print_message("CR (failed) pool " DF_UUID " has unexpected reports: %d\n",
+			      DP_UUID(uuid), dci->dci_report_nr);
+		return -DER_INVAL;
+	}
+#endif
+
+	for (i = 0; i < dci->dci_report_nr; i++) {
+		dcri = &dci->dci_reports[i];
+		if (uuid_compare(dcri->dcri_uuid, uuid) != 0) {
+#ifdef CR_ACCURATE_QUERY_RESULT
+			print_message("Detect unrelated inconsistency report: "
+				      DF_UUID " vs " DF_UUID "\n",
+				      DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));
+			return -DER_INVAL;
+#else
+			continue;
+#endif
+		}
+
+		if (dcri->dcri_class != cla) {
+			print_message("CR (failed) pool " DF_UUID " reports unexpected "
+				      "inconsistency at %d: %u vs %u\n",
+				      DP_UUID(uuid), i, dcri->dcri_class, cla);
+			return -DER_INVAL;
+		}
+
+		if (dcri->dcri_act != act) {
+			print_message("CR (failed) pool " DF_UUID " reports unexpected "
+				      "solution at %d: %u vs %u\n",
+				      DP_UUID(uuid), i, dcri->dcri_act, act);
+			return -DER_INVAL;
+		}
+
+		*ret = dcri->dcri_result;
+		return 0;
+	}
+
+	return -DER_INVAL;
+}
+
 static void
 cr_dci_fini(struct daos_check_info *dci)
 {
@@ -2953,7 +3025,7 @@ cr_engine_rejoin_fail(void **state)
 	uint32_t			 class = TCC_POOL_NONEXIST_ON_MS;
 	uint32_t			 action;
 	int				 rank = -1;
-	int				 result;
+	int				 result = 0;
 	int				 rc;
 	int				 i;
 
@@ -3006,15 +3078,13 @@ cr_engine_rejoin_fail(void **state)
 	rc = cr_ins_verify(&dci, TCIS_COMPLETED);
 	assert_rc_equal(rc, 0);
 
-	/* The check on the pool will fail as -DER_HG or -DER_TIMEDOUT. */
-	result = -DER_HG;
-	rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
-	if (rc == -DER_INVAL) {
-		result = -DER_TIMEDOUT;
-		rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
-	}
+	rc = cr_pool_fail_result(&dci, pool.pool_uuid, class, action, &result);
 	assert_rc_equal(rc, 0);
 
+	/* The check on the pool will fail because of -DER_HG or -DER_TIMEDOUT or -DER_OOG. */
+	D_ASSERTF(result == -DER_HG || result == -DER_TIMEDOUT || result == -DER_OOG,
+		  "Unexpected pool " DF_UUID " fail result: %d\n", DP_UUID(pool.pool_uuid), result);
+
 	/* Reint the rank, rejoin will fail but not affect the rank start. */
 	rc = cr_rank_reint(rank, true);
 	assert_rc_equal(rc, 0);