daos-stack · daltonbohning · Dec 16, 2024 · Nov 8, 2024 · Nov 21, 2024 · frostedcmos
@@ -1537,7 +1537,7 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg)
 			/* failure already reported through complete cb */
 			if (complete_cb != NULL)
 				rc = 0;
-		} else {
+		} else if (!crt_rpc_completed(rpc_priv)) {
 			RPC_DECREF(rpc_priv);
 		}
 	}

diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c
@@ -694,7 +694,7 @@ chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t poo
 int
 chk_mark_remote(d_rank_list_t *rank_list, uint64_t gen, d_rank_t rank, uint32_t version)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_mark_in	*cmi;
 	struct chk_mark_out	*cmo;
 	int			 rc;
@@ -730,7 +730,7 @@ int
 chk_act_remote(d_rank_list_t *rank_list, uint64_t gen, uint64_t seq, uint32_t cla,
 	       uint32_t act, d_rank_t rank, bool for_all)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_act_in	*cai;
 	struct chk_act_out	*cao;
 	int			 rc;
@@ -825,7 +825,7 @@ int
 chk_pool_start_remote(d_rank_list_t *rank_list, uint64_t gen, uuid_t uuid, uint32_t phase,
 		      uint32_t flags)
 {
-	crt_rpc_t			*req;
+	crt_rpc_t			*req = NULL;
 	struct chk_pool_start_in	*cpsi;
 	struct chk_pool_start_out	*cpso;
 	int				 rc;
@@ -863,7 +863,7 @@ chk_pool_mbs_remote(d_rank_t rank, uint32_t phase, uint64_t gen, uuid_t uuid, ch
 		    uint64_t seq, uint32_t flags, uint32_t mbs_nr, struct chk_pool_mbs *mbs_array,
 		    int *svc_rc, struct rsvc_hint *svc_hint)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_pool_mbs_in	*cpmi;
 	struct chk_pool_mbs_out	*cpmo;
 	int			 rc;
@@ -908,7 +908,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act,
 		      char *msg, uint32_t option_nr, uint32_t *options, uint32_t detail_nr,
 		      d_sg_list_t *details, uint64_t seq)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_report_in	*cri;
 	struct chk_report_out	*cro;
 	int			 rc;
@@ -984,7 +984,7 @@ int
 chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags,
 		  uint32_t *pool_nr, uuid_t **pools)
 {
-	crt_rpc_t		*req;
+	crt_rpc_t		*req = NULL;
 	struct chk_rejoin_in	*cri;
 	struct chk_rejoin_out	*cro;
 	uuid_t			*tmp;

diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c
@@ -382,6 +382,78 @@ cr_locate_dcri(struct daos_check_info *dci, struct daos_check_report_info *base,
 	return dcri;
 }
 
+static int
+cr_pool_fail_result(struct daos_check_info *dci, uuid_t uuid, uint32_t cla, uint32_t act, int *ret)
+{
+	struct daos_check_pool_info	*dcpi;
+	struct daos_check_report_info	*dcri;
+	int				 i;
+
+	if (dci->dci_pool_nr != 1) {
+		print_message("CR pool count %d (pool " DF_UUID ") is not 1\n",
+			      dci->dci_pool_nr, DP_UUID(uuid));
+		return -DER_INVAL;
+	}
+
+	dcpi = &dci->dci_pools[0];
+	D_ASSERTF(uuid_compare(dcpi->dcpi_uuid, uuid) == 0,
+		  "Unmatched pool UUID (1): " DF_UUID " vs " DF_UUID "\n",
+		  DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));
+
+	if (!cr_pool_status_failed(dcpi->dcpi_status)) {
+		print_message("CR pool " DF_UUID " status %s is not failed\n",
+			      DP_UUID(uuid), dcpi->dcpi_status);
+		return -DER_INVAL;
+	}
+
+	if (cr_pool_phase_is_done(dcpi->dcpi_phase)) {
+		print_message("CR pool " DF_UUID " phase should not be done\n",
+			      DP_UUID(uuid));
+		return -DER_INVAL;
+	}
+
+#ifdef CR_ACCURATE_QUERY_RESULT
+	if (dci->dci_report_nr != 1) {
+		print_message("CR (failed) pool " DF_UUID " has unexpected reports: %d\n",
+			      DP_UUID(uuid), dci->dci_report_nr);
+		return -DER_INVAL;
+	}
+#endif
+
+	for (i = 0; i < dci->dci_report_nr; i++) {
+		dcri = &dci->dci_reports[i];
+		if (uuid_compare(dcri->dcri_uuid, uuid) != 0) {
+#ifdef CR_ACCURATE_QUERY_RESULT
+			print_message("Detect unrelated inconsistency report: "
+				      DF_UUID " vs " DF_UUID "\n",
+				      DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid));
+			return -DER_INVAL;
+#else
+			continue;
+#endif
+		}
+
+		if (dcri->dcri_class != cla) {
+			print_message("CR (failed) pool " DF_UUID " reports unexpected "
+				      "inconsistency at %d: %u vs %u\n",
+				      DP_UUID(uuid), i, dcri->dcri_class, cla);
+			return -DER_INVAL;
+		}
+
+		if (dcri->dcri_act != act) {
+			print_message("CR (failed) pool " DF_UUID " reports unexpected "
+				      "solution at %d: %u vs %u\n",
+				      DP_UUID(uuid), i, dcri->dcri_act, act);
+			return -DER_INVAL;
+		}
+
+		*ret = dcri->dcri_result;
+		return 0;
+	}
+
+	return -DER_INVAL;
+}
+
 static void
 cr_dci_fini(struct daos_check_info *dci)
 {
@@ -2953,7 +3025,7 @@ cr_engine_rejoin_fail(void **state)
 	uint32_t			 class = TCC_POOL_NONEXIST_ON_MS;
 	uint32_t			 action;
 	int				 rank = -1;
-	int				 result;
+	int				 result = 0;
 	int				 rc;
 	int				 i;
 
@@ -3006,15 +3078,13 @@ cr_engine_rejoin_fail(void **state)
 	rc = cr_ins_verify(&dci, TCIS_COMPLETED);
 	assert_rc_equal(rc, 0);
 
-	/* The check on the pool will fail as -DER_HG or -DER_TIMEDOUT. */
-	result = -DER_HG;
-	rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
-	if (rc == -DER_INVAL) {
-		result = -DER_TIMEDOUT;
-		rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result);
-	}
+	rc = cr_pool_fail_result(&dci, pool.pool_uuid, class, action, &result);
 	assert_rc_equal(rc, 0);
 
+	/* The check on the pool will fail because of -DER_HG or -DER_TIMEDOUT or -DER_OOG. */
+	D_ASSERTF(result == -DER_HG || result == -DER_TIMEDOUT || result == -DER_OOG,
+		  "Unexpected pool " DF_UUID " fail result: %d\n", DP_UUID(pool.pool_uuid), result);
+
 	/* Reint the rank, rejoin will fail but not affect the rank start. */
 	rc = cr_rank_reint(rank, true);
 	assert_rc_equal(rc, 0);