diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index 17df99b13611..0b83aaeb8f6e 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -171,8 +171,6 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu struct chk_iv iv = { 0 }; int rc; - ins->ci_sched_exiting = 1; - while ((cpr = d_list_pop_entry(&ins->ci_pool_shutdown_list, struct chk_pool_rec, cpr_shutdown_link)) != NULL) { chk_pool_shutdown(cpr, false); @@ -199,7 +197,7 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu ins_status != CHK__CHECK_INST_STATUS__CIS_STOPPED && ins_status != CHK__CHECK_INST_STATUS__CIS_IMPLICATED && ins->ci_iv_ns != NULL) { if (DAOS_FAIL_CHECK(DAOS_CHK_PS_NOTIFY_LEADER)) - goto out; + return; iv.ci_gen = cbk->cb_gen; iv.ci_phase = cbk->cb_phase; @@ -213,9 +211,6 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu DF_ENGINE" on rank %u notify leader for its exit, status %u: rc = %d\n", DP_ENGINE(ins), dss_self_rank(), ins_status, rc); } - -out: - ins->ci_sched_exiting = 0; } static int @@ -1847,11 +1842,11 @@ chk_engine_sched(void *args) D_INFO(DF_ENGINE" scheduler on rank %u entry at phase %u\n", DP_ENGINE(ins), myrank, cbk->cb_phase); - while (ins->ci_sched_running) { + while (!ins->ci_sched_exiting) { dss_sleep(300); /* Someone wants to stop the check. */ - if (!ins->ci_sched_running) + if (ins->ci_sched_exiting) D_GOTO(out, rc = 0); ins_phase = chk_pools_find_slowest(ins, &done); @@ -1931,6 +1926,7 @@ chk_engine_sched(void *args) D_INFO(DF_ENGINE" scheduler on rank %u exit at phase %u with status %u: rc %d\n", DP_ENGINE(ins), myrank, cbk->cb_phase, ins_status, rc); + ins->ci_sched_exiting = 0; ins->ci_sched_running = 0; } @@ -2295,7 +2291,7 @@ chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags) if (ins->ci_starting) D_GOTO(log, rc = -DER_BUSY); - if (ins->ci_stopping) + if (ins->ci_stopping || ins->ci_sched_exiting) D_GOTO(log, rc = -DER_INPROGRESS); if (cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_RUNNING) @@ -3153,7 +3149,7 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) goto out; } - if (!ins->ci_sched_running || cpr->cpr_exiting) { + if (!ins->ci_sched_running || ins->ci_sched_exiting || cpr->cpr_exiting) { rc = 1; ABT_mutex_unlock(cpr->cpr_mutex); goto out; diff --git a/src/chk/chk_internal.h b/src/chk/chk_internal.h index 56c2a3ee1b9b..2b605d7ba3d3 100644 --- a/src/chk/chk_internal.h +++ b/src/chk/chk_internal.h @@ -1150,43 +1150,37 @@ chk_dup_string(char **tgt, const char *src, size_t len) static inline void chk_stop_sched(struct chk_instance *ins) { + uint64_t gen = ins->ci_bk.cb_gen; + ABT_mutex_lock(ins->ci_abt_mutex); - if (ins->ci_sched != ABT_THREAD_NULL && ins->ci_sched_running) { - ins->ci_sched_running = 0; + if (ins->ci_sched_running && !ins->ci_sched_exiting) { + D_INFO("Stopping %s instance on rank %u with gen "DF_U64"\n", + ins->ci_is_leader ? "leader" : "engine", dss_self_rank(), gen); + ins->ci_sched_exiting = 1; ABT_cond_broadcast(ins->ci_abt_cond); - ABT_mutex_unlock(ins->ci_abt_mutex); - ABT_thread_free(&ins->ci_sched); - } else { - ABT_mutex_unlock(ins->ci_abt_mutex); } + + /* Check the ci_bk.cb_gen for the case of others restarted the checker during the wait. */ + while (ins->ci_sched_running && gen == ins->ci_bk.cb_gen) + ABT_cond_wait(ins->ci_abt_cond, ins->ci_abt_mutex); + ABT_mutex_unlock(ins->ci_abt_mutex); } static inline int chk_ins_can_start(struct chk_instance *ins) { - struct chk_bookmark *cbk = &ins->ci_bk; - if (unlikely(!ins->ci_inited)) return -DER_AGAIN; if (ins->ci_starting) return -DER_INPROGRESS; - if (ins->ci_stopping) + if (ins->ci_stopping || ins->ci_sched_exiting) return -DER_BUSY; if (ins->ci_sched_running) return -DER_ALREADY; - /* - * If ci_sched_running is zero but check instance is still running, - * then someone is trying to stop it. - */ - if (((ins->ci_is_leader && cbk->cb_magic == CHK_BK_MAGIC_LEADER) || - (!ins->ci_is_leader && cbk->cb_magic == CHK_BK_MAGIC_ENGINE)) && - cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING) - return -DER_BUSY; - return 0; } diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index 7d6d5d54cf14..33a63392ac17 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -229,8 +229,6 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu struct chk_iv iv = { 0 }; int rc = 0; - ins->ci_sched_exiting = 1; - D_ASSERT(d_list_empty(&ins->ci_pool_shutdown_list)); chk_pool_stop_all(ins, pool_status, NULL); @@ -262,8 +260,6 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu D_ERROR(DF_LEADER" exit with status %u: "DF_RC"\n", DP_LEADER(ins), ins_status, DP_RC(rc)); } - - ins->ci_sched_exiting = 0; } static void @@ -1306,7 +1302,7 @@ chk_leader_need_stop(struct chk_instance *ins, int *ret) } } - if (!ins->ci_sched_running) { + if (!ins->ci_sched_running || ins->ci_sched_exiting) { *ret = 0; return true; } @@ -1924,7 +1920,7 @@ chk_leader_pool_mbs_one(struct chk_pool_rec *cpr) if (rc1 == RSVC_CLIENT_RECHOOSE || (rc1 == RSVC_CLIENT_PROCEED && daos_rpc_retryable_rc(rc))) { dss_sleep(interval); - if (cpr->cpr_stop || !ins->ci_sched_running) { + if (cpr->cpr_stop || !ins->ci_sched_running || ins->ci_sched_exiting) { notify = false; D_GOTO(out_client, rc = 0); } @@ -2158,7 +2154,7 @@ chk_leader_sched(void *args) ABT_mutex_lock(ins->ci_abt_mutex); again: - if (!ins->ci_sched_running) { + if (ins->ci_sched_exiting) { ABT_mutex_unlock(ins->ci_abt_mutex); D_GOTO(out, rc = 0); } @@ -2297,6 +2293,7 @@ chk_leader_sched(void *args) D_INFO(DF_LEADER" scheduler exit at phase %u with status %u: rc %d\n", DP_LEADER(ins), cbk->cb_phase, ins_status, rc); + ins->ci_sched_exiting = 0; ins->ci_sched_running = 0; } @@ -3032,7 +3029,7 @@ chk_leader_stop(int pool_nr, uuid_t pools[]) if (ins->ci_starting) D_GOTO(log, rc = -DER_BUSY); - if (ins->ci_stopping) + if (ins->ci_stopping || ins->ci_sched_exiting) D_GOTO(log, rc = -DER_INPROGRESS); /* @@ -3613,7 +3610,7 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision) goto out; } - if (!ins->ci_sched_running || cpr->cpr_exiting) { + if (!ins->ci_sched_running || ins->ci_sched_exiting || cpr->cpr_exiting) { rc = 1; ABT_mutex_unlock(cpr->cpr_mutex); goto out;