Skip to content

Commit

Permalink
DAOS-14467 chk: properly stop check instance
Browse files Browse the repository at this point in the history
When someone wants to stop current check instance, it needs to set
ins->ci_sched_exiting to notify related instance scheduler to exit.

Originally, we used "ci_sched_running" for such purpose. But it is
confused to distinguish whether the scheduler has already exited or
someone is stopping the scheduler. The others may misunderstand that
related check scheduler has already exited, but the scheduler is in
stopping process, as to subsequent checker restart will fail.

Signed-off-by: Fan Yong <[email protected]>
  • Loading branch information
Nasf-Fan committed Oct 15, 2023
1 parent 5bbd742 commit c61f30d
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 37 deletions.
16 changes: 6 additions & 10 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,6 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu
struct chk_iv iv = { 0 };
int rc;

ins->ci_sched_exiting = 1;

while ((cpr = d_list_pop_entry(&ins->ci_pool_shutdown_list, struct chk_pool_rec,
cpr_shutdown_link)) != NULL) {
chk_pool_shutdown(cpr, false);
Expand All @@ -199,7 +197,7 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu
ins_status != CHK__CHECK_INST_STATUS__CIS_STOPPED &&
ins_status != CHK__CHECK_INST_STATUS__CIS_IMPLICATED && ins->ci_iv_ns != NULL) {
if (DAOS_FAIL_CHECK(DAOS_CHK_PS_NOTIFY_LEADER))
goto out;
return;

iv.ci_gen = cbk->cb_gen;
iv.ci_phase = cbk->cb_phase;
Expand All @@ -213,9 +211,6 @@ chk_engine_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu
DF_ENGINE" on rank %u notify leader for its exit, status %u: rc = %d\n",
DP_ENGINE(ins), dss_self_rank(), ins_status, rc);
}

out:
ins->ci_sched_exiting = 0;
}

static int
Expand Down Expand Up @@ -1847,11 +1842,11 @@ chk_engine_sched(void *args)
D_INFO(DF_ENGINE" scheduler on rank %u entry at phase %u\n",
DP_ENGINE(ins), myrank, cbk->cb_phase);

while (ins->ci_sched_running) {
while (!ins->ci_sched_exiting) {
dss_sleep(300);

/* Someone wants to stop the check. */
if (!ins->ci_sched_running)
if (ins->ci_sched_exiting)
D_GOTO(out, rc = 0);

ins_phase = chk_pools_find_slowest(ins, &done);
Expand Down Expand Up @@ -1931,6 +1926,7 @@ chk_engine_sched(void *args)
D_INFO(DF_ENGINE" scheduler on rank %u exit at phase %u with status %u: rc %d\n",
DP_ENGINE(ins), myrank, cbk->cb_phase, ins_status, rc);

ins->ci_sched_exiting = 0;
ins->ci_sched_running = 0;
}

Expand Down Expand Up @@ -2295,7 +2291,7 @@ chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[], uint32_t *flags)
if (ins->ci_starting)
D_GOTO(log, rc = -DER_BUSY);

if (ins->ci_stopping)
if (ins->ci_stopping || ins->ci_sched_exiting)
D_GOTO(log, rc = -DER_INPROGRESS);

if (cbk->cb_ins_status != CHK__CHECK_INST_STATUS__CIS_RUNNING)
Expand Down Expand Up @@ -3153,7 +3149,7 @@ chk_engine_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
goto out;
}

if (!ins->ci_sched_running || cpr->cpr_exiting) {
if (!ins->ci_sched_running || ins->ci_sched_exiting || cpr->cpr_exiting) {
rc = 1;
ABT_mutex_unlock(cpr->cpr_mutex);
goto out;
Expand Down
30 changes: 12 additions & 18 deletions src/chk/chk_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1150,43 +1150,37 @@ chk_dup_string(char **tgt, const char *src, size_t len)
static inline void
chk_stop_sched(struct chk_instance *ins)
{
uint64_t gen = ins->ci_bk.cb_gen;

ABT_mutex_lock(ins->ci_abt_mutex);
if (ins->ci_sched != ABT_THREAD_NULL && ins->ci_sched_running) {
ins->ci_sched_running = 0;
if (ins->ci_sched_running && !ins->ci_sched_exiting) {
D_INFO("Stopping %s instance on rank %u with gen "DF_U64"\n",
ins->ci_is_leader ? "leader" : "engine", dss_self_rank(), gen);
ins->ci_sched_exiting = 1;
ABT_cond_broadcast(ins->ci_abt_cond);
ABT_mutex_unlock(ins->ci_abt_mutex);
ABT_thread_free(&ins->ci_sched);
} else {
ABT_mutex_unlock(ins->ci_abt_mutex);
}

/* Check the ci_bk.cb_gen for the case of others restarted the checker during the wait. */
while (ins->ci_sched_running && gen == ins->ci_bk.cb_gen)
ABT_cond_wait(ins->ci_abt_cond, ins->ci_abt_mutex);
ABT_mutex_unlock(ins->ci_abt_mutex);
}

static inline int
chk_ins_can_start(struct chk_instance *ins)
{
struct chk_bookmark *cbk = &ins->ci_bk;

if (unlikely(!ins->ci_inited))
return -DER_AGAIN;

if (ins->ci_starting)
return -DER_INPROGRESS;

if (ins->ci_stopping)
if (ins->ci_stopping || ins->ci_sched_exiting)
return -DER_BUSY;

if (ins->ci_sched_running)
return -DER_ALREADY;

/*
* If ci_sched_running is zero but check instance is still running,
* then someone is trying to stop it.
*/
if (((ins->ci_is_leader && cbk->cb_magic == CHK_BK_MAGIC_LEADER) ||
(!ins->ci_is_leader && cbk->cb_magic == CHK_BK_MAGIC_ENGINE)) &&
cbk->cb_ins_status == CHK__CHECK_INST_STATUS__CIS_RUNNING)
return -DER_BUSY;

return 0;
}

Expand Down
15 changes: 6 additions & 9 deletions src/chk/chk_leader.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,6 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu
struct chk_iv iv = { 0 };
int rc = 0;

ins->ci_sched_exiting = 1;

D_ASSERT(d_list_empty(&ins->ci_pool_shutdown_list));

chk_pool_stop_all(ins, pool_status, NULL);
Expand Down Expand Up @@ -262,8 +260,6 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu
D_ERROR(DF_LEADER" exit with status %u: "DF_RC"\n",
DP_LEADER(ins), ins_status, DP_RC(rc));
}

ins->ci_sched_exiting = 0;
}

static void
Expand Down Expand Up @@ -1306,7 +1302,7 @@ chk_leader_need_stop(struct chk_instance *ins, int *ret)
}
}

if (!ins->ci_sched_running) {
if (!ins->ci_sched_running || ins->ci_sched_exiting) {
*ret = 0;
return true;
}
Expand Down Expand Up @@ -1924,7 +1920,7 @@ chk_leader_pool_mbs_one(struct chk_pool_rec *cpr)
if (rc1 == RSVC_CLIENT_RECHOOSE ||
(rc1 == RSVC_CLIENT_PROCEED && daos_rpc_retryable_rc(rc))) {
dss_sleep(interval);
if (cpr->cpr_stop || !ins->ci_sched_running) {
if (cpr->cpr_stop || !ins->ci_sched_running || ins->ci_sched_exiting) {
notify = false;
D_GOTO(out_client, rc = 0);
}
Expand Down Expand Up @@ -2158,7 +2154,7 @@ chk_leader_sched(void *args)
ABT_mutex_lock(ins->ci_abt_mutex);

again:
if (!ins->ci_sched_running) {
if (ins->ci_sched_exiting) {
ABT_mutex_unlock(ins->ci_abt_mutex);
D_GOTO(out, rc = 0);
}
Expand Down Expand Up @@ -2297,6 +2293,7 @@ chk_leader_sched(void *args)
D_INFO(DF_LEADER" scheduler exit at phase %u with status %u: rc %d\n",
DP_LEADER(ins), cbk->cb_phase, ins_status, rc);

ins->ci_sched_exiting = 0;
ins->ci_sched_running = 0;
}

Expand Down Expand Up @@ -3032,7 +3029,7 @@ chk_leader_stop(int pool_nr, uuid_t pools[])
if (ins->ci_starting)
D_GOTO(log, rc = -DER_BUSY);

if (ins->ci_stopping)
if (ins->ci_stopping || ins->ci_sched_exiting)
D_GOTO(log, rc = -DER_INPROGRESS);

/*
Expand Down Expand Up @@ -3613,7 +3610,7 @@ chk_leader_report(struct chk_report_unit *cru, uint64_t *seq, int *decision)
goto out;
}

if (!ins->ci_sched_running || cpr->cpr_exiting) {
if (!ins->ci_sched_running || ins->ci_sched_exiting || cpr->cpr_exiting) {
rc = 1;
ABT_mutex_unlock(cpr->cpr_mutex);
goto out;
Expand Down

0 comments on commit c61f30d

Please sign in to comment.