Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-13047 chk: properly handle start options #12242

Merged
merged 2 commits into from
Jun 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/chk/chk_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,8 @@ chk_prop_prepare(d_rank_t leader, uint32_t flags, int phase,
int i;

prop->cp_leader = leader;
if (!(flags & CHK__CHECK_FLAG__CF_DRYRUN))
prop->cp_flags &= ~CHK__CHECK_FLAG__CF_DRYRUN;
if (flags & CHK__CHECK_FLAG__CF_NO_FAILOUT)
prop->cp_flags &= ~CHK__CHECK_FLAG__CF_FAILOUT;
if (flags & CHK__CHECK_FLAG__CF_NO_AUTO)
Expand Down
15 changes: 10 additions & 5 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,10 @@ chk_engine_pm_dangling(struct chk_pool_rec *cpr, struct pool_map *map, struct po
case CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE:
/* Report the inconsistency without repair. */
cbk->cb_statistics.cs_ignored++;
/*
* For the pool with dangling map entry, if not repair, then the subsequent
* check (based on pool map) may fail, then have to skip to avoid confusing.
*/
cpr->cpr_skip = 1;
break;
default:
Expand Down Expand Up @@ -1972,8 +1976,8 @@ chk_engine_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank

init:
if (!chk_is_on_leader(gen, leader, true)) {
rc = chk_prop_prepare(leader, api_flags | prop->cp_flags, phase,
policy_nr, policies, rank_list, prop);
rc = chk_prop_prepare(leader, api_flags, phase, policy_nr, policies, rank_list,
prop);
if (rc != 0)
goto out;

Expand Down Expand Up @@ -2209,9 +2213,10 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks, uint32_t polic
chk_destroy_pool_tree(ins);
out_log:
if (rc >= 0) {
D_INFO(DF_ENGINE" started on rank %u with api_flags %x, phase %d, leader %u, "
"flags %x: rc %d\n",
DP_ENGINE(ins), myrank, api_flags, phase, leader, flags, rc);
D_INFO(DF_ENGINE " %s on rank %u with api_flags %x, phase %d, leader %u, "
"flags %x: rc %d\n",
DP_ENGINE(ins), chk_is_ins_reset(ins, api_flags) ? "start" : "resume",
myrank, api_flags, phase, leader, flags, rc);

chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks);
chk_pools_dump(&ins->ci_pool_list, pool_nr, pools);
Expand Down
6 changes: 6 additions & 0 deletions src/chk/chk_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,12 @@ void chk_vos_init(void);

void chk_vos_fini(void);

static inline bool
chk_is_ins_reset(struct chk_instance *ins, uint32_t flags)
{
return flags & CHK__CHECK_FLAG__CF_RESET || ins->ci_start_flags & CSF_RESET_ALL;
}

static inline void
chk_ins_set_fail(struct chk_instance *ins, uint32_t phase)
{
Expand Down
31 changes: 12 additions & 19 deletions src/chk/chk_leader.c
Original file line number Diff line number Diff line change
Expand Up @@ -707,8 +707,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
clue->pc_svc_clue->psc_db_clue.bcl_replicas);
if (result != 0) {
cbk->cb_statistics.cs_failed++;
/* Skip the pool if failed to register to MS. */
cpr->cpr_skip = 1;
} else {
cbk->cb_statistics.cs_repaired++;
cpr->cpr_exist_on_ms = 1;
Expand Down Expand Up @@ -738,8 +736,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
case CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE:
/* Report the inconsistency without repair. */
cbk->cb_statistics.cs_ignored++;
/* If ignore the orphan pool, then skip subsequent check. */
cpr->cpr_skip = 1;
break;
default:
/*
Expand All @@ -755,7 +751,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
/* Ignore the inconsistency if admin does not want interaction. */
act = CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE;
cbk->cb_statistics.cs_ignored++;
cpr->cpr_skip = 1;
} else {
act = CHK__CHECK_INCONSIST_ACTION__CIA_INTERACT;

Expand Down Expand Up @@ -806,8 +801,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)

if (rc < 0 && option_nr > 0) {
cbk->cb_statistics.cs_failed++;
/* Skip the orphan if failed to interact with admin for further action. */
cpr->cpr_skip = 1;
result = rc;
}

Expand All @@ -832,8 +825,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
case CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE:
act = CHK__CHECK_INCONSIST_ACTION__CIA_IGNORE;
cbk->cb_statistics.cs_ignored++;
/* If ignore the orphan pool, then skip subsequent check. */
cpr->cpr_skip = 1;
break;
case CHK__CHECK_INCONSIST_ACTION__CIA_DISCARD:
act = CHK__CHECK_INCONSIST_ACTION__CIA_DISCARD;
Expand Down Expand Up @@ -866,8 +857,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
clue->pc_svc_clue->psc_db_clue.bcl_replicas);
if (result != 0) {
cbk->cb_statistics.cs_failed++;
/* Skip the pool if failed to register to MS. */
cpr->cpr_skip = 1;
} else {
cbk->cb_statistics.cs_repaired++;
cpr->cpr_exist_on_ms = 1;
Expand Down Expand Up @@ -1857,6 +1846,11 @@ chk_leader_need_stop(struct chk_instance *ins)
}

if (!dangling) {
/*
* "ci_stopping" means that the user wants to stop checker for some pools.
* But the specified pools may be not in checking. "ci_pool_stopped" means
* the checker for some pools are really stopped.
*/
if (ins->ci_pool_stopped) {
D_ASSERT(ins->ci_stopping);
return 0;
Expand Down Expand Up @@ -2325,7 +2319,7 @@ chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank
if (rc != 0)
goto out;

ins->ci_start_flags |= CSF_RESET_ALL;
ins->ci_start_flags = CSF_RESET_ALL;
if (pool_nr <= 0)
ins->ci_start_flags |= CSF_ORPHAN_POOL;

Expand All @@ -2334,8 +2328,7 @@ chk_leader_start_prep(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *rank
cbk->cb_version = DAOS_CHK_VERSION;

init:
rc = chk_prop_prepare(leader, flags | prop->cp_flags, phase,
policy_nr, policies, rank_list, prop);
rc = chk_prop_prepare(leader, flags, phase, policy_nr, policies, rank_list, prop);
if (rc != 0)
goto out;

Expand Down Expand Up @@ -2768,10 +2761,10 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c
goto out_stop_pools;
}

D_INFO("Leader %s check with api_flags %x, phase %d, leader %u, flags %x, gen "
DF_X64": rc %d\n",
(flags & CHK__CHECK_FLAG__CF_RESET || ins->ci_start_flags & CSF_RESET_ALL) ?
"start" : "resume", api_flags, phase, myrank, ins->ci_start_flags, cbk->cb_gen, rc);
D_INFO("Leader %s check with api_flags %x, phase %d, leader %u, flags %x, gen " DF_X64
": rc %d\n",
chk_is_ins_reset(ins, flags) ? "start" : "resume", api_flags, phase, myrank,
ins->ci_start_flags, cbk->cb_gen, rc);

chk_ranks_dump(ins->ci_ranks->rl_nr, ins->ci_ranks->rl_ranks);
chk_pools_dump(&ins->ci_pool_list, c_pool_nr > 0 ? c_pool_nr : pool_nr,
Expand Down Expand Up @@ -2821,7 +2814,7 @@ chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct c
ins->ci_starting = 0;

/* Notify the control plane that the check (re-)starts from the scratch. */
if (flags & CHK__CHECK_FLAG__CF_RESET || ins->ci_start_flags & CSF_RESET_ALL)
if (chk_is_ins_reset(ins, flags))
rc = 1;

if (c_pools != NULL && c_pools != pools)
Expand Down
Loading