Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-9599 chk: handle inconsistent pool label #9524

Merged
merged 1 commit into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions src/chk/chk_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ chk_ranks_dump(uint32_t rank_nr, d_rank_t *ranks)
}

void
chk_pools_dump(uint32_t pool_nr, uuid_t pools[])
chk_pools_dump(int pool_nr, uuid_t pools[])
{
char buf[256];
char *ptr = buf;
Expand Down Expand Up @@ -371,6 +371,58 @@ chk_pools_dump(uint32_t pool_nr, uuid_t pools[])
D_INFO("%s\n", buf);
}

int
chk_pool_filter(uuid_t uuid, void *arg)
{
struct chk_pool_filter_args *cpfa = arg;
d_iov_t kiov;
d_iov_t riov;
int i;
int rc;
bool found = false;

if (daos_handle_is_valid(cpfa->cpfa_pool_hdl)) {
d_iov_set(&riov, NULL, 0);
d_iov_set(&kiov, uuid, sizeof(uuid_t));
rc = dbtree_lookup(cpfa->cpfa_pool_hdl, &kiov, &riov);
if (rc == 0)
found = true;
} else {
if (cpfa->cpfa_pool_nr <= 0) {
found = true;
} else {
for (i = 0; i < cpfa->cpfa_pool_nr; i++) {
if (uuid_compare(uuid, cpfa->cpfa_pools[i]) == 0) {
found = true;
break;
}
}
}
}

return found ? 1 : 0;
}

int
chk_dup_label(char **tgt, const char *src, size_t len)
{
int rc = 0;

if (src == NULL) {
*tgt = NULL;
} else {
D_ASSERT(len > 0);

D_ALLOC(*tgt, len + 1);
if (*tgt == NULL)
rc = -DER_NOMEM;
else
memcpy(*tgt, src, len);
}

return rc;
}

void
chk_stop_sched(struct chk_instance *ins)
{
Expand All @@ -385,7 +437,7 @@ chk_stop_sched(struct chk_instance *ins)

int
chk_prop_prepare(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr,
struct chk_policy *policies, uint32_t pool_nr, uuid_t pools[],
struct chk_policy *policies, int pool_nr, uuid_t pools[],
uint32_t flags, int phase, d_rank_t leader,
struct chk_property *prop, d_rank_list_t **rlist)
{
Expand Down
49 changes: 13 additions & 36 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,6 @@ struct chk_traverse_pools_args {
uint32_t ctpa_status;
};

struct chk_engine_clues_args {
uint32_t ceca_pool_nr;
uuid_t *ceca_pools;
};

struct chk_query_pool_args {
struct chk_instance *cqpa_ins;
uint32_t cqpa_cap;
Expand Down Expand Up @@ -444,7 +439,7 @@ chk_engine_sched(void *args)
static int
chk_engine_start_prepare(struct chk_instance *ins, uint32_t rank_nr, d_rank_t *ranks,
uint32_t policy_nr, struct chk_policy *policies,
uint32_t pool_nr, uuid_t pools[], uint64_t gen, int phase,
int pool_nr, uuid_t pools[], uint64_t gen, int phase,
uint32_t flags, d_rank_t leader, d_rank_list_t **rlist)
{
struct chk_bookmark *cbk = &ins->ci_bk;
Expand Down Expand Up @@ -681,27 +676,10 @@ chk_pools_add_from_db(struct sys_db *db, char *table, d_iov_t *key, void *args)
return rc;
}

static int
chk_engine_clues_filter(uuid_t uuid, void *arg)
{
struct chk_engine_clues_args *ceca = arg;
int i;

if (ceca->ceca_pool_nr == 0)
return 0;

for (i = 0; i < ceca->ceca_pool_nr; i++) {
if (uuid_compare(uuid, ceca->ceca_pools[i]) == 0)
return 0;
}

return 1;
}

int
chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,
uint32_t policy_nr, struct chk_policy *policies, uint32_t pool_nr,
uuid_t pools[], uint32_t flags, int32_t exp_phase, d_rank_t leader,
uint32_t policy_nr, struct chk_policy *policies, int pool_nr,
uuid_t pools[], uint32_t flags, int exp_phase, d_rank_t leader,
uint32_t *cur_phase, struct ds_pool_clues *clues)
{
struct chk_instance *ins = chk_engine;
Expand All @@ -711,7 +689,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,
struct chk_pool_rec *cpr;
struct chk_pool_rec *tmp;
struct chk_traverse_pools_args ctpa = { 0 };
struct chk_engine_clues_args ceca = { 0 };
struct chk_pool_filter_args cpfa = { 0 };
struct umem_attr uma = { 0 };
uuid_t dummy_pool;
d_rank_t myrank = dss_self_rank();
Expand Down Expand Up @@ -838,9 +816,8 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,

if (cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_PREPARE ||
cbk->cb_phase == CHK__CHECK_SCAN_PHASE__CSP_POOL_LIST) {
ceca.ceca_pool_nr = pool_nr;
ceca.ceca_pools = pools;
rc = ds_pool_clues_init(chk_engine_clues_filter, &ceca, clues);
cpfa.cpfa_pool_hdl = ins->ci_pool_hdl;
rc = ds_pool_clues_init(chk_pool_filter, &cpfa, clues);
if (rc != 0)
goto out_bk;
}
Expand Down Expand Up @@ -880,7 +857,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,
ins->ci_starting = 0;

if (rc == 0) {
D_INFO(DF_ENGINE" started on rank %u with %u ranks, %u pools, "
D_INFO(DF_ENGINE" started on rank %u with %u ranks, %d pools, "
"flags %x, phase %d, leader %u\n",
DP_ENGINE(ins), myrank, rank_nr, pool_nr, flags, exp_phase, leader);

Expand All @@ -893,7 +870,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,
} else if (rc > 0) {
*cur_phase = CHK__CHECK_SCAN_PHASE__DSP_DONE;
} else if (rc != -DER_ALREADY) {
D_ERROR(DF_ENGINE" failed to start on rank %u with %u ranks, %u pools, flags %x, "
D_ERROR(DF_ENGINE" failed to start on rank %u with %u ranks, %d pools, flags %x, "
"phase %d, leader %u, gen "DF_X64": "DF_RC"\n", DP_ENGINE(ins), myrank,
rank_nr, pool_nr, flags, exp_phase, leader, gen, DP_RC(rc));
}
Expand All @@ -904,7 +881,7 @@ chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,
}

int
chk_engine_stop(uint64_t gen, uint32_t pool_nr, uuid_t pools[])
chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[])
{
struct chk_instance *ins = chk_engine;
struct chk_property *prop = &ins->ci_prop;
Expand Down Expand Up @@ -954,7 +931,7 @@ chk_engine_stop(uint64_t gen, uint32_t pool_nr, uuid_t pools[])
ins->ci_stopping = 0;

if (rc == 0) {
D_INFO(DF_ENGINE" stopped on rank %u with %u pools\n",
D_INFO(DF_ENGINE" stopped on rank %u with %d pools\n",
DP_ENGINE(ins), dss_self_rank(), pool_nr > 0 ? pool_nr : prop->cp_pool_nr);

if (pool_nr > 0)
Expand All @@ -964,7 +941,7 @@ chk_engine_stop(uint64_t gen, uint32_t pool_nr, uuid_t pools[])
} else if (rc == -DER_ALREADY) {
rc = 1;
} else if (rc < 0) {
D_ERROR(DF_ENGINE" failed to stop on rank %u with %u pools, "
D_ERROR(DF_ENGINE" failed to stop on rank %u with %d pools, "
"gen "DF_X64": "DF_RC"\n", DP_ENGINE(ins), dss_self_rank(),
pool_nr > 0 ? pool_nr : prop->cp_pool_nr, gen, DP_RC(rc));
}
Expand Down Expand Up @@ -1127,7 +1104,7 @@ chk_engine_query_pool(uuid_t uuid, void *args)
}

int
chk_engine_query(uint64_t gen, uint32_t pool_nr, uuid_t pools[],
chk_engine_query(uint64_t gen, int pool_nr, uuid_t pools[],
uint32_t *shard_nr, struct chk_query_pool_shard **shards)
{
struct chk_instance *ins = chk_engine;
Expand Down Expand Up @@ -1165,7 +1142,7 @@ chk_engine_query(uint64_t gen, uint32_t pool_nr, uuid_t pools[],
}

D_CDEBUG(rc != 0, DLOG_ERR, DLOG_DBG,
DF_ENGINE" on rank %u handle query for %u pools :"DF_RC"\n",
DF_ENGINE" on rank %u handle query for %d pools :"DF_RC"\n",
DP_ENGINE(ins), dss_self_rank(), pool_nr, DP_RC(rc));

out:
Expand Down
37 changes: 25 additions & 12 deletions src/chk/chk_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,7 @@ struct chk_pool_rec {
cpr_done:1,
cpr_skip:1,
cpr_healthy:1,
cpr_delay_label:1,
cpr_exist_on_ms:1;
int cpr_advice;
uint32_t cpr_phase;
Expand All @@ -386,6 +387,7 @@ struct chk_pool_rec {
struct ds_pool_clues cpr_clues;
struct chk_bookmark cpr_bk;
struct chk_instance *cpr_ins;
char *cpr_label;
int cpr_refs;
};

Expand Down Expand Up @@ -424,6 +426,12 @@ struct chk_report_unit {
uint32_t cru_result;
};

struct chk_pool_filter_args {
daos_handle_t cpfa_pool_hdl;
int32_t cpfa_pool_nr;
uuid_t *cpfa_pools;
};

extern struct crt_proto_format chk_proto_fmt;

extern struct crt_corpc_ops chk_start_co_ops;
Expand All @@ -440,12 +448,16 @@ extern btr_ops_t chk_rank_ops;

void chk_ranks_dump(uint32_t rank_nr, d_rank_t *ranks);

void chk_pools_dump(uint32_t pool_nr, uuid_t pools[]);
void chk_pools_dump(int pool_nr, uuid_t pools[]);

int chk_pool_filter(uuid_t uuid, void *arg);

int chk_dup_label(char **tgt, const char *src, size_t len);

void chk_stop_sched(struct chk_instance *ins);

int chk_prop_prepare(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr,
struct chk_policy *policies, uint32_t pool_nr, uuid_t pools[],
struct chk_policy *policies, int pool_nr, uuid_t pools[],
uint32_t flags, int phase, d_rank_t leader,
struct chk_property *prop, d_rank_list_t **rlist);

Expand All @@ -469,13 +481,13 @@ void chk_ins_fini(struct chk_instance *ins);
/* chk_engine.c */

int chk_engine_start(uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,
uint32_t policy_nr, struct chk_policy *policies, uint32_t pool_nr,
uuid_t pools[], uint32_t flags, int32_t exp_phase, d_rank_t leader,
uint32_t policy_nr, struct chk_policy *policies, int pool_nr,
uuid_t pools[], uint32_t flags, int exp_phase, d_rank_t leader,
uint32_t *cur_phase, struct ds_pool_clues *clues);

int chk_engine_stop(uint64_t gen, uint32_t pool_nr, uuid_t pools[]);
int chk_engine_stop(uint64_t gen, int pool_nr, uuid_t pools[]);

int chk_engine_query(uint64_t gen, uint32_t pool_nr, uuid_t pools[],
int chk_engine_query(uint64_t gen, int pool_nr, uuid_t pools[],
uint32_t *shard_nr, struct chk_query_pool_shard **shards);

int chk_engine_mark_rank_dead(uint64_t gen, d_rank_t rank, uint32_t version);
Expand Down Expand Up @@ -524,22 +536,22 @@ void chk_leader_fini(void);
/* chk_rpc.c */

int chk_start_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t rank_nr, d_rank_t *ranks,
uint32_t policy_nr, struct chk_policy *policies, uint32_t pool_nr,
uuid_t pools[], uint32_t flags, int32_t phase, d_rank_t leader,
uint32_t policy_nr, struct chk_policy *policies, int pool_nr,
uuid_t pools[], uint32_t flags, int phase, d_rank_t leader,
chk_co_rpc_cb_t start_cb, void *args);

int chk_stop_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t pool_nr, uuid_t pools[],
int chk_stop_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t pools[],
chk_co_rpc_cb_t stop_cb, void *args);

int chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, uint32_t pool_nr, uuid_t pools[],
int chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t pools[],
chk_co_rpc_cb_t query_cb, void *args);

int chk_mark_remote(d_rank_list_t *rank_list, uint64_t gen, d_rank_t rank, uint32_t version);

int chk_act_remote(d_rank_list_t *rank_list, uint64_t gen, uint64_t seq, uint32_t cla,
uint32_t act, d_rank_t rank, bool for_all);

int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, int32_t result,
int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, int result,
d_rank_t rank, uint32_t target, uuid_t *pool, uuid_t *cont,
daos_unit_oid_t *obj, daos_key_t *dkey, daos_key_t *akey, char *msg,
uint32_t option_nr, uint32_t *options, uint32_t detail_nr,
Expand All @@ -549,7 +561,7 @@ int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uint32_t pha

/* chk_updcall.c */

int chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int32_t result,
int chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int result,
d_rank_t rank, uint32_t target, uuid_t *pool, uuid_t *cont,
daos_unit_oid_t *obj, daos_key_t *dkey, daos_key_t *akey, char *msg,
uint32_t option_nr, uint32_t *options, uint32_t detail_nr,
Expand Down Expand Up @@ -711,6 +723,7 @@ chk_pool_put(struct chk_pool_rec *cpr)
d_list_del(&cpr->cpr_link);
D_ASSERT(cpr->cpr_thread == ABT_THREAD_NULL);
D_ASSERT(cpr->cpr_started == 0);
D_FREE(cpr->cpr_label);
D_FREE(cpr);
}
}
Expand Down
Loading