Skip to content

Commit

Permalink
DAOS-13117 chk: properly set pool label in check report upcall (#11896)
Browse files Browse the repository at this point in the history
The cpr->cpr_label is empty for orphan pool, that will confuse
related MS logic when handle the check report upcall. So under
such case, we should use PS pool label in the check report upcall.

The patch also fixes another issue when handle broken pool without
pool service quorum. For such case, it uses ds_chk_regpool_upcall()
to notify MS to refresh related pool service replicas. But at that
time, we do not know whether related pool label (between MS and PS)
is consistent or not. Under such case, we should use MS pool label
instead of PS label to avoid over-writting MS pool label by wrong.

Signed-off-by: Fan Yong <[email protected]>
  • Loading branch information
Nasf-Fan authored Apr 13, 2023
1 parent 162b660 commit 75431f4
Showing 1 changed file with 12 additions and 14 deletions.
26 changes: 12 additions & 14 deletions src/chk/chk_leader.c
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,7 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
struct chk_instance *ins = cpr->cpr_ins;
struct chk_property *prop = &ins->ci_prop;
struct chk_bookmark *cbk = &ins->ci_bk;
struct ds_pool_clue *clue;
struct ds_pool_clue *clue = cpr->cpr_clue;
char *strs[3];
d_iov_t iovs[3];
d_sg_list_t sgl;
Expand Down Expand Up @@ -729,7 +729,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
cbk->cb_statistics.cs_repaired++;
cpr->cpr_exist_on_ms = 1;
} else {
clue = cpr->cpr_clue;
result = ds_chk_regpool_upcall(seq, cpr->cpr_uuid, clue->pc_label,
clue->pc_svc_clue->psc_db_clue.bcl_replicas);
if (result != 0) {
Expand Down Expand Up @@ -817,7 +816,7 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
cru.cru_option_nr = option_nr;
cru.cru_detail_nr = detail_nr;
cru.cru_pool = (uuid_t *)&cpr->cpr_uuid;
cru.cru_pool_label = cpr->cpr_label;
cru.cru_pool_label = clue->pc_label;
cru.cru_msg = "Check leader detects orphan pool.\n";
cru.cru_options = options;
cru.cru_details = details;
Expand Down Expand Up @@ -889,7 +888,6 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
cbk->cb_statistics.cs_repaired++;
cpr->cpr_exist_on_ms = 1;
} else {
clue = cpr->cpr_clue;
result = ds_chk_regpool_upcall(seq, cpr->cpr_uuid, clue->pc_label,
clue->pc_svc_clue->psc_db_clue.bcl_replicas);
if (result != 0) {
Expand Down Expand Up @@ -1046,7 +1044,7 @@ chk_leader_no_quorum_pool(struct chk_pool_rec *cpr)
goto report;

clue = cpr->cpr_clue;
result = ds_chk_regpool_upcall(seq, cpr->cpr_uuid, clue->pc_label,
result = ds_chk_regpool_upcall(seq, cpr->cpr_uuid, cpr->cpr_label,
clue->pc_svc_clue->psc_db_clue.bcl_replicas);
if (result != 0) {
cbk->cb_statistics.cs_failed++;
Expand Down Expand Up @@ -1216,7 +1214,7 @@ chk_leader_no_quorum_pool(struct chk_pool_rec *cpr)
break;

clue = cpr->cpr_clue;
result = ds_chk_regpool_upcall(seq, cpr->cpr_uuid, clue->pc_label,
result = ds_chk_regpool_upcall(seq, cpr->cpr_uuid, cpr->cpr_label,
clue->pc_svc_clue->psc_db_clue.bcl_replicas);
if (result != 0) {
cbk->cb_statistics.cs_failed++;
Expand Down Expand Up @@ -1629,6 +1627,14 @@ chk_leader_handle_pools_list(struct chk_instance *ins)
cpr = (struct chk_pool_rec *)riov.iov_buf;
cpr->cpr_exist_on_ms = 1;

rc = chk_dup_string(&cpr->cpr_label, clp[i].clp_label,
clp[i].clp_label != NULL ?
strlen(clp[i].clp_label) : 0);
if (rc != 0) {
cpr->cpr_skip = 1;
goto out;
}

/* No engine report shard for the pool, it is dangling pool. */
if (d_list_empty(&cpr->cpr_shard_list)) {
chk_pool_get(cpr);
Expand Down Expand Up @@ -1657,14 +1663,6 @@ chk_leader_handle_pools_list(struct chk_instance *ins)
continue;
}

rc = chk_dup_string(&cpr->cpr_label, clp[i].clp_label,
clp[i].clp_label != NULL ?
strlen(clp[i].clp_label) : 0);
if (rc != 0) {
cpr->cpr_skip = 1;
goto out;
}

chk_pool_get(cpr);
/*
* Each pool will has a dedicated ULT to handle the subsequent check,
Expand Down

0 comments on commit 75431f4

Please sign in to comment.