Skip to content

Commit

Permalink
DAOS-14010 rebuild: add delay rebuild (#13357)
Browse files Browse the repository at this point in the history
* DAOS-14010 rebuild: add delay rebuild

Add "delay rebuild" healing mode, so the delay rebuild process is

1) SWIM detects dead ranks and report to the PS leader, which update
the pool map, i.e. marking the related targets as DOWN.
2) Though the rebuild job will not be scheduled, until there are further
manual pool operations, for example drain, extend, reintegration.
3) Then all these pool operations will be merged into one rebuild job,
then scheduled.

Update placement algothrim to be able to calculate the layout with
merged pool operation.

Abort the rebuild job immediately if it finds further pool map update,
so the current job will be merged to the following rebuild job. So
concurrent pool operation will be allowed, no EBUSY check anymore.

Add various tests to verify the delay rebuild process.

Signed-off-by: Di Wang <[email protected]>
  • Loading branch information
wangdi authored Feb 26, 2024
1 parent c68ac48 commit 61e1334
Show file tree
Hide file tree
Showing 45 changed files with 1,273 additions and 1,046 deletions.
1 change: 1 addition & 0 deletions src/cart/crt_iv.c
Original file line number Diff line number Diff line change
Expand Up @@ -2545,6 +2545,7 @@ handle_ivupdate_response(const struct crt_cb_info *cb_info)
/* uci_bulk_hdl will not be set for invalidate call */
if (iv_info->uci_bulk_hdl != CRT_BULK_NULL)
crt_bulk_free(iv_info->uci_bulk_hdl);

iv_ops->ivo_on_put(iv_info->uci_ivns_internal, &iv_info->uci_iv_value,
iv_info->uci_user_priv);
child_output->rc = output->rc;
Expand Down
227 changes: 150 additions & 77 deletions src/common/pool_map.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1160,24 +1160,21 @@ pool_map_compat(struct pool_map *map, uint32_t version,
return -DER_NO_PERM;
}

} else if (dc->co_status == PO_COMP_ST_UPIN) {
} else if (dc->co_status & (PO_COMP_ST_UPIN | PO_COMP_ST_UP |
PO_COMP_ST_DOWN)) {
if (!existed) {
D_ERROR("status [UPIN] not valid for "
"new comp\n");
D_ERROR("status [%u] not valid for new comp\n",
dc->co_status);
return -DER_INVAL;
}

D_ASSERT(parent != NULL);
if (parent->do_comp.co_status ==
PO_COMP_ST_NEW) {
if (parent != NULL && parent->do_comp.co_status == PO_COMP_ST_NEW) {
D_ERROR("invalid parent status [NEW] "
"when component status "
"[UPIN]\n");
"when component status %u\n", dc->co_status);
return -DER_INVAL;
}
} else {
D_ERROR("bad comp status=0x%x\n",
dc->co_status);
D_ERROR("bad comp status=0x%x\n", dc->co_status);
return -DER_INVAL;
}

Expand Down Expand Up @@ -1321,8 +1318,8 @@ pool_map_merge(struct pool_map *map, uint32_t version,
ddom->do_targets = NULL;
ddom->do_child_nr = 0;
ddom->do_target_nr = 0;
D_DEBUG(DB_TRACE, "Add new domain %s %d\n",
pool_domain_name(cdom), dom_nr);
D_DEBUG(DB_TRACE, "Add new domain %s[%d] idx/nr %d/%u\n",
pool_domain_name(ddom), ddom->do_comp.co_id, i, dom_nr);
} else {
/* Domain existed, copy its children/targets
* from current pool map.
Expand Down Expand Up @@ -1382,9 +1379,9 @@ pool_map_merge(struct pool_map *map, uint32_t version,
if (dc->co_status != PO_COMP_ST_NEW)
continue;

D_DEBUG(DB_TRACE, "New %s[%d]\n",
D_DEBUG(DB_TRACE, "New %s[%d] to %u\n",
pool_comp_type2str(dc->co_type),
dc->co_id);
dc->co_id, (uint32_t)(child - dst_doms));

*child = sdom->do_children[j];
child++;
Expand Down Expand Up @@ -1808,6 +1805,144 @@ pool_map_create(struct pool_buf *buf, uint32_t version, struct pool_map **mapp)
return rc;
}

static bool
child_status_check(struct pool_domain *domain, uint32_t status)
{
int i;

if (domain->do_children == NULL) {
for (i = 0; i < domain->do_target_nr; i++) {
struct pool_component *comp = &domain->do_targets[i].ta_comp;

if (!(comp->co_status & status))
return false;
}

return true;
}

for (i = 0; i < domain->do_child_nr; i++) {
struct pool_component *comp = &domain->do_children[i].do_comp;

if (!(comp->co_status & status))
return false;
}

return true;
}

/* Domain status update state machine */
static int
update_dom_status(struct pool_domain *domain, uint32_t id, uint32_t status, uint32_t version,
bool *updated)
{
int i;

D_ASSERT(domain->do_targets != NULL);

/*
* If this component has children, recurse over them.
*
* If the target ID is found in any of the children, activate
* this component and abort the search
*/
for (i = 0; domain->do_children != NULL && i < domain->do_child_nr; i++) {
struct pool_domain *child = &domain->do_children[i];
int found;

found = update_dom_status(child, id, status, version, updated);
if (!found)
continue;

switch(status) {
case PO_COMP_ST_NEW:
/* Dom should only be changed to NEW if its original
* status is UP during revert rebuild.
*/
if (child->do_comp.co_status == PO_COMP_ST_UP) {
D_DEBUG(DB_MD, "rank %u id %u status %u --> %u\n",
child->do_comp.co_rank, child->do_comp.co_id,
child->do_comp.co_status, status);
child->do_comp.co_status = status;
child->do_comp.co_in_ver = 0;
*updated = true;
}
break;
case PO_COMP_ST_UP:
/* Dom should only be changed to UP if its original
* status is NEW|DOWN|DOWNOUT.
*/
if (child->do_comp.co_status &
(PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT)) {
if (child->do_comp.co_status == PO_COMP_ST_DOWN)
child->do_comp.co_flags = PO_COMPF_DOWN2UP;
D_DEBUG(DB_MD, "rank %u id %u status %u --> %u\n",
child->do_comp.co_rank, child->do_comp.co_id,
child->do_comp.co_status, status);
child->do_comp.co_status = status;
child->do_comp.co_in_ver = version;
*updated = true;
}
break;
case PO_COMP_ST_UPIN:
/* Dom should only be changed to UPIN if its original
* status is UP, otherwise if parts of targets under
* the domain are turns to UP, the domain status might
* be UPIN, then it can not be turned to UP.
*/
if (child->do_comp.co_status == PO_COMP_ST_UP) {
D_DEBUG(DB_MD, "rank %u id %u status %u --> %u\n",
child->do_comp.co_rank, child->do_comp.co_id,
child->do_comp.co_status, status);
child->do_comp.co_status = status;
child->do_comp.co_in_ver = version;
*updated = true;
}
break;
case PO_COMP_ST_DOWNOUT:
case PO_COMP_ST_DOWN:
/* Only change to DOWNOUT/DOWN if all of children are DOWNOUT/DOWN */
if (child_status_check(child, PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT) &&
(child->do_comp.co_status != status)) {
D_DEBUG(DB_MD, "rank %u id %u status %u --> %u\n",
child->do_comp.co_rank, child->do_comp.co_id,
child->do_comp.co_status, status);
if (child->do_comp.co_status == PO_COMP_ST_DOWN)
child->do_comp.co_flags = PO_COMPF_DOWN2OUT;

child->do_comp.co_status = status;
if (status == PO_COMP_ST_DOWN)
child->do_comp.co_fseq = version;
*updated = true;
}
}

return found;
}

for (i = 0; i < domain->do_target_nr; i++) {
struct pool_component *comp = &domain->do_targets[i].ta_comp;

if (comp->co_id == id)
return 1;
}

return 0;
}

int
update_dom_status_by_tgt_id(struct pool_map *map, uint32_t tgt_id, uint32_t status,
uint32_t version, bool *updated)
{
int rc;

D_ASSERT(map->po_tree != NULL);
rc = update_dom_status(map->po_tree, tgt_id, status, version, updated);
if (rc < 0)
return rc;
return 0;
}

/**
* Destroy a pool map.
*/
Expand Down Expand Up @@ -2071,68 +2206,6 @@ pool_map_find_target_by_rank_idx(struct pool_map *map, uint32_t rank,
return 1;
}

static int
activate_new_target(struct pool_domain *domain, uint32_t id)
{
int i;

D_ASSERT(domain->do_targets != NULL);

/*
* If this component has children, recurse over them.
*
* If the target ID is found in any of the children, activate
* this component and abort the search
*/
if (domain->do_children != NULL) {
for (i = 0; i < domain->do_child_nr; i++) {
int found = activate_new_target(&domain->do_children[i],
id);
if (found) {
domain->do_comp.co_status = PO_COMP_ST_UPIN;
return found;
}
}
}

/*
* Check the targets in this domain to see if they match
*
* If they do, activate them and activate the current domain
*/
for (i = 0; i < domain->do_target_nr; i++) {
struct pool_component *comp = &domain->do_targets[i].ta_comp;

if (comp->co_id == id && (comp->co_status == PO_COMP_ST_NEW ||
comp->co_status == PO_COMP_ST_UP ||
comp->co_status == PO_COMP_ST_DRAIN)) {
comp->co_status = PO_COMP_ST_UPIN;
domain->do_comp.co_status = PO_COMP_ST_UPIN;
return 1;
}
}

return 0;
}

/**
* Activate (move to UPIN) a NEW or UP target and all of its parent domains
*
* \param map [IN] The pool map to search
* \param id [IN] Target ID to search
*
* \return 0 if target was not found or not in NEW state
* 1 if target was found and activated
*/
int
pool_map_activate_new_target(struct pool_map *map, uint32_t id)
{
if (map->po_tree != NULL)
return activate_new_target(map->po_tree, id);
return 0;
}


/**
* Check if all targets under one node matching the status.
* \params [IN] dom node domain to be checked.
Expand Down
4 changes: 2 additions & 2 deletions src/container/container_iv.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ cont_iv_ent_copy(struct ds_iv_entry *entry, struct cont_iv_key *key,
dst->iv_agg_eph.eph = src->iv_agg_eph.eph;
break;
default:
D_ERROR("bad iv_class_id %d: "DF_RC"\n", entry->iv_class->iv_class_id,
DP_RC(-DER_INVAL));
rc = -DER_INVAL;
DL_ERROR(rc, "bad iv_class_id %d: ", entry->iv_class->iv_class_id);
return -DER_INVAL;
};

Expand Down
2 changes: 2 additions & 0 deletions src/control/lib/daos/pool_cont_prop.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ const (
PoolSelfHealingAutoExclude = C.DAOS_SELF_HEAL_AUTO_EXCLUDE
// PoolSelfHealingAutoRebuild sets the self-healing strategy to auto-rebuild.
PoolSelfHealingAutoRebuild = C.DAOS_SELF_HEAL_AUTO_REBUILD
// PoolSelfHealingDelayRebuild sets the self-healing strategy to delay-rebuild.
PoolSelfHealingDelayRebuild = C.DAOS_SELF_HEAL_DELAY_REBUILD
)

const (
Expand Down
15 changes: 11 additions & 4 deletions src/control/lib/daos/pool_property.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,25 @@ func PoolProperties() PoolPropertyMap {
return "exclude"
case PoolSelfHealingAutoRebuild:
return "rebuild"
case PoolSelfHealingDelayRebuild:
return "delay_rebuild"
case PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild:
return "exclude,rebuild"
case PoolSelfHealingAutoExclude | PoolSelfHealingDelayRebuild:
return "exclude,delay_rebuild"
default:
return "unknown"
}
},
},
values: map[string]uint64{
"exclude": PoolSelfHealingAutoExclude,
"rebuild": PoolSelfHealingAutoRebuild,
"exclude,rebuild": PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild,
"rebuild,exclude": PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild,
"exclude": PoolSelfHealingAutoExclude,
"rebuild": PoolSelfHealingAutoRebuild,
"delay_rebuild": PoolSelfHealingDelayRebuild,
"exclude,rebuild": PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild,
"rebuild,exclude": PoolSelfHealingAutoExclude | PoolSelfHealingAutoRebuild,
"delay_rebuild,exclude": PoolSelfHealingAutoExclude | PoolSelfHealingDelayRebuild,
"exclude,delay_rebuild": PoolSelfHealingAutoExclude | PoolSelfHealingDelayRebuild,
},
},
"space_rb": {
Expand Down
12 changes: 12 additions & 0 deletions src/control/lib/daos/pool_property_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ func TestControl_PoolProperties(t *testing.T) {
expStr: "self_heal:rebuild",
expJson: []byte(`{"name":"self_heal","description":"Self-healing policy","value":"rebuild"}`),
},
"self_heal-delay_rebuild": {
name: "self_heal",
value: "delay_rebuild",
expStr: "self_heal:delay_rebuild",
expJson: []byte(`{"name":"self_heal","description":"Self-healing policy","value":"delay_rebuild"}`),
},
"self_heal-exclude,rebuild": {
name: "self_heal",
value: "exclude,rebuild",
Expand All @@ -180,6 +186,12 @@ func TestControl_PoolProperties(t *testing.T) {
expStr: "self_heal:exclude,rebuild",
expJson: []byte(`{"name":"self_heal","description":"Self-healing policy","value":"exclude,rebuild"}`),
},
"self_heal-exclude,delay_rebuild": {
name: "self_heal",
value: "exclude,delay_rebuild",
expStr: "self_heal:exclude,delay_rebuild",
expJson: []byte(`{"name":"self_heal","description":"Self-healing policy","value":"exclude,delay_rebuild"}`),
},
"self_heal-invalid": {
name: "self_heal",
value: "wat",
Expand Down
2 changes: 1 addition & 1 deletion src/control/server/instance_storage_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2022 Intel Corporation.
// (C) Copyright 2020-2023 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down
2 changes: 1 addition & 1 deletion src/control/server/instance_superblock.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2019-2022 Intel Corporation.
// (C) Copyright 2019-2023 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down
2 changes: 1 addition & 1 deletion src/control/server/instance_superblock_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2022 Intel Corporation.
// (C) Copyright 2020-2023 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down
2 changes: 1 addition & 1 deletion src/include/daos/btree.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2022 Intel Corporation.
* (C) Copyright 2016-2023 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down
Loading

0 comments on commit 61e1334

Please sign in to comment.