diff --git a/src/include/daos/pool_map.h b/src/include/daos/pool_map.h index 3c961b4369c..4f1add7471b 100644 --- a/src/include/daos/pool_map.h +++ b/src/include/daos/pool_map.h @@ -265,9 +265,19 @@ pool_component_unavail(struct pool_component *comp, bool for_reint) { uint8_t status = comp->co_status; - return (status == PO_COMP_ST_DOWN) || - (status == PO_COMP_ST_DOWNOUT) || - (status == PO_COMP_ST_UP && !(for_reint)); + /* If it's down or down-out it is definitely unavailable */ + if ((status == PO_COMP_ST_DOWN) || (status == PO_COMP_ST_DOWNOUT)) + return true; + + /* + * The component is unavailable if it's currently being reintegrated. + * However when calculating the data movement for reintegration + * We treat these nodes as being available for the placement map. + */ + if ((status == PO_COMP_ST_UP) && (for_reint == false)) + return true; + + return false; } static inline bool diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 01eb5d299ae..9070789a406 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -139,9 +139,6 @@ obj_shard_open(struct dc_object *obj, unsigned int shard, unsigned int map_ver, D_GOTO(unlock, rc = -DER_NONEXIST); } - /* XXX could be otherwise for some object classes? */ - D_ASSERT(obj_shard->do_shard == shard); - D_DEBUG(DB_IO, "Open object shard %d\n", shard); if (obj_shard->do_obj == NULL) { @@ -345,7 +342,7 @@ obj_layout_create(struct dc_object *obj, bool refresh) struct dc_obj_shard *obj_shard; obj_shard = &obj->cob_shards->do_shards[i]; - obj_shard->do_shard = i; + obj_shard->do_shard = layout->ol_shards[i].po_shard; obj_shard->do_target_id = layout->ol_shards[i].po_target; obj_shard->do_fseq = layout->ol_shards[i].po_fseq; obj_shard->do_rebuilding = layout->ol_shards[i].po_rebuilding; diff --git a/src/placement/jump_map.c b/src/placement/jump_map.c index 5ca0dff6dd8..db41a75bed0 100644 --- a/src/placement/jump_map.c +++ b/src/placement/jump_map.c @@ -31,6 +31,27 @@ #include #include + +/* + * These ops determine whether extra information is calculated during + * placement. + * + * PL_PLACE_EXTENDED calculates an extended layout for use when there + * is a reintegration operation currently ongoing. + * + * PL_REINT calculates the post-reintegration layout for use during + * reintegration, it treats the UP status targets as UP_IN. + * + * Currently the other OP types calculate a normal layout without extra info. + */ +enum PL_OP_TYPE { + PL_PLACE, + PL_PLACE_EXTENDED, + PL_REBUILD, + PL_REINT, + PL_ADD, +}; + /** * Contains information related to object layout size. */ @@ -374,8 +395,9 @@ count_available_spares(struct pl_jump_map *jmap, struct pl_obj_layout *layout, static int obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, struct pl_obj_layout *layout, struct jm_obj_placement *jmop, - d_list_t *remap_list, bool for_reint, uint8_t *tgts_used, - uint8_t *dom_used, uint32_t failed_in_layout) + d_list_t *remap_list, enum PL_OP_TYPE op_type, + uint8_t *tgts_used, uint8_t *dom_used, + uint32_t failed_in_layout, d_list_t *extend_list) { struct failed_shard *f_shard; struct pl_obj_shard *l_shard; @@ -384,6 +406,7 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, d_list_t *current; daos_obj_id_t oid; bool spare_avail = true; + bool for_reint; uint64_t key; uint32_t spares_left; int rc; @@ -391,6 +414,7 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, remap_dump(remap_list, md, "before remap:"); + for_reint = (op_type == PL_REINT); current = remap_list->next; spare_tgt = NULL; oid = md->omd_id; @@ -405,10 +429,8 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, uint64_t rebuild_key; uint32_t shard_id; - f_shard = d_list_entry(current, struct failed_shard, - fs_list); + f_shard = d_list_entry(current, struct failed_shard, fs_list); shard_id = f_shard->fs_shard_idx; - l_shard = &layout->ol_shards[f_shard->fs_shard_idx]; spare_avail = jump_map_remap_next_spare(jmap, jmop, @@ -422,11 +444,23 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, spares_left--; } - determine_valid_spares(spare_tgt, md, spare_avail, ¤t, - remap_list, for_reint, f_shard, l_shard); - + if (op_type == PL_PLACE_EXTENDED && spare_avail && + spare_tgt->ta_comp.co_status == PO_COMP_ST_UP) { + rc = remap_alloc_one(extend_list, shard_id, spare_tgt, + true); + if (rc) + return rc; + } + determine_valid_spares(spare_tgt, md, spare_avail, + ¤t, remap_list, for_reint, f_shard, + l_shard); } + if (op_type == PL_PLACE_EXTENDED) { + rc = pl_map_extend(layout, extend_list); + if (rc != 0) + return rc; + } remap_dump(remap_list, md, "after remap:"); return 0; } @@ -506,16 +540,18 @@ jump_map_obj_spec_place_get(struct pl_jump_map *jmap, daos_obj_id_t oid, static int get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, struct jm_obj_placement *jmop, d_list_t *remap_list, - bool for_reint, struct daos_obj_md *md) + enum PL_OP_TYPE op_type, struct daos_obj_md *md) { struct pool_target *target; struct pool_domain *root; daos_obj_id_t oid; + d_list_t extend_list; uint8_t *dom_used; uint8_t *tgts_used; uint32_t dom_used_length; uint64_t key; uint32_t fail_tgt_cnt; + bool for_reint; int i, j, k, rc; /* Set the pool map version */ @@ -527,6 +563,7 @@ get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, oid = md->omd_id; key = oid.hi ^ oid.lo; target = NULL; + for_reint = (op_type == PL_REINT); rc = pool_map_find_domain(jmap->jmp_map.pl_poolmap, PO_COMP_TP_ROOT, PO_COMP_ID_ALL, &root); @@ -539,6 +576,7 @@ get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, D_ALLOC_ARRAY(dom_used, (dom_used_length / 8) + 1); D_ALLOC_ARRAY(tgts_used, (root->do_target_nr / 8) + 1); + D_INIT_LIST_HEAD(&extend_list); if (dom_used == NULL || tgts_used == NULL) D_GOTO(out, rc = -DER_NOMEM); @@ -572,6 +610,13 @@ get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, rc = remap_alloc_one(remap_list, 0, target, false); if (rc) D_GOTO(out, rc); + if (op_type == PL_PLACE_EXTENDED && + target->ta_comp.co_status == PO_COMP_ST_UP) { + rc = remap_alloc_one(&extend_list, k, target, + true); + if (rc != 0) + D_GOTO(out, rc); + } } /** skip the first shard because it's been @@ -600,17 +645,30 @@ get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, /** If target is failed queue it for remap*/ if (pool_target_unavail(target, for_reint)) { fail_tgt_cnt++; + rc = remap_alloc_one(remap_list, k, target, false); if (rc) D_GOTO(out, rc); + + if (op_type == PL_PLACE_EXTENDED && + target->ta_comp.co_status == PO_COMP_ST_UP) { + remap_alloc_one(&extend_list, k, + target, true); + } + } } + j = 0; } - rc = obj_remap_shards(jmap, md, layout, jmop, remap_list, - for_reint, tgts_used, dom_used, fail_tgt_cnt); + rc = 0; + if (fail_tgt_cnt > 0) + rc = obj_remap_shards(jmap, md, layout, jmop, remap_list, + op_type, tgts_used, dom_used, fail_tgt_cnt, + &extend_list); + out: if (rc) { D_ERROR("jump_map_obj_layout_fill failed, rc "DF_RC"\n", @@ -698,8 +756,6 @@ jump_map_create(struct pool_map *poolmap, struct pl_map_init_attr *mia, return rc; } - - static void jump_map_print(struct pl_map *map) { @@ -753,7 +809,8 @@ jump_map_obj_place(struct pl_map *map, struct daos_obj_md *md, /* Get root node of pool map */ D_INIT_LIST_HEAD(&remap_list); - rc = get_object_layout(jmap, layout, &jmop, &remap_list, false, md); + rc = get_object_layout(jmap, layout, &jmop, &remap_list, + PL_PLACE_EXTENDED, md); if (rc < 0) { D_ERROR("Could not generate placement layout, rc "DF_RC"\n", DP_RC(rc)); @@ -841,7 +898,8 @@ jump_map_obj_find_rebuild(struct pl_map *map, struct daos_obj_md *md, } D_INIT_LIST_HEAD(&remap_list); - rc = get_object_layout(jmap, layout, &jmop, &remap_list, false, md); + rc = get_object_layout(jmap, layout, &jmop, &remap_list, PL_REBUILD, + md); if (rc < 0) { D_ERROR("Could not generate placement layout, rc "DF_RC"\n", @@ -916,7 +974,7 @@ jump_map_obj_find_reint(struct pl_map *map, struct daos_obj_md *md, D_INIT_LIST_HEAD(&reint_list); /* Get original placement */ - rc = get_object_layout(jmap, layout, &jop, &remap_list, false, md); + rc = get_object_layout(jmap, layout, &jop, &remap_list, PL_PLACE, md); if (rc) goto out; @@ -925,7 +983,8 @@ jump_map_obj_find_reint(struct pl_map *map, struct daos_obj_md *md, D_INIT_LIST_HEAD(&remap_list); /* Get placement after reintegration. */ - rc = get_object_layout(jmap, reint_layout, &jop, &remap_list, true, md); + rc = get_object_layout(jmap, reint_layout, &jop, &remap_list, PL_REINT, + md); if (rc) goto out; diff --git a/src/placement/pl_map.c b/src/placement/pl_map.c index e90960bb0ef..2de35f1f88b 100644 --- a/src/placement/pl_map.c +++ b/src/placement/pl_map.c @@ -599,18 +599,23 @@ pl_select_leader(daos_obj_id_t oid, uint32_t shard_idx, uint32_t grp_size, start = rdg_idx * replicas; replica_idx = (oid.lo + rdg_idx) % replicas; preferred = start + replica_idx; + for (i = 0, off = preferred, pos = -1; i < replicas; i++, replica_idx = (replica_idx + 1) % replicas, off = start + replica_idx) { shard = pl_get_shard(data, off); - if (shard->po_target == -1 || shard->po_rebuilding) + /* + * shard->po_shard != off is necessary because during + * reintegration we may have an extended layout and we don't + * want the extended target to be the leader. + */ + if (shard->po_target == -1 || shard->po_rebuilding + || shard->po_shard != off) continue; - if (pos == -1 || pl_get_shard(data, pos)->po_fseq > shard->po_fseq) pos = off; } - if (pos != -1) { D_ASSERT(pl_get_shard(data, pos)->po_shard == pos); diff --git a/src/placement/pl_map.h b/src/placement/pl_map.h index cfb687c7f54..94355ef55b1 100644 --- a/src/placement/pl_map.h +++ b/src/placement/pl_map.h @@ -98,6 +98,9 @@ int remap_alloc_one(d_list_t *remap_list, unsigned int shard_idx, struct pool_target *tgt, bool for_reint); +int +remap_insert_copy_one(d_list_t *remap_list, struct failed_shard *original); + void remap_list_free_all(d_list_t *remap_list); @@ -127,4 +130,7 @@ int spec_place_rank_get(unsigned int *pos, daos_obj_id_t oid, struct pool_map *pl_poolmap); +int +pl_map_extend(struct pl_obj_layout *layout, d_list_t *extended_list); + #endif /* __PL_MAP_H__ */ diff --git a/src/placement/pl_map_common.c b/src/placement/pl_map_common.c index 9188f195158..589b6cb94d4 100644 --- a/src/placement/pl_map_common.c +++ b/src/placement/pl_map_common.c @@ -93,7 +93,7 @@ remap_alloc_one(d_list_t *remap_list, unsigned int shard_idx, remap_add_one(remap_list, f_new); } else { f_new->fs_tgt_id = tgt->ta_comp.co_id; - d_list_add(&f_new->fs_list, remap_list); + d_list_add_tail(&f_new->fs_list, remap_list); } return 0; @@ -399,4 +399,94 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, (*current) = (*current)->next; } +int +pl_map_extend(struct pl_obj_layout *layout, d_list_t *extended_list) +{ + struct pl_obj_shard *new_shards; + struct failed_shard *f_shard; + d_list_t *current; + uint8_t *grp_map; + uint32_t *grp_count; + uint32_t max_fail_grp; + uint32_t new_group_size; + uint32_t grp; + uint32_t grp_idx; + int i, j, k = 0; + int rc = 0; + + grp_map = NULL; + grp_count = NULL; + + /* Empty list, no extension needed */ + if (extended_list == extended_list->next || layout->ol_grp_size == 1) + goto out; + + D_ALLOC_ARRAY(grp_map, (layout->ol_nr / 8) + 1); + D_ALLOC_ARRAY(grp_count, layout->ol_grp_nr); + if (grp_count == NULL || grp_map == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + i = 0; + max_fail_grp = 0; + + current = extended_list->next; + while (current != extended_list) { + f_shard = d_list_entry(current, struct failed_shard, fs_list); + grp = f_shard->fs_shard_idx / layout->ol_grp_size; + + if (isset(grp_map, f_shard->fs_tgt_id) == false) { + setbit(grp_map, f_shard->fs_tgt_id); + grp_count[grp]++; + + if (max_fail_grp < grp_count[grp]) + max_fail_grp = grp_count[grp]; + } else + d_list_del_init(&f_shard->fs_list); + + current = current->next; + } + + + new_group_size = layout->ol_grp_size + max_fail_grp; + D_ALLOC_ARRAY(new_shards, new_group_size * layout->ol_grp_nr); + if (new_shards == NULL) + return -DER_NOMEM; + + while (k < layout->ol_nr) { + for (j = 0; j < layout->ol_grp_size; ++j, ++k, ++i) + new_shards[i] = layout->ol_shards[k]; + for (; j < new_group_size; ++j, ++i) { + new_shards[i].po_shard = -1; + new_shards[i].po_target = -1; + } + } + current = extended_list->next; + while (current != extended_list) { + f_shard = d_list_entry(current, struct failed_shard, fs_list); + + grp = f_shard->fs_shard_idx / layout->ol_grp_size; + grp_idx = ((grp + 1) * layout->ol_grp_size) + grp; + grp_count[grp]--; + grp_idx += grp_count[grp]; + + new_shards[grp_idx].po_fseq = f_shard->fs_fseq; + new_shards[grp_idx].po_shard = f_shard->fs_shard_idx; + new_shards[grp_idx].po_target = f_shard->fs_tgt_id; + new_shards[grp_idx].po_rebuilding = 1; + + current = current->next; + } + + layout->ol_grp_size += max_fail_grp; + layout->ol_nr = layout->ol_grp_size * layout->ol_grp_nr; + + D_FREE(layout->ol_shards); + layout->ol_shards = new_shards; + +out: + D_FREE(grp_map); + D_FREE(grp_count); + remap_list_free_all(extended_list); + return rc; +} diff --git a/src/placement/tests/place_obj_common.c b/src/placement/tests/place_obj_common.c index 5d76a3cf475..9412ddcd929 100644 --- a/src/placement/tests/place_obj_common.c +++ b/src/placement/tests/place_obj_common.c @@ -83,30 +83,71 @@ reint_check(struct pl_obj_layout *layout, struct pl_obj_layout *temp_layout, uint32_t *spare_tgt_ranks, uint32_t *shard_ids, int num_reint, uint32_t curr_fail_tgt) { - int i; + int i, j; + int temp_i; + int rebuilding; + int num_reint_found; + uint32_t shard_idx; + uint32_t target; uint32_t original_target; uint32_t reint_target; + struct pl_obj_shard curr_shard; D_ASSERT(num_reint >= 0 && num_reint < 2); + num_reint_found = 0; + temp_i = 0; + i = 0; /* can't rebuild non replicated date */ if (temp_layout->ol_grp_size == 1) { D_ASSERT(num_reint == 0); - if (layout->ol_shards[0].po_target == curr_fail_tgt) - D_ASSERT(temp_layout->ol_shards[0].po_target == -1); + for (i = 0; i < layout->ol_nr; ++i) { + original_target = layout->ol_shards[i].po_target; + reint_target = temp_layout->ol_shards[i].po_target; + + if (original_target == curr_fail_tgt) + D_ASSERT(reint_target == -1); + } return; } - for (i = 0; i < temp_layout->ol_nr; ++i) { - original_target = layout->ol_shards[i].po_target; - reint_target = temp_layout->ol_shards[i].po_target; + if (layout->ol_nr != temp_layout->ol_nr) + D_ASSERT(num_reint > 0); + i = 0; + while (i < layout->ol_nr) { + + for (j = 0; j < layout->ol_grp_size; ++j) { + original_target = layout->ol_shards[i].po_target; + reint_target = temp_layout->ol_shards[temp_i].po_target; + + if (original_target == curr_fail_tgt) { + D_ASSERT(num_reint == 1); + D_ASSERT(original_target == spare_tgt_ranks[0]); + D_ASSERT(reint_target != original_target); + } + + i++; + temp_i++; + } - if (original_target == curr_fail_tgt) { - D_ASSERT(num_reint == 1); - D_ASSERT(original_target == spare_tgt_ranks[0]); - D_ASSERT(reint_target != original_target); + while (temp_i < temp_layout->ol_grp_size) { + curr_shard = temp_layout->ol_shards[temp_i]; + shard_idx = curr_shard.po_shard; + target = curr_shard.po_target; + rebuilding = curr_shard.po_rebuilding; + + if (shard_idx != -1) { + D_ASSERT(shard_idx == shard_ids[0]); + D_ASSERT(target == spare_tgt_ranks[0]); + D_ASSERT(rebuilding == 1); + D_ASSERT(num_reint_found < num_reint); + } + + num_reint_found++; + temp_i++; } } + D_ASSERT(num_reint_found == num_reint); } void diff --git a/src/tests/suite/daos_obj.c b/src/tests/suite/daos_obj.c index b3efcafa21b..7fc3b11e362 100644 --- a/src/tests/suite/daos_obj.c +++ b/src/tests/suite/daos_obj.c @@ -3148,6 +3148,9 @@ fetch_replica_unavail(void **state) daos_add_server(arg->pool.pool_uuid, arg->group, &arg->pool.svc, rank); + /* wait until reintegration is done */ + test_rebuild_wait(&arg, 1); + assert_int_equal(rc, 0); } D_FREE(buf);