diff --git a/src/container/container_iv.c b/src/container/container_iv.c index 58703176ed5..9090a25af50 100644 --- a/src/container/container_iv.c +++ b/src/container/container_iv.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1018,6 +1018,7 @@ cont_iv_hdl_fetch(uuid_t cont_hdl_uuid, uuid_t pool_uuid, D_DEBUG(DB_TRACE, "Can not find "DF_UUID" hdl\n", DP_UUID(cont_hdl_uuid)); +invalidate_retry: /* Fetch the capability from the leader. To avoid extra locks, * all metadatas are maintained by xstream 0, so let's create * an ULT on xstream 0 to let xstream 0 to handle capa fetch @@ -1046,6 +1047,19 @@ cont_iv_hdl_fetch(uuid_t cont_hdl_uuid, uuid_t pool_uuid, if (*cont_hdl == NULL) { D_DEBUG(DB_TRACE, "Can not find "DF_UUID" hdl\n", DP_UUID(cont_hdl_uuid)); + /* In reintegrate with case that the IC_CONT_CAPA cache is valid locally + * but cont open handle invalid (not in dt_cont_hdl_hash). For this case + * invalidate local IV cache first and retry again, to avoid in-flight + * UPDATE's failure. (IV locally valid then the IV fetch will not trigger + * cont_iv_ent_update() callback). + */ + if (!invalidate_current) { + invalidate_current = true; + ABT_eventual_free(&eventual); + D_DEBUG(DB_TRACE, DF_UUID" invalidate_current and retry\n", + DP_UUID(cont_hdl_uuid)); + goto invalidate_retry; + } D_GOTO(out_eventual, rc = -DER_NONEXIST); } diff --git a/src/include/daos_srv/rebuild.h b/src/include/daos_srv/rebuild.h index 8a856c70bb1..3e88fca35ab 100644 --- a/src/include/daos_srv/rebuild.h +++ b/src/include/daos_srv/rebuild.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -35,10 +35,32 @@ typedef enum { (rb_op) == RB_OP_NONE ? "None" : \ "Unknown") +/* Common rebuild identifying information for INFO/DEBUG logging: + * rb=/// + */ +#define DF_RB "rb=" DF_UUID "/%u/%u/%s" + +/* Full rebuild identifying information includes / + * Instead of this, use DF_RB most of the time (use this for leader change scenarios, etc.) + */ +#define DF_RBF DF_RB " ld=%u/" DF_U64 + +/* arguments for log rebuild identifier given a struct rebuild_global_pool_tracker * */ +#define DP_RB_RGT(rgt) \ + DP_UUID((rgt)->rgt_pool_uuid), (rgt)->rgt_rebuild_ver, (rgt)->rgt_rebuild_gen, \ + RB_OP_STR((rgt)->rgt_opc) + +/* arguments for log rebuild identifier given a struct rebuild_tgt_pool_tracker *rpt */ +#define DP_RB_RPT(rpt) \ + DP_UUID((rpt)->rt_pool_uuid), (rpt)->rt_rebuild_ver, (rpt)->rt_rebuild_gen, \ + RB_OP_STR((rpt)->rt_rebuild_op) +#define DP_RBF_RPT(rpt) DP_RB_RPT(rpt), (rpt)->rt_leader_rank, (rpt)->rt_leader_term + int ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t stable_eph, uint32_t layout_version, struct pool_target_id_list *tgts, daos_rebuild_opc_t rebuild_op, uint64_t delay_sec); +void ds_rebuild_restart_if_rank_wip(uuid_t pool_uuid, d_rank_t rank); int ds_rebuild_query(uuid_t pool_uuid, struct daos_rebuild_status *status); void ds_rebuild_running_query(uuid_t pool_uuid, uint32_t opc, uint32_t *rebuild_ver, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 46161bf7a2d..75d661d0665 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -6319,7 +6319,9 @@ obj_ec_get_parity_or_alldata_shard(struct obj_auxi_args *obj_auxi, unsigned int shard_idx = grp_start + i; if (obj_shard_is_invalid(obj, shard_idx, DAOS_OBJ_RPC_ENUMERATE)) { if (++fail_cnt > obj_ec_parity_tgt_nr(oca)) { - D_ERROR(DF_OID" reach max failure "DF_RC"\n", + D_ERROR(DF_CONT", obj "DF_OID" reach max failure "DF_RC"\n", + DP_CONT(obj->cob_pool->dp_pool, + obj->cob_co->dc_uuid), DP_OID(obj->cob_md.omd_id), DP_RC(-DER_DATA_LOSS)); D_GOTO(out, shard = -DER_DATA_LOSS); } @@ -6466,7 +6468,8 @@ obj_list_shards_get(struct obj_auxi_args *obj_auxi, unsigned int map_ver, } if (rc < 0) { - D_ERROR(DF_OID" Can not find shard grp %d: "DF_RC"\n", + D_ERROR(DF_CONT", obj "DF_OID" Can not find shard grp %d: "DF_RC"\n", + DP_CONT(obj->cob_pool->dp_pool, obj->cob_co->dc_uuid), DP_OID(obj->cob_md.omd_id), grp_idx, DP_RC(rc)); D_GOTO(out, rc); } diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 6e3a01379fa..7fa0f33ea5f 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1458,6 +1458,43 @@ resume_event_handling(struct pool_svc *svc) ABT_mutex_unlock(events->pse_mutex); } +/* + * Restart rebuild if the rank is UPIN in pool map and is in rebuilding. + * + * This function only used when PS leader gets CRT_EVT_ALIVE event of engine \a rank, + * if that rank is UPIN in pool map and with unfinished rebuilding should be massive + * failure case - + * 1. some engines down and triggered rebuild. + * 2. the engine \a rank participated the rebuild, not finished yet, it became down again, + * the #failures exceeds pool RF and will not change pool map. + * 3. That engine restarted by administrator. + * + * In that case should recover the rebuild task on engine \a rank, to simplify it now just + * abort and retry the global rebuild task. + */ +static void +pool_restart_rebuild_if_rank_wip(struct ds_pool *pool, d_rank_t rank) +{ + struct pool_domain *dom; + + dom = pool_map_find_dom_by_rank(pool->sp_map, rank); + if (dom == NULL) { + D_DEBUG(DB_MD, DF_UUID": rank %d non-exist on pool map.\n", + DP_UUID(pool->sp_uuid), rank); + return; + } + + if (dom->do_comp.co_status != PO_COMP_ST_UPIN) { + D_INFO(DF_UUID": rank %d status %d in pool map, got CRT_EVT_ALIVE.\n", + DP_UUID(pool->sp_uuid), rank, dom->do_comp.co_status); + return; + } + + ds_rebuild_restart_if_rank_wip(pool->sp_uuid, rank); + + return; +} + static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set); static int @@ -1487,8 +1524,13 @@ handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set) for (i = 0; i < event_set->pss_len; i++) { struct pool_svc_event *event = &event_set->pss_buf[i]; - if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE) + if (event->psv_type != CRT_EVT_ALIVE) continue; + + D_DEBUG(DB_MD, DF_UUID ": got CRT_EVT_ALIVE event, psv_src %d, psv_rank %d\n", + DP_UUID(svc->ps_uuid), event->psv_src, event->psv_rank); + pool_restart_rebuild_if_rank_wip(svc->ps_pool, event->psv_rank); + if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) { /* * The rank is up in the pool map. Request a pool map diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 4b9a9108fe1..423e9010d01 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1169,6 +1169,24 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) "rsi_rebuild_ver %d != rt_rebuild_ver %d\n", rsi->rsi_rebuild_ver, rpt->rt_rebuild_ver); + /* The same PS leader request rebuild with higher rsi_rebuild_gen. + * Is the case of massive failure case, see pool_restart_rebuild_if_rank_wip(). + */ + if (rpt->rt_leader_rank == rsi->rsi_master_rank && + rpt->rt_leader_term == rsi->rsi_leader_term && + rpt->rt_rebuild_gen < rsi->rsi_rebuild_gen) { + /* rebuild_leader_status_notify(LAZY rebuild_iv_update), + * it will set rpt->rt_global_done to abort rpt. + * set rt_abort here just for safe. + */ + rpt->rt_abort = 1; + D_INFO(DF_RBF ", start new rebuild, gen %d -> %d.\n", + DP_RBF_RPT(rpt), rpt->rt_rebuild_gen, rsi->rsi_rebuild_gen); + rpt_put(rpt); + rpt = NULL; + goto tls_lookup; + } + D_DEBUG(DB_REBUILD, DF_UUID" already started, req "DF_U64" master %u/"DF_U64"\n", DP_UUID(rsi->rsi_pool_uuid), rsi->rsi_leader_term, rsi->rsi_master_rank, rpt->rt_leader_term); @@ -1203,6 +1221,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rpt = NULL; } +tls_lookup: tls = rebuild_pool_tls_lookup(rsi->rsi_pool_uuid, rsi->rsi_rebuild_ver, rsi->rsi_rebuild_gen); if (tls != NULL) { diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 5c463af9e90..8f0696a22f0 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -490,6 +490,45 @@ ds_rebuild_running_query(uuid_t pool_uuid, uint32_t opc, uint32_t *upper_ver, rpt_put(rpt); } +/* + * Restart rebuild if \a rank's rebuild not finished. + * Only used for massive failure recovery case, see pool_restart_rebuild_if_rank_wip(). + */ +void +ds_rebuild_restart_if_rank_wip(uuid_t pool_uuid, d_rank_t rank) +{ + struct rebuild_global_pool_tracker *rgt; + int i; + + rgt = rebuild_global_pool_tracker_lookup(pool_uuid, -1, -1); + if (rgt == NULL) + return; + + if (rgt->rgt_status.rs_state != DRS_IN_PROGRESS) { + rgt_put(rgt); + return; + } + + for (i = 0; i < rgt->rgt_servers_number; i++) { + if (rgt->rgt_servers[i].rank == rank) { + if (!rgt->rgt_servers[i].pull_done) { + rgt->rgt_status.rs_errno = -DER_STALE; + rgt->rgt_abort = 1; + rgt->rgt_status.rs_fail_rank = rank; + D_INFO(DF_RB ": abort rebuild because rank %d WIP\n", + DP_RB_RGT(rgt), rank); + } + rgt_put(rgt); + return; + } + } + + D_INFO(DF_RB ": rank %d not in rgt_servers, rgt_servers_number %d\n", + DP_RB_RGT(rgt), rank, rgt->rgt_servers_number); + rgt_put(rgt); + return; +} + /* TODO: Add something about what the current operation is for output status */ int ds_rebuild_query(uuid_t pool_uuid, struct daos_rebuild_status *status)