diff --git a/src/include/daos_srv/rebuild.h b/src/include/daos_srv/rebuild.h index 30ad5d8a18f..5deed3f3a6a 100644 --- a/src/include/daos_srv/rebuild.h +++ b/src/include/daos_srv/rebuild.h @@ -70,6 +70,7 @@ int ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver, daos_epoch_t stable_eph, uint32_t layout_version, struct pool_target_id_list *tgts, daos_rebuild_opc_t rebuild_op, uint64_t delay_sec); +void ds_rebuild_restart_if_rank_wip(uuid_t pool_uuid, d_rank_t rank); int ds_rebuild_query(uuid_t pool_uuid, struct daos_rebuild_status *status); void ds_rebuild_running_query(uuid_t pool_uuid, uint32_t opc, uint32_t *rebuild_ver, diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 6e3a01379fa..7fa0f33ea5f 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1458,6 +1458,43 @@ resume_event_handling(struct pool_svc *svc) ABT_mutex_unlock(events->pse_mutex); } +/* + * Restart rebuild if the rank is UPIN in pool map and is in rebuilding. + * + * This function only used when PS leader gets CRT_EVT_ALIVE event of engine \a rank, + * if that rank is UPIN in pool map and with unfinished rebuilding should be massive + * failure case - + * 1. some engines down and triggered rebuild. + * 2. the engine \a rank participated the rebuild, not finished yet, it became down again, + * the #failures exceeds pool RF and will not change pool map. + * 3. That engine restarted by administrator. + * + * In that case should recover the rebuild task on engine \a rank, to simplify it now just + * abort and retry the global rebuild task. + */ +static void +pool_restart_rebuild_if_rank_wip(struct ds_pool *pool, d_rank_t rank) +{ + struct pool_domain *dom; + + dom = pool_map_find_dom_by_rank(pool->sp_map, rank); + if (dom == NULL) { + D_DEBUG(DB_MD, DF_UUID": rank %d non-exist on pool map.\n", + DP_UUID(pool->sp_uuid), rank); + return; + } + + if (dom->do_comp.co_status != PO_COMP_ST_UPIN) { + D_INFO(DF_UUID": rank %d status %d in pool map, got CRT_EVT_ALIVE.\n", + DP_UUID(pool->sp_uuid), rank, dom->do_comp.co_status); + return; + } + + ds_rebuild_restart_if_rank_wip(pool->sp_uuid, rank); + + return; +} + static int pool_svc_exclude_ranks(struct pool_svc *svc, struct pool_svc_event_set *event_set); static int @@ -1487,8 +1524,13 @@ handle_event(struct pool_svc *svc, struct pool_svc_event_set *event_set) for (i = 0; i < event_set->pss_len; i++) { struct pool_svc_event *event = &event_set->pss_buf[i]; - if (event->psv_src != CRT_EVS_SWIM || event->psv_type != CRT_EVT_ALIVE) + if (event->psv_type != CRT_EVT_ALIVE) continue; + + D_DEBUG(DB_MD, DF_UUID ": got CRT_EVT_ALIVE event, psv_src %d, psv_rank %d\n", + DP_UUID(svc->ps_uuid), event->psv_src, event->psv_rank); + pool_restart_rebuild_if_rank_wip(svc->ps_pool, event->psv_rank); + if (ds_pool_map_rank_up(svc->ps_pool->sp_map, event->psv_rank)) { /* * The rank is up in the pool map. Request a pool map diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 4b9a9108fe1..09472e78d97 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1169,9 +1169,26 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) "rsi_rebuild_ver %d != rt_rebuild_ver %d\n", rsi->rsi_rebuild_ver, rpt->rt_rebuild_ver); - D_DEBUG(DB_REBUILD, DF_UUID" already started, req "DF_U64" master %u/"DF_U64"\n", - DP_UUID(rsi->rsi_pool_uuid), rsi->rsi_leader_term, rsi->rsi_master_rank, - rpt->rt_leader_term); + /* The same PS leader request rebuild with higher rsi_rebuild_gen. + * Is the case of massive failure case, see pool_restart_rebuild_if_rank_wip(). + */ + if (rpt->rt_leader_rank == rsi->rsi_master_rank && + rpt->rt_leader_term == rsi->rsi_leader_term && + rpt->rt_rebuild_gen < rsi->rsi_rebuild_gen) { + /* rebuild_leader_status_notify(LAZY rebuild_iv_update), + * it will set rpt->rt_global_done to abort rpt. + * set rt_abort here just for safe. + */ + rpt->rt_abort = 1; + D_INFO(DF_RBF ", start new rebuild, gen %d -> %d.\n", + DP_RBF_RPT(rpt), rpt->rt_rebuild_gen, rsi->rsi_rebuild_gen); + rpt_put(rpt); + rpt = NULL; + goto tls_lookup; + } + + D_DEBUG(DB_REBUILD, "already started, existing " DF_RBF ", req " DF_RBF "\n", + DP_RBF_RPT(rpt), DP_RBF_RSI(rsi)); /* Ignore the rebuild trigger request if it comes from * an old or same leader. @@ -1203,6 +1220,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rpt = NULL; } +tls_lookup: tls = rebuild_pool_tls_lookup(rsi->rsi_pool_uuid, rsi->rsi_rebuild_ver, rsi->rsi_rebuild_gen); if (tls != NULL) { diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 5c463af9e90..8f0696a22f0 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -490,6 +490,45 @@ ds_rebuild_running_query(uuid_t pool_uuid, uint32_t opc, uint32_t *upper_ver, rpt_put(rpt); } +/* + * Restart rebuild if \a rank's rebuild not finished. + * Only used for massive failure recovery case, see pool_restart_rebuild_if_rank_wip(). + */ +void +ds_rebuild_restart_if_rank_wip(uuid_t pool_uuid, d_rank_t rank) +{ + struct rebuild_global_pool_tracker *rgt; + int i; + + rgt = rebuild_global_pool_tracker_lookup(pool_uuid, -1, -1); + if (rgt == NULL) + return; + + if (rgt->rgt_status.rs_state != DRS_IN_PROGRESS) { + rgt_put(rgt); + return; + } + + for (i = 0; i < rgt->rgt_servers_number; i++) { + if (rgt->rgt_servers[i].rank == rank) { + if (!rgt->rgt_servers[i].pull_done) { + rgt->rgt_status.rs_errno = -DER_STALE; + rgt->rgt_abort = 1; + rgt->rgt_status.rs_fail_rank = rank; + D_INFO(DF_RB ": abort rebuild because rank %d WIP\n", + DP_RB_RGT(rgt), rank); + } + rgt_put(rgt); + return; + } + } + + D_INFO(DF_RB ": rank %d not in rgt_servers, rgt_servers_number %d\n", + DP_RB_RGT(rgt), rank, rgt->rgt_servers_number); + rgt_put(rgt); + return; +} + /* TODO: Add something about what the current operation is for output status */ int ds_rebuild_query(uuid_t pool_uuid, struct daos_rebuild_status *status)