From d66525641e822450ad4dd63d1f8bab797fe5e65d Mon Sep 17 00:00:00 2001 From: Di Wang Date: Tue, 6 Feb 2024 00:45:59 +0000 Subject: [PATCH 1/3] DAOS-15056 rebuild: add rpt to the rgt list properly Add rpt to the rgt list before yield, in case the rpt is incorrectly created with incoming scan RPC. Features: rebuild Required-githooks: true Signed-off-by: Di Wang --- src/rebuild/scan.c | 8 ++++---- src/rebuild/srv.c | 18 ++++++++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 352459a2d84..251fcb79892 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1117,10 +1117,10 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) (rpt->rt_rebuild_op == rsi->rsi_rebuild_op && rpt->rt_rebuild_ver == rsi->rsi_rebuild_ver && rpt->rt_rebuild_gen < rsi->rsi_rebuild_gen))) { - D_INFO(DF_UUID" %s %u/"DF_U64" < incoming rebuild %u/"DF_U64"\n", - DP_UUID(rpt->rt_pool_uuid), RB_OP_STR(rpt->rt_rebuild_op), - rpt->rt_rebuild_ver, rpt->rt_leader_term, rsi->rsi_rebuild_ver, - rsi->rsi_leader_term); + D_INFO(DF_UUID" %p %s %u/"DF_U64"/%u < incoming rebuild %u/"DF_U64"/%u\n", + DP_UUID(rpt->rt_pool_uuid), rpt, RB_OP_STR(rpt->rt_rebuild_op), + rpt->rt_rebuild_ver, rpt->rt_leader_term, rpt->rt_rebuild_gen, + rsi->rsi_rebuild_ver, rsi->rsi_leader_term, rsi->rsi_rebuild_gen); rpt->rt_abort = 1; } } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index fa519adabad..7427f536150 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1730,6 +1730,9 @@ ds_rebuild_abort(uuid_t pool_uuid, unsigned int ver, unsigned int gen, uint64_t (ver == (unsigned int)(-1) || rpt->rt_rebuild_ver == ver) && (gen == (unsigned int)(-1) || rpt->rt_rebuild_gen == gen) && (term == (uint64_t)(-1) || rpt->rt_leader_term == term)) { + D_INFO(DF_UUID" try abort the rpt %p op %s ver %u gen %u\n", + DP_UUID(rpt->rt_pool_uuid), rpt, RB_OP_STR(rpt->rt_rebuild_op), + rpt->rt_rebuild_ver, rpt->rt_rebuild_gen); rpt->rt_abort = 1; aborted = false; } @@ -2467,6 +2470,11 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) rpt->rt_rebuild_op = rsi->rsi_rebuild_op; + /* Let's add the rpt to the tracker list before IV fetch, which might yield, + * to make sure the new coming request can find the rpt in the list. + */ + rpt_get(rpt); + d_list_add(&rpt->rt_list, &rebuild_gst.rg_tgt_tracker_list); rc = ds_pool_iv_srv_hdl_fetch(pool, &rpt->rt_poh_uuid, &rpt->rt_coh_uuid); if (rc) @@ -2506,14 +2514,16 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) rpt->rt_pool = pool; /* pin it */ ABT_mutex_unlock(rpt->rt_lock); - rpt_get(rpt); - d_list_add(&rpt->rt_list, &rebuild_gst.rg_tgt_tracker_list); *p_rpt = rpt; out: if (rc) { - if (rpt) + if (rpt) { rpt_put(rpt); - + if (!d_list_empty(&rpt->rt_list)) { + d_list_del_init(&rpt->rt_list); + rpt_put(rpt); + } + } ds_pool_put(pool); } daos_prop_fini(&prop); From 37d72f8361b6a780b9e783e94c939681c4074f31 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Tue, 20 Feb 2024 04:17:20 +0000 Subject: [PATCH 2/3] DAOS-15056 rebuild: fix memor leak Fix memory leak found by Niu. Features: rebuild Required-githooks: true Signed-off-by: Di Wang --- src/rebuild/srv.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 7427f536150..4d2c0d4c8b0 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1069,6 +1069,17 @@ rpt_put(struct rebuild_tgt_pool_tracker *rpt) if (rpt->rt_refcount == 1 && rpt->rt_finishing) ABT_cond_signal(rpt->rt_fini_cond); ABT_mutex_unlock(rpt->rt_lock); + if (rpt->rt_refcount == 0) { + /* If rebuild tracker ULT is started successfully, then + * the rpt will be destroyed in rebuild_tgt_fini(). + * Otherwise the rpt might be destroyed if the tracking + * ULT is not started. Since it will happen in system + * XS, no need lock here. + */ + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + d_list_del_init(&rpt->rt_list); + rpt_destroy(rpt); + } } static void @@ -2160,16 +2171,10 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) DP_UUID(rpt->rt_pool_uuid), DP_RC(rc)); /* destroy the migrate_tls of 0-xstream */ ds_migrate_stop(rpt->rt_pool, rpt->rt_rebuild_ver, rpt->rt_rebuild_gen); - d_list_del_init(&rpt->rt_list); - rpt_put(rpt); - /* No one should access rpt after rebuild_fini_one. - */ - D_ASSERT(rpt->rt_refcount == 0); - + /* No one should access rpt after rebuild_fini_one. */ D_INFO("Finalized rebuild for "DF_UUID", map_ver=%u.\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver); - - rpt_destroy(rpt); + rpt_put(rpt); } void From 52bf8623fd5b0f1223ef2b413e6a39ac72555a45 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Tue, 20 Feb 2024 04:22:46 +0000 Subject: [PATCH 3/3] DAOS-15056 rebuild: fix typo address review from wangshilong Features: rebuild Required-githooks: true Signed-off-by: Di Wang --- src/rebuild/srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 4d2c0d4c8b0..554ad0c1535 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2523,11 +2523,11 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) out: if (rc) { if (rpt) { - rpt_put(rpt); if (!d_list_empty(&rpt->rt_list)) { d_list_del_init(&rpt->rt_list); rpt_put(rpt); } + rpt_put(rpt); } ds_pool_put(pool); }