From ebccb758e4f6eb45941e18578896cf439006c691 Mon Sep 17 00:00:00 2001 From: Liu Xuezhao Date: Fri, 13 Dec 2024 21:44:05 +0800 Subject: [PATCH 1/3] DAOS-16572 object: refine sc_ec_agg_active flag setting (#15352) clear the sc_ec_agg_active flag more proactively. Signed-off-by: Xuezhao Liu --- src/container/srv_target.c | 20 +++++++++++++++++--- src/object/srv_ec_aggregate.c | 22 +++++++++++++++++++++- src/object/srv_obj.c | 5 +++-- src/rebuild/scan.c | 21 +++++++++++++++------ 4 files changed, 56 insertions(+), 12 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index caf2a4b2ee74..653cb5f7d37a 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -181,7 +181,6 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, } if (pool->sp_rebuilding && !vos_agg) { - cont->sc_ec_agg_active = 0; D_DEBUG(DB_EPC, DF_CONT": skip EC aggregation during rebuild %d.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), pool->sp_rebuilding); @@ -192,12 +191,10 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (!cont->sc_vos_agg_active) D_DEBUG(DB_EPC, DF_CONT": resume VOS aggregation after reintegration.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid)); - cont->sc_vos_agg_active = 1; } else { if (!cont->sc_ec_agg_active) D_DEBUG(DB_EPC, DF_CONT": resume EC aggregation after reintegration.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid)); - cont->sc_ec_agg_active = 1; } if (!cont->sc_props_fetched) @@ -471,6 +468,11 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, if (!cont_aggregate_runnable(cont, req, param->ap_vos_agg)) goto next; + if (param->ap_vos_agg) + cont->sc_vos_agg_active = 1; + else + cont->sc_ec_agg_active = 1; + rc = cont_child_aggregate(cont, cb, param); if (rc == -DER_SHUTDOWN) { break; /* pool destroyed */ @@ -483,10 +485,22 @@ cont_aggregate_interval(struct ds_cont_child *cont, cont_aggregate_cb_t cb, /* Don't sleep too long when there is space pressure */ msecs = 2ULL * 100; } + + if (param->ap_vos_agg) + cont->sc_vos_agg_active = 0; + else + cont->sc_ec_agg_active = 0; + next: if (dss_ult_exiting(req)) break; + /* sleep 18 seconds for EC aggregation ULT if the pool is in rebuilding, + * if no space pressure. + */ + if (cont->sc_pool->spc_pool->sp_rebuilding && !param->ap_vos_agg && msecs != 200) + msecs = 18000; + sched_req_sleep(req, msecs); } out: diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 50b513d16120..46d01bf2dc45 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -2270,6 +2270,13 @@ ec_aggregate_yield(struct ec_agg_param *agg_param) { int rc; + if (agg_param->ap_pool_info.api_pool->sp_rebuilding > 0) { + D_INFO(DF_UUID": abort ec aggregation, sp_rebuilding %d\n", + DP_UUID(agg_param->ap_pool_info.api_pool->sp_uuid), + agg_param->ap_pool_info.api_pool->sp_rebuilding); + return true; + } + D_ASSERT(agg_param->ap_yield_func != NULL); rc = agg_param->ap_yield_func(agg_param->ap_yield_arg); if (rc < 0) /* Abort */ @@ -2460,6 +2467,17 @@ agg_iterate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry, D_ASSERT(agg_param->ap_initialized); + /* If rebuild started, abort it to save conflict window with rebuild + * (see obj_inflight_io_check()). + */ + if (agg_param->ap_pool_info.api_pool->sp_rebuilding > 0) { + D_INFO(DF_CONT" abort as rebuild started, sp_rebuilding %d\n", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid), + agg_param->ap_pool_info.api_pool->sp_rebuilding); + return -1; + } + switch (type) { case VOS_ITER_OBJ: agg_param->ap_epr = param->ip_epr; @@ -2481,7 +2499,9 @@ agg_iterate_pre_cb(daos_handle_t ih, vos_iter_entry_t *entry, } if (rc < 0) { - D_ERROR("EC aggregation failed: "DF_RC"\n", DP_RC(rc)); + D_ERROR(DF_UUID" EC aggregation (rebuilding %d) failed: "DF_RC"\n", + DP_UUID(agg_param->ap_pool_info.api_pool->sp_uuid), + agg_param->ap_pool_info.api_pool->sp_rebuilding, DP_RC(rc)); return rc; } diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 4762e04b898b..7ebca2e7346e 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2362,8 +2362,9 @@ obj_inflight_io_check(struct ds_cont_child *child, uint32_t opc, { if (opc == DAOS_OBJ_RPC_ENUMERATE && flags & ORF_FOR_MIGRATION) { if (child->sc_ec_agg_active) { - D_ERROR(DF_UUID" ec aggregate still active\n", - DP_UUID(child->sc_pool->spc_uuid)); + D_ERROR(DF_CONT" ec aggregate still active, rebuilding %d\n", + DP_CONT(child->sc_pool->spc_uuid, child->sc_uuid), + child->sc_pool->spc_pool->sp_rebuilding); return -DER_UPDATE_AGAIN; } } diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 423e9010d01a..e08e40eea382 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -873,8 +873,9 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, rpt->rt_rebuild_op != RB_OP_FAIL_RECLAIM) { D_ASSERTF(rpt->rt_pool->sp_rebuilding >= 0, DF_UUID" rebuilding %d\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_pool->sp_rebuilding); - /* Wait for EC aggregation to abort before discard the object */ - D_INFO(DF_UUID" wait for ec agg abort.\n", DP_UUID(entry->ie_couuid)); + /* Wait for EC aggregation to abort before discard the object */ + D_INFO(DF_UUID" wait for ec agg abort, rebuilding %d.\n", + DP_UUID(entry->ie_couuid), rpt->rt_pool->sp_rebuilding); dss_sleep(1000); if (rpt->rt_abort || rpt->rt_finishing) { D_DEBUG(DB_REBUILD, DF_CONT" rebuild op %s ver %u abort %u/%u.\n", @@ -1050,6 +1051,7 @@ rebuild_scan_leader(void *data) struct rebuild_tgt_pool_tracker *rpt = data; struct rebuild_pool_tls *tls; int rc; + bool wait = false; D_DEBUG(DB_REBUILD, DF_UUID "check resync %u/%u < %u\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_pool->sp_dtx_resync_version, @@ -1063,6 +1065,7 @@ rebuild_scan_leader(void *data) D_INFO(DF_UUID "wait for global dtx %u rebuild ver %u\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_global_dtx_resync_version, rpt->rt_rebuild_ver); + wait = true; ABT_cond_wait(rpt->rt_global_dtx_wait_cond, rpt->rt_lock); } ABT_mutex_unlock(rpt->rt_lock); @@ -1074,8 +1077,11 @@ rebuild_scan_leader(void *data) } } - D_DEBUG(DB_REBUILD, "rebuild scan collective "DF_UUID" begin.\n", - DP_UUID(rpt->rt_pool_uuid)); + if (wait) + D_INFO("rebuild scan collective "DF_UUID" begin.\n", DP_UUID(rpt->rt_pool_uuid)); + else + D_DEBUG(DB_REBUILD, "rebuild scan collective "DF_UUID" begin.\n", + DP_UUID(rpt->rt_pool_uuid)); rc = ds_pool_thread_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, rebuild_scanner, rpt, @@ -1083,8 +1089,11 @@ rebuild_scan_leader(void *data) if (rc) D_GOTO(out, rc); - D_DEBUG(DB_REBUILD, "rebuild scan collective "DF_UUID" done.\n", - DP_UUID(rpt->rt_pool_uuid)); + if (wait) + D_INFO("rebuild scan collective "DF_UUID" done.\n", DP_UUID(rpt->rt_pool_uuid)); + else + D_DEBUG(DB_REBUILD, "rebuild scan collective "DF_UUID" done.\n", + DP_UUID(rpt->rt_pool_uuid)); ABT_mutex_lock(rpt->rt_lock); rc = ds_pool_task_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | From bdea4aa15aa037f45580d8d79f0de10093eee3ec Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Fri, 13 Dec 2024 08:42:28 -0800 Subject: [PATCH 2/3] DAOS-16812 cart: read after free cid 2556737 (#15517) (#15600) - If failed to reply, skip rpc early buffer release Signed-off-by: Alexander A Oganezov --- src/cart/crt_hg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 26b54b52ec3d..62aebe098ad2 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1479,7 +1479,7 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) DP_HG_RC(hg_ret)); /* should success as addref above */ RPC_DECREF(rpc_priv); - rc = crt_hgret_2_der(hg_ret); + D_GOTO(out, rc = crt_hgret_2_der(hg_ret)); } /* Release input buffer */ @@ -1492,6 +1492,7 @@ crt_hg_reply_send(struct crt_rpc_priv *rpc_priv) } } +out: return rc; } From d09376378c67f8d54bada62213ab47fcf979d357 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Fri, 13 Dec 2024 11:42:51 -0800 Subject: [PATCH 3/3] DAOS-16875 cq: fix flake8 xargs usage (#15608) (#15614) Use -r so if no scons or non-scons files are grep'ed, flake8 does not run. Signed-off-by: Dalton Bohning --- utils/githooks/pre-commit.d/71-flake.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/githooks/pre-commit.d/71-flake.sh b/utils/githooks/pre-commit.d/71-flake.sh index 4ab24910262b..53c5ac995038 100755 --- a/utils/githooks/pre-commit.d/71-flake.sh +++ b/utils/githooks/pre-commit.d/71-flake.sh @@ -48,12 +48,12 @@ else rc=0 # non-scons - if ! echo "$py_files" | grep -vi scons | xargs flake8 --config .flake8; then + if ! echo "$py_files" | grep -vi scons | xargs -r flake8 --config .flake8; then rc=1 fi # scons - if ! echo "$py_files" | grep -i scons | xargs flake8 --config .flake8-scons; then + if ! echo "$py_files" | grep -i scons | xargs -r flake8 --config .flake8-scons; then rc=1; fi