diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 653cb5f7d37..71d13f637bd 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -321,7 +321,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_PEER_FAIL))) interval = 0; else - interval = d_sec2hlc(DAOS_AGG_THRESHOLD); + interval = cont->sc_agg_eph_gap; D_ASSERT(hlc > (interval * 2)); /* @@ -409,6 +409,9 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), tgt_id, epoch_range.epr_lo, epoch_range.epr_hi); + if (!param->ap_vos_agg) + vos_cont_set_mod_bound(cont->sc_hdl, epoch_range.epr_hi); + flags |= VOS_AGG_FL_FORCE_MERGE; rc = agg_cb(cont, &epoch_range, flags, param); if (rc) @@ -425,6 +428,9 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), tgt_id, epoch_range.epr_lo, epoch_range.epr_hi); + if (!param->ap_vos_agg) + vos_cont_set_mod_bound(cont->sc_hdl, epoch_range.epr_hi); + if (dss_xstream_is_busy()) flags &= ~VOS_AGG_FL_FORCE_MERGE; rc = agg_cb(cont, &epoch_range, flags, param); diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 1ee74ae11a4..96e8119d4ce 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -922,6 +922,7 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; + dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; dth->dth_aborted = 0; dth->dth_already = 0; dth->dth_need_validation = 0; @@ -1853,6 +1854,8 @@ dtx_cont_register(struct ds_cont_child *cont) D_GOTO(out, rc = -DER_NOMEM); } + cont->sc_agg_eph_gap = d_sec2hlc(vos_get_agg_gap()); + ds_cont_child_get(cont); dbca->dbca_refs = 0; dbca->dbca_cont = cont; diff --git a/src/dtx/tests/dts_structs.c b/src/dtx/tests/dts_structs.c index dc4347fed7c..a763546f824 100644 --- a/src/dtx/tests/dts_structs.c +++ b/src/dtx/tests/dts_structs.c @@ -70,8 +70,9 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_need_validation); SET_BITFIELD_1(dummy, dth_ignore_uncommitted); SET_BITFIELD_1(dummy, dth_local); + SET_BITFIELD_1(dummy, dth_epoch_owner); SET_BITFIELD_1(dummy, dth_local_complete); - SET_BITFIELD(dummy, padding1, 13); + SET_BITFIELD(dummy, padding1, 12); SET_FIELD(dummy, dth_dti_cos_count); SET_FIELD(dummy, dth_dti_cos); diff --git a/src/engine/sched.c b/src/engine/sched.c index 49a46ca3618..807f150839e 100644 --- a/src/engine/sched.c +++ b/src/engine/sched.c @@ -197,17 +197,6 @@ enum { static int sched_policy; -/* - * Time threshold for giving IO up throttling. If space pressure stays in the - * highest level for enough long time, we assume that no more space can be - * reclaimed and choose to give up IO throttling, so that ENOSPACE error could - * be returned to client earlier. - * - * To make time for aggregation reclaiming overwriteen space, this threshold - * should be longer than the DAOS_AGG_THRESHOLD. - */ -#define SCHED_DELAY_THRESH 40000 /* msecs */ - struct pressure_ratio { unsigned int pr_free; /* free space ratio */ unsigned int pr_gc_ratio; /* CPU percentage for GC & Aggregation */ @@ -943,12 +932,21 @@ is_gc_pending(struct sched_pool_info *spi) return spi->spi_gc_ults && (spi->spi_gc_ults > spi->spi_gc_sleeping); } -/* Just run into this space pressure situation recently? */ +/* + * Just run into this space pressure situation recently? + * + * If space pressure stays in the highest level for enough long time, we assume + * that no more space can be reclaimed and choose to give up IO throttling, so + * that ENOSPACE error could be returned to client earlier. + * + * To make time for aggregation reclaiming overwriteen space, this threshold + * should be longer than VOS aggregation epoch gap with current HLC. + */ static inline bool is_pressure_recent(struct sched_info *info, struct sched_pool_info *spi) { D_ASSERT(info->si_cur_ts >= spi->spi_pressure_ts); - return (info->si_cur_ts - spi->spi_pressure_ts) < SCHED_DELAY_THRESH; + return (info->si_cur_ts - spi->spi_pressure_ts) < info->si_agg_gap; } static inline uint64_t @@ -2256,6 +2254,8 @@ sched_run(ABT_sched sched) return; } + dx->dx_sched_info.si_agg_gap = (vos_get_agg_gap() + 10) * 1000; /* msecs */ + while (1) { /* Try to pick network poll ULT */ pool = pools[DSS_POOL_NET_POLL]; diff --git a/src/engine/srv_internal.h b/src/engine/srv_internal.h index 222f07e4906..d1f240270fb 100644 --- a/src/engine/srv_internal.h +++ b/src/engine/srv_internal.h @@ -61,6 +61,7 @@ struct sched_info { /* Number of kicked requests for each type in current cycle */ uint32_t si_kicked_req_cnt[SCHED_REQ_MAX]; unsigned int si_stop:1; + uint64_t si_agg_gap; }; struct mem_stats { diff --git a/src/include/daos/dtx.h b/src/include/daos/dtx.h index ca719077a14..f3aa2546850 100644 --- a/src/include/daos/dtx.h +++ b/src/include/daos/dtx.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -27,17 +27,6 @@ /* The time (in second) threshold for batched DTX commit. */ #define DTX_COMMIT_THRESHOLD_AGE 10 -/* - * VOS aggregation should try to avoid aggregating in the epoch range where - * lots of data records are pending to commit, so the aggregation epoch upper - * bound is: current HLC - (DTX batched commit threshold + buffer period) - * - * To avoid conflicting of aggregation vs. transactions, any transactional - * update/fetch with epoch lower than the aggregation upper bound should be - * rejected and restarted. - */ -#define DAOS_AGG_THRESHOLD (DTX_COMMIT_THRESHOLD_AGE + 10) /* seconds */ - enum dtx_target_flags { /* The target only contains read-only operations for the DTX. */ DTF_RDONLY = (1 << 0), diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 9fc615c2a8b..14953a1972a 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -129,6 +129,12 @@ struct ds_cont_child { */ uint64_t sc_ec_update_timestamp; + /* + * The gap between the max allowed aggregation epoch and current HLC. The modification + * with older epoch out of range may cause conflict with aggregation as to be rejected. + */ + uint64_t sc_agg_eph_gap; + /* The objects with committable DTXs in DRAM. */ daos_handle_t sc_dtx_cos_hdl; /* The DTX COS-btree. */ diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 7c60d2deaa0..f648b42d7be 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -113,8 +113,10 @@ struct dtx_handle { dth_ignore_uncommitted : 1, /* Local transaction */ dth_local : 1, + /* Locally generate the epoch. */ + dth_epoch_owner : 1, /* Flag to commit the local transaction */ - dth_local_complete : 1, padding1 : 13; + dth_local_complete : 1, padding1 : 12; /* The count the DTXs in the dth_dti_cos array. */ uint32_t dth_dti_cos_count; @@ -287,6 +289,8 @@ enum dtx_flags { DTX_RELAY = (1 << 10), /** Local transaction */ DTX_LOCAL = (1 << 11), + /** Locally generate the epoch. */ + DTX_EPOCH_OWNER = (1 << 12), }; void diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index b6287c2986e..e34ade044dd 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -938,6 +938,56 @@ vos_update_renew_epoch(daos_handle_t ioh, struct dtx_handle *dth); void vos_dtx_renew_epoch(struct dtx_handle *dth); +/** + * Calculate current locally known stable epoch for the given container. + * + * \param coh [IN] Container open handle + * + * \return The epoch on success, negative value if error. + */ +daos_epoch_t +vos_cont_get_local_stable_epoch(daos_handle_t coh); + +/** + * Get global stable epoch for the given container. + * + * \param coh [IN] Container open handle + * + * \return The epoch on success, negative value if error. + */ +daos_epoch_t +vos_cont_get_global_stable_epoch(daos_handle_t coh); + +/** + * Set global stable epoch for the given container. + * + * \param coh [IN] Container open handle + * \param epoch [IN] The epoch to be used as the new global stable epoch. + * + * \return Zero on success, negative value if error. + */ +int +vos_cont_set_global_stable_epoch(daos_handle_t coh, daos_epoch_t epoch); + +/** + * Set the lowest allowed modification epoch for the given container. + * + * \param coh [IN] Container open handle + * \param epoch [IN] The lowest allowed epoch for modification. + * + * \return Zero on success, negative value if error. + */ +int +vos_cont_set_mod_bound(daos_handle_t coh, uint64_t epoch); + +/** + * Query the gap between the max allowed aggregation epoch and current HLC. + * + * \return The gap value in seconds. + */ +uint32_t +vos_get_agg_gap(void); + /** * Get the recx/epoch list. * diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index 0a52851c390..fa173cf1f63 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -58,6 +58,8 @@ enum dtx_entry_flags { * on all yet, need to be re-committed. */ DTE_PARTIAL_COMMITTED = (1 << 5), + /* The DTX epoch is sorted locally. */ + DTE_EPOCH_SORTED = (1 << 6), }; struct dtx_entry { diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index a79cec03f6f..2654df4d5f9 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2896,8 +2896,10 @@ ds_obj_rw_handler(crt_rpc_t *rpc) rc = process_epoch(&orw->orw_epoch, &orw->orw_epoch_first, &orw->orw_flags); - if (rc == PE_OK_LOCAL) + if (rc == PE_OK_LOCAL) { orw->orw_flags &= ~ORF_EPOCH_UNCERTAIN; + dtx_flags |= DTX_EPOCH_OWNER; + } if (obj_rpc_is_fetch(rpc)) { struct dtx_handle *dth; @@ -3856,8 +3858,10 @@ ds_obj_punch_handler(crt_rpc_t *rpc) rc = process_epoch(&opi->opi_epoch, NULL /* epoch_first */, &opi->opi_flags); - if (rc == PE_OK_LOCAL) + if (rc == PE_OK_LOCAL) { opi->opi_flags &= ~ORF_EPOCH_UNCERTAIN; + dtx_flags |= DTX_EPOCH_OWNER; + } version = opi->opi_map_ver; max_ver = opi->opi_map_ver; @@ -5110,6 +5114,7 @@ ds_obj_dtx_leader(struct daos_cpd_args *dca) &dcsh->dcsh_epoch.oe_first, &dcsh->dcsh_epoch.oe_rpc_flags); if (rc == PE_OK_LOCAL) { + dtx_flags |= DTX_EPOCH_OWNER; /* * In this case, writes to local RDGs can use the chosen epoch * without any uncertainty. This optimization is left to future @@ -5701,8 +5706,10 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) if (ocpi->ocpi_flags & ORF_LEADER) { rc = process_epoch(&ocpi->ocpi_epoch, NULL /* epoch_first */, &ocpi->ocpi_flags); - if (rc == PE_OK_LOCAL) + if (rc == PE_OK_LOCAL) { ocpi->ocpi_flags &= ~ORF_EPOCH_UNCERTAIN; + dtx_flags |= DTX_EPOCH_OWNER; + } } else if (dct_nr == 1) { rc = obj_coll_local(rpc, dcts[0].dct_shards, dce, &version, &ioc, NULL, odm->odm_mbs, obj_coll_tgt_punch); diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 46db4891220..a113956ce73 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -436,6 +436,7 @@ class EngineYamlParameters(YamlParameters): "D_LOG_FILE_APPEND_PID=1", "DAOS_POOL_RF=4", "CRT_EVENT_DELAY=1", + "DAOS_VOS_AGG_GAP=20", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [], diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index bd54dd52838..8859405686d 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -66,6 +66,7 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, dth->dth_for_migration = 0; dth->dth_ignore_uncommitted = 0; dth->dth_prepared = 0; + dth->dth_epoch_owner = 0; dth->dth_aborted = 0; dth->dth_already = 0; dth->dth_need_validation = 0; diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index e19768d4c03..1e1bb34cfe8 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -620,6 +620,42 @@ struct dss_module_key vos_module_key = { daos_epoch_t vos_start_epoch = DAOS_EPOCH_MAX; +/* + * For standalone transaction, when it is started on the DTX leader, its epoch + * is generated by the leader, then the modification RPC will be forwarded to + * other related non-leader(s). If the forwarded RPC is delayed for some reason, + * such as network congestion or system busy on the non-leader, as to the epoch + * for such transaction becomes very old (exceed related threshold), as to VOS + * aggregation may has already aggregated related epoch rang. Under such case, + * the non-leader will reject such modification to avoid data lost/corruption. + * + * For distributed transaction, if there is no read (fetch, query, enumerate, + * and so on) before client tx_commit, then related DTX leader will generate + * epoch for the transaction after client commit_tx. Then it will be the same + * as above standalone transaction for epoch handling. + * + * If the distributed transaction involves some read before client commit_tx, + * its epoch will be generated by the first accessed engine for read. If the + * transaction takes too long time after that, then when client commit_tx, its + * epoch may become very old as to related DTX leader will have to reject the + * transaction to avoid above mentioned conflict. And even if the DTX leader + * did not reject the transaction, some non-leader may also reject it because + * of the very old epoch. So it means that under such framework, the life for + * a distributed transaction cannot be too long. That can be adjusted via the + * server side environment variable DAOS_VOS_AGG_GAP. + * + * NOTE: EC/VOS aggregation should avoid aggregating in the epoch range where + * lots of data records are pending to commit, so the aggregation epoch + * upper bound is 'current HLC - vos_agg_gap'. + */ +uint32_t vos_agg_gap; + +uint32_t +vos_get_agg_gap(void) +{ + return vos_agg_gap; +} + static int vos_mod_init(void) { @@ -679,6 +715,15 @@ vos_mod_init(void) d_getenv_bool("DAOS_DKEY_PUNCH_PROPAGATE", &vos_dkey_punch_propagate); D_INFO("DKEY punch propagation is %s\n", vos_dkey_punch_propagate ? "enabled" : "disabled"); + vos_agg_gap = VOS_AGG_GAP_DEF; + d_getenv_uint("DAOS_VOS_AGG_GAP", &vos_agg_gap); + if (vos_agg_gap < VOS_AGG_GAP_MIN || vos_agg_gap > VOS_AGG_GAP_MAX) { + D_WARN("Invalid DAOS_VOS_AGG_GAP value, " + "valid range [%u, %u], set it as default %u (second)\n", + VOS_AGG_GAP_MIN, VOS_AGG_GAP_MAX, VOS_AGG_GAP_DEF); + vos_agg_gap = VOS_AGG_GAP_DEF; + } + D_INFO("Set DAOS VOS aggregation gap as %u (second)\n", vos_agg_gap); return rc; } diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c index a5a55a902b9..38fc2ef7d32 100644 --- a/src/vos/vos_container.c +++ b/src/vos/vos_container.c @@ -198,6 +198,9 @@ cont_free_internal(struct vos_container *cont) lrua_array_free(cont->vc_dtx_array); D_ASSERT(d_list_empty(&cont->vc_dtx_act_list)); + D_ASSERT(d_list_empty(&cont->vc_dtx_sorted_list)); + D_ASSERT(d_list_empty(&cont->vc_dtx_unsorted_list)); + D_ASSERT(d_list_empty(&cont->vc_dtx_reindex_list)); dbtree_close(cont->vc_btr_hdl); @@ -395,6 +398,9 @@ vos_cont_open(daos_handle_t poh, uuid_t co_uuid, daos_handle_t *coh) cont->vc_cmt_dtx_indexed = 0; cont->vc_cmt_dtx_reindex_pos = cont->vc_cont_df->cd_dtx_committed_head; D_INIT_LIST_HEAD(&cont->vc_dtx_act_list); + D_INIT_LIST_HEAD(&cont->vc_dtx_sorted_list); + D_INIT_LIST_HEAD(&cont->vc_dtx_unsorted_list); + D_INIT_LIST_HEAD(&cont->vc_dtx_reindex_list); cont->vc_dtx_committed_count = 0; cont->vc_solo_dtx_epoch = d_hlc_get(); rc = gc_open_cont(cont); @@ -815,3 +821,174 @@ struct vos_iter_ops vos_cont_iter_ops = { .iop_fetch = cont_iter_fetch, .iop_process = cont_iter_process, }; + +/* + * The local stable epoch can be used to calculate global stable epoch: all the container + * shards report each own local stable epoch to some leader who will find out the smallest + * one as the global stable epoch and dispatch it to all related container shards. + */ +daos_epoch_t +vos_cont_get_local_stable_epoch(daos_handle_t coh) +{ + struct vos_container *cont; + struct vos_cont_ext_df *cont_ext; + struct vos_dtx_act_ent *dae; + uint64_t gap = d_sec2hlc(vos_agg_gap); + daos_epoch_t epoch = d_hlc_get() - gap; + + cont = vos_hdl2cont(coh); + D_ASSERT(cont != NULL); + + /* + * If the oldest (that is at the head of the sorted list) sorted DTX's + * epoch is out of the boundary, then use it as the local stable epoch. + */ + if (!d_list_empty(&cont->vc_dtx_sorted_list)) { + dae = d_list_entry(cont->vc_dtx_sorted_list.next, + struct vos_dtx_act_ent, dae_order_link); + if (epoch >= DAE_EPOCH(dae)) + epoch = DAE_EPOCH(dae) - 1; + } + + /* + * It is not easy to know which DTX is the oldest one in the unsorted list. + * The one after the header in the list maybe older than the header. But the + * epoch difference will NOT exceed 'vos_agg_gap' since any DTX with older + * epoch will be rejected (and restart with newer epoch). + * + * So "DAE_EPOCH(header) - vos_agg_gap" can be used to estimate the local + * stable epoch for unsorted DTX entries. + */ + if (!d_list_empty(&cont->vc_dtx_unsorted_list)) { + dae = d_list_entry(cont->vc_dtx_unsorted_list.next, + struct vos_dtx_act_ent, dae_order_link); + if (epoch > DAE_EPOCH(dae) - gap) + epoch = DAE_EPOCH(dae) - gap; + } + + /* + * The historical vos_agg_gap for the DTX entries in the reindex list is unknown. + * We use cont->vc_dtx_reindex_eph_diff to estimate the local stable epoch. That + * may be over-estimated. Usually, the count of re-indexed DTX entries is quite + * limited, and will be purged soon after the container opened (via DTX resync). + * So it will not much affect the local stable epoch calculation. + */ + if (unlikely(!d_list_empty(&cont->vc_dtx_reindex_list))) { + dae = d_list_entry(cont->vc_dtx_reindex_list.next, + struct vos_dtx_act_ent, dae_order_link); + if (epoch > DAE_EPOCH(dae) - cont->vc_dtx_reindex_eph_diff) + epoch = DAE_EPOCH(dae) - cont->vc_dtx_reindex_eph_diff; + } + + /* The new local stable epoch cannot be smaller than the old global stable epoch. */ + cont_ext = umem_off2ptr(vos_cont2umm(cont), cont->vc_cont_df->cd_ext); + if (cont_ext != NULL && epoch < cont_ext->ced_global_stable_epoch) + epoch = cont_ext->ced_global_stable_epoch; + + D_ASSERT(cont->vc_local_stable_epoch <= epoch); + cont->vc_local_stable_epoch = epoch; + + return epoch; +} + +/* + * The global stable epoch can be used for incremental reintegration: all the modifications + * involved in current target (container shard) under the global stable epoch have already + * been persistently stored globally, only need to care about the modification with newer + * epoch when reintegrate into the system. + */ +daos_epoch_t +vos_cont_get_globla_stable_epoch(daos_handle_t coh) +{ + struct vos_container *cont; + struct vos_cont_ext_df *cont_ext; + daos_epoch_t epoch = 0; + + cont = vos_hdl2cont(coh); + D_ASSERT(cont != NULL); + + cont_ext = umem_off2ptr(vos_cont2umm(cont), cont->vc_cont_df->cd_ext); + if (cont_ext != NULL) + epoch = cont_ext->ced_global_stable_epoch; + + return epoch; +} + +int +vos_cont_set_global_stable_epoch(daos_handle_t coh, daos_epoch_t epoch) +{ + struct umem_instance *umm; + struct vos_container *cont; + struct vos_cont_ext_df *cont_ext; + daos_epoch_t old = 0; + int rc = 0; + + cont = vos_hdl2cont(coh); + D_ASSERT(cont != NULL); + + umm = vos_cont2umm(cont); + cont_ext = umem_off2ptr(umm, cont->vc_cont_df->cd_ext); + + /* Do not allow to set global stable epoch against old container without extension. */ + if (cont_ext == NULL) + D_GOTO(out, rc = -DER_NOTSUPPORTED); + + /* + * Either the leader gives wrong global stable epoch or current target does not participant + * in the calculating new globle stable epoch. Then do not allow to set globle stable epoch. + */ + if (unlikely(cont->vc_local_stable_epoch < epoch)) { + D_WARN("Invalid global stable epoch: " DF_X64" vs " DF_X64 " for container " + DF_UUID "\n", cont->vc_local_stable_epoch, epoch, DP_UUID(cont->vc_id)); + D_GOTO(out, rc = -DER_NO_PERM); + } + + if (unlikely(cont_ext->ced_global_stable_epoch > epoch)) { + D_WARN("Do not allow to rollback global stable epoch from " + DF_X64" to " DF_X64 " for container " DF_UUID "\n", + cont_ext->ced_global_stable_epoch, epoch, DP_UUID(cont->vc_id)); + D_GOTO(out, rc = -DER_NO_PERM); + } + + if (cont_ext->ced_global_stable_epoch == epoch) + D_GOTO(out, rc = 0); + + old = cont_ext->ced_global_stable_epoch; + rc = umem_tx_begin(umm, NULL); + if (rc == 0) { + rc = umem_tx_add_ptr(umm, &cont_ext->ced_global_stable_epoch, + sizeof(cont_ext->ced_global_stable_epoch)); + if (rc == 0) { + cont_ext->ced_global_stable_epoch = epoch; + rc = umem_tx_commit(vos_cont2umm(cont)); + } else { + rc = umem_tx_abort(umm, rc); + } + } + + DL_CDEBUG(rc != 0, DLOG_ERR, DB_MGMT, rc, + "Set global stable epoch from "DF_X64" to " DF_X64 " for container " DF_UUID, + old , epoch, DP_UUID(cont->vc_id)); + +out: + return rc; +} + +int +vos_cont_set_mod_bound(daos_handle_t coh, uint64_t epoch) +{ + struct vos_container *cont; + + cont = vos_hdl2cont(coh); + D_ASSERT(cont != NULL); + + /* Ascending. */ + if (cont->vc_mod_epoch_bound < epoch) { + D_DEBUG(DB_TRACE, "Increase acceptable modification boundary from " + DF_X64 " to " DF_X64 " for container " DF_UUID "\n", + cont->vc_mod_epoch_bound, epoch, DP_UUID(cont->vc_id)); + cont->vc_mod_epoch_bound = epoch; + } + + return 0; +} diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 86c100f4739..4d9e3d2d1a5 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -261,8 +261,10 @@ dtx_act_ent_free(struct btr_instance *tins, struct btr_record *rec, dae = umem_off2ptr(&tins->ti_umm, rec->rec_off); rec->rec_off = UMOFF_NULL; - if (dae != NULL) + if (dae != NULL) { + d_list_del_init(&dae->dae_order_link); d_list_del_init(&dae->dae_link); + } if (args != NULL) { /* Return the record addreass (offset in DRAM). @@ -995,6 +997,62 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) cont = vos_hdl2cont(dth->dth_coh); D_ASSERT(cont != NULL); + /* Do not allow the modification with too old epoch. */ + if (dth->dth_epoch <= cont->vc_mod_epoch_bound) { + D_DEBUG(DB_TRACE, "Need to restart DTX (1) " DF_DTI " with epoch " + DF_X64 " vs bound " DF_X64 "\n", DP_DTI(&dth->dth_xid), + dth->dth_epoch, cont->vc_mod_epoch_bound); + return -DER_TX_RESTART; + } + + /* + * NOTE: For the purpose of efficient calculating container based local stable epoch, + * we will maintain some kind of sorted list for active DTX entries with epoch + * order. But consider related overhead, it is not easy to maintain a strictly + * sorted list for all active DTX entries. For the DTX which leader resides on + * current target, its epoch is already sorted when generate on current engine. + * So the main difficulty is for those DTX entries which leaders are on remote + * targets. + * + * On the other hand, the local stable epoch is mainly used to generate global + * stable epoch that is for incremental reintegration. In fact, we do not need + * a very accurate global stable epoch for incremental reintegration. It means + * that it is no matter (or non-fatal) if the calculated stable epoch is a bit + * smaller than the real case. For example, seconds error for the stable epoch + * almost can be ignored if we compare such overhead with rebuilding the whole + * target from scratch. So for the DTX entry which leader is on remomte target, + * we will maintain it in the list with relative incremental trend based on the + * epoch instead of strict sorting the epoch. We introduce an O(1) algorithm to + * handle such unsroted DTX entries list. + * + * For distributed transaction, its epoch may be generated on non-leader. + */ + + if (!dth->dth_epoch_owner && !d_list_empty(&cont->vc_dtx_unsorted_list)) { + dae = d_list_entry(cont->vc_dtx_unsorted_list.prev, struct vos_dtx_act_ent, + dae_order_link); + if (dth->dth_epoch < DAE_EPOCH(dae) && + cont->vc_mod_epoch_bound < DAE_EPOCH(dae) - d_sec2hlc(vos_agg_gap)) { + /* + * It guarantees that even if there was some older DTX to be added, + * the epoch difference between it and all former added ones cannot + * exceed vos_agg_gap. So we can easily calculate the local stable + * epoch. Please reference vos_cont_get_local_stable_epoch(). + */ + D_DEBUG(DB_TRACE, "Increase acceptable modification boundary from " + DF_X64 " to " DF_X64 " for container " DF_UUID "\n", + cont->vc_mod_epoch_bound, + DAE_EPOCH(dae) - d_sec2hlc(vos_agg_gap), DP_UUID(cont->vc_id)); + cont->vc_mod_epoch_bound = DAE_EPOCH(dae) - d_sec2hlc(vos_agg_gap); + if (dth->dth_epoch <= cont->vc_mod_epoch_bound) { + D_DEBUG(DB_TRACE, "Need to restart DTX (2) " DF_DTI " with epoch " + DF_X64 " vs bound " DF_X64 "\n", DP_DTI(&dth->dth_xid), + dth->dth_epoch, cont->vc_mod_epoch_bound); + return -DER_TX_RESTART; + } + } + } + rc = lrua_allocx(cont->vc_dtx_array, &idx, dth->dth_epoch, &dae, &dth->dth_local_stub); if (rc != 0) { /* The array is full, need to commit some transactions first */ @@ -1007,6 +1065,7 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) } D_INIT_LIST_HEAD(&dae->dae_link); + D_INIT_LIST_HEAD(&dae->dae_order_link); DAE_LID(dae) = idx + DTX_LID_RESERVED; if (dth->dth_solo) DAE_LID(dae) |= DTX_LID_SOLO_FLAG; @@ -1015,6 +1074,8 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) DAE_DKEY_HASH(dae) = dth->dth_dkey_hash; DAE_EPOCH(dae) = dth->dth_epoch; DAE_FLAGS(dae) = dth->dth_flags; + if (dth->dth_epoch_owner) + DAE_FLAGS(dae) |= DTE_EPOCH_SORTED; DAE_VER(dae) = dth->dth_ver; if (dth->dth_mbs != NULL) { @@ -1043,6 +1104,15 @@ vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) if (rc == 0) { dae->dae_start_time = daos_gettime_coarse(); d_list_add_tail(&dae->dae_link, &cont->vc_dtx_act_list); + if (dth->dth_epoch_owner) + d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_sorted_list); + else + /* + * Add all the others, including non-leader(s), into unsorted list. + * Then even though the leader was evicted for some reason, related + * DTX still can be considered via the new leader on another target. + */ + d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_unsorted_list); dth->dth_ent = dae; } else { dtx_evict_lid(cont, dae); @@ -2938,6 +3008,13 @@ vos_dtx_act_reindex(struct vos_container *cont) umem_off_t dbd_off = cont_df->cd_dtx_active_head; d_iov_t kiov; d_iov_t riov; + struct vos_dtx_act_ent *prev = NULL; + /* The max epoch for all unsorted DTX entries to be re-indexed. */ + uint64_t max_eph = 0; + /* The min epoch which DTX entry is after the max_eph DTX. */ + uint64_t min_eph = 0; + /* The largest diff for above pairs 'max_eph - min_eph'. */ + uint64_t diff = 0; uint64_t start_time = daos_gettime_coarse(); int rc = 0; int i; @@ -3027,6 +3104,42 @@ vos_dtx_act_reindex(struct vos_container *cont) dae->dae_start_time = start_time; d_list_add_tail(&dae->dae_link, &cont->vc_dtx_act_list); + if (DAE_FLAGS(dae) & DTE_EPOCH_SORTED) { + d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_sorted_list); + } else { + /* + * The DXT entries in the active blob may be generated against + * different VOS AGG GAP configurations, or even upgraded from + * old system that did not support VOS AGG GAP logic yet. Link + * them into a reindex list. During the reindex scanning, we + * will find out the pairs with the largest epoch difference. + * Using such difference to estimate the local stable epoch. + * + * NOTE: The min_eph may be not the smallest one in all the DTX + * entries to be re-indexed, instead, it is after current + * known max_eph, and if max_eph is changed, min_eph will + * be reset. So there may be multiple max/min pairs. Each + * pairs has epoch own difference. We use the largest one. + * + * This is an O(N) algorithm. N is the count of DTX entries to be + * re-indexed. Please reference vos_cont_get_local_stable_epoch(). + */ + if (prev == NULL || DAE_EPOCH(dae) > DAE_EPOCH(prev)) { + if (max_eph < DAE_EPOCH(dae)) { + max_eph = DAE_EPOCH(dae); + min_eph = 0; + } + } else { + if (min_eph == 0 || min_eph > DAE_EPOCH(dae)) { + min_eph = DAE_EPOCH(dae); + if (diff < max_eph - min_eph) + diff = max_eph - min_eph; + } + } + + d_list_add_tail(&dae->dae_order_link, &cont->vc_dtx_reindex_list); + } + prev = dae; dbd_count++; } @@ -3044,6 +3157,8 @@ vos_dtx_act_reindex(struct vos_container *cont) dbd_off = dbd->dbd_next; } + cont->vc_dtx_reindex_eph_diff = diff; + out: return rc > 0 ? 0 : rc; } diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 30e92318299..c2252dc2c4f 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -140,6 +140,12 @@ enum { /* Throttle ENOSPACE error message */ #define VOS_NOSPC_ERROR_INTVL 60 /* seconds */ +extern uint32_t vos_agg_gap; + +#define VOS_AGG_GAP_MIN 20 /* seconds */ +#define VOS_AGG_GAP_DEF 60 +#define VOS_AGG_GAP_MAX 180 + extern unsigned int vos_agg_nvme_thresh; extern bool vos_dkey_punch_propagate; @@ -359,6 +365,18 @@ struct vos_container { struct btr_root vc_dtx_committed_btr; /* The list for active DTXs, roughly ordered in time. */ d_list_t vc_dtx_act_list; + /* The list for the active DTX entries with epoch sorted. */ + d_list_t vc_dtx_sorted_list; + /* The list for the active DTX entries (but not re-indexed) with epoch unsorted. */ + d_list_t vc_dtx_unsorted_list; + /* The list for the active DTX entries that are re-indexed when open the container. */ + d_list_t vc_dtx_reindex_list; + /* The largest epoch difference for re-indexed DTX entries max/min pairs. */ + uint64_t vc_dtx_reindex_eph_diff; + /* The latest calculated local stable epoch. */ + daos_epoch_t vc_local_stable_epoch; + /* The lowest epoch boundary for current acceptable modification. */ + daos_epoch_t vc_mod_epoch_bound; /* The count of committed DTXs. */ uint32_t vc_dtx_committed_count; /** Index for timestamp lookup */ @@ -428,6 +446,8 @@ struct vos_dtx_act_ent { daos_unit_oid_t *dae_oids; /* The time (hlc) when the DTX entry is created. */ uint64_t dae_start_time; + /* Link into container::vc_dtx_{sorted,unsorted,reindex}_list. */ + d_list_t dae_order_link; /* Link into container::vc_dtx_act_list. */ d_list_t dae_link; /* Back pointer to the DTX handle. */ diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index 87d092bc882..40daa55da93 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -271,8 +271,13 @@ enum vos_io_stream { struct vos_cont_ext_df { /* GC bucket extension */ struct vos_gc_bkt_df ced_gc_bkt; + /* + * Any modification involved in current target (container shard) under the global + * stable epoch have already been persistently stored globally. + */ + uint64_t ced_global_stable_epoch; /* Reserved for potential new features */ - uint64_t ced_paddings[38]; + uint64_t ced_paddings[37]; /* Reserved for future extension */ uint64_t ced_reserve; };