diff --git a/src/common/btree.c b/src/common/btree.c index 6bf1bdb2b15..0657bf384dc 100644 --- a/src/common/btree.c +++ b/src/common/btree.c @@ -945,8 +945,12 @@ btr_root_alloc(struct btr_context *tcx) struct btr_instance *tins = &tcx->tc_tins; struct btr_root *root; - tins->ti_root_off = umem_zalloc(btr_umm(tcx), - sizeof(struct btr_root)); + if (btr_ops(tcx)->to_node_alloc != NULL) + tins->ti_root_off = btr_ops(tcx)->to_node_alloc(&tcx->tc_tins, + sizeof(struct btr_root)); + else + tins->ti_root_off = umem_zalloc(btr_umm(tcx), sizeof(struct btr_root)); + if (UMOFF_IS_NULL(tins->ti_root_off)) return btr_umm(tcx)->umm_nospc_rc; @@ -3884,6 +3888,7 @@ btr_tree_destroy(struct btr_context *tcx, void *args, bool *destroyed) tcx->tc_tins.ti_root_off, tcx->tc_order); root = tcx->tc_tins.ti_root; + tcx->tc_tins.ti_destroy = 1; if (root && !UMOFF_IS_NULL(root->tr_node)) { /* destroy the root and all descendants */ rc = btr_node_destroy(tcx, root->tr_node, args, &empty); diff --git a/src/include/daos/btree.h b/src/include/daos/btree.h index 24c7b95cbe4..51f7999e47a 100644 --- a/src/include/daos/btree.h +++ b/src/include/daos/btree.h @@ -429,6 +429,8 @@ struct btr_instance { struct btr_root *ti_root; /** Customized operations for the tree */ btr_ops_t *ti_ops; + /** The context is used for tree destroy */ + unsigned int ti_destroy : 1; }; /** diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index 7a7ca3dcf7c..b94f75b22c4 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -451,8 +451,6 @@ typedef void umem_cache_wait_cb_t(void *arg, uint64_t chkpt_tx, uint64_t *committed_tx); /** - * Write all dirty pages before @wal_tx to MD blob. (XXX: not yet implemented) - * * This function can yield internally, it is called by checkpoint service of upper level stack. * * \param[in] store The umem store diff --git a/src/include/daos_srv/evtree.h b/src/include/daos_srv/evtree.h index 63224259ccc..292c8848c87 100644 --- a/src/include/daos_srv/evtree.h +++ b/src/include/daos_srv/evtree.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -70,6 +70,10 @@ struct evt_desc_cbs { struct evt_desc *desc, daos_size_t nob, void *args); void *dc_bio_free_args; + /** + * Argument for allocation. + */ + void *dc_alloc_arg; /** * Availability check, it is for data tracked by DTX undo log. * It is optional, EVTree always treats data extent is available if diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index b782173e3f4..d42b08470e6 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -424,7 +424,7 @@ class TelemetryUtils(): ENGINE_NVME_INTEL_VENDOR_METRICS ENGINE_MEM_USAGE_METRICS = [ "engine_mem_vos_dtx_cmt_ent_48", - "engine_mem_vos_vos_obj_360", + "engine_mem_vos_vos_obj_384", "engine_mem_vos_vos_lru_size", "engine_mem_dtx_dtx_leader_handle_360"] ENGINE_MEM_TOTAL_USAGE_METRICS = [ diff --git a/src/vos/evtree.c b/src/vos/evtree.c index d635453f8b2..59f8855c3c1 100644 --- a/src/vos/evtree.c +++ b/src/vos/evtree.c @@ -1443,8 +1443,9 @@ evt_node_alloc(struct evt_context *tcx, unsigned int flags, struct evt_node *nd; umem_off_t nd_off; bool leaf = (flags & EVT_NODE_LEAF); + struct vos_object *obj = tcx->tc_desc_cbs.dc_alloc_arg; - nd_off = umem_zalloc(evt_umm(tcx), evt_node_size(tcx, leaf)); + nd_off = vos_obj_alloc(evt_umm(tcx), obj, evt_node_size(tcx, leaf), true); if (UMOFF_IS_NULL(nd_off)) return -DER_NOSPACE; @@ -3249,8 +3250,9 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd, } if (leaf) { - umem_off_t desc_off; - uint32_t csum_buf_size = 0; + umem_off_t desc_off; + uint32_t csum_buf_size = 0; + struct vos_object *obj = tcx->tc_desc_cbs.dc_alloc_arg; if (ci_is_valid(&ent->ei_csum)) csum_buf_size = ci_csums_len(ent->ei_csum); @@ -3263,7 +3265,7 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd, D_DEBUG(DB_TRACE, "Allocating an extra %d bytes " "for checksum", csum_buf_size); } - desc_off = umem_zalloc(evt_umm(tcx), desc_size); + desc_off = vos_obj_alloc(evt_umm(tcx), obj, desc_size, true); if (UMOFF_IS_NULL(desc_off)) return -DER_NOSPACE; diff --git a/src/vos/tests/vts_io.c b/src/vos/tests/vts_io.c index 9cb37e41700..08d097b35ed 100644 --- a/src/vos/tests/vts_io.c +++ b/src/vos/tests/vts_io.c @@ -898,7 +898,7 @@ io_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch, static inline int hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *epr, daos_epoch_t bound, uint64_t flags, uint32_t intent, struct vos_object **obj_p, - struct vos_ts_set *ts_set) + struct vos_ts_set *ts_set, struct umem_instance *umm) { int rc; @@ -908,7 +908,16 @@ hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *ep if (flags & VOS_OBJ_CREATE) { assert_ptr_not_equal(*obj_p, NULL); + + if (umm != NULL) { + rc = umem_tx_begin(umm, NULL); + assert_rc_equal(rc, 0); + } + rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set); + + if (umm != NULL) + rc = umem_tx_end(umm, rc); } return rc; @@ -926,7 +935,8 @@ hold_objects(struct vos_object **objs, daos_handle_t *coh, daos_unit_oid_t *oid, hold_flags |= VOS_OBJ_VISIBLE; for (i = start; i < end; i++) { rc = hold_obj(vos_hdl2cont(*coh), *oid, &epr, 0, hold_flags, - no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE, &objs[i], 0); + no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE, + &objs[i], 0, NULL); if (rc != exp_rc) return 1; } @@ -1006,82 +1016,72 @@ io_obj_cache_test(void **state) ummg = vos_cont2umm(vos_hdl2cont(ctx->tc_co_hdl)); umml = vos_cont2umm(vos_hdl2cont(l_coh)); - rc = umem_tx_begin(ummg, NULL); - assert_rc_equal(rc, 0); rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, ummg); assert_rc_equal(rc, 0); /** Hold object for discard */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj1, 0); + DAOS_INTENT_DISCARD, &obj1, 0, ummg); assert_rc_equal(rc, 0); /** Second discard should fail */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + DAOS_INTENT_DISCARD, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Should prevent simultaneous aggregation */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj2, 0); + DAOS_INTENT_PURGE, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Should prevent simultaneous hold for create as well */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, + 0, ummg); assert_rc_equal(rc, -DER_UPDATE_AGAIN); /** Need to be able to hold for read though or iteration won't work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_DEFAULT, &obj2, 0); + DAOS_INTENT_DEFAULT, &obj2, 0, ummg); vos_obj_release(obj2, 0, false); vos_obj_release(obj1, VOS_OBJ_DISCARD, false); /** Hold object for aggregation */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj1, 0); + DAOS_INTENT_PURGE, &obj1, 0, ummg); assert_rc_equal(rc, 0); /** Discard should fail */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + DAOS_INTENT_DISCARD, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Second aggregation should fail */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj2, 0); + DAOS_INTENT_PURGE, &obj2, 0, ummg); assert_rc_equal(rc, -DER_BUSY); /** Simultaneous create should work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0, ummg); assert_rc_equal(rc, 0); vos_obj_release(obj2, 0, false); /** Need to be able to hold for read though or iteration won't work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_DEFAULT, &obj2, 0); + DAOS_INTENT_DEFAULT, &obj2, 0, ummg); vos_obj_release(obj2, 0, false); vos_obj_release(obj1, VOS_OBJ_AGGREGATE, false); /** Now that other one is done, this should work */ rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + DAOS_INTENT_DISCARD, &obj2, 0, ummg); assert_rc_equal(rc, 0); vos_obj_release(obj2, VOS_OBJ_DISCARD, false); - rc = umem_tx_end(ummg, 0); - assert_rc_equal(rc, 0); - vos_obj_release(objs[0], 0, false); - rc = umem_tx_begin(umml, NULL); - assert_rc_equal(rc, 0); - rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0); + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, umml); assert_rc_equal(rc, 0); vos_obj_release(objs[0], 0, false); - rc = umem_tx_end(umml, 0); - assert_rc_equal(rc, 0); - rc = hold_objects(objs, &ctx->tc_co_hdl, &oids[0], 0, 10, true, 0); assert_int_equal(rc, 0); @@ -1091,7 +1091,7 @@ io_obj_cache_test(void **state) rc = hold_objects(objs, &l_coh, &oids[1], 10, 15, true, 0); assert_int_equal(rc, 0); rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_DEFAULT, &objs[16], 0); + DAOS_INTENT_DEFAULT, &objs[16], 0, NULL); assert_rc_equal(rc, 0); vos_obj_release(objs[16], 0, false); diff --git a/src/vos/vos_aggregate.c b/src/vos/vos_aggregate.c index 65d70dd7762..5064e74d730 100644 --- a/src/vos/vos_aggregate.c +++ b/src/vos/vos_aggregate.c @@ -984,7 +984,7 @@ reserve_segment(struct vos_object *obj, struct agg_io_context *io, if (vos_io_scm(vos_obj2pool(obj), DAOS_IOD_ARRAY, size, VOS_IOS_AGGREGATION)) { /** Store on SCM */ - off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size); + off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size, obj); if (UMOFF_IS_NULL(off)) { now = daos_gettime_coarse(); if (now - obj->obj_cont->vc_agg_nospc_ts > VOS_NOSPC_ERROR_INTVL) { diff --git a/src/vos/vos_gc.c b/src/vos/vos_gc.c index 0937b883f33..b90cd5d1199 100644 --- a/src/vos/vos_gc.c +++ b/src/vos/vos_gc.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -115,7 +115,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, daos_handle_t toh; int rc; - vos_evt_desc_cbs_init(&cbs, pool, coh); + vos_evt_desc_cbs_init(&cbs, pool, coh, NULL); rc = evt_open(root, &pool->vp_uma, &cbs, &toh); if (rc == -DER_NONEXIST) { *empty = true; diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index f6a74fce7e6..72275c1d41c 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -1258,7 +1258,7 @@ vos_bio_addr_free(struct vos_pool *pool, bio_addr_t *addr, daos_size_t nob); void vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, - daos_handle_t coh); + daos_handle_t coh, struct vos_object *obj); int vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb); @@ -1312,7 +1312,7 @@ vos_dedup_invalidate(struct vos_pool *pool); umem_off_t vos_reserve_scm(struct vos_container *cont, struct umem_rsrvd_act *rsrvd_scm, - daos_size_t size); + daos_size_t size, struct vos_object *obj); int vos_publish_scm(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_scm, bool publish); int @@ -1329,6 +1329,12 @@ vos_pool2umm(struct vos_pool *pool) return &pool->vp_umm; } +static inline struct umem_store * +vos_pool2store(struct vos_pool *pool) +{ + return &pool->vp_umm.umm_pool->up_store; +} + static inline struct umem_instance * vos_cont2umm(struct vos_container *cont) { @@ -1844,4 +1850,42 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v int vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); +static inline bool +vos_pool_is_p2(struct vos_pool *pool) +{ + struct umem_store *store = vos_pool2store(pool); + + return store->store_type == DAOS_MD_BMEM_V2; +} + +static inline umem_off_t +vos_obj_alloc(struct umem_instance *umm, struct vos_object *obj, size_t size, bool zeroing) +{ + + if (obj != NULL && vos_pool_is_p2(vos_obj2pool(obj))) { + D_ASSERT(obj->obj_bkt_allot == 1); + if (zeroing) + return umem_zalloc_from_bucket(umm, size, obj->obj_bkt_ids[0]); + + return umem_alloc_from_bucket(umm, size, obj->obj_bkt_ids[0]); + } + + if (zeroing) + return umem_zalloc(umm, size); + + return umem_alloc(umm, size); +} + +static inline umem_off_t +vos_obj_reserve(struct umem_instance *umm, struct vos_object *obj, + struct umem_rsrvd_act *rsrvd_scm, daos_size_t size) +{ + if (obj != NULL && vos_pool_is_p2(vos_obj2pool(obj))) { + D_ASSERT(obj->obj_bkt_allot == 1); + return umem_reserve_from_bucket(umm, rsrvd_scm, size, obj->obj_bkt_ids[0]); + } + + return umem_reserve(umm, rsrvd_scm, size); +} + #endif /* __VOS_INTERNAL_H__ */ diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index 4d452f50d6a..e1435801ee5 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -2040,17 +2040,16 @@ vos_recx2irec_size(daos_size_t rsize, struct dcs_csum_info *csum) umem_off_t vos_reserve_scm(struct vos_container *cont, struct umem_rsrvd_act *rsrvd_scm, - daos_size_t size) + daos_size_t size, struct vos_object *obj) { - umem_off_t umoff; + umem_off_t umoff; + struct umem_instance *umm = vos_cont2umm(cont); D_ASSERT(size > 0); - - if (vos_cont2umm(cont)->umm_ops->mo_reserve != NULL) { - umoff = umem_reserve(vos_cont2umm(cont), rsrvd_scm, size); - } else { - umoff = umem_alloc(vos_cont2umm(cont), size); - } + if (umm->umm_ops->mo_reserve != NULL) + umoff = vos_obj_reserve(umm, obj, rsrvd_scm, size); + else + umoff = vos_obj_alloc(umm, obj, size, false); return umoff; } @@ -2096,7 +2095,7 @@ reserve_space(struct vos_io_context *ioc, uint16_t media, daos_size_t size, if (media == DAOS_MEDIA_SCM) { umem_off_t umoff; - umoff = vos_reserve_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, size); + umoff = vos_reserve_scm(ioc->ic_cont, ioc->ic_rsrvd_scm, size, ioc->ic_obj); if (!UMOFF_IS_NULL(umoff)) { ioc->ic_umoffs[ioc->ic_umoffs_cnt] = umoff; ioc->ic_umoffs_cnt++; @@ -2430,13 +2429,16 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; - err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); - D_ASSERT(err == 0); + /* The object is already held for md-on-ssd phase2 pool */ + if (ioc->ic_obj == NULL) { + err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); + D_ASSERT(err == 0); - err = vos_obj_hold(ioc->ic_cont, ioc->ic_oid, &ioc->ic_epr, ioc->ic_bound, - flags, DAOS_INTENT_UPDATE, &ioc->ic_obj, ioc->ic_ts_set); - if (err != 0) - goto abort; + err = vos_obj_hold(ioc->ic_cont, ioc->ic_oid, &ioc->ic_epr, ioc->ic_bound, + flags, DAOS_INTENT_UPDATE, &ioc->ic_obj, ioc->ic_ts_set); + if (err != 0) + goto abort; + } err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb); if (err != 0) @@ -2612,6 +2614,18 @@ vos_update_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, goto error; } + /* Hold the object for md-on-ssd phase2 pool */ + if (vos_pool_is_p2(vos_cont2pool(ioc->ic_cont))) { + rc = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); + D_ASSERT(rc == 0); + + rc = vos_obj_hold(ioc->ic_cont, ioc->ic_oid, &ioc->ic_epr, ioc->ic_bound, + VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_UPDATE, + &ioc->ic_obj, ioc->ic_ts_set); + if (rc != 0) + goto error; + } + rc = dkey_update_begin(ioc); if (rc != 0) { D_ERROR(DF_UOID ": dkey update begin failed. " DF_RC "\n", DP_UOID(oid), DP_RC(rc)); diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index 72459544c27..0400a351175 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -377,4 +377,17 @@ struct vos_obj_df { struct btr_root vo_tree; }; +#define VOS_OBJ_BKTS_MAX 4 + +/* + * VOS object durable format for md-on-ssd phase2. The size is fit to the 128 bytes + * slab (see slab_map[] defined in mem.c). + */ +struct vos_obj_p2_df { + struct vos_obj_df p2_obj_df; + uint32_t p2_bkt_ids[VOS_OBJ_BKTS_MAX]; + uint64_t p2_reserved; +}; +D_CASSERT(sizeof(struct vos_obj_p2_df) == D_ALIGNUP(sizeof(struct vos_obj_df), 32)); + #endif diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index b4e772b1e29..4101c3fa261 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -1065,7 +1065,8 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type, * subtree */ if (krec->kr_bmap & KREC_BF_EVT) { - vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont)); + vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont), + obj); rc = evt_open(&krec->kr_evt, info->ii_uma, &cbs, &info->ii_tree_hdl); if (rc) { D_DEBUG(DB_TRACE, @@ -1077,7 +1078,7 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type, info->ii_fake_akey_flag = VOS_IT_DKEY_EV; } else { rc = dbtree_open_inplace_ex(&krec->kr_btr, info->ii_uma, - vos_cont2hdl(obj->obj_cont), vos_obj2pool(obj), + vos_cont2hdl(obj->obj_cont), obj, &info->ii_tree_hdl); if (rc) { D_DEBUG(DB_TRACE, @@ -2037,7 +2038,7 @@ vos_obj_akey_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, } rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), &toh); + obj, &toh); if (rc) { D_DEBUG(DB_TRACE, "Failed to open tree for iterator:" @@ -2094,7 +2095,7 @@ vos_obj_iter_sv_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, } rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), &toh); + obj, &toh); if (rc) { D_DEBUG(DB_TRACE, "Failed to open tree for iterator:" @@ -2154,7 +2155,7 @@ vos_obj_ev_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, goto prepare; } - vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont)); + vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont), obj); rc = evt_open(info->ii_evt, info->ii_uma, &cbs, &toh); if (rc) { D_DEBUG(DB_TRACE, diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index 2ccc8d71988..7b254ad2d75 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -47,12 +47,18 @@ struct vos_object { struct vos_obj_df *obj_df; /** backref to container */ struct vos_container *obj_cont; + /* Handle for the pinned object */ + struct umem_pin_handle *obj_pin_hdl; + /** Bucket IDs for the object */ + uint32_t obj_bkt_ids[VOS_OBJ_BKTS_MAX]; /** nobody should access this object */ bool obj_zombie; /** Object is held for discard */ uint32_t obj_discard : 1, /** If non-zero, object is held for aggregation */ - obj_aggregate : 1; + obj_aggregate : 1, + /** Evict-able bucket is already allocated */ + obj_bkt_allot : 1; }; enum { diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index 8845eae0085..502aafa6306 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -245,12 +245,58 @@ obj_get(struct daos_lru_cache *occ, struct vos_container *cont, daos_unit_oid_t return rc; } +static inline void +vos_obj_unpin(struct vos_object *obj) +{ + struct vos_pool *pool = vos_obj2pool(obj); + struct umem_store *store = vos_pool2store(pool); + + if (obj->obj_pin_hdl != NULL) { + umem_cache_unpin(store, obj->obj_pin_hdl); + obj->obj_pin_hdl = NULL; + } +} + +/* Support single evict-able bucket for this moment */ +static inline int +vos_obj_pin(struct vos_object *obj) +{ + struct vos_pool *pool = vos_obj2pool(obj); + struct umem_store *store = vos_pool2store(pool); + struct umem_cache_range rg; + + if (!vos_pool_is_p2(pool)) + return 0; + + if (!obj->obj_bkt_allot) { + if (!obj->obj_df) { + /* TODO: Revise all vos_obj_hold() callers to move it out of tx! */ + /* obj->obj_bkt_ids[0] = umem_allot_mb_evictable(vos_pool2umm(pool), 0); */ + obj->obj_bkt_ids[0] = UMEM_DEFAULT_MBKT_ID; + } else { + struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj->obj_df; + + obj->obj_bkt_ids[0] = p2->p2_bkt_ids[0]; + } + obj->obj_bkt_allot = 1; + } + + D_ASSERT(obj->obj_pin_hdl == NULL); + if (obj->obj_bkt_ids[0] == UMEM_DEFAULT_MBKT_ID) + return 0; + + rg.cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), obj->obj_bkt_ids[0]); + rg.cr_size = store->cache->ca_page_sz; + + return umem_cache_pin(store, &rg, 1, false, &obj->obj_pin_hdl); +} + static inline void obj_release(struct daos_lru_cache *occ, struct vos_object *obj, bool evict) { D_ASSERT(obj != NULL); - /* TODO: Unpin the object in md-on-ssd phase II */ + vos_obj_unpin(obj); if (obj == &obj_local) { clean_object(obj); @@ -301,6 +347,8 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp) obj_new->obj_sync_epoch = obj_local.obj_sync_epoch; obj_new->obj_df = obj_local.obj_df; obj_new->obj_zombie = obj_local.obj_zombie; + obj_new->obj_bkt_allot = obj_local.obj_bkt_allot; + obj_new->obj_pin_hdl = obj_local.obj_pin_hdl; obj_local.obj_toh = DAOS_HDL_INVAL; obj_local.obj_ih = DAOS_HDL_INVAL; @@ -363,7 +411,9 @@ vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t if (rc) return rc; - /* TODO: Pin object in memory */ + rc = vos_obj_pin(obj); + if (rc) + return rc; if (check_discard(obj, flags)) /* Update request will retry with this error */ @@ -453,6 +503,10 @@ vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t D_ASSERT(cont != NULL); D_ASSERT(cont->vc_pool); D_ASSERT(obj_p != NULL); + + /* TODO: Revise all vos_obj_hold() callers to move it out of tx! */ + /* D_ASSERT(!vos_pool_is_p2(cont->vc_pool) || umem_tx_none(vos_pool2umm(cont->vc_pool))); */ + *obj_p = NULL; occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb); @@ -507,8 +561,16 @@ vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */ } - /* TODO: Pin the object in memory in md-on-ssd phase II. Revise the 'obj_local' implementation - * then, since this function could yield. */ + /* For md-on-ssd phase2 pool, add object to cache before yield in vos_obj_pin() */ + if (obj == &obj_local && vos_pool_is_p2(cont->vc_pool)) { + rc = cache_object(occ, &obj); + if (rc != 0) + goto failed; + } + + rc = vos_obj_pin(obj); + if (rc) + goto failed; /* It's done for DAOS_INTENT_UPDATE or DAOS_INTENT_PUNCH or DAOS_INTENT_KILL */ if (intent == DAOS_INTENT_UPDATE || intent == DAOS_INTENT_PUNCH || diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index 72870cfd76e..e6111cd0746 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -47,7 +47,8 @@ oi_hkey_size(void) static int oi_rec_msize(int alloc_overhead) { - return alloc_overhead + sizeof(struct vos_obj_df); + /* This function is only used for metadata overhead estimation. */ + return alloc_overhead + D_ALIGNUP(sizeof(struct vos_obj_df), 32); } static void @@ -67,6 +68,15 @@ oi_hkey_cmp(struct btr_instance *tins, struct btr_record *rec, void *hkey) return dbtree_key_cmp_rc(memcmp(oid1, oid2, sizeof(*oid1))); } +static inline unsigned int +vos_obj_df_size(struct vos_pool *pool) +{ + if (vos_pool_is_p2(pool)) + return sizeof(struct vos_obj_p2_df); + + return sizeof(struct vos_obj_df); +} + static int oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, d_iov_t *val_iov, struct btr_record *rec, d_iov_t *val_out) @@ -76,10 +86,11 @@ oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, struct vos_obj_df *obj; daos_unit_oid_t *key; umem_off_t obj_off; + struct vos_pool *pool = (struct vos_pool *)tins->ti_priv; int rc; /* Allocate a PMEM value of type vos_obj_df */ - obj_off = umem_zalloc(&tins->ti_umm, sizeof(struct vos_obj_df)); + obj_off = umem_zalloc(&tins->ti_umm, vos_obj_df_size(pool)); if (UMOFF_IS_NULL(obj_off)) return -DER_NOSPACE; @@ -100,11 +111,11 @@ oi_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, } else { struct vos_obj_df *new_obj = val_out->iov_buf; - memcpy(obj, new_obj, sizeof(*obj)); + memcpy(obj, new_obj, vos_obj_df_size(pool)); obj->vo_id = *key; } - d_iov_set(val_iov, obj, sizeof(struct vos_obj_df)); + d_iov_set(val_iov, obj, vos_obj_df_size(pool)); rec->rec_off = obj_off; /* For new created object, commit it synchronously to reduce @@ -176,7 +187,7 @@ oi_rec_fetch(struct btr_instance *tins, struct btr_record *rec, DP_UOID(obj->vo_id), rec->rec_off); D_ASSERT(val_iov != NULL); - d_iov_set(val_iov, obj, sizeof(struct vos_obj_df)); + d_iov_set(val_iov, obj, vos_obj_df_size((struct vos_pool *)tins->ti_priv)); return 0; } @@ -504,7 +515,7 @@ oi_iter_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type, return rc; } - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)rec_iov.iov_buf; rc = oi_iter_ilog_check(obj, oiter, &info->ii_epr, false); @@ -610,7 +621,7 @@ oi_iter_match_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t f goto failed; } - D_ASSERT(iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)iov.iov_buf; if (iter->it_filter_cb != NULL && (flags & VOS_ITER_PROBE_AGAIN) == 0) { @@ -767,7 +778,7 @@ oi_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry, return rc; } - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); return oi_iter_fill(rec_iov.iov_buf, oiter, false, it_entry); } @@ -818,7 +829,7 @@ oi_iter_check_punch(daos_handle_t ih) "Probe should be done before aggregation\n"); if (rc != 0) return rc; - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)rec_iov.iov_buf; oid = obj->vo_id; @@ -873,7 +884,7 @@ oi_iter_aggregate(daos_handle_t ih, bool range_discard) "Probe should be done before aggregation\n"); if (rc != 0) return rc; - D_ASSERT(rec_iov.iov_len == sizeof(struct vos_obj_df)); + D_ASSERT(rec_iov.iov_len == vos_obj_df_size(oiter->oit_cont->vc_pool)); obj = (struct vos_obj_df *)rec_iov.iov_buf; oid = obj->vo_id; diff --git a/src/vos/vos_query.c b/src/vos/vos_query.c index e924e4016b6..b4d414012e5 100644 --- a/src/vos/vos_query.c +++ b/src/vos/vos_query.c @@ -162,7 +162,7 @@ query_normal_recx(struct open_query *query, daos_recx_t *recx) uint32_t inob; - vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh); + vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh, query->qt_obj); rc = evt_open(query->qt_recx_root, &query->qt_pool->vp_uma, &cbs, &toh); if (rc != 0) return rc; @@ -344,7 +344,7 @@ query_ec_recx(struct open_query *query, daos_recx_t *recx) bool prefresh = true; - vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh); + vos_evt_desc_cbs_init(&cbs, query->qt_pool, query->qt_coh, query->qt_obj); rc = evt_open(query->qt_recx_root, &query->qt_pool->vp_uma, &cbs, &toh); if (rc != 0) return rc; @@ -517,7 +517,7 @@ open_and_query_key(struct open_query *query, daos_key_t *key, return -DER_NONEXIST; rc = dbtree_open_inplace_ex(to_open, &query->qt_pool->vp_uma, - query->qt_coh, query->qt_pool, toh); + query->qt_coh, query->qt_obj, toh); if (rc != 0) return rc; diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index c36fcaa88c5..b2245e626d9 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -154,8 +154,10 @@ ktr_hkey_gen(struct btr_instance *tins, d_iov_t *key_iov, void *hkey) { struct ktr_hkey *kkey = (struct ktr_hkey *)hkey; struct umem_pool *umm_pool = tins->ti_umm.umm_pool; - struct vos_pool *pool = (struct vos_pool *)tins->ti_priv; + struct vos_pool *pool; + D_ASSERT(tins->ti_destroy == 0); + pool = vos_obj2pool(tins->ti_priv); D_ASSERT(key_iov->iov_len < pool->vp_pool_df->pd_scm_sz); hkey_common_gen(key_iov, hkey); @@ -255,7 +257,8 @@ ktr_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, rbund = iov2rec_bundle(val_iov); - rec->rec_off = umem_zalloc(&tins->ti_umm, vos_krec_size(rbund)); + D_ASSERT(tins->ti_destroy == 0); + rec->rec_off = vos_obj_alloc(&tins->ti_umm, tins->ti_priv, vos_krec_size(rbund), true); if (UMOFF_IS_NULL(rec->rec_off)) return -DER_NOSPACE; @@ -298,11 +301,15 @@ ktr_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) if (rc != 0) return rc; - pool = (struct vos_pool *)tins->ti_priv; + D_ASSERT(tins->ti_priv); + if (tins->ti_destroy) + pool = (struct vos_pool *)tins->ti_priv; + else + pool = vos_obj2pool(tins->ti_priv); + vos_ilog_ts_evict(&krec->kr_ilog, (krec->kr_bmap & KREC_BF_DKEY) ? VOS_TS_TYPE_DKEY : VOS_TS_TYPE_AKEY, pool->vp_sysdb); - D_ASSERT(tins->ti_priv); gc = (krec->kr_bmap & KREC_BF_DKEY) ? GC_DKEY : GC_AKEY; coh = vos_cont2hdl(args); return gc_add_item(pool, coh, gc, rec->rec_off, 0); @@ -351,7 +358,8 @@ ktr_rec_update(struct btr_instance *tins, struct btr_record *rec, static umem_off_t ktr_node_alloc(struct btr_instance *tins, int size) { - return umem_zalloc(&tins->ti_umm, size); + D_ASSERT(tins->ti_destroy == 0); + return vos_obj_alloc(&tins->ti_umm, tins->ti_priv, size, true); } static btr_ops_t key_btr_ops = { @@ -620,9 +628,13 @@ svt_rec_free_internal(struct btr_instance *tins, struct btr_record *rec, if (!overwrite) { /* SCM value is stored together with vos_irec_df */ if (addr->ba_type == DAOS_MEDIA_NVME) { - struct vos_pool *pool = tins->ti_priv; + struct vos_pool *pool; - D_ASSERT(pool != NULL); + D_ASSERT(tins->ti_priv != NULL); + if (tins->ti_destroy) + pool = (struct vos_pool *)tins->ti_priv; + else + pool = vos_obj2pool(tins->ti_priv); rc = vos_bio_addr_free(pool, addr, irec->ir_size); if (rc) return rc; @@ -714,7 +726,7 @@ svt_check_availability(struct btr_instance *tins, struct btr_record *rec, static umem_off_t svt_node_alloc(struct btr_instance *tins, int size) { - return umem_zalloc(&tins->ti_umm, size); + return vos_obj_alloc(&tins->ti_umm, tins->ti_priv, size, true); } static btr_ops_t singv_btr_ops = { @@ -802,12 +814,13 @@ evt_dop_log_del(struct umem_instance *umm, daos_epoch_t epoch, } void -vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, - daos_handle_t coh) +vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh, + struct vos_object *obj) { /* NB: coh is not required for destroy */ cbs->dc_bio_free_cb = evt_dop_bio_free; cbs->dc_bio_free_args = (void *)pool; + cbs->dc_alloc_arg = (void *)obj; cbs->dc_log_status_cb = evt_dop_log_status; cbs->dc_log_status_args = (void *)(unsigned long)coh.cookie; cbs->dc_log_add_cb = evt_dop_log_add; @@ -829,7 +842,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, int unexpected_flag; int rc = 0; - vos_evt_desc_cbs_init(&cbs, pool, coh); + vos_evt_desc_cbs_init(&cbs, pool, coh, obj); if ((krec->kr_bmap & (KREC_BF_BTR | KREC_BF_EVT)) == 0) goto create; @@ -855,7 +868,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, if (expected_flag == KREC_BF_EVT) { rc = evt_open(&krec->kr_evt, uma, &cbs, sub_toh); } else { - rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, pool, sub_toh); + rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, obj, sub_toh); } if (rc != 0) D_ERROR("Failed to open tree: " DF_RC "\n", DP_RC(rc)); @@ -924,7 +937,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, rc = dbtree_create_inplace_ex(ta->ta_class, tree_feats, ta->ta_order, uma, &krec->kr_btr, - coh, pool, sub_toh); + coh, obj, sub_toh); if (rc != 0) { D_ERROR("Failed to create btree: "DF_RC"\n", DP_RC(rc)); goto out; @@ -1206,14 +1219,13 @@ obj_tree_init(struct vos_object *obj) ta->ta_order, vos_obj2uma(obj), &obj->obj_df->vo_tree, vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), - &obj->obj_toh); + obj, &obj->obj_toh); } else { D_DEBUG(DB_DF, "Open btree for object\n"); rc = dbtree_open_inplace_ex(&obj->obj_df->vo_tree, vos_obj2uma(obj), vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), &obj->obj_toh); + obj, &obj->obj_toh); } if (rc)