Skip to content

Commit

Permalink
DAOS-14317 vos: initial changes for the phase2 object pre-load
Browse files Browse the repository at this point in the history
- Introduced new durable format 'vos_obj_p2_df' for the md-on-ssd phase2
  object, at most 4 evict-able bucket IDs could be stored.

- Changed vos_obj_hold() & vos_obj_release() to pin or unpin object
  respectively.

- Changed the private data of VOS dkey/akey/value trees from 'vos_pool' to
  'vos_object', the private data will be used for allocating/reserving from
  the evict-able bucket.

- Move the vos_obj_hold() call from vos_update_end() to vos_update_begin()
  for the phase2 pool, reserve value from the object evict-able bucket.

TODO:
- Reorg GC code to pre-load objects before starting transaction.
- Reorg DTX commit code to pre-load objects before starting transaction.
- Reorg multiple-objects tx code to pre-load objects.
- Improve engine scheduler to take both non-evict-able & evict-able zones
  space pressure into account.

Required-githooks: true

Signed-off-by: Niu Yawei <[email protected]>
  • Loading branch information
NiuYawei committed Aug 29, 2024
1 parent f6c4816 commit 7cf7bae
Show file tree
Hide file tree
Showing 18 changed files with 270 additions and 96 deletions.
9 changes: 7 additions & 2 deletions src/common/btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -945,8 +945,12 @@ btr_root_alloc(struct btr_context *tcx)
struct btr_instance *tins = &tcx->tc_tins;
struct btr_root *root;

tins->ti_root_off = umem_zalloc(btr_umm(tcx),
sizeof(struct btr_root));
if (btr_ops(tcx)->to_node_alloc != NULL)
tins->ti_root_off = btr_ops(tcx)->to_node_alloc(&tcx->tc_tins,
sizeof(struct btr_root));
else
tins->ti_root_off = umem_zalloc(btr_umm(tcx), sizeof(struct btr_root));

if (UMOFF_IS_NULL(tins->ti_root_off))
return btr_umm(tcx)->umm_nospc_rc;

Expand Down Expand Up @@ -3884,6 +3888,7 @@ btr_tree_destroy(struct btr_context *tcx, void *args, bool *destroyed)
tcx->tc_tins.ti_root_off, tcx->tc_order);

root = tcx->tc_tins.ti_root;
tcx->tc_tins.ti_destroy = 1;
if (root && !UMOFF_IS_NULL(root->tr_node)) {
/* destroy the root and all descendants */
rc = btr_node_destroy(tcx, root->tr_node, args, &empty);
Expand Down
2 changes: 2 additions & 0 deletions src/include/daos/btree.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ struct btr_instance {
struct btr_root *ti_root;
/** Customized operations for the tree */
btr_ops_t *ti_ops;
/** The context is used for tree destroy */
unsigned int ti_destroy : 1;
};

/**
Expand Down
2 changes: 0 additions & 2 deletions src/include/daos/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -451,8 +451,6 @@ typedef void
umem_cache_wait_cb_t(void *arg, uint64_t chkpt_tx, uint64_t *committed_tx);

/**
* Write all dirty pages before @wal_tx to MD blob. (XXX: not yet implemented)
*
* This function can yield internally, it is called by checkpoint service of upper level stack.
*
* \param[in] store The umem store
Expand Down
6 changes: 5 additions & 1 deletion src/include/daos_srv/evtree.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2017-2023 Intel Corporation.
* (C) Copyright 2017-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -70,6 +70,10 @@ struct evt_desc_cbs {
struct evt_desc *desc,
daos_size_t nob, void *args);
void *dc_bio_free_args;
/**
* Argument for allocation.
*/
void *dc_alloc_arg;
/**
* Availability check, it is for data tracked by DTX undo log.
* It is optional, EVTree always treats data extent is available if
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/util/telemetry_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ class TelemetryUtils():
ENGINE_NVME_INTEL_VENDOR_METRICS
ENGINE_MEM_USAGE_METRICS = [
"engine_mem_vos_dtx_cmt_ent_48",
"engine_mem_vos_vos_obj_360",
"engine_mem_vos_vos_obj_384",
"engine_mem_vos_vos_lru_size",
"engine_mem_dtx_dtx_leader_handle_360"]
ENGINE_MEM_TOTAL_USAGE_METRICS = [
Expand Down
10 changes: 6 additions & 4 deletions src/vos/evtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -1443,8 +1443,9 @@ evt_node_alloc(struct evt_context *tcx, unsigned int flags,
struct evt_node *nd;
umem_off_t nd_off;
bool leaf = (flags & EVT_NODE_LEAF);
struct vos_object *obj = tcx->tc_desc_cbs.dc_alloc_arg;

nd_off = umem_zalloc(evt_umm(tcx), evt_node_size(tcx, leaf));
nd_off = vos_obj_alloc(evt_umm(tcx), obj, evt_node_size(tcx, leaf), true);
if (UMOFF_IS_NULL(nd_off))
return -DER_NOSPACE;

Expand Down Expand Up @@ -3249,8 +3250,9 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd,
}

if (leaf) {
umem_off_t desc_off;
uint32_t csum_buf_size = 0;
umem_off_t desc_off;
uint32_t csum_buf_size = 0;
struct vos_object *obj = tcx->tc_desc_cbs.dc_alloc_arg;

if (ci_is_valid(&ent->ei_csum))
csum_buf_size = ci_csums_len(ent->ei_csum);
Expand All @@ -3263,7 +3265,7 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd,
D_DEBUG(DB_TRACE, "Allocating an extra %d bytes "
"for checksum", csum_buf_size);
}
desc_off = umem_zalloc(evt_umm(tcx), desc_size);
desc_off = vos_obj_alloc(evt_umm(tcx), obj, desc_size, true);
if (UMOFF_IS_NULL(desc_off))
return -DER_NOSPACE;

Expand Down
54 changes: 27 additions & 27 deletions src/vos/tests/vts_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -898,7 +898,7 @@ io_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch,
static inline int
hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *epr,
daos_epoch_t bound, uint64_t flags, uint32_t intent, struct vos_object **obj_p,
struct vos_ts_set *ts_set)
struct vos_ts_set *ts_set, struct umem_instance *umm)
{
int rc;

Expand All @@ -908,7 +908,16 @@ hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *ep

if (flags & VOS_OBJ_CREATE) {
assert_ptr_not_equal(*obj_p, NULL);

if (umm != NULL) {
rc = umem_tx_begin(umm, NULL);
assert_rc_equal(rc, 0);
}

rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set);

if (umm != NULL)
rc = umem_tx_end(umm, rc);
}

return rc;
Expand All @@ -926,7 +935,8 @@ hold_objects(struct vos_object **objs, daos_handle_t *coh, daos_unit_oid_t *oid,
hold_flags |= VOS_OBJ_VISIBLE;
for (i = start; i < end; i++) {
rc = hold_obj(vos_hdl2cont(*coh), *oid, &epr, 0, hold_flags,
no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE, &objs[i], 0);
no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE,
&objs[i], 0, NULL);
if (rc != exp_rc)
return 1;
}
Expand Down Expand Up @@ -1006,82 +1016,72 @@ io_obj_cache_test(void **state)

ummg = vos_cont2umm(vos_hdl2cont(ctx->tc_co_hdl));
umml = vos_cont2umm(vos_hdl2cont(l_coh));
rc = umem_tx_begin(ummg, NULL);
assert_rc_equal(rc, 0);

rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0,
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0);
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, ummg);
assert_rc_equal(rc, 0);

/** Hold object for discard */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
DAOS_INTENT_DISCARD, &obj1, 0);
DAOS_INTENT_DISCARD, &obj1, 0, ummg);
assert_rc_equal(rc, 0);
/** Second discard should fail */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
DAOS_INTENT_DISCARD, &obj2, 0);
DAOS_INTENT_DISCARD, &obj2, 0, ummg);
assert_rc_equal(rc, -DER_BUSY);
/** Should prevent simultaneous aggregation */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE,
DAOS_INTENT_PURGE, &obj2, 0);
DAOS_INTENT_PURGE, &obj2, 0, ummg);
assert_rc_equal(rc, -DER_BUSY);
/** Should prevent simultaneous hold for create as well */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0,
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0);
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2,
0, ummg);
assert_rc_equal(rc, -DER_UPDATE_AGAIN);

/** Need to be able to hold for read though or iteration won't work */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE,
DAOS_INTENT_DEFAULT, &obj2, 0);
DAOS_INTENT_DEFAULT, &obj2, 0, ummg);
vos_obj_release(obj2, 0, false);
vos_obj_release(obj1, VOS_OBJ_DISCARD, false);

/** Hold object for aggregation */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE,
DAOS_INTENT_PURGE, &obj1, 0);
DAOS_INTENT_PURGE, &obj1, 0, ummg);
assert_rc_equal(rc, 0);
/** Discard should fail */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
DAOS_INTENT_DISCARD, &obj2, 0);
DAOS_INTENT_DISCARD, &obj2, 0, ummg);
assert_rc_equal(rc, -DER_BUSY);
/** Second aggregation should fail */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE,
DAOS_INTENT_PURGE, &obj2, 0);
DAOS_INTENT_PURGE, &obj2, 0, ummg);
assert_rc_equal(rc, -DER_BUSY);
/** Simultaneous create should work */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0,
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0);
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0, ummg);
assert_rc_equal(rc, 0);
vos_obj_release(obj2, 0, false);

/** Need to be able to hold for read though or iteration won't work */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE,
DAOS_INTENT_DEFAULT, &obj2, 0);
DAOS_INTENT_DEFAULT, &obj2, 0, ummg);
vos_obj_release(obj2, 0, false);
vos_obj_release(obj1, VOS_OBJ_AGGREGATE, false);

/** Now that other one is done, this should work */
rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD,
DAOS_INTENT_DISCARD, &obj2, 0);
DAOS_INTENT_DISCARD, &obj2, 0, ummg);
assert_rc_equal(rc, 0);
vos_obj_release(obj2, VOS_OBJ_DISCARD, false);

rc = umem_tx_end(ummg, 0);
assert_rc_equal(rc, 0);

vos_obj_release(objs[0], 0, false);

rc = umem_tx_begin(umml, NULL);
assert_rc_equal(rc, 0);

rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0,
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0);
VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0, umml);
assert_rc_equal(rc, 0);
vos_obj_release(objs[0], 0, false);

rc = umem_tx_end(umml, 0);
assert_rc_equal(rc, 0);

rc = hold_objects(objs, &ctx->tc_co_hdl, &oids[0], 0, 10, true, 0);
assert_int_equal(rc, 0);

Expand All @@ -1091,7 +1091,7 @@ io_obj_cache_test(void **state)
rc = hold_objects(objs, &l_coh, &oids[1], 10, 15, true, 0);
assert_int_equal(rc, 0);
rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, VOS_OBJ_VISIBLE,
DAOS_INTENT_DEFAULT, &objs[16], 0);
DAOS_INTENT_DEFAULT, &objs[16], 0, NULL);
assert_rc_equal(rc, 0);

vos_obj_release(objs[16], 0, false);
Expand Down
2 changes: 1 addition & 1 deletion src/vos/vos_aggregate.c
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ reserve_segment(struct vos_object *obj, struct agg_io_context *io,

if (vos_io_scm(vos_obj2pool(obj), DAOS_IOD_ARRAY, size, VOS_IOS_AGGREGATION)) {
/** Store on SCM */
off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size);
off = vos_reserve_scm(obj->obj_cont, io->ic_rsrvd_scm, size, obj);
if (UMOFF_IS_NULL(off)) {
now = daos_gettime_coarse();
if (now - obj->obj_cont->vc_agg_nospc_ts > VOS_NOSPC_ERROR_INTVL) {
Expand Down
4 changes: 2 additions & 2 deletions src/vos/vos_gc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2019-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -115,7 +115,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
daos_handle_t toh;
int rc;

vos_evt_desc_cbs_init(&cbs, pool, coh);
vos_evt_desc_cbs_init(&cbs, pool, coh, NULL);
rc = evt_open(root, &pool->vp_uma, &cbs, &toh);
if (rc == -DER_NONEXIST) {
*empty = true;
Expand Down
48 changes: 46 additions & 2 deletions src/vos/vos_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,7 @@ vos_bio_addr_free(struct vos_pool *pool, bio_addr_t *addr, daos_size_t nob);

void
vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool,
daos_handle_t coh);
daos_handle_t coh, struct vos_object *obj);

int
vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb);
Expand Down Expand Up @@ -1312,7 +1312,7 @@ vos_dedup_invalidate(struct vos_pool *pool);

umem_off_t
vos_reserve_scm(struct vos_container *cont, struct umem_rsrvd_act *rsrvd_scm,
daos_size_t size);
daos_size_t size, struct vos_object *obj);
int
vos_publish_scm(struct umem_instance *umm, struct umem_rsrvd_act *rsrvd_scm, bool publish);
int
Expand All @@ -1329,6 +1329,12 @@ vos_pool2umm(struct vos_pool *pool)
return &pool->vp_umm;
}

static inline struct umem_store *
vos_pool2store(struct vos_pool *pool)
{
return &pool->vp_umm.umm_pool->up_store;
}

static inline struct umem_instance *
vos_cont2umm(struct vos_container *cont)
{
Expand Down Expand Up @@ -1844,4 +1850,42 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v
int
vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid);

static inline bool
vos_pool_is_p2(struct vos_pool *pool)
{
struct umem_store *store = vos_pool2store(pool);

return store->store_type == DAOS_MD_BMEM_V2;
}

static inline umem_off_t
vos_obj_alloc(struct umem_instance *umm, struct vos_object *obj, size_t size, bool zeroing)
{

if (obj != NULL && vos_pool_is_p2(vos_obj2pool(obj))) {
D_ASSERT(obj->obj_bkt_allot == 1);
if (zeroing)
return umem_zalloc_from_bucket(umm, size, obj->obj_bkt_ids[0]);

return umem_alloc_from_bucket(umm, size, obj->obj_bkt_ids[0]);
}

if (zeroing)
return umem_zalloc(umm, size);

return umem_alloc(umm, size);
}

static inline umem_off_t
vos_obj_reserve(struct umem_instance *umm, struct vos_object *obj,
struct umem_rsrvd_act *rsrvd_scm, daos_size_t size)
{
if (obj != NULL && vos_pool_is_p2(vos_obj2pool(obj))) {
D_ASSERT(obj->obj_bkt_allot == 1);
return umem_reserve_from_bucket(umm, rsrvd_scm, size, obj->obj_bkt_ids[0]);
}

return umem_reserve(umm, rsrvd_scm, size);
}

#endif /* __VOS_INTERNAL_H__ */
Loading

0 comments on commit 7cf7bae

Please sign in to comment.