Skip to content

Commit

Permalink
DAOS-14317 vos: initial changes for the phase2 object pre-load
Browse files Browse the repository at this point in the history
- Introduced new durable format 'vos_obj_p2_df' for the md-on-ssd phase2
  object, at most 4 evict-able bucket IDs could be stored.

- Changed vos_obj_hold() & vos_obj_release() to pin or unpin object
  respectively.

- Changed the private data of VOS dkey/akey/value trees from 'vos_pool' to
  'vos_object', the private data will be used for allocating/reserving from
  the evict-able bucket.

TODO:
- Move the vos_obj_hold() from vos_update_end() to vos_update_begin(),
  change VOS I/O code to do reserve from evict-able bucket.
- Reorg GC code to pre-load objects before starting transaction.
- Reorg DTX commit code to pre-load objects before starting transaction.
- Reorg multiple-objects tx code to pre-load objects.
- Improve engine scheduler to take both non-evict-able & evict-able zones
  space pressure into account.

Required-githooks: true

Signed-off-by: Niu Yawei <[email protected]>
  • Loading branch information
NiuYawei committed Aug 27, 2024
1 parent f6c4816 commit 08b5847
Show file tree
Hide file tree
Showing 14 changed files with 203 additions and 49 deletions.
9 changes: 7 additions & 2 deletions src/common/btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -945,8 +945,12 @@ btr_root_alloc(struct btr_context *tcx)
struct btr_instance *tins = &tcx->tc_tins;
struct btr_root *root;

tins->ti_root_off = umem_zalloc(btr_umm(tcx),
sizeof(struct btr_root));
if (btr_ops(tcx)->to_node_alloc != NULL)
tins->ti_root_off = btr_ops(tcx)->to_node_alloc(&tcx->tc_tins,
sizeof(struct btr_root));
else
tins->ti_root_off = umem_zalloc(btr_umm(tcx), sizeof(struct btr_root));

if (UMOFF_IS_NULL(tins->ti_root_off))
return btr_umm(tcx)->umm_nospc_rc;

Expand Down Expand Up @@ -3884,6 +3888,7 @@ btr_tree_destroy(struct btr_context *tcx, void *args, bool *destroyed)
tcx->tc_tins.ti_root_off, tcx->tc_order);

root = tcx->tc_tins.ti_root;
tcx->tc_tins.ti_destroy = 1;
if (root && !UMOFF_IS_NULL(root->tr_node)) {
/* destroy the root and all descendants */
rc = btr_node_destroy(tcx, root->tr_node, args, &empty);
Expand Down
2 changes: 2 additions & 0 deletions src/include/daos/btree.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ struct btr_instance {
struct btr_root *ti_root;
/** Customized operations for the tree */
btr_ops_t *ti_ops;
/** The context is used for tree destroy */
unsigned int ti_destroy : 1;
};

/**
Expand Down
2 changes: 0 additions & 2 deletions src/include/daos/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -451,8 +451,6 @@ typedef void
umem_cache_wait_cb_t(void *arg, uint64_t chkpt_tx, uint64_t *committed_tx);

/**
* Write all dirty pages before @wal_tx to MD blob. (XXX: not yet implemented)
*
* This function can yield internally, it is called by checkpoint service of upper level stack.
*
* \param[in] store The umem store
Expand Down
6 changes: 5 additions & 1 deletion src/include/daos_srv/evtree.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2017-2023 Intel Corporation.
* (C) Copyright 2017-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -70,6 +70,10 @@ struct evt_desc_cbs {
struct evt_desc *desc,
daos_size_t nob, void *args);
void *dc_bio_free_args;
/**
* Argument for allocation.
*/
void *dc_alloc_arg;
/**
* Availability check, it is for data tracked by DTX undo log.
* It is optional, EVTree always treats data extent is available if
Expand Down
15 changes: 13 additions & 2 deletions src/vos/evtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -1435,6 +1435,17 @@ evt_node_size(struct evt_context *tcx, bool leaf)
return evt_order2size(tcx->tc_order, leaf);
}

static inline umem_off_t
evt_zalloc(struct evt_context *tcx, size_t size)
{
struct vos_object *obj = tcx->tc_desc_cbs.dc_alloc_arg;

if (obj != NULL)
return vos_obj_zalloc(obj, size);

return umem_zalloc(evt_umm(tcx), size);
}

/** Allocate a evtree node */
static int
evt_node_alloc(struct evt_context *tcx, unsigned int flags,
Expand All @@ -1444,7 +1455,7 @@ evt_node_alloc(struct evt_context *tcx, unsigned int flags,
umem_off_t nd_off;
bool leaf = (flags & EVT_NODE_LEAF);

nd_off = umem_zalloc(evt_umm(tcx), evt_node_size(tcx, leaf));
nd_off = evt_zalloc(tcx, evt_node_size(tcx, leaf));
if (UMOFF_IS_NULL(nd_off))
return -DER_NOSPACE;

Expand Down Expand Up @@ -3263,7 +3274,7 @@ evt_common_insert(struct evt_context *tcx, struct evt_node *nd,
D_DEBUG(DB_TRACE, "Allocating an extra %d bytes "
"for checksum", csum_buf_size);
}
desc_off = umem_zalloc(evt_umm(tcx), desc_size);
desc_off = evt_zalloc(tcx, desc_size);
if (UMOFF_IS_NULL(desc_off))
return -DER_NOSPACE;

Expand Down
4 changes: 2 additions & 2 deletions src/vos/vos_gc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2019-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -115,7 +115,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh,
daos_handle_t toh;
int rc;

vos_evt_desc_cbs_init(&cbs, pool, coh);
vos_evt_desc_cbs_init(&cbs, pool, coh, NULL);
rc = evt_open(root, &pool->vp_uma, &cbs, &toh);
if (rc == -DER_NONEXIST) {
*empty = true;
Expand Down
43 changes: 42 additions & 1 deletion src/vos/vos_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,7 @@ vos_bio_addr_free(struct vos_pool *pool, bio_addr_t *addr, daos_size_t nob);

void
vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool,
daos_handle_t coh);
daos_handle_t coh, struct vos_object *obj);

int
vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb);
Expand Down Expand Up @@ -1329,6 +1329,12 @@ vos_pool2umm(struct vos_pool *pool)
return &pool->vp_umm;
}

static inline struct umem_store *
vos_pool2store(struct vos_pool *pool)
{
return &pool->vp_umm.umm_pool->up_store;
}

static inline struct umem_instance *
vos_cont2umm(struct vos_container *cont)
{
Expand Down Expand Up @@ -1844,4 +1850,39 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v
int
vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid);

static inline bool
vos_pool_is_p2(struct vos_pool *pool)
{
struct umem_store *store = vos_pool2store(pool);

return store->store_type == DAOS_MD_BMEM_V2;
}

static inline umem_off_t
vos_obj_zalloc(struct vos_object *obj, size_t size)
{
struct vos_pool *pool = vos_obj2pool(obj);

if (vos_pool_is_p2(pool)) {
D_ASSERT(obj->obj_bkt_allot == 1);
return umem_zalloc_from_bucket(vos_pool2umm(pool), size, obj->obj_bkt_ids[0]);
}

return umem_zalloc(vos_pool2umm(pool), size);
}

static inline umem_off_t
vos_obj_reserve(struct vos_object *obj, struct umem_rsrvd_act *rsrvd_scm, daos_size_t size)
{
struct vos_pool *pool = vos_obj2pool(obj);

if (vos_pool_is_p2(pool)) {
D_ASSERT(obj->obj_bkt_allot == 1);
return umem_reserve_from_bucket(vos_pool2umm(pool), rsrvd_scm, size,
obj->obj_bkt_ids[0]);
}

return umem_reserve(vos_pool2umm(pool), rsrvd_scm, size);
}

#endif /* __VOS_INTERNAL_H__ */
13 changes: 13 additions & 0 deletions src/vos/vos_layout.h
Original file line number Diff line number Diff line change
Expand Up @@ -377,4 +377,17 @@ struct vos_obj_df {
struct btr_root vo_tree;
};

#define VOS_OBJ_BKTS_MAX 4

/*
* VOS object durable format for md-on-ssd phase2. The size is fit to the 128 bytes
* slab (see slab_map[] defined in mem.c).
*/
struct vos_obj_p2_df {
struct vos_obj_df p2_obj_df;
uint32_t p2_bkt_ids[VOS_OBJ_BKTS_MAX];
uint64_t p2_reserved;
};
D_CASSERT(sizeof(struct vos_obj_p2_df) == D_ALIGNUP(sizeof(struct vos_obj_df), 32));

#endif
11 changes: 6 additions & 5 deletions src/vos/vos_obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,8 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type,
* subtree
*/
if (krec->kr_bmap & KREC_BF_EVT) {
vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont));
vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont),
obj);
rc = evt_open(&krec->kr_evt, info->ii_uma, &cbs, &info->ii_tree_hdl);
if (rc) {
D_DEBUG(DB_TRACE,
Expand All @@ -1077,7 +1078,7 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type,
info->ii_fake_akey_flag = VOS_IT_DKEY_EV;
} else {
rc = dbtree_open_inplace_ex(&krec->kr_btr, info->ii_uma,
vos_cont2hdl(obj->obj_cont), vos_obj2pool(obj),
vos_cont2hdl(obj->obj_cont), obj,
&info->ii_tree_hdl);
if (rc) {
D_DEBUG(DB_TRACE,
Expand Down Expand Up @@ -2030,7 +2031,7 @@ vos_obj_akey_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info,
}

rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont),
vos_obj2pool(obj), &toh);
obj, &toh);
if (rc) {
D_DEBUG(DB_TRACE,
"Failed to open tree for iterator:"
Expand Down Expand Up @@ -2087,7 +2088,7 @@ vos_obj_iter_sv_nested_prep(vos_iter_type_t type, struct vos_iter_info *info,
}

rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont),
vos_obj2pool(obj), &toh);
obj, &toh);
if (rc) {
D_DEBUG(DB_TRACE,
"Failed to open tree for iterator:"
Expand Down Expand Up @@ -2147,7 +2148,7 @@ vos_obj_ev_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info,
goto prepare;
}

vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont));
vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont), obj);
rc = evt_open(info->ii_evt, info->ii_uma, &cbs, &toh);
if (rc) {
D_DEBUG(DB_TRACE,
Expand Down
8 changes: 7 additions & 1 deletion src/vos/vos_obj.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,18 @@ struct vos_object {
struct vos_obj_df *obj_df;
/** backref to container */
struct vos_container *obj_cont;
/* Handle for the pinned object */
struct umem_pin_handle *obj_pin_hdl;
/** Bucket IDs for the object */
uint32_t obj_bkt_ids[VOS_OBJ_BKTS_MAX];
/** nobody should access this object */
bool obj_zombie;
/** Object is held for discard */
uint32_t obj_discard : 1,
/** If non-zero, object is held for aggregation */
obj_aggregate : 1;
obj_aggregate : 1,
/** Evict-able bucket is already allocated */
obj_bkt_allot : 1;
};

enum {
Expand Down
58 changes: 54 additions & 4 deletions src/vos/vos_obj_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,12 +245,56 @@ obj_get(struct daos_lru_cache *occ, struct vos_container *cont, daos_unit_oid_t
return rc;
}

static inline void
vos_obj_unpin(struct vos_object *obj)
{
struct vos_pool *pool = vos_obj2pool(obj);
struct umem_store *store = vos_pool2store(pool);

if (obj->obj_pin_hdl != NULL) {
umem_cache_unpin(store, obj->obj_pin_hdl);
obj->obj_pin_hdl = NULL;
}
}

/* Support single evict-able bucket for this moment */
static inline int
vos_obj_pin(struct vos_object *obj)
{
struct vos_pool *pool = vos_obj2pool(obj);
struct umem_store *store = vos_pool2store(pool);
struct umem_cache_range rg;

if (!vos_pool_is_p2(pool))
return 0;

if (!obj->obj_bkt_allot) {
if (!obj->obj_df) {
obj->obj_bkt_ids[0] = umem_allot_mb_evictable(vos_pool2umm(pool), 0);
} else {
struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj->obj_df;

obj->obj_bkt_ids[0] = p2->p2_bkt_ids[0];
}
obj->obj_bkt_allot = 1;
}

D_ASSERT(obj->obj_pin_hdl == NULL);
if (obj->obj_bkt_ids[0] == UMEM_DEFAULT_MBKT_ID)
return 0;

rg.cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), obj->obj_bkt_ids[0]);
rg.cr_size = store->cache->ca_page_sz;

return umem_cache_pin(store, &rg, 1, false, &obj->obj_pin_hdl);
}

static inline void
obj_release(struct daos_lru_cache *occ, struct vos_object *obj, bool evict)
{

D_ASSERT(obj != NULL);
/* TODO: Unpin the object in md-on-ssd phase II */
vos_obj_unpin(obj);

if (obj == &obj_local) {
clean_object(obj);
Expand Down Expand Up @@ -301,6 +345,8 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp)
obj_new->obj_sync_epoch = obj_local.obj_sync_epoch;
obj_new->obj_df = obj_local.obj_df;
obj_new->obj_zombie = obj_local.obj_zombie;
obj_new->obj_bkt_allot = obj_local.obj_bkt_allot;
obj_new->obj_pin_hdl = obj_local.obj_pin_hdl;
obj_local.obj_toh = DAOS_HDL_INVAL;
obj_local.obj_ih = DAOS_HDL_INVAL;

Expand Down Expand Up @@ -363,7 +409,9 @@ vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t
if (rc)
return rc;

/* TODO: Pin object in memory */
rc = vos_obj_pin(obj);
if (rc)
return rc;

if (check_discard(obj, flags))
/* Update request will retry with this error */
Expand Down Expand Up @@ -507,8 +555,10 @@ vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t
D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */
}

/* TODO: Pin the object in memory in md-on-ssd phase II. Revise the 'obj_local' implementation
* then, since this function could yield. */
/* TODO: Revise the 'obj_local' implementation later, since this function could yield. */
rc = vos_obj_pin(obj);
if (rc)
goto failed;

/* It's done for DAOS_INTENT_UPDATE or DAOS_INTENT_PUNCH or DAOS_INTENT_KILL */
if (intent == DAOS_INTENT_UPDATE || intent == DAOS_INTENT_PUNCH ||
Expand Down
Loading

0 comments on commit 08b5847

Please sign in to comment.