diff --git a/src/common/btree.c b/src/common/btree.c index 0657bf384dc..579b921d768 100644 --- a/src/common/btree.c +++ b/src/common/btree.c @@ -3888,7 +3888,6 @@ btr_tree_destroy(struct btr_context *tcx, void *args, bool *destroyed) tcx->tc_tins.ti_root_off, tcx->tc_order); root = tcx->tc_tins.ti_root; - tcx->tc_tins.ti_destroy = 1; if (root && !UMOFF_IS_NULL(root->tr_node)) { /* destroy the root and all descendants */ rc = btr_node_destroy(tcx, root->tr_node, args, &empty); diff --git a/src/include/daos/btree.h b/src/include/daos/btree.h index 51f7999e47a..24c7b95cbe4 100644 --- a/src/include/daos/btree.h +++ b/src/include/daos/btree.h @@ -429,8 +429,6 @@ struct btr_instance { struct btr_root *ti_root; /** Customized operations for the tree */ btr_ops_t *ti_ops; - /** The context is used for tree destroy */ - unsigned int ti_destroy : 1; }; /** diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c index 6e6cbeeeb2a..0972b6d7e1b 100644 --- a/src/vos/vos_container.c +++ b/src/vos/vos_container.c @@ -63,7 +63,7 @@ cont_df_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) cont_df = umem_off2ptr(&tins->ti_umm, rec->rec_off); vos_ts_evict(&cont_df->cd_ts_idx, VOS_TS_TYPE_CONT, vos_pool->vp_sysdb); - return gc_add_item(vos_pool, DAOS_HDL_INVAL, GC_CONT, rec->rec_off, 0); + return gc_add_item(vos_pool, DAOS_HDL_INVAL, GC_CONT, rec->rec_off, NULL); } static int diff --git a/src/vos/vos_gc.c b/src/vos/vos_gc.c index b90cd5d1199..1be9be47d83 100644 --- a/src/vos/vos_gc.c +++ b/src/vos/vos_gc.c @@ -74,12 +74,26 @@ struct vos_gc { */ static int gc_drain_btr(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, - struct btr_root *root, int *credits, bool *empty) + struct vos_gc_item *item, struct btr_root *root, int *credits, bool *empty) { - daos_handle_t toh; - int rc; + struct vos_object dummy_obj = { 0 }; + struct vos_container dummy_cont = { 0 }; + daos_handle_t toh; + void *priv; + int rc, i; + + if (gc->gc_type == GC_CONT) { + priv = pool; + } else { + dummy_cont.vc_pool = pool; + dummy_obj.obj_cont = &dummy_cont; + dummy_obj.obj_bkt_allot = 1; + for (i = 0; i < VOS_GC_BKTS_MAX; i++) + dummy_obj.obj_bkt_ids[i] = item->it_bkt_ids[i]; + priv = &dummy_obj; + } - rc = dbtree_open_inplace_ex(root, &pool->vp_uma, coh, pool, &toh); + rc = dbtree_open_inplace_ex(root, &pool->vp_uma, coh, priv, &toh); if (rc == -DER_NONEXIST) { /* empty tree */ *empty = true; return 0; @@ -126,7 +140,7 @@ gc_drain_evt(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, D_DEBUG(DB_TRACE, "drain %s evtree, creds=%d\n", gc->gc_name, *credits); rc = evt_drain(toh, credits, empty); - D_ASSERT(evt_close(toh) == 0); + evt_close(toh); if (rc) goto failed; @@ -160,7 +174,7 @@ gc_drain_key(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, } if (key->kr_bmap & KREC_BF_BTR) { - rc = gc_drain_btr(gc, pool, coh, &key->kr_btr, credits, empty); + rc = gc_drain_btr(gc, pool, coh, item, &key->kr_btr, credits, empty); } else if (key->kr_bmap & KREC_BF_EVT) { D_ASSERT(gc->gc_type == GC_AKEY); @@ -195,7 +209,7 @@ gc_free_dkey(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, struct D_ASSERT(krec->kr_bmap & KREC_BF_DKEY); if (krec->kr_bmap & KREC_BF_NO_AKEY) - gc_add_item(pool, coh, GC_AKEY, item->it_addr, item->it_args); + gc_add_item(pool, coh, GC_AKEY, item->it_addr, &item->it_bkt_ids[0]); else umem_free(&pool->vp_umm, item->it_addr); return 0; @@ -211,7 +225,7 @@ gc_drain_obj(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, { struct vos_obj_df *obj = umem_off2ptr(&pool->vp_umm, item->it_addr); - return gc_drain_btr(gc, pool, coh, &obj->vo_tree, credits, empty); + return gc_drain_btr(gc, pool, coh, item, &obj->vo_tree, credits, empty); } static int @@ -298,8 +312,7 @@ gc_drain_cont(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, } D_ASSERT(daos_handle_is_inval(coh)); - return gc_drain_btr(gc, pool, coh, &cont->cd_obj_root, - credits, empty); + return gc_drain_btr(gc, pool, coh, item, &cont->cd_obj_root, credits, empty); } static int @@ -627,12 +640,12 @@ gc_free_item(struct vos_gc *gc, struct vos_pool *pool, */ int gc_add_item(struct vos_pool *pool, daos_handle_t coh, - enum vos_gc_type type, umem_off_t item_off, uint64_t args) + enum vos_gc_type type, umem_off_t item_off, uint32_t *bkt_ids) { struct vos_container *cont = vos_hdl2cont(coh); struct vos_gc_bin_df *bin = gc_type2bin(pool, cont, type); struct vos_gc_item item; - int rc; + int rc, i; D_DEBUG(DB_TRACE, "Add %s addr="DF_X64"\n", gc_type2name(type), item_off); @@ -641,7 +654,9 @@ gc_add_item(struct vos_pool *pool, daos_handle_t coh, return 0; /* OK to ignore because the pool is being deleted */ item.it_addr = item_off; - item.it_args = args; + for (i = 0; i < VOS_GC_BKTS_MAX; i++) + item.it_bkt_ids[i] = bkt_ids ? bkt_ids[i] : UMEM_DEFAULT_MBKT_ID; + rc = gc_bin_add_item(&pool->vp_umm, bin, &item); if (rc) { D_ERROR("Failed to add item, pool=" DF_UUID ", rc=" DF_RC "\n", @@ -712,28 +727,41 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) struct vos_container *cont = gc_get_container(pool); struct vos_gc *gc = &gc_table[0]; /* start from akey */ int creds = *credits; + uint32_t bkt = UMEM_DEFAULT_MBKT_ID, pinned_bkt = UMEM_DEFAULT_MBKT_ID; + struct umem_pin_handle *pin_hdl = NULL; + struct umem_cache_range rg; int rc; if (pool->vp_dying) { *empty_ret = true; D_GOTO(done, rc = 0); } + *empty_ret = false; /* take an extra ref to avoid concurrent container destroy/free */ if (cont != NULL) vos_cont_addref(cont); +pin_obj: + if (bkt != UMEM_DEFAULT_MBKT_ID) { + rg.cr_off = umem_get_mb_base_offset(vos_pool2umm(pool), bkt); + rg.cr_size = vos_pool2store(pool)->cache->ca_page_sz; + + rc = umem_cache_pin(vos_pool2store(pool), &rg, 1, false, &pin_hdl); + if (rc) { + DL_ERROR(rc, "Failed to pin bucket %u.", bkt); + goto tx_error; + } + pinned_bkt = bkt; + } + rc = umem_tx_begin(&pool->vp_umm, NULL); if (rc) { D_ERROR("Failed to start transacton for " DF_UUID ": " DF_RC "\n", DP_UUID(pool->vp_id), DP_RC(rc)); - if (cont != NULL) - vos_cont_decref(cont); - *empty_ret = false; - goto done; + goto tx_error; } - *empty_ret = false; while (creds > 0) { struct vos_gc_item *item; bool empty = false; @@ -769,6 +797,25 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) if (DAOS_FAIL_CHECK(DAOS_VOS_GC_CONT)) D_ASSERT(cont != NULL); + bkt = item->it_bkt_ids[0]; + if (bkt != UMEM_DEFAULT_MBKT_ID && bkt != pinned_bkt) { + D_ASSERT(gc->gc_type != GC_CONT); + D_ASSERT(vos_pool_is_p2(pool)); + + rc = umem_tx_end(&pool->vp_umm, rc); + if (rc != 0) { + DL_ERROR(rc, "Transaction commit failed."); + goto tx_error; + } + + if (pin_hdl != NULL) { + umem_cache_unpin(vos_pool2store(pool), pin_hdl); + pin_hdl = NULL; + } + + goto pin_obj; + } + rc = gc_drain_item(gc, pool, vos_cont2hdl(cont), item, &creds, &empty); if (rc < 0) { @@ -815,6 +862,11 @@ gc_reclaim_pool(struct vos_pool *pool, int *credits, bool *empty_ret) rc = umem_tx_end(&pool->vp_umm, rc); if (rc == 0) *credits = creds; +tx_error: + if (pin_hdl != NULL) { + umem_cache_unpin(vos_pool2store(pool), pin_hdl); + pin_hdl = NULL; + } if (cont != NULL && d_list_empty(&cont->vc_gc_link)) { /** The container may not be empty so add it back to end of diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 72275c1d41c..e382e0d2f0a 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -1371,7 +1371,7 @@ void gc_check_cont(struct vos_container *cont); int gc_add_item(struct vos_pool *pool, daos_handle_t coh, - enum vos_gc_type type, umem_off_t item_off, uint64_t args); + enum vos_gc_type type, umem_off_t item_off, uint32_t *bkt_ids); int vos_gc_pool_tight(daos_handle_t poh, int *credits); void diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index 0400a351175..1bfc84b69c9 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -43,6 +43,16 @@ struct vos_gc_bin_df { uint16_t bin_pad16; }; +/* + * This is smaller than the VOS_OBJ_BKTS_MAX for object durable format, because + * I don't want to increase each GC item size (the amount of GC item is massive) + * for an imagined requirement. + * + * If we really need to support more than 2 evict-able buckets per object in the + * futhure, we can enlarge the GC item then. + */ +#define VOS_GC_BKTS_MAX 2 + struct vos_gc_bag_df { /** index of the first item in FIFO */ uint16_t bag_item_first; @@ -57,19 +67,12 @@ struct vos_gc_bag_df { struct vos_gc_item { /* address of the item to be freed */ umem_off_t it_addr; - /** Reserved, argument for GC_VEA/BIO (e.g. size of extent) */ - uint64_t it_args; + /* object buckets for GC_AKEY/DKEY/OBJ of the md-on-ssd p2 pool */ + uint32_t it_bkt_ids[VOS_GC_BKTS_MAX]; } bag_items[0]; }; enum vos_gc_type { - /* XXX: we could define GC_VEA, which can free NVMe/SCM space. - * So svt_rec_free() and evt_desc_bio_free() only need to call - * gc_add_item() to register BIO address for GC. - * - * However, GC_VEA could have extra overhead of reassigning SCM - * pointers, but it also has low latency for undo changes. - */ GC_AKEY, GC_DKEY, GC_OBJ, @@ -378,6 +381,7 @@ struct vos_obj_df { }; #define VOS_OBJ_BKTS_MAX 4 +D_CASSERT(VOS_GC_BKTS_MAX <= VOS_OBJ_BKTS_MAX); /* * VOS object durable format for md-on-ssd phase2. The size is fit to the 128 bytes diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index 502aafa6306..71717884300 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -470,6 +470,25 @@ vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t return -DER_TX_RESTART; } + if (obj->obj_bkt_ids[0] != UMEM_DEFAULT_MBKT_ID) { + struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj->obj_df; + + D_ASSERT(vos_pool_is_p2(vos_obj2pool(obj))); + D_ASSERT(obj->obj_bkt_allot); + + if (p2->p2_bkt_ids[0] == UMEM_DEFAULT_MBKT_ID) { + p2->p2_bkt_ids[0] = obj->obj_bkt_ids[0]; + rc = umem_tx_add_ptr(vos_cont2umm(cont), &p2->p2_bkt_ids[0], + sizeof(p2->p2_bkt_ids[0])); + if (rc) { + DL_ERROR(rc, "Add bucket ID failed."); + return rc; + } + } else { + D_ASSERT(p2->p2_bkt_ids[0] == obj->obj_bkt_ids[0]); + } + } + /* It's done for DAOS_INTENT_PUNCH case */ if (intent == DAOS_INTENT_PUNCH) return 0; diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index e6111cd0746..e385d9d8460 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -145,6 +145,7 @@ oi_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) daos_handle_t coh = { 0 }; int rc; struct vos_pool *pool; + uint32_t *bkt_ids = NULL; obj = umem_off2ptr(umm, rec->rec_off); @@ -173,7 +174,14 @@ oi_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) if (del_arg != NULL) coh = vos_cont2hdl((struct vos_container *)del_arg->cont); - return gc_add_item(tins->ti_priv, coh, GC_OBJ, rec->rec_off, 0); + + if (vos_pool_is_p2(pool)) { + struct vos_obj_p2_df *p2 = (struct vos_obj_p2_df *)obj; + + bkt_ids = &p2->p2_bkt_ids[0]; + } + + return gc_add_item(tins->ti_priv, coh, GC_OBJ, rec->rec_off, bkt_ids); } static int diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index b2245e626d9..3659cd026d7 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -156,7 +156,6 @@ ktr_hkey_gen(struct btr_instance *tins, d_iov_t *key_iov, void *hkey) struct umem_pool *umm_pool = tins->ti_umm.umm_pool; struct vos_pool *pool; - D_ASSERT(tins->ti_destroy == 0); pool = vos_obj2pool(tins->ti_priv); D_ASSERT(key_iov->iov_len < pool->vp_pool_df->pd_scm_sz); hkey_common_gen(key_iov, hkey); @@ -257,7 +256,6 @@ ktr_rec_alloc(struct btr_instance *tins, d_iov_t *key_iov, rbund = iov2rec_bundle(val_iov); - D_ASSERT(tins->ti_destroy == 0); rec->rec_off = vos_obj_alloc(&tins->ti_umm, tins->ti_priv, vos_krec_size(rbund), true); if (UMOFF_IS_NULL(rec->rec_off)) return -DER_NOSPACE; @@ -289,6 +287,8 @@ ktr_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) int gc; int rc; struct vos_pool *pool; + struct vos_object *obj; + uint32_t *bkt_ids = NULL; if (UMOFF_IS_NULL(rec->rec_off)) return 0; @@ -302,17 +302,21 @@ ktr_rec_free(struct btr_instance *tins, struct btr_record *rec, void *args) return rc; D_ASSERT(tins->ti_priv); - if (tins->ti_destroy) - pool = (struct vos_pool *)tins->ti_priv; - else - pool = vos_obj2pool(tins->ti_priv); + obj = tins->ti_priv; + pool = vos_obj2pool(obj); vos_ilog_ts_evict(&krec->kr_ilog, (krec->kr_bmap & KREC_BF_DKEY) ? VOS_TS_TYPE_DKEY : VOS_TS_TYPE_AKEY, pool->vp_sysdb); gc = (krec->kr_bmap & KREC_BF_DKEY) ? GC_DKEY : GC_AKEY; coh = vos_cont2hdl(args); - return gc_add_item(pool, coh, gc, rec->rec_off, 0); + + if (vos_pool_is_p2(pool)) { + D_ASSERT(obj->obj_bkt_allot == 1); + bkt_ids = &obj->obj_bkt_ids[0]; + } + + return gc_add_item(pool, coh, gc, rec->rec_off, bkt_ids); } static int @@ -358,7 +362,6 @@ ktr_rec_update(struct btr_instance *tins, struct btr_record *rec, static umem_off_t ktr_node_alloc(struct btr_instance *tins, int size) { - D_ASSERT(tins->ti_destroy == 0); return vos_obj_alloc(&tins->ti_umm, tins->ti_priv, size, true); } @@ -631,10 +634,7 @@ svt_rec_free_internal(struct btr_instance *tins, struct btr_record *rec, struct vos_pool *pool; D_ASSERT(tins->ti_priv != NULL); - if (tins->ti_destroy) - pool = (struct vos_pool *)tins->ti_priv; - else - pool = vos_obj2pool(tins->ti_priv); + pool = vos_obj2pool(tins->ti_priv); rc = vos_bio_addr_free(pool, addr, irec->ir_size); if (rc) return rc;