From cb10687ba48facbfe1b6b52896892ac3576f9ea3 Mon Sep 17 00:00:00 2001 From: Sherin T George Date: Thu, 19 Dec 2024 17:28:39 +0530 Subject: [PATCH] DAOS-16896 common: Spill Over Evictable Buckets Implementation The DAV_v2 allocator now includes support for Spill Over Evictable Buckets (SOEMB). All global allocations will continue to utilize the standard non-evictable memory buckets, while spillover allocations from evictable memory buckets will be directed to SOEMB. In the current implementation, SOEMB remains locked in the memory cache, similar to the behavior of non-evictable memory buckets. Signed-off-by: Sherin T George --- src/common/dav_v2/heap.c | 718 +++++++++++++++++++++++------- src/common/dav_v2/heap.h | 16 +- src/common/dav_v2/heap_layout.h | 3 +- src/common/dav_v2/palloc.c | 4 +- src/common/dav_v2/tx.c | 1 + src/common/tests/umem_test_bmem.c | 93 ++-- src/vos/tests/vts_wal.c | 354 ++++++++++++++- 7 files changed, 965 insertions(+), 224 deletions(-) diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c index 61b7ed2e2b1..5ca8abe625d 100644 --- a/src/common/dav_v2/heap.c +++ b/src/common/dav_v2/heap.c @@ -63,6 +63,8 @@ struct mbrt { struct recycler *recyclers[MAX_ALLOCATION_CLASSES]; bool laf[MAX_ALLOCATION_CLASSES]; /* last allocation failed? */ bool laf_updated; + bool is_global_mbrt; + bool is_evictable; }; enum mb_usage_hint { @@ -78,7 +80,24 @@ enum mb_usage_hint { #define MB_U30 (ZONE_MAX_SIZE * 3 / 10) #define MB_USAGE_DELTA (ZONE_MAX_SIZE / 20) -size_t mb_usage_byhint[MB_UMAX_HINT] = {0, MB_U30 + 1, MB_U75 + 1, MB_U90 + 1}; +size_t mb_usage_byhint[MB_UMAX_HINT] = {1, MB_U30 + 1, MB_U75 + 1, MB_U90 + 1}; + +struct mbrt_qbs { + struct mbrt_q mb_u90; + struct mbrt_q mb_u75; + struct mbrt_q mb_u30; + struct mbrt_q mb_u0; + struct mbrt_q mb_ue; +}; + +#define SOEMB_ACTIVE_CNT 3 + +struct soemb_rt { + struct mbrt *svec[SOEMB_ACTIVE_CNT]; + int cur_idx; + int fur_idx; + struct mbrt_qbs qbs; +}; struct heap_rt { struct alloc_class_collection *alloc_classes; @@ -101,16 +120,13 @@ struct heap_rt { void *mb_create_wq; struct zinfo_vec *zinfo_vec; struct mbrt *default_mb; - struct mbrt **evictable_mbs; + struct mbrt **mbs; struct mbrt *active_evictable_mb; - struct mbrt_q mb_u90; - struct mbrt_q mb_u75; - struct mbrt_q mb_u30; - struct mbrt_q mb_u0; + struct mbrt_qbs emb_qbs; + struct soemb_rt smbrt; + unsigned int soemb_cnt; }; -#define MBRT_NON_EVICTABLE ((struct mbrt *)(-1UL)) - static void heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, uint32_t zone_id); @@ -213,39 +229,230 @@ mbrt_is_laf(struct mbrt *mb, int c_id) return mb->laf[c_id]; } -void -heap_mbrt_setmb_nonevictable(struct palloc_heap *heap, uint32_t zid) +static void +mbrt_qbs_init(struct mbrt_qbs *qb) +{ + TAILQ_INIT(&qb->mb_u90); + TAILQ_INIT(&qb->mb_u75); + TAILQ_INIT(&qb->mb_u30); + TAILQ_INIT(&qb->mb_u0); + TAILQ_INIT(&qb->mb_ue); +} + +static void +mbrt_qbs_fini(struct mbrt_qbs *qb) +{ + /* No op */ +} + +static void +mbrt_qbs_insertmb(struct mbrt_qbs *qb, struct mbrt *mb) +{ + D_ASSERT(mb->qptr == NULL); + + if (mb->space_usage > MB_U90) { + TAILQ_INSERT_TAIL(&qb->mb_u90, mb, mb_link); + mb->qptr = &qb->mb_u90; + } else if (mb->space_usage > MB_U75) { + TAILQ_INSERT_TAIL(&qb->mb_u75, mb, mb_link); + mb->qptr = &qb->mb_u75; + } else if (mb->space_usage > MB_U30) { + TAILQ_INSERT_TAIL(&qb->mb_u30, mb, mb_link); + mb->qptr = &qb->mb_u30; + } else if (mb->space_usage) { + TAILQ_INSERT_TAIL(&qb->mb_u0, mb, mb_link); + mb->qptr = &qb->mb_u0; + } else { + TAILQ_INSERT_TAIL(&qb->mb_ue, mb, mb_link); + mb->qptr = &qb->mb_ue; + } + + mb->prev_usage = mb->space_usage; +} + +static void +mbrt_qbs_insertmb_force(struct mbrt_qbs *qb, struct mbrt *mb, int hint) +{ + D_ASSERT(mb->qptr == NULL); + + switch (hint) { + case MB_U90_HINT: + TAILQ_INSERT_TAIL(&qb->mb_u90, mb, mb_link); + mb->qptr = &qb->mb_u90; + break; + case MB_U75_HINT: + TAILQ_INSERT_TAIL(&qb->mb_u75, mb, mb_link); + mb->qptr = &qb->mb_u75; + break; + case MB_U30_HINT: + TAILQ_INSERT_TAIL(&qb->mb_u30, mb, mb_link); + mb->qptr = &qb->mb_u30; + break; + case MB_U0_HINT: + TAILQ_INSERT_TAIL(&qb->mb_u0, mb, mb_link); + mb->qptr = &qb->mb_u0; + break; + default: + D_ASSERTF(0, "invalid usage hint %d", hint); + break; + } +} + +static int +mbrt_qbs_update_mb(struct mbrt_qbs *qb, struct mbrt *mb) +{ + int hint = MB_UMAX_HINT; + + if (mb->qptr == NULL) + return MB_UMAX_HINT; + + if (mb->space_usage == 0) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&qb->mb_ue, mb, mb_link); + mb->qptr = &qb->mb_ue; + mb->prev_usage = mb->space_usage; + return MB_U0_HINT; + } else if (mb->qptr == &qb->mb_ue) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&qb->mb_u0, mb, mb_link); + mb->qptr = &qb->mb_u0; + } + + if (labs((int64_t)(mb->space_usage - mb->prev_usage)) < MB_USAGE_DELTA) + return MB_UMAX_HINT; + + if (mb->space_usage > MB_U90) { + if (mb->qptr != &qb->mb_u90) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&qb->mb_u90, mb, mb_link); + mb->qptr = &qb->mb_u90; + hint = MB_U90_HINT; + } + } else if (mb->space_usage > MB_U75) { + if (mb->qptr != &qb->mb_u75) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&qb->mb_u75, mb, mb_link); + mb->qptr = &qb->mb_u75; + hint = MB_U75_HINT; + } + } else if (mb->space_usage > MB_U30) { + if (mb->qptr != &qb->mb_u30) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&qb->mb_u30, mb, mb_link); + mb->qptr = &qb->mb_u30; + hint = MB_U30_HINT; + } + } else if (mb->qptr != &qb->mb_u0) { + TAILQ_REMOVE(mb->qptr, mb, mb_link); + TAILQ_INSERT_TAIL(&qb->mb_u0, mb, mb_link); + mb->qptr = &qb->mb_u0; + hint = MB_U0_HINT; + } + mb->prev_usage = mb->space_usage; + return hint; +} + +static struct mbrt * +mbrt_qbs_getmb(struct mbrt_qbs *qb, int force) +{ + struct mbrt *mb = NULL; + + if ((mb = TAILQ_FIRST(&qb->mb_u30)) != NULL) + TAILQ_REMOVE(&qb->mb_u30, mb, mb_link); + else if ((mb = TAILQ_FIRST(&qb->mb_u0)) != NULL) + TAILQ_REMOVE(&qb->mb_u0, mb, mb_link); + else if ((mb = TAILQ_FIRST(&qb->mb_ue)) != NULL) + TAILQ_REMOVE(&qb->mb_ue, mb, mb_link); + + if (mb) { + mb->qptr = NULL; + return mb; + } + + if (!force) + return NULL; + + if ((mb = TAILQ_FIRST(&qb->mb_u75)) != NULL) + TAILQ_REMOVE(&qb->mb_u75, mb, mb_link); + else if ((mb = TAILQ_FIRST(&qb->mb_u90)) != NULL) + TAILQ_REMOVE(&qb->mb_u90, mb, mb_link); + + if (mb) + mb->qptr = NULL; + return mb; +} + +static struct mbrt * +mbrt_qbs_getmb_ue(struct mbrt_qbs *qb) +{ + struct mbrt *mb = NULL; + if ((mb = TAILQ_FIRST(&qb->mb_ue)) != NULL) { + TAILQ_REMOVE(&qb->mb_ue, mb, mb_link); + mb->qptr = NULL; + } + return mb; +} + +static void +soemb_init(struct soemb_rt *smbrt) +{ + memset(smbrt->svec, 0, sizeof(struct mbrt *) * SOEMB_ACTIVE_CNT); + mbrt_qbs_init(&smbrt->qbs); + smbrt->cur_idx = 0; + smbrt->fur_idx = 0; +} + +static void +soemb_fini(struct soemb_rt *smbrt) +{ + mbrt_qbs_fini(&smbrt->qbs); +} + +static void +heap_mbrt_setmb_nonevictable(struct palloc_heap *heap, struct mbrt *mb, uint32_t zid) { D_ASSERT(zid < heap->rt->nzones); - heap->rt->evictable_mbs[zid] = MBRT_NON_EVICTABLE; + D_ASSERT(heap->rt->default_mb != NULL); + + heap->rt->mbs[zid] = mb ? mb : heap->rt->default_mb; + if (mb) + mb->is_evictable = false; } -void +static void heap_mbrt_setmb_evictable(struct palloc_heap *heap, struct mbrt *mb) { D_ASSERT((mb->mb_id != 0) && (mb->mb_id < heap->rt->nzones)); - heap->rt->evictable_mbs[mb->mb_id] = mb; + heap->rt->mbs[mb->mb_id] = mb; + mb->is_evictable = true; } -void +static void heap_mbrt_setmb_unused(struct palloc_heap *heap, uint32_t zid) { - D_ASSERT((zid < heap->rt->nzones) && (heap->rt->evictable_mbs[zid] == MBRT_NON_EVICTABLE)); - heap->rt->evictable_mbs[zid] = NULL; + D_ASSERT((zid < heap->rt->nzones) && (heap->rt->mbs[zid]->is_evictable == false)); + heap->rt->mbs[zid] = NULL; } bool heap_mbrt_ismb_evictable(struct palloc_heap *heap, uint32_t zid) { D_ASSERT(zid < heap->rt->nzones); - return (heap->rt->evictable_mbs[zid] != MBRT_NON_EVICTABLE); + return (!heap->rt->mbs[zid] || heap->rt->mbs[zid]->is_evictable); } bool heap_mbrt_ismb_initialized(struct palloc_heap *heap, uint32_t zid) { D_ASSERT(zid < heap->rt->nzones); - return (heap->rt->evictable_mbs[zid] != 0); + return (heap->rt->mbs[zid] != 0); +} + +bool +heap_mbrt_ismb_localrt(struct palloc_heap *heap, uint32_t zid) +{ + D_ASSERT(zid < heap->rt->nzones); + return (heap->rt->mbs[zid] != heap->rt->default_mb); } /* @@ -279,7 +486,7 @@ mbrt_bucket_release(struct bucket *b) /* * heap_mbrt_setup_mb -- (internal) create and initializes a Memory Bucket runtime. */ -struct mbrt * +static struct mbrt * heap_mbrt_setup_mb(struct palloc_heap *heap, uint32_t zid) { struct heap_rt *rt = heap->rt; @@ -383,6 +590,7 @@ heap_mbrt_init(struct palloc_heap *heap) rt->mb_create_wq = NULL; rt->mb_pressure = 0; rt->empty_nemb_cnt = 0; + rt->soemb_cnt = 0; rt->empty_nemb_gcth = HEAP_NEMB_EMPTY_THRESHOLD; d_getenv_uint("DAOS_NEMB_EMPTY_RECYCLE_THRESHOLD", &rt->empty_nemb_gcth); @@ -395,27 +603,24 @@ heap_mbrt_init(struct palloc_heap *heap) goto error; } - D_ALLOC_ARRAY(rt->evictable_mbs, rt->nzones); - if (rt->evictable_mbs == NULL) { + D_ALLOC_ARRAY(rt->mbs, rt->nzones); + if (rt->mbs == NULL) { ret = ENOMEM; goto error; } - TAILQ_INIT(&rt->mb_u90); - TAILQ_INIT(&rt->mb_u75); - TAILQ_INIT(&rt->mb_u30); - TAILQ_INIT(&rt->mb_u0); + mbrt_qbs_init(&rt->emb_qbs); rt->default_mb = heap_mbrt_setup_mb(heap, 0); if (rt->default_mb == NULL) { ret = ENOMEM; goto error_default_mb_setup; } - heap_mbrt_setmb_nonevictable(heap, 0); + heap_mbrt_setmb_nonevictable(heap, NULL, 0); return 0; error_default_mb_setup: - D_FREE(rt->evictable_mbs); + D_FREE(rt->mbs); error: return ret; } @@ -428,15 +633,16 @@ heap_mbrt_fini(struct palloc_heap *heap) struct umem_store *store = heap->layout_info.store; for (i = 0; i < rt->zones_exhausted; i++) { - if (heap_mbrt_ismb_evictable(heap, i)) - heap_mbrt_cleanup_mb(rt->evictable_mbs[i]); + if (heap_mbrt_ismb_localrt(heap, i)) + heap_mbrt_cleanup_mb(rt->mbs[i]); } heap_mbrt_cleanup_mb(rt->default_mb); - D_FREE(rt->evictable_mbs); + mbrt_qbs_fini(&rt->emb_qbs); + D_FREE(rt->mbs); rt->default_mb = NULL; rt->active_evictable_mb = NULL; - rt->evictable_mbs = NULL; + rt->mbs = NULL; D_ASSERT(rt->mb_create_waiters == 0); if (rt->mb_create_wq != NULL) store->stor_ops->so_waitqueue_destroy(rt->mb_create_wq); @@ -450,11 +656,8 @@ heap_mbrt_fini(struct palloc_heap *heap) struct mbrt * heap_mbrt_get_mb(struct palloc_heap *heap, uint32_t zone_id) { - if (!heap_mbrt_ismb_evictable(heap, zone_id)) - return heap->rt->default_mb; - - D_ASSERTF(heap->rt->evictable_mbs[zone_id] != NULL, "zone_id %d is marked unused", zone_id); - return heap->rt->evictable_mbs[zone_id]; + D_ASSERTF(heap->rt->mbs[zone_id] != NULL, "zone_id %d is marked unused", zone_id); + return heap->rt->mbs[zone_id]; } void @@ -463,10 +666,8 @@ heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id) struct mbrt *mb = heap->rt->active_evictable_mb; if (mb && (mb->mb_id == zone_id)) { - TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link); - mb->qptr = &heap->rt->mb_u90; - mb->prev_usage = mb->space_usage; heap->rt->active_evictable_mb = NULL; + mbrt_qbs_insertmb_force(&heap->rt->emb_qbs, mb, MB_U90_HINT); heap_zinfo_set_usage(heap, zone_id, MB_U90_HINT); } } @@ -474,37 +675,28 @@ heap_mbrt_log_alloc_failure(struct palloc_heap *heap, uint32_t zone_id) void heap_mbrt_setmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t usage) { - struct mbrt *mb = heap->rt->evictable_mbs[zone_id]; + struct mbrt *mb = heap->rt->mbs[zone_id]; D_ASSERT(zone_id < heap->rt->nzones); if (zone_id == 0) { heap->rt->default_mb->space_usage = usage; return; } - if (mb == (struct mbrt *)(-1UL)) + + if (!heap_mbrt_ismb_evictable(heap, zone_id)) { + mbrt_qbs_insertmb(&heap->rt->smbrt.qbs, mb); return; + } mb->space_usage = usage; - if ((heap->rt->active_evictable_mb == mb) || (mb->qptr)) + if (heap->rt->active_evictable_mb == mb) return; - if (mb->space_usage > MB_U90) { - TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link); - mb->qptr = &heap->rt->mb_u90; - } else if (mb->space_usage > MB_U75) { - TAILQ_INSERT_TAIL(&heap->rt->mb_u75, mb, mb_link); - mb->qptr = &heap->rt->mb_u75; - } else if (mb->space_usage > MB_U30) { - TAILQ_INSERT_TAIL(&heap->rt->mb_u30, mb, mb_link); - mb->qptr = &heap->rt->mb_u30; - heap->rt->mb_pressure = 0; - } else { - TAILQ_INSERT_TAIL(&heap->rt->mb_u0, mb, mb_link); - mb->qptr = &heap->rt->mb_u0; - heap->rt->mb_pressure = 0; - } - mb->prev_usage = mb->space_usage; + if (mb->qptr) + mbrt_qbs_update_mb(&heap->rt->emb_qbs, mb); + else + mbrt_qbs_insertmb(&heap->rt->emb_qbs, mb); } int @@ -521,8 +713,8 @@ heap_mbrt_getmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t *allo errno = EINVAL; return -1; } - mb = heap->rt->evictable_mbs[zone_id]; - if (!mb || (mb == (struct mbrt *)(-1UL))) { + mb = heap->rt->mbs[zone_id]; + if (!mb || !heap_mbrt_ismb_evictable(heap, zone_id)) { errno = EINVAL; return -1; } @@ -535,51 +727,31 @@ heap_mbrt_getmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t *allo void heap_mbrt_incrmb_usage(struct palloc_heap *heap, uint32_t zone_id, int size) { - struct mbrt *mb = heap->rt->evictable_mbs[zone_id]; + struct mbrt *mb = heap->rt->mbs[zone_id]; + int hint; - if (mb == (struct mbrt *)(-1UL)) { + if (!heap_mbrt_ismb_evictable(heap, zone_id)) heap->rt->default_mb->space_usage += size; + + if (!heap_mbrt_ismb_localrt(heap, zone_id)) return; - } mb->space_usage += size; - if ((heap->rt->active_evictable_mb == mb) || - (labs((int64_t)(mb->space_usage - mb->prev_usage)) < MB_USAGE_DELTA)) + + if (heap->rt->active_evictable_mb == mb) return; - if (mb->space_usage > MB_U90) { - if (mb->qptr != &heap->rt->mb_u90) { - TAILQ_REMOVE(mb->qptr, mb, mb_link); - TAILQ_INSERT_TAIL(&heap->rt->mb_u90, mb, mb_link); - mb->qptr = &heap->rt->mb_u90; - heap_zinfo_set_usage(heap, zone_id, MB_U90_HINT); - } - } else if (mb->space_usage > MB_U75) { - if (mb->qptr != &heap->rt->mb_u75) { - TAILQ_REMOVE(mb->qptr, mb, mb_link); - TAILQ_INSERT_TAIL(&heap->rt->mb_u75, mb, mb_link); - mb->qptr = &heap->rt->mb_u75; - heap_zinfo_set_usage(heap, zone_id, MB_U75_HINT); - } - } else if (mb->space_usage > MB_U30) { - if (mb->qptr != &heap->rt->mb_u30) { - TAILQ_REMOVE(mb->qptr, mb, mb_link); - TAILQ_INSERT_TAIL(&heap->rt->mb_u30, mb, mb_link); - mb->qptr = &heap->rt->mb_u30; - heap_zinfo_set_usage(heap, zone_id, MB_U30_HINT); + if (heap_mbrt_ismb_evictable(heap, zone_id)) { + hint = mbrt_qbs_update_mb(&heap->rt->emb_qbs, mb); + if (hint != MB_UMAX_HINT) + heap_zinfo_set_usage(heap, zone_id, hint); + if (hint <= MB_U30_HINT) heap->rt->mb_pressure = 0; - } - } else if (mb->qptr != &heap->rt->mb_u0) { - TAILQ_REMOVE(mb->qptr, mb, mb_link); - TAILQ_INSERT_TAIL(&heap->rt->mb_u0, mb, mb_link); - mb->qptr = &heap->rt->mb_u0; - heap_zinfo_set_usage(heap, zone_id, MB_U0_HINT); - heap->rt->mb_pressure = 0; - } - mb->prev_usage = mb->space_usage; + } else + hint = mbrt_qbs_update_mb(&heap->rt->smbrt.qbs, mb); } -int +static int heap_mbrt_mb_reclaim_garbage(struct palloc_heap *heap, uint32_t zid) { struct mbrt *mb; @@ -600,6 +772,85 @@ heap_mbrt_mb_reclaim_garbage(struct palloc_heap *heap, uint32_t zid) return 0; } +void +heap_soemb_active_iter_init(struct palloc_heap *heap) +{ + heap->rt->smbrt.cur_idx = 0; +} + +uint32_t +heap_soemb_active_get(struct palloc_heap *heap) +{ + struct soemb_rt *smbrt = &heap->rt->smbrt; + struct mbrt *mb = NULL; + + if (heap->rt->nzones_e == 0) + return 0; + + if (smbrt->cur_idx > smbrt->fur_idx) + smbrt->fur_idx = smbrt->cur_idx; + + if (smbrt->cur_idx < SOEMB_ACTIVE_CNT) { + mb = smbrt->svec[smbrt->cur_idx]; + smbrt->cur_idx++; + } + + if (mb) + return mb->mb_id; + + return 0; +} + +static int +heap_create_soe_mb(struct palloc_heap *heap, uint32_t *mb_id); + +void +heap_soemb_reserve(struct palloc_heap *heap) +{ + int i, ret; + uint32_t mb_id; + struct mbrt *mb; + struct soemb_rt *smbrt = &heap->rt->smbrt; + + if (heap->rt->nzones_e == 0) + return; + + if (smbrt->fur_idx > 1) { + mb = smbrt->svec[0]; + if (mb) + mbrt_qbs_insertmb(&smbrt->qbs, mb); + + for (i = 1; i < SOEMB_ACTIVE_CNT; i++) { + smbrt->svec[i - 1] = smbrt->svec[i]; + } + + smbrt->svec[SOEMB_ACTIVE_CNT - 1] = NULL; + smbrt->fur_idx = 0; + } + + for (i = 0; i < SOEMB_ACTIVE_CNT; i++) { + if (smbrt->svec[i] != NULL) + continue; + mb = mbrt_qbs_getmb(&smbrt->qbs, 0); + if (mb) { + smbrt->svec[i] = mb; + break; + } + ret = heap_create_soe_mb(heap, &mb_id); + if (ret == 0) { + smbrt->svec[i] = heap_mbrt_get_mb(heap, mb_id); + break; + } + mb = mbrt_qbs_getmb(&smbrt->qbs, 1); + if (mb) { + smbrt->svec[i] = mb; + break; + } + break; + } + smbrt->cur_idx = 0; +} + void heap_set_root_ptrs(struct palloc_heap *heap, uint64_t **offp, uint64_t **sizep) { @@ -720,8 +971,7 @@ zone_calc_size_idx(uint32_t zone_id, unsigned max_zone, size_t heap_size) * heap_zone_init -- (internal) writes zone's first chunk and header */ static void -heap_zone_init(struct palloc_heap *heap, uint32_t zone_id, uint32_t first_chunk_id, - bool is_evictable) +heap_zone_init(struct palloc_heap *heap, uint32_t zone_id, uint32_t first_chunk_id, int flags) { struct zone *z = ZID_TO_ZONE(&heap->layout_info, zone_id); uint32_t size_idx = zone_calc_size_idx(zone_id, heap->rt->nzones, heap->size); @@ -734,8 +984,11 @@ heap_zone_init(struct palloc_heap *heap, uint32_t zone_id, uint32_t first_chunk_ }; z->header = nhdr; /* write the entire header at once */ - if (is_evictable) - z->header.flags |= ZONE_EVICTABLE_MB; + + if (flags) { + D_ASSERT((flags == ZONE_EVICTABLE_MB) || (flags == ZONE_SOE_MB)); + z->header.flags = flags; + } mo_wal_persist(&heap->p_ops, &z->header, sizeof(z->header)); memblock_huge_init(heap, first_chunk_id, zone_id, size_idx - first_chunk_id); @@ -996,7 +1249,7 @@ heap_reclaim_next_ne(struct palloc_heap *heap, uint32_t *zone_id) heap_zinfo_get(heap, i, &allotted, &evictable); if (!allotted) continue; - if (!evictable) { + if (!evictable && !heap_mbrt_ismb_localrt(heap, i)) { h->zones_nextne_gc = i + 1; *zone_id = i; return 0; @@ -1032,29 +1285,22 @@ heap_get_next_unused_zone(struct palloc_heap *heap, uint32_t *zone_id) return 0; } -static int -heap_mark_zone_used_transient(struct palloc_heap *heap, uint32_t zone_id, bool is_evictable) +static void +heap_mark_zone_used_transient(struct palloc_heap *heap, struct mbrt *mb, uint32_t zone_id, + bool is_evictable) { - struct mbrt *mb; - if (is_evictable) { - mb = heap_mbrt_setup_mb(heap, zone_id); - if (mb == NULL) { - ERR("Failed to setup mbrt for zone %u\n", zone_id); - return -1; - } + D_ASSERT(mb != NULL); heap_mbrt_setmb_evictable(heap, mb); - } else - heap_mbrt_setmb_nonevictable(heap, zone_id); + heap->rt->zones_exhausted_e++; + } else { + heap_mbrt_setmb_nonevictable(heap, mb, zone_id); + heap->rt->zones_exhausted_ne++; + } heap->rt->zones_unused_first = zone_id + 1; if (heap->rt->zones_exhausted < heap->rt->zones_unused_first) heap->rt->zones_exhausted = heap->rt->zones_unused_first; - if (is_evictable) - heap->rt->zones_exhausted_e++; - else - heap->rt->zones_exhausted_ne++; - return 0; } static void @@ -1069,13 +1315,9 @@ heap_mark_zone_used_persist(struct palloc_heap *heap, uint32_t zone_id) static void heap_mark_zone_unused_transient(struct palloc_heap *heap, uint32_t zone_id) { - struct mbrt *mb = heap_mbrt_get_mb(heap, zone_id); - - if (heap_mbrt_ismb_evictable(heap, zone_id)) { - D_ASSERT(mb != NULL); - heap_mbrt_cleanup_mb(mb); + if (heap_mbrt_ismb_evictable(heap, zone_id)) heap->rt->zones_exhausted_e--; - } else + else heap->rt->zones_exhausted_ne--; heap_mbrt_setmb_unused(heap, zone_id); @@ -1086,15 +1328,20 @@ heap_mark_zone_unused_transient(struct palloc_heap *heap, uint32_t zone_id) heap->rt->zones_exhausted = zone_id; } -static void +static int heap_mark_zone_unused(struct palloc_heap *heap, uint32_t zone_id) { struct umem_cache_range rg = {0}; bool is_evictable = heap_mbrt_ismb_evictable(heap, zone_id); int rc; + struct mbrt *mb = heap_mbrt_get_mb(heap, zone_id); D_ASSERT(is_evictable == false); + if (heap_mbrt_ismb_localrt(heap, zone_id)) { + heap->rt->soemb_cnt--; + VALGRIND_DO_DESTROY_MEMPOOL_COND(ZID_TO_ZONE(&heap->layout_info, zone_id)); + } heap_mark_zone_unused_transient(heap, zone_id); rg.cr_off = GET_ZONE_OFFSET(zone_id); rg.cr_size = @@ -1103,10 +1350,13 @@ heap_mark_zone_unused(struct palloc_heap *heap, uint32_t zone_id) if (rc != 0) { rc = daos_der2errno(rc); ERR("Failed to remap zone %d in umem cache as unused rc=%d\n", zone_id, rc); - heap_mark_zone_used_transient(heap, zone_id, is_evictable); + heap_mark_zone_used_transient(heap, mb, zone_id, is_evictable); + VALGRIND_DO_CREATE_MEMPOOL(ZID_TO_ZONE(&heap->layout_info, zone_id), 0, 0); + return -1; } heap_zinfo_set_usage(heap, zone_id, MB_U0_HINT); heap_zinfo_set(heap, zone_id, false, false); + return 0; } int @@ -1115,14 +1365,16 @@ heap_populate_nemb_unused(struct palloc_heap *heap) struct bucket *defb; struct memory_block m = MEMORY_BLOCK_NONE; struct mbrt *mb; + int rc; m.size_idx = MAX_CHUNK; mb = heap_mbrt_get_mb(heap, 0); defb = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); while (bucket_alloc_block(defb, &m) == 0) { - heap->rt->empty_nemb_cnt--; - heap_mark_zone_unused(heap, m.zone_id); + rc = heap_mark_zone_unused(heap, m.zone_id); + if (!rc) + heap->rt->empty_nemb_cnt--; m = MEMORY_BLOCK_NONE; m.size_idx = MAX_CHUNK; @@ -1163,9 +1415,7 @@ heap_populate_bucket(struct palloc_heap *heap, struct bucket *bucket) if (rc) return ENOMEM; - rc = heap_mark_zone_used_transient(heap, zone_id, false); - if (rc) - return ENOMEM; + heap_mark_zone_used_transient(heap, NULL, zone_id, false); /* Create a umem cache map for the new zone */ rg.cr_off = GET_ZONE_OFFSET(zone_id); @@ -1196,13 +1446,12 @@ heap_populate_bucket(struct palloc_heap *heap, struct bucket *bucket) VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) + sizeof(z->chunk_headers)); - heap_zone_init(heap, zone_id, 0, false); + heap_zone_init(heap, zone_id, 0, 0); heap_mark_zone_used_persist(heap, zone_id); reclaim_garbage: heap_reclaim_zone_garbage(heap, bucket, zone_id); heap_reclaim_setlast_ne(heap, zone_id); - /* * It doesn't matter that this function might not have found any * free blocks because there is still potential that subsequent calls @@ -1602,7 +1851,7 @@ heap_cleanup(struct palloc_heap *heap) if (On_memcheck) { for (i = 0; i < heap->rt->zones_exhausted; i++) { if (!heap_mbrt_ismb_initialized(heap, i) || - !heap_mbrt_ismb_evictable(heap, i)) + !heap_mbrt_ismb_localrt(heap, i)) continue; if (umem_cache_offisloaded(heap->layout_info.store, GET_ZONE_OFFSET(i))) VALGRIND_DO_DESTROY_MEMPOOL(ZID_TO_ZONE(&heap->layout_info, i)); @@ -1610,6 +1859,7 @@ heap_cleanup(struct palloc_heap *heap) } #endif heap_mbrt_fini(heap); + soemb_fini(&heap->rt->smbrt); D_FREE(rt); heap->rt = NULL; @@ -1686,7 +1936,7 @@ heap_ensure_zone0_initialized(struct palloc_heap *heap) struct bucket *b; int rc = 0; - heap_mbrt_setmb_nonevictable(heap, 0); + heap_mbrt_setmb_nonevictable(heap, NULL, 0); if (heap->layout_info.zone0->header.magic != ZONE_HEADER_MAGIC) { /* If not magic the content should be zero, indicating new file */ D_ASSERT(heap->layout_info.zone0->header.magic == 0); @@ -1772,6 +2022,9 @@ heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_ h->nlocks = On_valgrind ? MAX_RUN_LOCKS_VG : MAX_RUN_LOCKS; for (unsigned i = 0; i < h->nlocks; ++i) util_mutex_init(&h->run_locks[i]); + + soemb_init(&h->smbrt); + heap->rt = h; heap->p_ops = *p_ops; @@ -1855,6 +2108,7 @@ heap_create_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) struct zone *z; struct umem_pin_handle *pin_handle = NULL; struct umem_store *store = heap->layout_info.store; + struct mbrt *mb; D_ASSERT(heap->rt->active_evictable_mb == NULL); @@ -1879,13 +2133,16 @@ heap_create_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) goto out; } - rc = heap_mark_zone_used_transient(heap, zone_id, true); - if (rc) { + mb = heap_mbrt_setup_mb(heap, zone_id); + if (mb == NULL) { + ERR("Failed to setup mbrt for zone %u\n", zone_id); rc = 1; errno = ENOMEM; goto out; } + heap_mark_zone_used_transient(heap, mb, zone_id, true); + /* Create a umem cache map for the new zone */ rg.cr_off = GET_ZONE_OFFSET(zone_id); rg.cr_size = @@ -1921,7 +2178,7 @@ heap_create_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) if (rc) goto error; - heap_zone_init(heap, zone_id, 0, true); + heap_zone_init(heap, zone_id, 0, ZONE_EVICTABLE_MB); rc = heap_mbrt_mb_reclaim_garbage(heap, zone_id); if (rc) { ERR("Failed to initialize evictable zone %u", zone_id); @@ -1940,6 +2197,7 @@ heap_create_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) if (pin_handle) umem_cache_unpin(heap->layout_info.store, pin_handle); heap_mark_zone_unused_transient(heap, zone_id); + heap_mbrt_cleanup_mb(mb); rc = -1; out: @@ -1952,6 +2210,84 @@ heap_create_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) return rc; } +static int +heap_create_soe_mb(struct palloc_heap *heap, uint32_t *mb_id) +{ + uint32_t zone_id; + struct umem_cache_range rg = {0}; + int rc; + struct zone *z; + struct mbrt *mb; + + if (heap->rt->zones_exhausted_ne >= heap->rt->nzones_ne) + return -1; + + rc = heap_get_next_unused_zone(heap, &zone_id); + if (rc) { + D_ERROR("Failed to obtain free zone for evictable mb"); + rc = 1; + errno = ENOMEM; + goto out; + } + + mb = heap_mbrt_setup_mb(heap, zone_id); + if (mb == NULL) { + ERR("Failed to setup mbrt for zone %u\n", zone_id); + rc = 1; + errno = ENOMEM; + goto out; + } + + heap_mark_zone_used_transient(heap, mb, zone_id, false); + + /* Create a umem cache map for the new zone */ + rg.cr_off = GET_ZONE_OFFSET(zone_id); + rg.cr_size = + ((heap->size - rg.cr_off) > ZONE_MAX_SIZE) ? ZONE_MAX_SIZE : heap->size - rg.cr_off; + + rc = umem_cache_map(heap->layout_info.store, &rg, 1); + if (rc != 0) { + ERR("Failed to map zone %u to umem cache\n", zone_id); + errno = daos_der2errno(rc); + goto error; + } + + D_DEBUG(DB_TRACE, "Creating evictable zone %d\n", zone_id); + + z = ZID_TO_ZONE(&heap->layout_info, zone_id); + VALGRIND_DO_CREATE_MEMPOOL(z, 0, 0); + VALGRIND_DO_MAKE_MEM_UNDEFINED(z, rg.cr_size); + if (rg.cr_size != ZONE_MAX_SIZE) + VALGRIND_DO_MAKE_MEM_NOACCESS(z + rg.cr_size, (ZONE_MAX_SIZE - rg.cr_size)); + + memset(z, 0, rg.cr_size); + + /* ignore zone and chunk headers */ + VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(z, sizeof(z->header) + sizeof(z->chunk_headers)); + + heap_zone_init(heap, zone_id, 0, ZONE_SOE_MB); + rc = heap_mbrt_mb_reclaim_garbage(heap, zone_id); + if (rc) { + ERR("Failed to initialize evictable zone %u", zone_id); + goto error; + } + heap_mark_zone_used_persist(heap, zone_id); + + *mb_id = zone_id; + rc = 0; + heap_incr_empty_nemb_cnt(heap); + heap->rt->soemb_cnt++; + goto out; + +error: + heap_mark_zone_unused_transient(heap, zone_id); + heap_mbrt_cleanup_mb(mb); + rc = -1; + +out: + return rc; +} + int heap_get_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) { @@ -1971,34 +2307,33 @@ heap_get_evictable_mb(struct palloc_heap *heap, uint32_t *mb_id) } heap->rt->mb_pressure = 0; - if ((mb = TAILQ_FIRST(&heap->rt->mb_u30)) != NULL) - TAILQ_REMOVE(&heap->rt->mb_u30, mb, mb_link); - else if ((mb = TAILQ_FIRST(&heap->rt->mb_u0)) != NULL) - TAILQ_REMOVE(&heap->rt->mb_u0, mb, mb_link); - else if ((ret = heap_create_evictable_mb(heap, mb_id)) >= 0) { + mb = mbrt_qbs_getmb(&heap->rt->emb_qbs, 0); + if (mb) + goto out; + + if ((ret = heap_create_evictable_mb(heap, mb_id)) >= 0) { if (ret) goto retry; mb = heap_mbrt_get_mb(heap, *mb_id); D_ASSERT(mb != NULL); if (heap->rt->active_evictable_mb) { - TAILQ_INSERT_HEAD(&heap->rt->mb_u0, mb, mb_link); - mb->qptr = &heap->rt->mb_u0; + mbrt_qbs_insertmb(&heap->rt->emb_qbs, mb); *mb_id = heap->rt->active_evictable_mb->mb_id; return 0; } - } else if ((mb = TAILQ_FIRST(&heap->rt->mb_u75)) != NULL) { - TAILQ_REMOVE(&heap->rt->mb_u75, mb, mb_link); - heap->rt->mb_pressure = 1; - } else if ((mb = TAILQ_FIRST(&heap->rt->mb_u90)) != NULL) { - TAILQ_REMOVE(&heap->rt->mb_u90, mb, mb_link); - heap->rt->mb_pressure = 1; - } else { + goto out; + } + mb = mbrt_qbs_getmb(&heap->rt->emb_qbs, 1); + + heap->rt->mb_pressure = 1; + + if (mb == NULL) { D_ERROR("Failed to get an evictable MB"); *mb_id = 0; return 0; } +out: heap->rt->active_evictable_mb = mb; - mb->qptr = NULL; *mb_id = mb->mb_id; return 0; } @@ -2008,7 +2343,7 @@ heap_off2mbid(struct palloc_heap *heap, uint64_t offset) { struct memory_block m = memblock_from_offset_opt(heap, offset, 0); - if (heap_mbrt_ismb_evictable(heap, m.zone_id)) + if (heap_mbrt_ismb_localrt(heap, m.zone_id)) return m.zone_id; else return 0; @@ -2044,7 +2379,7 @@ heap_update_mbrt_zinfo(struct palloc_heap *heap, bool init) continue; } if (!evictable) { - heap_mbrt_setmb_nonevictable(heap, i); + heap_mbrt_setmb_nonevictable(heap, NULL, i); nemb_cnt++; } else { mb = heap_mbrt_setup_mb(heap, i); @@ -2078,9 +2413,10 @@ heap_update_mbrt_zinfo(struct palloc_heap *heap, bool init) int heap_load_nonevictable_zones(struct palloc_heap *heap) { - int i, rc; - bool allotted, evictable; + int i, rc; + bool allotted, evictable; struct zone *zone; + struct mbrt *mb; for (i = 1; i < heap->rt->zones_exhausted; i++) { heap_zinfo_get(heap, i, &allotted, &evictable); @@ -2091,10 +2427,20 @@ heap_load_nonevictable_zones(struct palloc_heap *heap) if (rc) return rc; zone = ZID_TO_ZONE(&heap->layout_info, i); + D_ASSERT((zone->header.flags & ZONE_EVICTABLE_MB) == 0); + if (zone->header.flags & ZONE_SOE_MB) { + mb = heap_mbrt_setup_mb(heap, i); + if (mb == NULL) { + D_ERROR("failed to load soe mb"); + return ENOMEM; + } + heap_mbrt_setmb_nonevictable(heap, mb, i); + mbrt_qbs_insertmb(&heap->rt->smbrt.qbs, mb); + heap->rt->soemb_cnt++; + } if (!zone->header.sp_usage) heap_incr_empty_nemb_cnt(heap); - else - heap_mbrt_incrmb_usage(heap, 0, zone->header.sp_usage); + heap_mbrt_incrmb_usage(heap, i, zone->header.sp_usage); } } return 0; @@ -2291,6 +2637,57 @@ heap_decr_empty_nemb_cnt(struct palloc_heap *heap) return heap->rt->empty_nemb_cnt ? --heap->rt->empty_nemb_cnt : 0; } +static void +heap_recycle_soembs(struct palloc_heap *heap) +{ + struct mbrt *mb; + struct bucket *defb, *b; + struct memory_block m = MEMORY_BLOCK_NONE; + int i, rc; + + for (i = 0; i < SOEMB_ACTIVE_CNT; i++) { + mb = heap->rt->smbrt.svec[i]; + if (mb && (mb->space_usage == 0)) { + mbrt_qbs_insertmb(&heap->rt->smbrt.qbs, mb); + heap->rt->smbrt.svec[i] = NULL; + } + } + + while ((mb = mbrt_qbs_getmb_ue(&heap->rt->smbrt.qbs)) != NULL) { + defb = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); + if (!mb->garbage_reclaimed) { + heap_reclaim_zone_garbage(heap, defb, mb->mb_id); + mb->garbage_reclaimed = 1; + } + mbrt_bucket_release(defb); + for (i = 0; i < MAX_ALLOCATION_CLASSES; i++) { + if (mb->buckets[i] == NULL) + continue; + b = bucket_acquire(mb->buckets[i]); + heap_detach_and_try_discard_run(heap, b); + mbrt_bucket_release(b); + } + defb = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); + heap_reclaim_garbage(heap, defb); + m = MEMORY_BLOCK_NONE; + m.size_idx = MAX_CHUNK; + if (bucket_alloc_block(defb, &m) == 0) { + rc = heap_mark_zone_unused(heap, m.zone_id); + if (rc) + mbrt_qbs_insertmb_force(&heap->rt->smbrt.qbs, mb, MB_U0_HINT); + else + heap->rt->empty_nemb_cnt--; + mbrt_bucket_release(defb); + heap_mbrt_cleanup_mb(mb); + } else { + mbrt_bucket_release(defb); + mbrt_qbs_insertmb_force(&heap->rt->smbrt.qbs, mb, MB_U0_HINT); + } + } + + return; +} + int heap_force_recycle(struct palloc_heap *heap) { @@ -2311,6 +2708,7 @@ heap_force_recycle(struct palloc_heap *heap) } } + heap_recycle_soembs(heap); defb = mbrt_bucket_acquire(mb, DEFAULT_ALLOC_CLASS_ID); while (heap_reclaim_next_ne(heap, &zone_id) == 0) { @@ -2325,10 +2723,6 @@ heap_force_recycle(struct palloc_heap *heap) heap_populate_nemb_unused(heap); mb->prev_usage = mb->space_usage; - if (max_reclaim && (heap->rt->empty_nemb_cnt >= heap->rt->empty_nemb_gcth)) - D_WARN("Force GC failed to free up enough nembs, cnt = %d", - heap->rt->empty_nemb_cnt); - return 0; } @@ -2395,7 +2789,7 @@ heap_vg_open(struct palloc_heap *heap, object_callback cb, void *arg, int object if (!heap_mbrt_ismb_initialized(heap, i)) continue; - if (heap_mbrt_ismb_evictable(heap, i)) + if (heap_mbrt_ismb_localrt(heap, i)) VALGRIND_DO_CREATE_MEMPOOL(ZID_TO_ZONE(&heap->layout_info, i), 0, 0); heap_vg_zone_open(heap, i, cb, arg, objects); diff --git a/src/common/dav_v2/heap.h b/src/common/dav_v2/heap.h index f2e710b4ce9..a5cc76d6ba6 100644 --- a/src/common/dav_v2/heap.h +++ b/src/common/dav_v2/heap.h @@ -44,25 +44,21 @@ heap_mbrt_update_alloc_class_buckets(struct palloc_heap *heap, struct mbrt *mb, struct alloc_class *c); int heap_extend(struct palloc_heap *heap, struct bucket *defb, size_t size); -void -heap_mbrt_setmb_evictable(struct palloc_heap *heap, struct mbrt *mb); -bool -heap_mbrt_ismb_initialized(struct palloc_heap *heap, uint32_t zone_id); bool heap_mbrt_ismb_evictable(struct palloc_heap *heap, uint32_t zone_id); void -heap_mbrt_setmb_nonevictable(struct palloc_heap *heap, uint32_t zone_id); -void heap_mbrt_setmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t usage); int heap_mbrt_getmb_usage(struct palloc_heap *heap, uint32_t zone_id, uint64_t *allotted, uint64_t *maxsz); void heap_mbrt_incrmb_usage(struct palloc_heap *heap, uint32_t zone_id, int size); -struct mbrt * -heap_mbrt_setup_mb(struct palloc_heap *heap, uint32_t zone_id); -int -heap_mbrt_mb_reclaim_garbage(struct palloc_heap *heap, uint32_t zid); +void +heap_soemb_active_iter_init(struct palloc_heap *heap); +uint32_t +heap_soemb_active_get(struct palloc_heap *heap); +void +heap_soemb_reserve(struct palloc_heap *heap); int heap_ensure_zone0_initialized(struct palloc_heap *heap); int diff --git a/src/common/dav_v2/heap_layout.h b/src/common/dav_v2/heap_layout.h index fa65846921e..098cb752de8 100644 --- a/src/common/dav_v2/heap_layout.h +++ b/src/common/dav_v2/heap_layout.h @@ -82,8 +82,9 @@ enum chunk_type { MAX_CHUNK_TYPE }; -/* zone header flags */ +/* zone header bit flags */ #define ZONE_EVICTABLE_MB 0x0001 +#define ZONE_SOE_MB 0x0002 struct chunk { uint8_t data[CHUNKSIZE]; diff --git a/src/common/dav_v2/palloc.c b/src/common/dav_v2/palloc.c index a82c887f5b1..80ed26fad98 100644 --- a/src/common/dav_v2/palloc.c +++ b/src/common/dav_v2/palloc.c @@ -212,6 +212,8 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c return -1; } + heap_soemb_active_iter_init(heap); + retry: mb = heap_mbrt_get_mb(heap, mb_id); if (mb == NULL) { @@ -288,7 +290,7 @@ palloc_reservation_create(struct palloc_heap *heap, size_t size, palloc_constr c */ if ((mb_id != 0) && (err == ENOMEM)) { heap_mbrt_log_alloc_failure(heap, mb_id); - mb_id = 0; + mb_id = heap_soemb_active_get(heap); goto retry; } diff --git a/src/common/dav_v2/tx.c b/src/common/dav_v2/tx.c index 98e6d6d314e..85bde8c16dc 100644 --- a/src/common/dav_v2/tx.c +++ b/src/common/dav_v2/tx.c @@ -559,6 +559,7 @@ dav_tx_begin_v2(dav_obj_t *pop, jmp_buf env, ...) sizeof(struct tx_range_def)); tx->first_snapshot = 1; tx->pop = pop; + heap_soemb_reserve(pop->do_heap); } else { FATAL("Invalid stage %d to begin new transaction", tx->stage); } diff --git a/src/common/tests/umem_test_bmem.c b/src/common/tests/umem_test_bmem.c index cd745c48dc8..a3c54d11e37 100644 --- a/src/common/tests/umem_test_bmem.c +++ b/src/common/tests/umem_test_bmem.c @@ -31,6 +31,7 @@ #define POOL_SIZE ((256 * 1024 * 1024ULL)) #define NEMB_RATIO (0.8) #define MB_SIZE (16 * 1024 * 1024) +#define MIN_SOEMB_CNT 3 struct test_arg { struct utest_context *ta_utx; @@ -183,7 +184,7 @@ struct umem_store_ops _store_ops_v2 = { .so_wal_id_cmp = wal_id_cmp, }; -struct umem_store ustore_v2 = {.stor_size = POOL_SIZE * 2, +struct umem_store ustore_v2 = {.stor_size = POOL_SIZE * 3, .stor_ops = &_store_ops_v2, .store_type = DAOS_MD_BMEM_V2, .stor_priv = (void *)(UINT64_MAX)}; @@ -219,7 +220,7 @@ setup_pmem_internal(void **state, struct umem_store *store) return 1; } - rc = utest_pmem_create(arg->ta_pool_name, POOL_SIZE, sizeof(*arg->ta_root), store, + rc = utest_pmem_create(arg->ta_pool_name, POOL_SIZE * 2, sizeof(*arg->ta_root), store, &arg->ta_utx); if (rc != 0) { perror("Could not create pmem context"); @@ -244,7 +245,24 @@ setup_pmem(void **state) static int setup_pmem_v2(void **state) { - return setup_pmem_internal(state, &ustore_v2); + struct test_arg *arg; + struct umem_instance *umm; + int rc, i; + + rc = setup_pmem_internal(state, &ustore_v2); + + arg = *state; + umm = utest_utx2umm(arg->ta_utx); + /* + * Do soemb reservations before the test begins. + */ + if (!rc) { + for (i = 0; i < MIN_SOEMB_CNT; i++) { + umem_tx_begin(umm, NULL); + umem_tx_commit(umm); + } + } + return rc; } static int @@ -2239,7 +2257,7 @@ test_tx_alloc_from_multimb(void **state) uint32_t id; int i; - for (i = 0; i < 10; i++) { + for (i = 0; i < 8; i++) { /* Create an MB and fill it with allocs */ ainfo[i].mb_id = umem_allot_mb_evictable(umm, 0); ainfo[i].num_allocs = 0; @@ -2257,11 +2275,9 @@ test_tx_alloc_from_multimb(void **state) /* Free 15% of space for MB 5 */ free_bucket_by_pct(umm, &ainfo[5], 20); /* 75-90 */ /* Free 10% of space for MB 6 */ - free_bucket_by_pct(umm, &ainfo[6], 18); /* 75-90 */ - /* Free 50% of space for MB 7 */ - free_bucket_by_pct(umm, &ainfo[7], 50); /* 30-75 */ + free_bucket_by_pct(umm, &ainfo[6], 50); /* 30-75 */ /* Free 90% of space for MB 8 */ - free_bucket_by_pct(umm, &ainfo[8], 90); /* 0-30 */ + free_bucket_by_pct(umm, &ainfo[7], 90); /* 0-30 */ /* Allocator should return mb with utilization 30%-75% */ id = umem_allot_mb_evictable(umm, 0); @@ -2269,9 +2285,9 @@ test_tx_alloc_from_multimb(void **state) assert_true(id == ainfo[3].mb_id); alloc_bucket_to_full(umm, &ainfo[3]); id = umem_allot_mb_evictable(umm, 0); - print_message("obtained id %d, expected is %d\n", id, ainfo[7].mb_id); - assert_true(id == ainfo[7].mb_id); - alloc_bucket_to_full(umm, &ainfo[7]); + print_message("obtained id %d, expected is %d\n", id, ainfo[6].mb_id); + assert_true(id == ainfo[6].mb_id); + alloc_bucket_to_full(umm, &ainfo[6]); /* Next preference should be 0%-30% */ id = umem_allot_mb_evictable(umm, 0); @@ -2279,13 +2295,13 @@ test_tx_alloc_from_multimb(void **state) assert_true(id == ainfo[4].mb_id); alloc_bucket_to_full(umm, &ainfo[4]); id = umem_allot_mb_evictable(umm, 0); - print_message("obtained id %d, expected is %d\n", id, ainfo[8].mb_id); - assert_true(id == ainfo[8].mb_id); - alloc_bucket_to_full(umm, &ainfo[8]); + print_message("obtained id %d, expected is %d\n", id, ainfo[7].mb_id); + assert_true(id == ainfo[7].mb_id); + alloc_bucket_to_full(umm, &ainfo[7]); /* Next is to create a new memory bucket. */ id = umem_allot_mb_evictable(umm, 0); - for (i = 0; i < 10; i++) + for (i = 0; i < 8; i++) assert_true(id != ainfo[i].mb_id); print_message("obtained id %d\n", id); @@ -2384,7 +2400,7 @@ test_umempobj_create_smallsize(void **state) static void test_umempobj_nemb_usage(void **state) { - int num = 0; + int num = 0, i; char *name; struct umem_store ustore_tmp = {.stor_size = 256 * 1024 * 1024, .stor_ops = &_store_ops_v2, @@ -2399,13 +2415,21 @@ test_umempobj_nemb_usage(void **state) /* Create a heap and cache of size 256MB and 249MB (16 & 15 zones) respectively */ D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 0); assert_true(name != NULL); - uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, - 240 * 1024 * 1024, 0666, &ustore_tmp); + uma.uma_pool = umempobj_create(name, "valid_pool", UMEMPOBJ_ENABLE_STATS, 240 * 1024 * 1024, + 0666, &ustore_tmp); assert_ptr_not_equal(uma.uma_pool, NULL); umem_class_init(&uma, &umm); - /* Do allocation and verify that only 13 zones allotted to non evictable MBs */ + /* Do the SOEMB reservation before the actual test. */ + for (i = 0; i < MIN_SOEMB_CNT; i++) { + umem_tx_begin(&umm, NULL); + umem_tx_commit(&umm); + } + + /* Do allocation and verify that only 10 zones allotted to non evictable MBs + * 3 zones are reserved for soemb + */ for (num = 0;; num++) { /* do an allocation that takes more than half the zone size */ umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY); @@ -2416,7 +2440,7 @@ test_umempobj_nemb_usage(void **state) prev_umoff = umoff; } /* 80% nemb when heap size greater than cache size */ - assert_int_equal(num, 13); + assert_int_equal(num, 13 - MIN_SOEMB_CNT); print_message("Number of allocations is %d\n", num); for (--num;; num--) { @@ -2436,12 +2460,18 @@ test_umempobj_nemb_usage(void **state) /* Create a heap and cache of size 256MB (16 zones) each */ D_ASPRINTF(name, "/mnt/daos/umem-test-tmp-%d", 1); assert_true(name != NULL); - uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, - 256 * 1024 * 1024, 0666, &ustore_tmp); + uma.uma_pool = umempobj_create(name, "valid_pool", UMEMPOBJ_ENABLE_STATS, 256 * 1024 * 1024, + 0666, &ustore_tmp); assert_ptr_not_equal(uma.uma_pool, NULL); umem_class_init(&uma, &umm); + /* Do the SOEMB reservation before the actual test. */ + for (i = 0; i < MIN_SOEMB_CNT; i++) { + umem_tx_begin(&umm, NULL); + umem_tx_commit(&umm); + } + /* Do allocation and verify that all 16 zones are allotted to non evictable MBs */ for (num = 0;; num++) { /* do an allocation that takes more than half the zone size */ @@ -2472,7 +2502,7 @@ test_umempobj_nemb_usage(void **state) static void test_umempobj_heap_mb_stats(void **state) { - int num = 0, count, rc; + int num = 0, count, rc, i; char *name; uint64_t scm_size = 128 * 1024 * 1024; uint64_t meta_size = 256 * 1024 * 1024; @@ -2484,7 +2514,8 @@ test_umempobj_heap_mb_stats(void **state) struct umem_instance umm; umem_off_t umoff, *ptr = NULL, prev_umoff = UMOFF_NULL; size_t alloc_size = 128; - uint64_t allocated, allocated0, allocated1, maxsz, maxsz_exp; + uint64_t allocated, allocated0, allocated1; + uint64_t maxsz, maxsz_exp, maxsz_alloc; uint32_t mb_id; uma.uma_id = umempobj_backend_type2class_id(ustore_tmp.store_type); @@ -2494,7 +2525,8 @@ test_umempobj_heap_mb_stats(void **state) uma.uma_pool = umempobj_create(name, "invalid_pool", UMEMPOBJ_ENABLE_STATS, scm_size, 0666, &ustore_tmp); assert_ptr_not_equal(uma.uma_pool, NULL); - maxsz_exp = (uint64_t)(scm_size / MB_SIZE * NEMB_RATIO) * MB_SIZE; + maxsz_exp = (uint64_t)(scm_size / MB_SIZE * NEMB_RATIO) * MB_SIZE; + maxsz_alloc = ((uint64_t)(((scm_size / MB_SIZE) * NEMB_RATIO)) - MIN_SOEMB_CNT) * MB_SIZE; umem_class_init(&uma, &umm); @@ -2504,6 +2536,12 @@ test_umempobj_heap_mb_stats(void **state) assert_int_equal(rc, 0); assert_int_equal(maxsz, maxsz_exp); + /* Do the SOEMB reservation before the actual test. */ + for (i = 0; i < MIN_SOEMB_CNT; i++) { + umem_tx_begin(&umm, NULL); + umem_tx_commit(&umm); + } + /* allocate and consume all of the space */ for (num = 0;; num++) { umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY); @@ -2516,7 +2554,7 @@ test_umempobj_heap_mb_stats(void **state) rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated1, &maxsz); print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated1); assert_int_equal(rc, 0); - assert_true(allocated1 * 100 / maxsz >= 99); + assert_true(allocated1 * 100 / maxsz_alloc >= 99); assert_int_equal(maxsz, maxsz_exp); for (count = num; count > num / 2; count--) { @@ -2530,7 +2568,8 @@ test_umempobj_heap_mb_stats(void **state) rc = umempobj_get_mbusage(umm.umm_pool, 0, &allocated, &maxsz); print_message("NE usage max_size = %lu allocated = %lu\n", maxsz, allocated); assert_int_equal(rc, 0); - assert_true(allocated < allocated1 / 2); + assert_true(allocated < ((allocated1 / 2) + alloc_size)); + assert_true((allocated + alloc_size) > (allocated1 / 2)); assert_int_equal(maxsz, maxsz_exp); for (;;) { umoff = *ptr; diff --git a/src/vos/tests/vts_wal.c b/src/vos/tests/vts_wal.c index 83303ad213d..ad79f0c4819 100644 --- a/src/vos/tests/vts_wal.c +++ b/src/vos/tests/vts_wal.c @@ -629,20 +629,22 @@ setup_wal_io(void **state) static struct io_test_args test_args; -#define MDTEST_META_BLOB_SIZE (256 * 1024 * 1024UL) -#define MDTEST_VOS_SIZE (160 * 1024 * 1024UL) +#define MDTEST_MIN_SOEMB_CNT 3 +#define MDTEST_MAX_NEMB_CNT 9 +#define MDTEST_MAX_EMB_CNT 8 #define MDTEST_MB_SIZE (16 * 1024 * 1024UL) -#define MDTEST_MB_CNT (MDTEST_META_BLOB_SIZE / MDTEST_MB_SIZE) -#define MDTEST_MB_VOS_CNT (MDTEST_VOS_SIZE / MDTEST_MB_SIZE) -#define MDTEST_MAX_NEMB_CNT (MDTEST_MB_VOS_CNT * 8 / 10) -#define MDTEST_MAX_EMB_CNT (MDTEST_MB_CNT - MDTEST_MAX_NEMB_CNT) +#define MDTEST_META_BLOB_SIZE \ + ((MDTEST_MIN_SOEMB_CNT + MDTEST_MAX_NEMB_CNT + MDTEST_MAX_EMB_CNT) * MDTEST_MB_SIZE) +#define MDTEST_VOS_SIZE ((MDTEST_MIN_SOEMB_CNT + MDTEST_MAX_NEMB_CNT) * 10 / 8 * MDTEST_MB_SIZE) +#define MDTEST_MB_VOS_CNT ((int)(MDTEST_VOS_SIZE / MDTEST_MB_SIZE)) +#define MDTEST_MB_CNT ((int)(MDTEST_META_BLOB_SIZE / MDTEST_MB_SIZE)) static int setup_mb_io(void **state) { int rc; - d_setenv("DAOS_NEMB_EMPTY_RECYCLE_THRESHOLD", "4", true); + d_setenv("DAOS_NEMB_EMPTY_RECYCLE_THRESHOLD", "2", true); memset(&test_args, 0, sizeof(test_args)); rc = vts_ctx_init_ex(&test_args.ctx, MDTEST_VOS_SIZE, MDTEST_META_BLOB_SIZE); *state = (void *)&test_args; @@ -1345,6 +1347,7 @@ struct bucket_alloc_info { uint32_t num_allocs; uint32_t mb_id; uint32_t alloc_size; + bool allow_spill; }; #define CHECKPOINT_FREQ 10000 @@ -1387,7 +1390,8 @@ alloc_bucket_to_full(struct umem_instance *umm, struct bucket_alloc_info *ainfo, ainfo->start_umoff = umem_alloc_from_bucket(umm, alloc_size, id); assert_false(UMOFF_IS_NULL(ainfo->start_umoff)); ainfo->num_allocs++; - assert_true(umem_get_mb_from_offset(umm, ainfo->start_umoff) == id); + if (!ainfo->allow_spill) + assert_true(umem_get_mb_from_offset(umm, ainfo->start_umoff) == id); prev_umoff = ainfo->start_umoff; ptr = (umem_off_t *)umem_off2ptr(umm, prev_umoff); *ptr = UMOFF_NULL; @@ -1407,7 +1411,8 @@ alloc_bucket_to_full(struct umem_instance *umm, struct bucket_alloc_info *ainfo, umem_tx_begin(umm, NULL); umoff = umem_alloc_from_bucket(umm, alloc_size, id); - if (UMOFF_IS_NULL(umoff) || (umem_get_mb_from_offset(umm, umoff) != id)) { + if (UMOFF_IS_NULL(umoff) || + (!ainfo->allow_spill && (umem_get_mb_from_offset(umm, umoff) != id))) { umem_tx_abort(umm, 1); break; } @@ -1452,7 +1457,8 @@ free_bucket_by_pct(struct umem_instance *umm, struct bucket_alloc_info *ainfo, i umoff = ainfo->start_umoff; for (i = 0; i < num_free; i++) { - assert_true(umem_get_mb_from_offset(umm, umoff) == ainfo->mb_id); + if (!ainfo->allow_spill) + assert_true(umem_get_mb_from_offset(umm, umoff) == ainfo->mb_id); ptr = (umem_off_t *)umem_off2ptr(umm, umoff); next_umoff = *ptr; umem_atomic_free(umm, umoff); @@ -1470,6 +1476,35 @@ free_bucket_by_pct(struct umem_instance *umm, struct bucket_alloc_info *ainfo, i ainfo->start_umoff, ainfo->num_allocs); } +static void +traverse_bucket(struct umem_instance *umm, struct bucket_alloc_info *ainfo) +{ + int num_elems = ainfo->num_allocs; + umem_off_t umoff, *ptr; + struct umem_pin_handle *p_hdl; + struct umem_cache_range rg = {0}; + int i, rc; + + if (UMOFF_IS_NULL(ainfo->start_umoff)) + return; + rg.cr_off = umem_get_mb_base_offset(umm, ainfo->mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + + umoff = ainfo->start_umoff; + for (i = 1; i < num_elems * 2; i++) { + ptr = (umem_off_t *)umem_off2ptr(umm, umoff); + umoff = *ptr; + if (UMOFF_IS_NULL(umoff)) + break; + } + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + print_message("Traverse: Bucket %d, start off %lu num_allocation %d actual_found %d\n", + ainfo->mb_id, ainfo->start_umoff, ainfo->num_allocs, i); + assert_true(i == num_elems); +} + static void wal_mb_utilization_tests(void **state) { @@ -1491,6 +1526,7 @@ wal_mb_utilization_tests(void **state) ainfo[i].num_allocs = 0; ainfo[i].start_umoff = UMOFF_NULL; ainfo[i].alloc_size = 0; + ainfo[i].allow_spill = 0; assert_true(ainfo[i].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); } @@ -1545,6 +1581,7 @@ wal_mb_utilization_tests(void **state) ainfo[i].num_allocs = 0; ainfo[i].start_umoff = UMOFF_NULL; ainfo[i].alloc_size = 0; + ainfo[i].allow_spill = 0; assert_true(ainfo[i].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); @@ -1592,6 +1629,7 @@ wal_mb_emb_evicts_emb(void **state) ainfo[0].num_allocs = 0; ainfo[0].start_umoff = UMOFF_NULL; ainfo[0].alloc_size = 0; + ainfo[0].allow_spill = 0; alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); /* @@ -1620,6 +1658,7 @@ wal_mb_emb_evicts_emb(void **state) ainfo[i].num_allocs = 0; ainfo[i].start_umoff = UMOFF_NULL; ainfo[i].alloc_size = 0; + ainfo[i].allow_spill = 0; assert_true(ainfo[i].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); } @@ -1658,6 +1697,7 @@ wal_mb_nemb_evicts_emb(void **state) ainfo[i].num_allocs = 0; ainfo[i].start_umoff = UMOFF_NULL; ainfo[i].alloc_size = 0; + ainfo[i].allow_spill = 0; assert_true(ainfo[i].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); } @@ -1669,6 +1709,7 @@ wal_mb_nemb_evicts_emb(void **state) ainfo[0].num_allocs = 0; ainfo[0].start_umoff = UMOFF_NULL; ainfo[0].alloc_size = 0; + ainfo[0].allow_spill = 0; alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); /* @@ -1706,6 +1747,7 @@ wal_mb_nemb_pct(void **state) ainfo[0].num_allocs = 0; ainfo[0].start_umoff = UMOFF_NULL; ainfo[0].alloc_size = 2048; + ainfo[0].allow_spill = 0; alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated, &maxsz); assert_true(rc == 0); @@ -1741,6 +1783,7 @@ wal_mb_nemb_pct(void **state) ainfo[i].num_allocs = 0; ainfo[i].start_umoff = UMOFF_NULL; ainfo[i].alloc_size = 2048; + ainfo[i].allow_spill = 0; assert_true(ainfo[i].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); } @@ -1772,6 +1815,7 @@ nemb_unused(void **state) ainfo[0].num_allocs = 0; ainfo[0].start_umoff = UMOFF_NULL; ainfo[0].alloc_size = 512 * 1024; + ainfo[0].allow_spill = 0; alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); rc = umempobj_get_mbusage(umm->umm_pool, 0, &nemb_full_size, &maxsz); assert_true(rc == 0); @@ -1795,11 +1839,13 @@ nemb_unused(void **state) ainfo[i].num_allocs = 0; ainfo[i].start_umoff = UMOFF_NULL; ainfo[i].alloc_size = 512 * 1024; + ainfo[i].allow_spill = 0; assert_true(ainfo[i].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); } /* Make sure that we can only create MDTEST_MAX_EMB_CNT evictable MBs */ id = umem_allot_mb_evictable(umm, 0); + print_message("Got id %d\n", id); for (j = 1; j <= MDTEST_MAX_EMB_CNT; j++) { if (id == ainfo[j].mb_id) break; @@ -1810,7 +1856,7 @@ nemb_unused(void **state) if (umem_cache_offisloaded(&umm->umm_pool->up_store, ainfo[j].start_umoff)) found++; print_message("phase3: Found %d evictable MBs loaded\n", found); - D_ASSERT(found > (MDTEST_MB_VOS_CNT - MDTEST_MAX_NEMB_CNT)); + D_ASSERT(found > (MDTEST_MB_VOS_CNT - MDTEST_MAX_NEMB_CNT - MDTEST_MIN_SOEMB_CNT)); for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) free_bucket_by_pct(umm, &ainfo[i], 100, checkpoint_fn, &arg->ctx.tc_po_hdl); @@ -1833,7 +1879,7 @@ nemb_unused(void **state) cont = vos_hdl2cont(arg->ctx.tc_co_hdl); umm = vos_cont2umm(cont); - /* Force GC */ + /* After a restart gc may not recalim all of the free MBs. Hence rerun it multiple times */ umem_heap_gc(umm); umem_heap_gc(umm); @@ -1859,7 +1905,7 @@ nemb_unused(void **state) found++; print_message("phase7: Found %d evictable MBs loaded\n", found); - D_ASSERT(found > (MDTEST_MB_VOS_CNT - MDTEST_MAX_NEMB_CNT)); + D_ASSERT(found > (MDTEST_MB_VOS_CNT - MDTEST_MAX_NEMB_CNT - MDTEST_MIN_SOEMB_CNT)); alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated, &maxsz); @@ -1868,6 +1914,264 @@ nemb_unused(void **state) assert_true(nemb_full_size == cur_allocated); } +static void +free_bucket_soembtest(struct umem_instance *umm, struct bucket_alloc_info *ainfo, int hint, + void (*chkpt_fn)(void *arg), void *arg) +{ + int num_free; + umem_off_t umoff, *ptr, next_umoff, *prev_ptr, baseoffset; + struct umem_pin_handle *p_hdl; + struct umem_cache_range rg = {0}; + int rc, pg_id, npg_id; + int free_incr = hint / 10 + 1; + int tfree = 0; + + if (UMOFF_IS_NULL(ainfo->start_umoff)) + return; + print_message("SOEMB Free BEFORE: Bucket %d, start off %lu num_allocation %d\n", + ainfo->mb_id, ainfo->start_umoff, ainfo->num_allocs); + + rg.cr_off = umem_get_mb_base_offset(umm, ainfo->mb_id); + rg.cr_size = 1; + rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); + assert_true(rc == 0); + + baseoffset = umem_get_mb_base_offset(umm, 0); + umoff = ainfo->start_umoff; + num_free = tfree; + pg_id = (umoff - baseoffset) / MDTEST_MB_SIZE; + prev_ptr = &ainfo->start_umoff; + while (ainfo->num_allocs) { + ptr = (umem_off_t *)umem_off2ptr(umm, umoff); + next_umoff = *ptr; + if (num_free && pg_id) { + umem_tx_begin(umm, NULL); + umem_free(umm, umoff); + if (prev_ptr != &ainfo->start_umoff) + umem_tx_add_ptr(umm, prev_ptr, sizeof(umoff)); + *prev_ptr = next_umoff; + umem_tx_commit(umm); + num_free--; + if (((ainfo->num_allocs-- % CHECKPOINT_FREQ) == 0) && (chkpt_fn != NULL)) + chkpt_fn(arg); + } else + prev_ptr = ptr; + umoff = next_umoff; + if (UMOFF_IS_NULL(umoff)) + break; + npg_id = (umoff - baseoffset) / MDTEST_MB_SIZE; + if (npg_id != pg_id) { + print_message("Freed %d blocks from page %d\n", tfree - num_free, pg_id); + pg_id = npg_id; + if (pg_id) + tfree += free_incr; + num_free = tfree; + } + } + if (chkpt_fn != NULL) + chkpt_fn(arg); + umem_cache_unpin(&umm->umm_pool->up_store, p_hdl); + print_message("SOEMB Free AFTER: Bucket %d, start off %lu num_allocation %d\n", + ainfo->mb_id, ainfo->start_umoff, ainfo->num_allocs); +} + +static void +soemb_test(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + int rc, soemb_num_allocs; + struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1]; + daos_size_t maxsz, cur_allocated, cur_allocated1, pg_alloc_sz; + daos_size_t cur_allocated2; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + /* + * Obtain number of allocation possible per bucket. + */ + ainfo[2].mb_id = umem_allot_mb_evictable(umm, 0); + ainfo[2].num_allocs = 0; + ainfo[2].start_umoff = UMOFF_NULL; + ainfo[2].alloc_size = 512 * 1024; + ainfo[2].allow_spill = 0; + assert_true(ainfo[2].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[2], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, ainfo[2].mb_id, &pg_alloc_sz, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu pg_alloc_sz = %lu\n", maxsz, pg_alloc_sz); + + /* + * Validate that the allocation to default bucket does not spill over to soe buckets. + */ + print_message("Stage 1\n"); + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 512 * 1024; + ainfo[0].allow_spill = 0; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated, &maxsz); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated); + assert_true(rc == 0); + assert_true(cur_allocated > (pg_alloc_sz * MDTEST_MAX_NEMB_CNT)); + + /* + * After a spill to soe buckets, the total utilization should be near full. + */ + print_message("Stage 2\n"); + ainfo[1].mb_id = umem_allot_mb_evictable(umm, 0); + ainfo[1].num_allocs = 0; + ainfo[1].start_umoff = UMOFF_NULL; + ainfo[1].alloc_size = 512 * 1024; + ainfo[1].allow_spill = 1; + assert_true(ainfo[1].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated); + assert_true(cur_allocated > (pg_alloc_sz * (MDTEST_MAX_NEMB_CNT + MDTEST_MIN_SOEMB_CNT))); + + /* + * Free the allocations in default bucket. Now further allocation in evictable bucket + * should cause soe buckets to extend stealing the freed non-evictable buckets. + */ + print_message("Stage 3\n"); + free_bucket_by_pct(umm, &ainfo[0], 100, checkpoint_fn, &arg->ctx.tc_po_hdl); + print_message("Triggering gc\n"); + umem_heap_gc(umm); + umem_heap_gc(umm); + umem_heap_gc(umm); + soemb_num_allocs = ainfo[1].num_allocs; + alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated1, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated1); + assert_true(cur_allocated == cur_allocated1); + assert_true(soemb_num_allocs < ainfo[1].num_allocs); + + /* + * Do a 50% free from the evictable + soe buckets, trigger gc and do allocation in + * default bucket. The non-evictable buckets should extend using the freed soe buckets. + */ + print_message("Stage 4\n"); + free_bucket_by_pct(umm, &ainfo[1], 50, checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated1, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated1); + assert_true(maxsz == MDTEST_VOS_SIZE * 80 / 100); + /* 50% includes the evictable MB, hence cur_allocated1 is not exactly cur_allocated/2 */ + assert_true(cur_allocated1 < (cur_allocated / 2 + MDTEST_MB_SIZE)); + + print_message("Triggering gc\n"); + umem_heap_gc(umm); + umem_heap_gc(umm); + umem_heap_gc(umm); + + /* Now the MBs marked unused should be available for NEMB allocation */ + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 512 * 1024; + ainfo[0].allow_spill = 0; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated2, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated2); + assert_true(maxsz == MDTEST_VOS_SIZE * 80 / 100); + assert_true(cur_allocated2 > (cur_allocated1 + 2 * pg_alloc_sz)); + + /* + * Restart the pool and check whether all of the soe buckets are accessible + * without explicit pin. + */ + print_message("Stage 5\n"); + wal_pool_refill(arg); + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + traverse_bucket(umm, &ainfo[1]); + + /* + * Check whether gc of soe buckets works post restart. + */ + print_message("Stage 6\n"); + free_bucket_by_pct(umm, &ainfo[1], 100, checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated1, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated1); + assert_true(maxsz == MDTEST_VOS_SIZE * 80 / 100); + + print_message("Triggering gc\n"); + umem_heap_gc(umm); + umem_heap_gc(umm); + umem_heap_gc(umm); + + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated2, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated2); + assert_true(maxsz == MDTEST_VOS_SIZE * 80 / 100); + assert_true(cur_allocated2 > (pg_alloc_sz * MDTEST_MAX_NEMB_CNT)); + + /* + * Similarly check whether freed non-evictable buckets will be + * reused as soemb post gc. + */ + print_message("Stage 7\n"); + free_bucket_by_pct(umm, &ainfo[0], 100, checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated1, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated1); + assert_true(maxsz == MDTEST_VOS_SIZE * 80 / 100); + assert_true(cur_allocated1 < MDTEST_MB_SIZE); + + print_message("Triggering gc\n"); + umem_heap_gc(umm); + umem_heap_gc(umm); + umem_heap_gc(umm); + + alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated2, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated2); + assert_true(maxsz == MDTEST_VOS_SIZE * 80 / 100); + assert_true(cur_allocated2 > (pg_alloc_sz * MDTEST_MAX_NEMB_CNT)); + + /* + * Selectively varying free blocks from different soe buckets and do + * reallocation post gc. Validate that all of the soe buckets are reused. + */ + print_message("Stage 8\n"); + free_bucket_soembtest(umm, &ainfo[1], ainfo[2].num_allocs, checkpoint_fn, + &arg->ctx.tc_po_hdl); + + print_message("Triggering gc\n"); + umem_heap_gc(umm); + umem_heap_gc(umm); + umem_heap_gc(umm); + + alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated2, &maxsz); + assert_true(rc == 0); + print_message("non-evictable MBs max_size = %lu current utilization = %lu\n", maxsz, + cur_allocated2); + assert_true(maxsz == MDTEST_VOS_SIZE * 80 / 100); + assert_true(cur_allocated2 > (pg_alloc_sz * MDTEST_MAX_NEMB_CNT)); +} + static int umoff_in_freelist(umem_off_t *free_list, int cnt, umem_off_t umoff, bool clear) { @@ -1910,6 +2214,7 @@ wal_umempobj_block_reuse_internal(void **state, int restart) ainfo[0].num_allocs = 0; ainfo[0].start_umoff = UMOFF_NULL; ainfo[0].alloc_size = 512; + ainfo[0].allow_spill = 0; alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); /* Allocate from Evictable Buckets. */ @@ -1919,6 +2224,7 @@ wal_umempobj_block_reuse_internal(void **state, int restart) ainfo[i].num_allocs = 0; ainfo[i].start_umoff = UMOFF_NULL; ainfo[i].alloc_size = 512; + ainfo[i].allow_spill = 0; assert_true(ainfo[i].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); } @@ -2022,6 +2328,7 @@ wal_umempobj_block_reuse_internal(void **state, int restart) /* Allocate from E Buckets and it should reuse the previous freed blocks */ for (i = 1; i <= MDTEST_MAX_EMB_CNT; i++) { + print_message("Allocating from bucket %d\n", ainfo[i].mb_id); rg.cr_off = umem_get_mb_base_offset(umm, ainfo[i].mb_id); rg.cr_size = 1; rc = umem_cache_pin(&umm->umm_pool->up_store, &rg, 1, 0, &p_hdl); @@ -2121,13 +2428,14 @@ wal_umempobj_mbusage_test(void **state) cont = vos_hdl2cont(arg->ctx.tc_co_hdl); umm = vos_cont2umm(cont); - maxsz_exp = MDTEST_MAX_NEMB_CNT * MDTEST_MB_SIZE; + maxsz_exp = (MDTEST_MAX_NEMB_CNT + MDTEST_MIN_SOEMB_CNT) * MDTEST_MB_SIZE; /* Allocate from NE Buckets. It should use 80% 360M i.e, 16 buckets */ ainfo[0].mb_id = 0; ainfo[0].num_allocs = 0; ainfo[0].start_umoff = UMOFF_NULL; ainfo[0].alloc_size = 512; + ainfo[0].allow_spill = 0; alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); /* Create an MB and fill it with allocs */ @@ -2135,6 +2443,7 @@ wal_umempobj_mbusage_test(void **state) ainfo[1].num_allocs = 0; ainfo[1].start_umoff = UMOFF_NULL; ainfo[1].alloc_size = 512; + ainfo[1].allow_spill = 0; assert_true(ainfo[1].mb_id != 0); alloc_bucket_to_full(umm, &ainfo[1], checkpoint_fn, &arg->ctx.tc_po_hdl); free_bucket_by_pct(umm, &ainfo[1], 50, checkpoint_fn, &arg->ctx.tc_po_hdl); @@ -2314,7 +2623,7 @@ p2_basic_test(void **state) daos_epoch_t epoch = 1; daos_size_t io_size = 512; struct vos_object *obj; - uint32_t bkt_id = 1, missed, loaded; + uint32_t bkt_id = 1 + MDTEST_MIN_SOEMB_CNT, missed, loaded; uint64_t used[2], ne_init; int rc; @@ -2635,6 +2944,7 @@ static const struct CMUnitTest wal_MB_tests[] = { {"WAL39: P2 fill evictable buckets", p2_fill_test, setup_mb_io, teardown_mb_io}, {"WAL40: nemb pct test", wal_mb_nemb_pct, setup_mb_io_nembpct, teardown_mb_io_nembpct}, {"WAL41: nemb unused test", nemb_unused, setup_mb_io, teardown_mb_io}, + {"WAL42: soemb test", soemb_test, setup_mb_io, teardown_mb_io}, }; int @@ -2652,14 +2962,12 @@ run_wal_tests(const char *cfg) dts_create_config(test_name, "WAL Pool and container tests %s", cfg); D_PRINT("Running %s\n", test_name); - rc = cmocka_run_group_tests_name(test_name, wal_tests, setup_wal_test, - teardown_wal_test); + rc = cmocka_run_group_tests_name(test_name, wal_tests, setup_wal_test, teardown_wal_test); dts_create_config(test_name, "WAL Basic SV and EV IO tests %s", cfg); D_PRINT("Running %s\n", test_name); otype = 0; - rc += cmocka_run_group_tests_name(test_name, wal_kv_basic_tests, - setup_wal_io, teardown_io); + rc += cmocka_run_group_tests_name(test_name, wal_kv_basic_tests, setup_wal_io, teardown_io); for (i = 0; i < (sizeof(type_list) / sizeof(int)); i++) { otype = type_list[i]; @@ -2675,13 +2983,13 @@ run_wal_tests(const char *cfg) cfg); test_name[3] = '1'; D_PRINT("Running %s\n", test_name); - rc += cmocka_run_group_tests_name(test_name, wal_io_tests, - setup_wal_io, teardown_io); + rc += + cmocka_run_group_tests_name(test_name, wal_io_tests, setup_wal_io, teardown_io); if (otype == DAOS_OT_MULTI_UINT64) { test_name[3] = '2'; D_PRINT("Running %s\n", test_name); - rc += cmocka_run_group_tests_name(test_name, wal_io_int_tests, - setup_wal_io, teardown_io); + rc += cmocka_run_group_tests_name(test_name, wal_io_int_tests, setup_wal_io, + teardown_io); } }