From 54c018a7dd87a9573c3a7a8d9fb901afe042c642 Mon Sep 17 00:00:00 2001 From: sherintg Date: Thu, 31 Oct 2024 13:32:13 +0530 Subject: [PATCH] DAOS-16763 common: Tunable to control max NEMB (#15422) A new tunable, DAOS_MD_ON_SSD_NEMB_PCT is introuced, to define the percentage of memory cache that non-evictable memory buckets can expand to. This tunable will be read during pool creation and persisted, ensuring that each time the pool is reopened, it retains the value set during its creation. Signed-off-by: Sherin T George --- src/common/dav_v2/dav_iface.c | 54 +++++++++++---------- src/common/dav_v2/heap.c | 50 +++++++++++++++++-- src/common/dav_v2/heap.h | 4 +- src/common/dav_v2/heap_layout.h | 3 +- src/vos/tests/vts_wal.c | 86 +++++++++++++++++++++++++++++++-- 5 files changed, 161 insertions(+), 36 deletions(-) diff --git a/src/common/dav_v2/dav_iface.c b/src/common/dav_v2/dav_iface.c index 63b8e86d556..ede29fafc56 100644 --- a/src/common/dav_v2/dav_iface.c +++ b/src/common/dav_v2/dav_iface.c @@ -70,7 +70,7 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct struct heap_zone_limits hzl; struct zone *z0; - hzl = heap_get_zone_limits(store->stor_size, scm_sz); + hzl = heap_get_zone_limits(store->stor_size, scm_sz, 100); if (hzl.nzones_heap == 0) { ERR("Insufficient heap size."); @@ -78,7 +78,7 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct return NULL; } - if (hzl.nzones_ne_max < 2) { + if ((hzl.nzones_cache < 2) && (hzl.nzones_heap > hzl.nzones_cache)) { ERR("Insufficient scm size."); errno = EINVAL; return NULL; @@ -117,24 +117,15 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct goto out1; } - rc = umem_cache_alloc(store, ZONE_MAX_SIZE, hzl.nzones_heap, hzl.nzones_cache, - hzl.nzones_ne_max, 4096, mmap_base, is_zone_evictable, - dav_uc_callback, hdl); - if (rc != 0) { - D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc)); - err = daos_der2errno(rc); - goto out1; - } - - D_STRNDUP(hdl->do_path, path, strlen(path)); - if (flags & DAV_HEAP_INIT) { rc = heap_init(mmap_base, scm_sz, store); if (rc) { err = errno; - goto out2; + goto out1; } } + + D_STRNDUP(hdl->do_path, path, strlen(path)); D_ALLOC_PTR(hdl->do_heap); if (hdl->do_heap == NULL) { err = ENOMEM; @@ -155,37 +146,46 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct heap_set_root_ptrs(hdl->do_heap, &hdl->do_root_offsetp, &hdl->do_root_sizep); heap_set_stats_ptr(hdl->do_heap, &hdl->do_stats->persistent); + rc = umem_cache_alloc(store, ZONE_MAX_SIZE, hzl.nzones_heap, hzl.nzones_cache, + heap_get_max_nemb(hdl->do_heap), 4096, mmap_base, is_zone_evictable, + dav_uc_callback, hdl); + if (rc != 0) { + D_ERROR("Could not allocate page cache: rc=" DF_RC "\n", DP_RC(rc)); + err = daos_der2errno(rc); + goto out3; + } + if (!(flags & DAV_HEAP_INIT)) { rc = heap_zone_load(hdl->do_heap, 0); if (rc) { err = rc; - goto out3; + goto out4; } D_ASSERT(store != NULL); rc = hdl->do_store->stor_ops->so_wal_replay(hdl->do_store, dav_wal_replay_cb, hdl); if (rc) { err = daos_der2errno(rc); - goto out3; + goto out4; } } rc = dav_create_clogs(hdl); if (rc) { err = rc; - goto out3; + goto out4; } rc = lw_tx_begin(hdl); if (rc) { D_ERROR("lw_tx_begin failed with err %d\n", rc); err = ENOMEM; - goto out3; + goto out5; } rc = heap_ensure_zone0_initialized(hdl->do_heap); if (rc) { lw_tx_end(hdl, NULL); D_ERROR("Failed to initialize zone0, rc = %d", daos_errno2der(rc)); - goto out3; + goto out5; } lw_tx_end(hdl, NULL); @@ -198,14 +198,14 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct if (rc) { D_ERROR("Failed to update mbrt with zinfo errno = %d", rc); err = rc; - goto out3; + goto out5; } rc = heap_load_nonevictable_zones(hdl->do_heap); if (rc) { D_ERROR("Failed to load required zones during boot, errno= %d", rc); err = rc; - goto out3; + goto out5; } } else { D_ASSERT(z0->header.zone0_zinfo_size == 0); @@ -213,20 +213,20 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct if (rc) { D_ERROR("lw_tx_begin failed with err %d\n", rc); err = ENOMEM; - goto out3; + goto out5; } rc = obj_realloc(hdl, &z0->header.zone0_zinfo_off, &z0->header.zone0_zinfo_size, heap_zinfo_get_size(hzl.nzones_heap)); if (rc != 0) { lw_tx_end(hdl, NULL); D_ERROR("Failed to setup zinfo"); - goto out3; + goto out5; } rc = heap_update_mbrt_zinfo(hdl->do_heap, true); if (rc) { D_ERROR("Failed to update mbrt with zinfo errno = %d", rc); err = rc; - goto out3; + goto out5; } lw_tx_end(hdl, NULL); } @@ -240,7 +240,10 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct hdl->do_booted = 1; return hdl; - +out5: + dav_destroy_clogs(hdl); +out4: + umem_cache_free(hdl->do_store); out3: heap_cleanup(hdl->do_heap); out2: @@ -253,7 +256,6 @@ dav_obj_open_internal(int fd, int flags, size_t scm_sz, const char *path, struct D_FREE(hdl->do_utx); } D_FREE(hdl->do_path); - umem_cache_free(hdl->do_store); out1: D_FREE(hdl); out0: diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c index 0a0fa71ea91..d730fed7bc4 100644 --- a/src/common/dav_v2/heap.c +++ b/src/common/dav_v2/heap.c @@ -25,6 +25,8 @@ #include "alloc_class.h" #include "meta_io.h" +#define HEAP_NEMB_PCT_DEFAULT 80 + static void heap_reclaim_zone_garbage(struct palloc_heap *heap, struct bucket *bucket, uint32_t zone_id); @@ -96,6 +98,7 @@ struct heap_rt { unsigned zinfo_vec_size; unsigned mb_create_waiters; unsigned mb_pressure; + unsigned nemb_pct; void *mb_create_wq; struct zinfo_vec *zinfo_vec; struct mbrt *default_mb; @@ -1397,7 +1400,8 @@ heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c) * heap_write_header -- (internal) creates a clean header */ static int -heap_write_header(struct umem_store *store, size_t heap_size, size_t umem_cache_size) +heap_write_header(struct umem_store *store, size_t heap_size, size_t umem_cache_size, + uint32_t nemb_pct) { struct heap_header *newhdr; int rc; @@ -1414,6 +1418,7 @@ heap_write_header(struct umem_store *store, size_t heap_size, size_t umem_cache_ newhdr->heap_hdr_size = sizeof(struct heap_header); newhdr->chunksize = CHUNKSIZE; newhdr->chunks_per_zone = MAX_CHUNK; + newhdr->nemb_pct = (uint8_t)nemb_pct; newhdr->checksum = 0; util_checksum(newhdr, sizeof(*newhdr), &newhdr->checksum, 1, 0); @@ -1483,6 +1488,11 @@ heap_verify_header(struct heap_header *hdr, size_t heap_size, size_t cache_size) return -1; } + if (hdr->nemb_pct > 100) { + D_ERROR("nemb pct value (%d) in heap header is incorrect\n", hdr->nemb_pct); + return -1; + } + if ((hdr->heap_hdr_size != sizeof(struct heap_header)) || (hdr->chunksize != CHUNKSIZE) || (hdr->chunks_per_zone != MAX_CHUNK)) { D_ERROR("incompatible heap layout: hdr_sz=%lu, chunk_sz=%lu, max_chunks=%lu\n", @@ -1558,6 +1568,7 @@ heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_ struct heap_header *newhdr; int err; struct heap_zone_limits hzl; + uint32_t nemb_pct = HEAP_NEMB_PCT_DEFAULT; D_ALLOC_PTR(newhdr); if (!newhdr) @@ -1575,6 +1586,8 @@ heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_ D_FREE(newhdr); return EINVAL; } + if (newhdr->nemb_pct) + nemb_pct = newhdr->nemb_pct; D_FREE(newhdr); D_ALLOC_PTR_NZ(h); @@ -1589,7 +1602,7 @@ heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_ goto error_alloc_classes_new; } - hzl = heap_get_zone_limits(heap_size, cache_size); + hzl = heap_get_zone_limits(heap_size, cache_size, nemb_pct); h->nzones = hzl.nzones_heap; h->nzones_ne = hzl.nzones_ne_max; @@ -1630,6 +1643,28 @@ heap_boot(struct palloc_heap *heap, void *mmap_base, uint64_t heap_size, uint64_ return err; } +static unsigned int +heap_get_nemb_pct() +{ + unsigned int nemb_pct; + + nemb_pct = HEAP_NEMB_PCT_DEFAULT; + d_getenv_uint("DAOS_MD_ON_SSD_NEMB_PCT", &nemb_pct); + if ((nemb_pct > 100) || (nemb_pct == 0)) { + D_ERROR("Invalid value %d for tunable DAOS_MD_ON_SSD_NEMB_PCT", nemb_pct); + nemb_pct = HEAP_NEMB_PCT_DEFAULT; + } + D_INFO("DAOS_MD_ON_SSD_NEMB_PCT set to %d", nemb_pct); + + return nemb_pct; +} + +int +heap_get_max_nemb(struct palloc_heap *heap) +{ + return heap->rt->nzones_ne; +} + /* * heap_init -- initializes the heap * @@ -1639,6 +1674,7 @@ int heap_init(void *heap_start, uint64_t umem_cache_size, struct umem_store *store) { int nzones; + uint32_t nemb_pct = heap_get_nemb_pct(); uint64_t heap_size = store->stor_size; if (heap_size < HEAP_MIN_SIZE) @@ -1649,7 +1685,7 @@ heap_init(void *heap_start, uint64_t umem_cache_size, struct umem_store *store) nzones = heap_max_zone(heap_size); meta_clear_pages(store, sizeof(struct heap_header), 4096, ZONE_MAX_SIZE, nzones); - if (heap_write_header(store, heap_size, umem_cache_size)) + if (heap_write_header(store, heap_size, umem_cache_size, nemb_pct)) return ENOMEM; return 0; @@ -1885,6 +1921,8 @@ heap_update_mbrt_zinfo(struct palloc_heap *heap, bool init) heap->rt->zones_exhausted_ne = nemb_cnt; heap->rt->zones_exhausted_e = emb_cnt; + D_ASSERT(heap->rt->nzones_e >= heap->rt->zones_exhausted_e); + D_ASSERT(heap->rt->nzones_ne >= heap->rt->zones_exhausted_ne); return 0; } @@ -2058,10 +2096,12 @@ heap_foreach_object(struct palloc_heap *heap, object_callback cb, void *arg, } struct heap_zone_limits -heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size) +heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size, uint32_t nemb_pct) { struct heap_zone_limits zd = {0}; + D_ASSERT(nemb_pct <= 100); + if (heap_size < sizeof(struct heap_header)) zd.nzones_heap = 0; else @@ -2075,7 +2115,7 @@ heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size) if (zd.nzones_heap < (zd.nzones_cache + UMEM_CACHE_MIN_EVICTABLE_PAGES)) zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES; else - zd.nzones_ne_max = zd.nzones_cache * 8 / 10; + zd.nzones_ne_max = ((unsigned long)zd.nzones_cache * nemb_pct) / 100; if (zd.nzones_cache < (zd.nzones_ne_max + UMEM_CACHE_MIN_EVICTABLE_PAGES)) zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES; } else diff --git a/src/common/dav_v2/heap.h b/src/common/dav_v2/heap.h index 8cddcbda152..8ceeff9a5cd 100644 --- a/src/common/dav_v2/heap.h +++ b/src/common/dav_v2/heap.h @@ -36,6 +36,8 @@ heap_cleanup(struct palloc_heap *heap); int heap_check(void *heap_start, uint64_t heap_size); int +heap_get_max_nemb(struct palloc_heap *heap); +int heap_create_alloc_class_buckets(struct palloc_heap *heap, struct alloc_class *c); int heap_mbrt_update_alloc_class_buckets(struct palloc_heap *heap, struct mbrt *mb, @@ -148,5 +150,5 @@ uint32_t heap_off2mbid(struct palloc_heap *heap, uint64_t offset); struct heap_zone_limits -heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size); +heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size, uint32_t nemb_pct); #endif /* __DAOS_COMMON_HEAP_H */ diff --git a/src/common/dav_v2/heap_layout.h b/src/common/dav_v2/heap_layout.h index de31044022f..fa65846921e 100644 --- a/src/common/dav_v2/heap_layout.h +++ b/src/common/dav_v2/heap_layout.h @@ -133,7 +133,8 @@ struct heap_header { uint64_t heap_hdr_size; uint64_t chunksize; uint64_t chunks_per_zone; - uint8_t reserved[4016]; + uint8_t nemb_pct; + uint8_t reserved[4015]; uint64_t checksum; }; diff --git a/src/vos/tests/vts_wal.c b/src/vos/tests/vts_wal.c index bd7f4926396..7506c7ffa79 100644 --- a/src/vos/tests/vts_wal.c +++ b/src/vos/tests/vts_wal.c @@ -629,9 +629,9 @@ setup_wal_io(void **state) static struct io_test_args test_args; -#define MDTEST_META_BLOB_SIZE (256 * 1024 * 1024) -#define MDTEST_VOS_SIZE (160 * 1024 * 1024) -#define MDTEST_MB_SIZE (16 * 1024 * 1024) +#define MDTEST_META_BLOB_SIZE (256 * 1024 * 1024UL) +#define MDTEST_VOS_SIZE (160 * 1024 * 1024UL) +#define MDTEST_MB_SIZE (16 * 1024 * 1024UL) #define MDTEST_MB_CNT (MDTEST_META_BLOB_SIZE / MDTEST_MB_SIZE) #define MDTEST_MB_VOS_CNT (MDTEST_VOS_SIZE / MDTEST_MB_SIZE) #define MDTEST_MAX_NEMB_CNT (MDTEST_MB_VOS_CNT * 8 / 10) @@ -657,6 +657,20 @@ teardown_mb_io(void **state) return 0; } +static int +setup_mb_io_nembpct(void **state) +{ + d_setenv("DAOS_MD_ON_SSD_NEMB_PCT", "40", true); + return setup_mb_io(state); +} + +static int +teardown_mb_io_nembpct(void **state) +{ + d_unsetenv("DAOS_MD_ON_SSD_NEMB_PCT"); + return teardown_mb_io(state); +} + /* refill:true - perform the pool re-load and refill after every key update/punch */ static int wal_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch, @@ -1669,6 +1683,71 @@ wal_mb_nemb_evicts_emb(void **state) free_bucket_by_pct(umm, &ainfo[j], 100, checkpoint_fn, &arg->ctx.tc_po_hdl); } +static void +wal_mb_nemb_pct(void **state) +{ + struct io_test_args *arg = *state; + struct vos_container *cont; + struct umem_instance *umm; + int i, j, rc, found = 0; + struct bucket_alloc_info ainfo[MDTEST_MB_CNT + 1]; + daos_size_t maxsz, cur_allocated1, cur_allocated; + + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + + /* + * The setup for this test would have set environment variable + * DAOS_MD_ON_SSD_NEMB_PCT to 40 before creating the pool. + */ + ainfo[0].mb_id = 0; + ainfo[0].num_allocs = 0; + ainfo[0].start_umoff = UMOFF_NULL; + ainfo[0].alloc_size = 2048; + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated, &maxsz); + assert_true(rc == 0); + print_message("nemb space utilization is %lu max is %lu\n", cur_allocated, maxsz); + assert_true(maxsz == MDTEST_VOS_SIZE * 40 / 100); + + /* Reopen pool after setting DAOS_MD_ON_SSD_NEMB_PCT to 80% + * It should not impact already created vos pool. + */ + d_setenv("DAOS_MD_ON_SSD_NEMB_PCT", "80", true); + wal_pool_refill(arg); + cont = vos_hdl2cont(arg->ctx.tc_co_hdl); + umm = vos_cont2umm(cont); + alloc_bucket_to_full(umm, &ainfo[0], checkpoint_fn, &arg->ctx.tc_po_hdl); + rc = umempobj_get_mbusage(umm->umm_pool, 0, &cur_allocated1, &maxsz); + assert_true(rc == 0); + print_message("nemb space utilization is %lu max is %lu\n", cur_allocated1, maxsz); + assert_true(maxsz == MDTEST_VOS_SIZE * 40 / 100); + assert_true(cur_allocated == cur_allocated1); + + /* Allocate from Evictable Buckets. */ + for (i = 1; i <= MDTEST_MB_CNT; i++) { + /* Create an MB and fill it with allocs */ + ainfo[i].mb_id = umem_allot_mb_evictable(umm, 0); + for (j = 1; j < i; j++) { + if (ainfo[i].mb_id == ainfo[j].mb_id) { + found = 1; + break; + } + } + if (found) + break; + ainfo[i].num_allocs = 0; + ainfo[i].start_umoff = UMOFF_NULL; + ainfo[i].alloc_size = 2048; + assert_true(ainfo[i].mb_id != 0); + alloc_bucket_to_full(umm, &ainfo[i], checkpoint_fn, &arg->ctx.tc_po_hdl); + } + i--; + print_message("Created %d evictable buckets, expected = %ld\n", i, + (MDTEST_META_BLOB_SIZE - maxsz) / MDTEST_MB_SIZE); + assert_true(i == (MDTEST_META_BLOB_SIZE - maxsz) / MDTEST_MB_SIZE); +} + static int umoff_in_freelist(umem_off_t *free_list, int cnt, umem_off_t umoff, bool clear) { @@ -2434,6 +2513,7 @@ static const struct CMUnitTest wal_MB_tests[] = { {"WAL37: UMEM MB stats test ", wal_umempobj_mbusage_test, setup_mb_io, teardown_mb_io}, {"WAL38: P2 basic", p2_basic_test, setup_mb_io, teardown_mb_io}, {"WAL39: P2 fill evictable buckets", p2_fill_test, setup_mb_io, teardown_mb_io}, + {"WAL40: nemb pct test", wal_mb_nemb_pct, setup_mb_io_nembpct, teardown_mb_io_nembpct}, }; int