From c2ce88213061c2ab8e45ec0a3067a2a064711085 Mon Sep 17 00:00:00 2001 From: Alexander Oganezov Date: Wed, 21 Aug 2024 08:53:20 -0700 Subject: [PATCH 1/7] DAOS-16340 cart: Fix order of finalize in cart_ctl (#14969) - Fix finalization order in the cart_ctl tool, where now a group is destroyed only after the progress has been stopped. This avoids a segfault that can happen if an RPC is cancelled as part of the 'progress stop' sequence, which then references a group. Signed-off-by: Alexander A Oganezov --- src/utils/ctl/cart_ctl.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/utils/ctl/cart_ctl.c b/src/utils/ctl/cart_ctl.c index b251f5e9d79..3bdf65b2bbb 100644 --- a/src/utils/ctl/cart_ctl.c +++ b/src/utils/ctl/cart_ctl.c @@ -760,6 +760,13 @@ ctl_init() rc); } + /* Stop the progress thread before destroying the group */ + crtu_progress_stop(); + + rc = pthread_join(ctl_gdata.cg_tid, NULL); + if (rc != 0) + error_warn("Failed to join the threads; rc=%d\n", rc); + d_rank_list_free(rank_list); if (ctl_gdata.cg_save_cfg) { @@ -772,12 +779,6 @@ ctl_init() error_warn("Failed to destroy the view; rc=%d\n", rc); } - crtu_progress_stop(); - - rc = pthread_join(ctl_gdata.cg_tid, NULL); - if (rc != 0) - error_warn("Failed to join the threads; rc=%d\n", rc); - rc = sem_destroy(&ctl_gdata.cg_num_reply); if (rc != 0) error_warn("Failed to destroy a semaphore; rc=%d\n", rc); From 61074752516335c72657d9be729a2c5c80eaa843 Mon Sep 17 00:00:00 2001 From: Kris Jacque Date: Wed, 21 Aug 2024 11:48:57 -0600 Subject: [PATCH 2/7] DAOS-16352 control: Handle cases with static ifaces (#14953) Fabric interfaces defined statically in the daos_agent config file are fundamentally different from those detected via hardware scanning. They don't include information derived from the hardware such as their true device class or fabric provider(s). This patch adds some rigor to what is ignored regarding these manually-defined interfaces. - Ignore provider for statically-defined fabric interfaces, as we do not bother detecting it. They are assumed to be compatible with whatever provider the agent is using. - Silence confusing "no interfaces requested" error from WaitFabricReady by not calling it if there are no interfaces to check. - Remove some defunct logic related to detecting the provider in the multi-provider case. The agent may only use a single provider. Signed-off-by: Kris Jacque --- src/control/cmd/daos_agent/fabric.go | 4 +- src/control/cmd/daos_agent/fabric_test.go | 30 +++++++++++ src/control/cmd/daos_agent/infocache.go | 5 ++ src/control/cmd/daos_agent/infocache_test.go | 1 + src/control/cmd/daos_agent/mgmt_rpc.go | 42 ++++++--------- src/control/cmd/daos_agent/mgmt_rpc_test.go | 55 +++++++++++++++++--- 6 files changed, 101 insertions(+), 36 deletions(-) diff --git a/src/control/cmd/daos_agent/fabric.go b/src/control/cmd/daos_agent/fabric.go index fbdf6faabac..84657061c3b 100644 --- a/src/control/cmd/daos_agent/fabric.go +++ b/src/control/cmd/daos_agent/fabric.go @@ -362,7 +362,7 @@ func (n *NUMAFabric) Find(name string) ([]*FabricInterface, error) { } // FindDevice looks up a fabric device with a given name, domain, and provider. -// NB: The domain and provider are optional. All other parameters are required. If there is more +// NB: The name is required. All other parameters are optional. If there is more // than one match, all of them are returned. func (n *NUMAFabric) FindDevice(params *FabricIfaceParams) ([]*FabricInterface, error) { if params == nil { @@ -406,7 +406,7 @@ func filterDomain(domain string, fiList []*FabricInterface) []*FabricInterface { func filterProvider(provider string, fiList []*FabricInterface) []*FabricInterface { result := make([]*FabricInterface, 0, len(fiList)) for _, fi := range fiList { - if fi.HasProvider(provider) { + if fi.HasProvider(provider) || fi.NetDevClass == FabricDevClassManual { result = append(result, fi) } } diff --git a/src/control/cmd/daos_agent/fabric_test.go b/src/control/cmd/daos_agent/fabric_test.go index 06f26e9896d..c70b68ac8bf 100644 --- a/src/control/cmd/daos_agent/fabric_test.go +++ b/src/control/cmd/daos_agent/fabric_test.go @@ -1100,6 +1100,36 @@ func TestAgent_NUMAFabric_FindDevice(t *testing.T) { }, }, }, + "success with manual interfaces": { + nf: &NUMAFabric{ + numaMap: map[int][]*FabricInterface{ + 0: { + { + Name: "t1", + Domain: "t1", + NetDevClass: FabricDevClassManual, + }, + { + Name: "t2", + Domain: "t2", + NetDevClass: FabricDevClassManual, + }, + }, + }, + }, + params: &FabricIfaceParams{ + Interface: "t2", + Domain: "t2", + Provider: "p2", + }, + expResult: []*FabricInterface{ + { + Name: "t2", + Domain: "t2", + NetDevClass: FabricDevClassManual, + }, + }, + }, "success with no domain": { nf: &NUMAFabric{ numaMap: map[int][]*FabricInterface{ diff --git a/src/control/cmd/daos_agent/infocache.go b/src/control/cmd/daos_agent/infocache.go index 512a2a4e512..317ed72f755 100644 --- a/src/control/cmd/daos_agent/infocache.go +++ b/src/control/cmd/daos_agent/infocache.go @@ -561,6 +561,11 @@ func (c *InfoCache) waitFabricReady(ctx context.Context, netDevClass hardware.Ne } } + if len(needIfaces) == 0 { + c.log.Debugf("no interfaces with device class %s to wait for", netDevClass) + return nil + } + return hardware.WaitFabricReady(ctx, c.log, hardware.WaitFabricReadyParams{ StateProvider: c.devStateGetter, FabricIfaces: needIfaces, diff --git a/src/control/cmd/daos_agent/infocache_test.go b/src/control/cmd/daos_agent/infocache_test.go index a6c4688002b..1f658055115 100644 --- a/src/control/cmd/daos_agent/infocache_test.go +++ b/src/control/cmd/daos_agent/infocache_test.go @@ -1581,6 +1581,7 @@ func TestAgent_InfoCache_waitFabricReady(t *testing.T) { netDevClass: hardware.Infiniband, expChecked: []string{"t0", "t1"}, }, + "nothing to wait for": {}, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) diff --git a/src/control/cmd/daos_agent/mgmt_rpc.go b/src/control/cmd/daos_agent/mgmt_rpc.go index 8945022ae29..25cbd5d820e 100644 --- a/src/control/cmd/daos_agent/mgmt_rpc.go +++ b/src/control/cmd/daos_agent/mgmt_rpc.go @@ -7,6 +7,7 @@ package main import ( + "fmt" "net" "strings" @@ -240,42 +241,28 @@ func (mod *mgmtModule) getAttachInfoResp(ctx context.Context, sys string) (*mgmt } func (mod *mgmtModule) selectAttachInfo(ctx context.Context, srvResp *mgmtpb.GetAttachInfoResp, iface, domain string) (*mgmtpb.GetAttachInfoResp, error) { - reqProviders := mod.getIfaceProviders(ctx, iface, domain) - + resp := srvResp if mod.providerIdx > 0 { + mod.log.Debugf("using secondary provider idx %d", mod.providerIdx) + + var err error // Secondary provider indices begin at 1 - resp, err := mod.selectSecondaryAttachInfo(srvResp, mod.providerIdx) + resp, err = mod.selectSecondaryAttachInfo(srvResp, mod.providerIdx) if err != nil { return nil, err } - - if len(reqProviders) != 0 && !reqProviders.Has(resp.ClientNetHint.Provider) { - mod.log.Errorf("requested fabric interface %q (domain: %q) does not report support for configured provider %q (idx %d)", - iface, domain, resp.ClientNetHint.Provider, mod.providerIdx) - } - - return resp, nil } - if len(reqProviders) == 0 || reqProviders.Has(srvResp.ClientNetHint.Provider) { - return srvResp, nil - } - - mod.log.Debugf("primary provider is not supported by requested interface %q domain %q (supports: %s)", iface, domain, strings.Join(reqProviders.ToSlice(), ", ")) - - // We can try to be smart about choosing a provider if the client requested a specific interface - for _, hint := range srvResp.SecondaryClientNetHints { - if reqProviders.Has(hint.Provider) { - mod.log.Tracef("found secondary provider supported by requested interface: %q (idx %d)", hint.Provider, hint.ProviderIdx) - return mod.selectSecondaryAttachInfo(srvResp, uint(hint.ProviderIdx)) - } + reqProviders := mod.getIfaceProviders(ctx, iface, domain, hardware.NetDevClass(resp.ClientNetHint.NetDevClass)) + if len(reqProviders) == 0 || reqProviders.Has(resp.ClientNetHint.Provider) { + return resp, nil } - mod.log.Errorf("no supported provider for requested interface %q domain %q, using primary by default", iface, domain) - return srvResp, nil + return nil, fmt.Errorf("provider %s is not supported by requested interface %q domain %q (supports: %s)", + resp.ClientNetHint.Provider, iface, domain, strings.Join(reqProviders.ToSlice(), ", ")) } -func (mod *mgmtModule) getIfaceProviders(ctx context.Context, iface, domain string) common.StringSet { +func (mod *mgmtModule) getIfaceProviders(ctx context.Context, iface, domain string, ndc hardware.NetDevClass) common.StringSet { providers := common.NewStringSet() if iface == "" { return providers @@ -288,9 +275,10 @@ func (mod *mgmtModule) getIfaceProviders(ctx context.Context, iface, domain stri if fis, err := mod.getFabricInterface(ctx, &FabricIfaceParams{ Interface: iface, Domain: domain, + DevClass: ndc, }); err != nil { mod.log.Errorf("requested fabric interface %q (domain %q) may not function as desired: %s", iface, domain, err) - } else { + } else if fis.NetDevClass != FabricDevClassManual { providers.Add(fis.Providers()...) } @@ -362,7 +350,7 @@ func (mod *mgmtModule) populateNUMAFabricMap(ctx context.Context, resp *mgmtpb.G if exists { pbFIs.Ifaces = make([]*mgmtpb.FabricInterface, 0, len(fis)) for _, fi := range fis { - if fi.HasProvider(resp.ClientNetHint.Provider) { + if fi.HasProvider(resp.ClientNetHint.Provider) || fi.NetDevClass == FabricDevClassManual { pbFIs.Ifaces = append(pbFIs.Ifaces, &mgmtpb.FabricInterface{ NumaNode: uint32(numaNode), Interface: fi.Name, diff --git a/src/control/cmd/daos_agent/mgmt_rpc_test.go b/src/control/cmd/daos_agent/mgmt_rpc_test.go index 7e57e518d5f..1dccde639dc 100644 --- a/src/control/cmd/daos_agent/mgmt_rpc_test.go +++ b/src/control/cmd/daos_agent/mgmt_rpc_test.go @@ -109,6 +109,7 @@ func TestAgent_mgmtModule_getAttachInfo(t *testing.T) { mockFabricScan fabricScanFn mockGetNetIfaces func() ([]net.Interface, error) numaGetter *mockNUMAProvider + fabricCfg []*NUMAFabricConfig reqBytes []byte expResp *mgmtpb.GetAttachInfoResp expErr error @@ -266,6 +267,34 @@ func TestAgent_mgmtModule_getAttachInfo(t *testing.T) { }, }), }, + "req interface with cfg ifaces": { + reqBytes: reqBytes(&mgmtpb.GetAttachInfoReq{ + Sys: testSys, + Interface: "test0", + }), + fabricCfg: []*NUMAFabricConfig{ + { + NUMANode: 0, + Interfaces: []*FabricInterfaceConfig{ + { + Interface: "test0", + Domain: "test0", + }, + }, + }, + }, + expResp: respWith(testResp, "test0", "test0", []*mgmtpb.FabricInterfaces{ + { + Ifaces: []*mgmtpb.FabricInterface{ + { + Interface: "test0", + Domain: "test0", + Provider: "ofi+tcp", // automatically set to the same as server requested + }, + }, + }, + }), + }, "incompatible error": { reqBytes: reqBytes(&mgmtpb.GetAttachInfoReq{}), mockGetAttachInfo: func(_ context.Context, _ control.UnaryInvoker, _ *control.GetAttachInfoReq) (*control.GetAttachInfoResp, error) { @@ -320,14 +349,26 @@ func TestAgent_mgmtModule_getAttachInfo(t *testing.T) { } } + ic := newTestInfoCache(t, log, testInfoCacheParams{ + mockGetAttachInfo: tc.mockGetAttachInfo, + mockScanFabric: tc.mockFabricScan, + mockNetIfaces: tc.mockGetNetIfaces, + mockNetDevClassGetter: &hardware.MockNetDevClassProvider{ + GetNetDevClassReturn: []hardware.MockGetNetDevClassResult{ + { + NDC: hardware.Ether, + }, + }, + }, + }) + if tc.fabricCfg != nil { + nf := NUMAFabricFromConfig(log, tc.fabricCfg) + ic.EnableStaticFabricCache(test.Context(t), nf) + } mod := &mgmtModule{ - log: log, - sys: testSys, - cache: newTestInfoCache(t, log, testInfoCacheParams{ - mockGetAttachInfo: tc.mockGetAttachInfo, - mockScanFabric: tc.mockFabricScan, - mockNetIfaces: tc.mockGetNetIfaces, - }), + log: log, + sys: testSys, + cache: ic, numaGetter: tc.numaGetter, } From 4361a3e4d66dcfaa9a4d84c1c4354dc9f5b2b958 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Thu, 22 Aug 2024 02:56:29 +0900 Subject: [PATCH 3/7] DAOS-16381 test: Run IOR with HDF5-VOL with multiple object classes (#14964) Currently, the logic in util/file_count_test_base.py runs IOR with HDF5-VOL with only one object class. The second (and beyond) object class will not run with HDF5-VOL (it runs as a normal IOR). Update line 120 to: if api == 'HDF5-VOL': so that if multiple object classes are defined in the test yaml, all of them will run with HDF5-VOL. Signed-off-by: Makito Kano --- src/tests/ftest/util/file_count_test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/util/file_count_test_base.py b/src/tests/ftest/util/file_count_test_base.py index ef6c83dd275..58b1a49f0a5 100644 --- a/src/tests/ftest/util/file_count_test_base.py +++ b/src/tests/ftest/util/file_count_test_base.py @@ -117,7 +117,7 @@ def run_file_count(self): try: self.processes = ior_np self.ppn = ior_ppn - if self.ior_cmd.api.value == 'HDF5-VOL': + if api == 'HDF5-VOL': self.ior_cmd.api.update('HDF5') self.run_ior_with_pool( create_pool=False, plugin_path=hdf5_plugin_path, mount_dir=mount_dir) From 68c3a0df0fba6477c7a5ed0fc45e60c7db29982c Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Wed, 21 Aug 2024 13:15:19 -0500 Subject: [PATCH 4/7] DAOS-16131 client: intercept mmap() with trampoline method (#14742) Adopt trampoline method to intercept mmap() in libioil to avoid the dead lock when jemalloc library is loaded. Signed-off-by: Lei Huang --- src/client/dfuse/SConscript | 13 ++++++++++++- src/client/dfuse/il/int_posix.c | 30 ++++++++++++++++++++++++++---- src/client/dfuse/il/intercept.h | 6 +++--- src/client/dfuse/il/ioil_io.h | 3 +-- src/client/dfuse/pil4dfs/hook.c | 8 +++++++- src/client/dfuse/pil4dfs/int_dfs.c | 13 +++++-------- 6 files changed, 54 insertions(+), 19 deletions(-) diff --git a/src/client/dfuse/SConscript b/src/client/dfuse/SConscript index d04afac38bf..f9d1d7c01a1 100644 --- a/src/client/dfuse/SConscript +++ b/src/client/dfuse/SConscript @@ -33,7 +33,7 @@ OPS_SRC = ['create', 'statfs'] IOIL_SRC = ['int_posix.c', 'int_read.c', 'int_write.c'] -PIL4DFS_SRC = ['int_dfs.c', 'hook.c', 'dfs_dcache', 'aio.c'] +PIL4DFS_SRC = ['int_dfs.c', 'dfs_dcache.c', 'aio.c'] def build_common(env, files, is_shared): @@ -52,18 +52,28 @@ def build_common(env, files, is_shared): def build_client_libs_shared(env, prereqs): """build the shared interception library""" + # Both libioil and libpil4dfs need hook.c. Compile hook.c first. + hookenv = env.Clone() + if '-fvisibility=hidden' in hookenv['CFLAGS']: + hookenv['CFLAGS'].remove('-fvisibility=hidden') + hookenv.AppendUnique(CFLAGS=['-fPIC']) + hookenv.require('capstone') + hook_obj = hookenv.SharedObject(os.path.join('pil4dfs', 'hook.c'), SHOBJPREFIX='s_') + ilenv = env.Clone() ilenv.AppendUnique(CFLAGS=['-fPIC']) ilenv.AppendUnique(CPPDEFINES=['IOIL_PRELOAD']) ilenv.AppendUnique(LIBPATH=[Dir('../dfs')]) ilenv.AppendUnique(LIBPATH=[Dir('../api')]) ilenv.AppendUnique(LIBS=['dfs']) + ilenv.require('capstone') penv = ilenv.Clone() penv.AppendUnique(CPPDEFINES=['_FILE_OFFSET_BITS=64']) il_obj = [] for src in IOIL_SRC: il_obj += ilenv.SharedObject(os.path.join('il', src), SHOBJPREFIX='s_') + il_obj += hook_obj pil4dfsenv = env.Clone() if '-fvisibility=hidden' in pil4dfsenv['CFLAGS']: @@ -78,6 +88,7 @@ def build_client_libs_shared(env, prereqs): pil4dfs_obj = [] for src in PIL4DFS_SRC: pil4dfs_obj += pil4dfsenv.SharedObject(os.path.join('pil4dfs', src)) + pil4dfs_obj += hook_obj common = build_common(penv, COMMON_SRC, True) diff --git a/src/client/dfuse/il/int_posix.c b/src/client/dfuse/il/int_posix.c index 220bd890796..71d0726dda6 100644 --- a/src/client/dfuse/il/int_posix.c +++ b/src/client/dfuse/il/int_posix.c @@ -34,6 +34,7 @@ #include "dfuse_common.h" #include "ioil.h" +#include "../pil4dfs/hook.h" FOREACH_INTERCEPT(IOIL_FORWARD_DECL) @@ -62,6 +63,7 @@ struct ioil_global { bool iog_daos_init; bool iog_show_summary; /**< Should a summary be shown at teardown */ + bool iog_fini_done; /**< Whether destructor function is finished */ unsigned iog_report_count; /**< Number of operations that should be logged */ ATOMIC uint64_t iog_file_count; /**< Number of file opens intercepted */ @@ -76,6 +78,10 @@ static struct ioil_global ioil_iog; static __thread int saved_errno; +static void *(*real_mmap)(void *addr, size_t length, int prot, int flags, int fd, off_t offset); +static void * +dfuse_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset); + #define SAVE_ERRNO(is_error) \ do { \ if (is_error) \ @@ -344,6 +350,9 @@ ioil_init(void) ioil_iog.iog_eq_count_max = IOIL_MAX_EQ; } + register_a_hook("libc", "mmap", (void *)dfuse_mmap, (long int *)(&real_mmap)); + install_hook(); + ioil_iog.iog_initialized = true; } @@ -370,11 +379,17 @@ ioil_fini(void) int rc; pid_t tid = syscall(SYS_gettid); + if (ioil_iog.iog_fini_done) + return; if (tid != ioil_iog.iog_init_tid) { DFUSE_TRA_INFO(&ioil_iog, "Ignoring destructor from alternate thread"); return; } + if (ioil_iog.iog_initialized) + uninstall_hook(); + else + free_memory_in_hook(); ioil_iog.iog_initialized = false; DFUSE_TRA_DOWN(&ioil_iog); @@ -415,6 +430,7 @@ ioil_fini(void) } ioil_iog.iog_daos_init = false; daos_debug_fini(); + ioil_iog.iog_fini_done = true; } int @@ -1754,9 +1770,8 @@ dfuse_pwritev(int fd, const struct iovec *vector, int iovcnt, off_t offset) return __real_pwritev(fd, vector, iovcnt, offset); } -DFUSE_PUBLIC void * -dfuse_mmap(void *address, size_t length, int prot, int flags, int fd, - off_t offset) +static void * +dfuse_mmap(void *address, size_t length, int prot, int flags, int fd, off_t offset) { struct fd_entry *entry; int rc; @@ -1783,7 +1798,7 @@ dfuse_mmap(void *address, size_t length, int prot, int flags, int fd, return MAP_FAILED; } - return __real_mmap(address, length, prot, flags, fd, offset); + return real_mmap(address, length, prot, flags, fd, offset); } DFUSE_PUBLIC int @@ -3007,5 +3022,12 @@ dfuse_get_bypass_status(int fd) return rc; } +DFUSE_PUBLIC void +dfuse_exit(int rc) +{ + ioil_fini(); + return __real_exit(rc); +} + FOREACH_INTERCEPT(IOIL_DECLARE_ALIAS) FOREACH_ALIASED_INTERCEPT(IOIL_DECLARE_ALIAS64) diff --git a/src/client/dfuse/il/intercept.h b/src/client/dfuse/il/intercept.h index a9e0b076892..a5f9ac60141 100644 --- a/src/client/dfuse/il/intercept.h +++ b/src/client/dfuse/il/intercept.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2022 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -39,7 +39,6 @@ ACTION(int, fseeko, (FILE *, off_t, int)) \ ACTION(ssize_t, preadv, (int, const struct iovec *, int, off_t)) \ ACTION(ssize_t, pwritev, (int, const struct iovec *, int, off_t)) \ - ACTION(void *, mmap, (void *, size_t, int, int, int, off_t)) \ ACTION(off_t, ftello, (FILE *)) \ ACTION(int, ftruncate, (int, off_t)) @@ -85,7 +84,8 @@ ACTION(wint_t, getwc, (FILE * stream)) \ ACTION(wint_t, getwc_unlocked, (FILE * stream)) \ ACTION(wint_t, fgetwc, (FILE * stream)) \ - ACTION(wint_t, fgetwc_unlocked, (FILE * stream)) + ACTION(wint_t, fgetwc_unlocked, (FILE * stream)) \ + ACTION(void, exit, (int)) #define FOREACH_INTERCEPT(ACTION) \ FOREACH_SINGLE_INTERCEPT(ACTION) \ diff --git a/src/client/dfuse/il/ioil_io.h b/src/client/dfuse/il/ioil_io.h index babe6ae34d8..9bc9ad6141d 100644 --- a/src/client/dfuse/il/ioil_io.h +++ b/src/client/dfuse/il/ioil_io.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2021 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -21,7 +21,6 @@ DFUSE_PUBLIC ssize_t dfuse_pwrite(int, const void *, size_t, off_t); DFUSE_PUBLIC off_t dfuse_lseek(int, off_t, int); DFUSE_PUBLIC ssize_t dfuse_preadv(int, const struct iovec *, int, off_t); DFUSE_PUBLIC ssize_t dfuse_pwritev(int, const struct iovec *, int, off_t); -DFUSE_PUBLIC void *dfuse_mmap(void *, size_t, int, int, int, off_t); DFUSE_PUBLIC int dfuse_close(int); DFUSE_PUBLIC ssize_t dfuse_read(int, void *, size_t); DFUSE_PUBLIC ssize_t dfuse_write(int, const void *, size_t); diff --git a/src/client/dfuse/pil4dfs/hook.c b/src/client/dfuse/pil4dfs/hook.c index 0ec1a0b5374..4af38d885db 100644 --- a/src/client/dfuse/pil4dfs/hook.c +++ b/src/client/dfuse/pil4dfs/hook.c @@ -104,7 +104,8 @@ static char path_libpil4dfs[PATH_MAX]; static void quit_hook_init(void) { - D_FATAL("pil4dfs failed to initialize, aborting."); + /* print to stdout instead of stderr to avoid fault injection errors */ + printf("pil4dfs failed to initialize, aborting.\n"); exit(1); } @@ -314,6 +315,11 @@ determine_lib_path(void) } D_FREE(lib_dir_str); + if (strstr(read_buff_map, "libioil.so")) { + D_FREE(read_buff_map); + return; + } + pos = strstr(read_buff_map, "libpil4dfs.so"); if (pos == NULL) { D_ERROR("Failed to find the path of libpil4dfs.so.\n"); diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index eaecdfabc9a..7e009c70d11 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -438,7 +438,7 @@ static int (*next_symlink)(const char *symvalue, const char *path); static int (*next_symlinkat)(const char *symvalue, int dirfd, const char *path); -static ssize_t (*next_readlink)(const char *path, char *buf, size_t size); +static ssize_t (*libc_readlink)(const char *path, char *buf, size_t size); static ssize_t (*next_readlinkat)(int dirfd, const char *path, char *buf, size_t size); static void * (*next_mmap)(void *addr, size_t length, int prot, int flags, int fd, off_t offset); @@ -4738,7 +4738,7 @@ symlinkat(const char *symvalue, int dirfd, const char *path) } ssize_t -readlink(const char *path, char *buf, size_t size) +new_readlink(const char *path, char *buf, size_t size) { int is_target_path, rc, rc2; dfs_obj_t *obj; @@ -4749,12 +4749,8 @@ readlink(const char *path, char *buf, size_t size) char *parent_dir = NULL; char *full_path = NULL; - if (next_readlink == NULL) { - next_readlink = dlsym(RTLD_NEXT, "readlink"); - D_ASSERT(next_readlink != NULL); - } if (!d_hook_enabled) - return next_readlink(path, buf, size); + return libc_readlink(path, buf, size); rc = query_path(path, &is_target_path, &parent, item_name, &parent_dir, &full_path, &dfs_mt); @@ -4783,7 +4779,7 @@ readlink(const char *path, char *buf, size_t size) if (parent != NULL) drec_decref(dfs_mt->dcache, parent); FREE(parent_dir); - return next_readlink(path, buf, size); + return libc_readlink(path, buf, size); out_release: rc2 = dfs_release(obj); @@ -6845,6 +6841,7 @@ init_myhook(void) register_a_hook("libc", "exit", (void *)new_exit, (long int *)(&next_exit)); register_a_hook("libc", "dup3", (void *)new_dup3, (long int *)(&libc_dup3)); + register_a_hook("libc", "readlink", (void *)new_readlink, (long int *)(&libc_readlink)); init_fd_dup2_list(); From ec85733be3bfe50050e5eeea50476e311bb239e7 Mon Sep 17 00:00:00 2001 From: paul356 Date: Thu, 22 Aug 2024 03:01:59 +0800 Subject: [PATCH 5/7] DAOS-16098 doc: Add a guide for setting up DAOS using QEMU (#14648) Add a guide for setting up DAOS using QEMU. Signed-off-by: Pan Hao --- docs/QSG/qemu-vms.md | 292 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 docs/QSG/qemu-vms.md diff --git a/docs/QSG/qemu-vms.md b/docs/QSG/qemu-vms.md new file mode 100644 index 00000000000..108d89295d9 --- /dev/null +++ b/docs/QSG/qemu-vms.md @@ -0,0 +1,292 @@ +# Set Up DAOS Using QEMU + +## Goal + +This document gives the basic steps to set up an experimental DAOS system using QEMU VMs. This system has the minimal configuration of two VMs. One VM acts as the DAOS server and admin. The other one acts as the DAOS client. You can create pools, containers, and access containers through FUSE. You can experiment with DAOS even if you don't have no spare NVMe disk. + +## Prerequisites + +1. A modern laptop with CPU virtualization support + - Check your CPU by running `cat /proc/cpuinfo` and finding the `vmx` `sse4_2` flags. Your CPU should have at least 4 cores. +2. 16GB RAM or more +3. QEMU + - It is suggested to use a Linux based operating system. In principle you can also use other OSes but how to setup network is quite different from this document. +4. libvirt and QEMU driver +5. dnsmasq +6. Rocky Linux 8.9 minimal ISO +7. Root privilege on the host machine + +## Steps + +Assume you have both hardware and software prepared. + +1. Prepare disks for QEMU VMs. +``` +# the disk image for daos-server +qemu-img create -f qcow2 daos-server.qcow2 40G +# the disk image for the emulated nvme disk for DAOS Tier 1 +qemu-img create -f qcow2 qemu-nvm-disk1.qcow2 16G +# the disk image for daos-client +qemu-img create -f qcow2 daos-client.qcow2 20G +``` +2. Set up the network. +We use tap networks to connect the VMs. To automatically set up the guest network this can be done with `/etc/qemu-ifup`. This script will be called by QEMU. Put these contents in this script, more info on [this page](https://wiki.qemu.org/Documentation/Networking/NAT). It basically creates a network bridge if non-existent, and runs a dnsmasq service on this bridge to assign IP addresses and act as a gateway. Make sure that there is no other dnsmasq running as it may conflict with our dnsmasq. If you have virt-manager installed, make sure dnsmasq processes started by virt-manager are closed first. +``` +#!/bin/sh +# +# Copyright IBM, Corp. 2010 +# +# Authors: +# Anthony Liguori +# +# This work is licensed under the terms of the GNU GPL, version 2. See +# the COPYING file in the top-level directory. + +# Set to the name of your bridge +BRIDGE=br0 + +# Network information +NETWORK=192.168.53.0 +NETMASK=255.255.255.0 +GATEWAY=192.168.53.1 +DHCPRANGE=192.168.53.2,192.168.53.254 + +# Optionally parameters to enable PXE support +TFTPROOT= +BOOTP= + +do_brctl() { + brctl "$@" +} + +do_ifconfig() { + ifconfig "$@" +} + +do_dd() { + dd "$@" +} + +do_iptables_restore() { + iptables-restore "$@" +} + +do_dnsmasq() { + dnsmasq "$@" +} + +check_bridge() { + if do_brctl show | grep "^$1" > /dev/null 2> /dev/null; then + return 1 + else + return 0 + fi +} + +create_bridge() { + do_brctl addbr "$1" + do_brctl stp "$1" off + do_brctl setfd "$1" 0 + do_ifconfig "$1" "$GATEWAY" netmask "$NETMASK" up +} + +enable_ip_forward() { + echo 1 | do_dd of=/proc/sys/net/ipv4/ip_forward > /dev/null +} + +add_filter_rules() { +do_iptables_restore < blackbox=(root) NOPASSWD: /usr/bin/qemu-system-x86_64 +``` +3. Install guest operating systems. + +Run these two commands. Select "boot from cdrom". Follow the installation guide to install the OSes. I use Rocky Linux 8.9 in this guide. +``` +sudo qemu-system-x86_64 -M q35,accel=kvm,kernel-irqchip=split -cpu host -smp 3 -m 12288 -device intel-iommu,intremap=on -drive file=/daos-server.qcow2,if=virtio -device virtio-net,netdev=mynet0,mac=52:54:00:12:34:56 -netdev tap,id=mynet0 -drive file=/qemu-nvm-disk1.qcow2,if=none,id=nvm1 -device nvme,serial=deadbeef,drive=nvm1 -cdrom /Rocky-8.9-x86_64-minimal.iso -boot menu=on & +sudo qemu-system-x86_64 -M q35,accel=kvm,kernel-irqchip=split -cpu host -smp 1 -m 2048 -device intel-iommu,intremap=on -drive file=/daos-client.qcow2,if=virtio -device virtio-net,netdev=mynet1,mac=52:54:00:12:34:57 -netdev tap,id=mynet1 -cdrom /Rocky-8.9-x86_64-minimal.iso -boot menu=on +``` +After OS installation is finished. Remove the `-cdrom` and `-boot` options from the QEMU commands to boot VMs normally. Next is to install DAOS software. +``` +sudo qemu-system-x86_64 -M q35,accel=kvm,kernel-irqchip=split -cpu host -smp 3 -m 12288 -device intel-iommu,intremap=on -drive file=/daos-server.qcow2,if=virtio -device virtio-net,netdev=mynet0,mac=52:54:00:12:34:56 -netdev tap,id=mynet0 -drive file=/qemu-nvm-disk1.qcow2,if=none,id=nvm1 -device nvme,serial=deadbeef,drive=nvm1 & +sudo qemu-system-x86_64 -M q35,accel=kvm,kernel-irqchip=split -cpu host -smp 1 -m 2048 -device intel-iommu,intremap=on -drive file=/daos-client.qcow2,if=virtio -device virtio-net,netdev=mynet1,mac=52:54:00:12:34:57 -netdev tap,id=mynet1 +``` +By default the network won't automatically connect in Rocky Linux 8.9, and ssh service is also not enabled as well. Run these commands to enable autoconnect behavior and sshd service. You can use ssh to connect to your VMs next time. +``` +# enable the default connection +nmcli c up enp0s2 +# enable autoconnect for this connection +nmcli c modify enp0s2 connection.autoconnect "yes" +# check if connection.autoconnect is enabled +nmcli c c show enp0s2 +# enable sshd service +systemctl enable sshd.service +# start sshd service +systemctl start sshd.service +``` + +4. Install DAOS software. + +I follow these [steps](https://docs.daos.io/latest/QSG/setup_rhel/) to install both the DAOS server, DAOS admin, and DAOS client. I install daos-server and daos-admin on the first VM, and install daos-client on the second VM. You can follow the steps before Step "Create Configuration Files" to install DAOS software. Then come back to update config files. + +5. Update config files. + +Update the daos-server config file `/etc/daos/daos_server.yml` on daos-server. You may need to update "access\_points", "fabric\_iface" and "bdev\_list". Update "access\_points" accordingly if you name daos-server differently. Check if the network device has the same name as listed under "fabric\_iface". Look in the output of `lspci` for "bdev\_list". The info for our NVMe controller is like *??:??:? Non-Volatile memory controller: Red Hat, Inc. QEMU NVM Express Controller (rev 02)*. Prefix *??:??.?* is the address of the NVMe devices. +``` +name: daos_server +access_points: +- daos-server +port: 10001 + +nr_hugepages: 1024 +system_ram_reserved: 4 +disable_vfio: true + +transport_config: + allow_insecure: false + client_cert_dir: /etc/daos/certs/clients + ca_cert: /etc/daos/certs/daosCA.crt + cert: /etc/daos/certs/server.crt + key: /etc/daos/certs/server.key +# Haven't got ofi+tcp to work with QEMU. Need further investigation. +provider: ofi+sockets +control_log_mask: DEBUG +control_log_file: /tmp/daos_server.log +helper_log_file: /tmp/daos_server_helper.log +engines: +- + targets: 1 + pinned_numa_node: 0 + nr_xs_helpers: 0 + fabric_iface: enp0s2 + fabric_iface_port: 31316 + log_mask: DEBUG + log_file: /tmp/daos_engine_0.log + storage: + - + class: ram + scm_mount: /mnt/daos0 + scm_size: 4 + - + class: nvme + bdev_list: + - "0000:00:03.0" +``` +Because we disable VFIO and use UIO, we have to run daos\_server.service as root. Open `/usr/lib/systemd/system/daos_server.service`, then change User and Group to root. Then run `systemctl daemon-reload` to reload systemd scripts. +``` +... +[Service] +Type=simple +User=root +Group=root +... +``` +Add the following line to `/etc/sysctl.conf` to preallocate huagepages. You need to reboot daos-server after updating `/etc/sysctl.conf`. +``` +vm.nr_hugepages = 1024 +``` +Update the control config file `/etc/daos/daos_control.yml` on daos-server. You may need to update "hostlist" if the hostname of daos-server is different. +``` +name: daos_server +port: 10001 +hostlist: +- daos-server + +transport_config: + allow_insecure: false + ca_cert: /etc/daos/certs/daosCA.crt + cert: /etc/daos/certs/admin.crt + key: /etc/daos/certs/admin.key +``` +Then update the agent config file `/etc/daos/daos_agent.yml` on daos-client. You may need to update "access\_points" if the hostname of daos-server is different. +``` +name: daos_server +access_points: +- daos-server + +port: 10001 + +transport_config: + allow_insecure: false + ca_cert: /etc/daos/certs/daosCA.crt + cert: /etc/daos/certs/agent.crt + key: /etc/daos/certs/agent.key +log_file: /tmp/daos_agent.log +control_log_mask: DEBUG +``` +6. Start services. + On daos-server VM run `systemctl enable daos_server` to enable daos\_server service. Run `systemctl start daos_server` to start it. On daos-client VM run `systemctl enable daos_agent` to enable daos\_agent service. Run `systemctl start daos_agent` to start it. If everything is correct, you should be able to experiment with this minimal DAOS setup. Try to create pools, containers, and mount POSIX FSes. + +## Possible Issues + +1. If `dmg` returns error complaining not knowing daos-server. You can add the daos-server IP and name to /etc/hosts. +2. daos-server fails. Check the `/tmp/daos_server.log` for ERROR messages. +3. daos-server is up, but connections are rejected. Check if you have firewalld still turned on. Turn off firewalld `systemctl stop firewalld; systemctl disable firewalld`. From a1b2df2811f3dc616c3c7c8850b79744a5293a86 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Thu, 22 Aug 2024 10:59:45 +0800 Subject: [PATCH 6/7] DAOS-14317 vos: vos_obj_hold() rework (#14701) To support object pre-loading in md-on-ssd phase 2, vos_obj_hold() is split into three functions: 1. vos_obj_hold() This function finds object in cache and holds it, in md-on-ssd phase 2 mode, it also needs to load the object from meta blob and pin it in memory, since object loading could yield, this function has to be called before starting a local transaction. If the object doesn't exist, a negative object cache entry (vos_object with NULL obj_df) will be returned when VOS_OBJ_CREATE is specified, otherwise, -DER_NONEXIST is returned. 2. vos_obj_incarnate() This function creates object to make the negative object cache entry incarnated, if the object is already existing, only some update checks will be performed. This function is called within a local transaction for INTENT_PUNCH and INTENT_UPDATE cases. 3. vos_obj_check_discard() This function checks if current operation will be conflicting with ongoing operations over the same object. (aggregate, discard, obj discard, update, etc.). It will be deprecated once the 'obj discard' is deprecated. Some other minor cleanup: - Removed the 'occ' (which is a TLS) parameter from bunch of vos object APIs. - Removed VOS_OBJ_KILL_DKEY and VOS_OBJ_NO_HOLD flags. Signed-off-by: Niu Yawei --- src/vos/tests/vts_aggregate.c | 5 +- src/vos/tests/vts_io.c | 123 +++++---- src/vos/vos_common.c | 4 +- src/vos/vos_container.c | 4 +- src/vos/vos_dtx.c | 9 +- src/vos/vos_internal.h | 10 - src/vos/vos_io.c | 20 +- src/vos/vos_obj.c | 80 +++--- src/vos/vos_obj.h | 99 ++++--- src/vos/vos_obj_cache.c | 470 +++++++++++++++++++--------------- src/vos/vos_obj_index.c | 83 +++--- src/vos/vos_query.c | 5 +- 12 files changed, 491 insertions(+), 421 deletions(-) diff --git a/src/vos/tests/vts_aggregate.c b/src/vos/tests/vts_aggregate.c index 1b33fe171e0..aa183a41b52 100644 --- a/src/vos/tests/vts_aggregate.c +++ b/src/vos/tests/vts_aggregate.c @@ -208,11 +208,10 @@ lookup_object(struct io_test_args *arg, daos_unit_oid_t oid) * tree. If this returns 0, we need to release the object though * this is only presently used to check existence */ - rc = vos_obj_hold(vos_obj_cache_current(true), - vos_hdl2cont(arg->ctx.tc_co_hdl), oid, &epr, 0, + rc = vos_obj_hold(vos_hdl2cont(arg->ctx.tc_co_hdl), oid, &epr, 0, VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, &obj, 0); if (rc == 0) - vos_obj_release(vos_obj_cache_current(true), obj, 0, false); + vos_obj_release(obj, 0, false); return rc; } diff --git a/src/vos/tests/vts_io.c b/src/vos/tests/vts_io.c index 5ec1bb08f76..93bb20d4906 100644 --- a/src/vos/tests/vts_io.c +++ b/src/vos/tests/vts_io.c @@ -896,9 +896,27 @@ io_update_and_fetch_dkey(struct io_test_args *arg, daos_epoch_t update_epoch, } static inline int -hold_objects(struct vos_object **objs, struct daos_lru_cache *occ, - daos_handle_t *coh, daos_unit_oid_t *oid, int start, int end, - bool no_create, int exp_rc) +hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *epr, + daos_epoch_t bound, uint64_t flags, uint32_t intent, struct vos_object **obj_p, + struct vos_ts_set *ts_set) +{ + int rc; + + rc = vos_obj_hold(cont, oid, epr, bound, flags, intent, obj_p, ts_set); + if (rc) + return rc; + + if (flags & VOS_OBJ_CREATE) { + assert_ptr_not_equal(*obj_p, NULL); + rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set); + } + + return rc; +} + +static inline int +hold_objects(struct vos_object **objs, daos_handle_t *coh, daos_unit_oid_t *oid, + int start, int end, bool no_create, int exp_rc) { int i = 0, rc = 0; daos_epoch_range_t epr = {0, 1}; @@ -907,9 +925,8 @@ hold_objects(struct vos_object **objs, struct daos_lru_cache *occ, hold_flags = no_create ? 0 : VOS_OBJ_CREATE; hold_flags |= VOS_OBJ_VISIBLE; for (i = start; i < end; i++) { - rc = vos_obj_hold(occ, vos_hdl2cont(*coh), *oid, &epr, 0, - hold_flags, no_create ? DAOS_INTENT_DEFAULT : - DAOS_INTENT_UPDATE, &objs[i], 0); + rc = hold_obj(vos_hdl2cont(*coh), *oid, &epr, 0, hold_flags, + no_create ? DAOS_INTENT_DEFAULT : DAOS_INTENT_UPDATE, &objs[i], 0); if (rc != exp_rc) return 1; } @@ -991,109 +1008,105 @@ io_obj_cache_test(void **state) rc = umem_tx_begin(ummg, NULL); assert_rc_equal(rc, 0); - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, - &objs[0], 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0); assert_rc_equal(rc, 0); /** Hold object for discard */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj1, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, + DAOS_INTENT_DISCARD, &obj1, 0); assert_rc_equal(rc, 0); /** Second discard should fail */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, + DAOS_INTENT_DISCARD, &obj2, 0); assert_rc_equal(rc, -DER_BUSY); /** Should prevent simultaneous aggregation */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj2, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, + DAOS_INTENT_PURGE, &obj2, 0); assert_rc_equal(rc, -DER_BUSY); /** Should prevent simultaneous hold for create as well */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, - &obj2, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0); assert_rc_equal(rc, -DER_UPDATE_AGAIN); /** Need to be able to hold for read though or iteration won't work */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, &obj2, 0); - vos_obj_release(occ, obj2, 0, false); - vos_obj_release(occ, obj1, VOS_OBJ_DISCARD, false); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE, + DAOS_INTENT_DEFAULT, &obj2, 0); + vos_obj_release(obj2, 0, false); + vos_obj_release(obj1, VOS_OBJ_DISCARD, false); /** Hold object for aggregation */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj1, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, + DAOS_INTENT_PURGE, &obj1, 0); assert_rc_equal(rc, 0); /** Discard should fail */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, + DAOS_INTENT_DISCARD, &obj2, 0); assert_rc_equal(rc, -DER_BUSY); /** Second aggregation should fail */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, - DAOS_INTENT_PURGE, &obj2, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_AGGREGATE, + DAOS_INTENT_PURGE, &obj2, 0); assert_rc_equal(rc, -DER_BUSY); /** Simultaneous create should work */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, &obj2, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &obj2, 0); assert_rc_equal(rc, 0); - vos_obj_release(occ, obj2, 0, false); + vos_obj_release(obj2, 0, false); /** Need to be able to hold for read though or iteration won't work */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_DEFAULT, &obj2, 0); - vos_obj_release(occ, obj2, 0, false); - vos_obj_release(occ, obj1, VOS_OBJ_AGGREGATE, false); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_VISIBLE, + DAOS_INTENT_DEFAULT, &obj2, 0); + vos_obj_release(obj2, 0, false); + vos_obj_release(obj1, VOS_OBJ_AGGREGATE, false); /** Now that other one is done, this should work */ - rc = vos_obj_hold(occ, vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, - DAOS_INTENT_DISCARD, &obj2, 0); + rc = hold_obj(vos_hdl2cont(ctx->tc_co_hdl), oids[0], &epr, 0, VOS_OBJ_DISCARD, + DAOS_INTENT_DISCARD, &obj2, 0); assert_rc_equal(rc, 0); - vos_obj_release(occ, obj2, VOS_OBJ_DISCARD, false); + vos_obj_release(obj2, VOS_OBJ_DISCARD, false); rc = umem_tx_end(ummg, 0); assert_rc_equal(rc, 0); - vos_obj_release(occ, objs[0], 0, false); + vos_obj_release(objs[0], 0, false); rc = umem_tx_begin(umml, NULL); assert_rc_equal(rc, 0); - rc = vos_obj_hold(occ, vos_hdl2cont(l_coh), oids[1], &epr, 0, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, - &objs[0], 0); + rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, + VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, &objs[0], 0); assert_rc_equal(rc, 0); - vos_obj_release(occ, objs[0], 0, false); + vos_obj_release(objs[0], 0, false); rc = umem_tx_end(umml, 0); assert_rc_equal(rc, 0); - rc = hold_objects(objs, occ, &ctx->tc_co_hdl, &oids[0], 0, 10, true, 0); + rc = hold_objects(objs, &ctx->tc_co_hdl, &oids[0], 0, 10, true, 0); assert_int_equal(rc, 0); - rc = hold_objects(objs, occ, &ctx->tc_co_hdl, &oids[1], 10, 15, true, - -DER_NONEXIST); + rc = hold_objects(objs, &ctx->tc_co_hdl, &oids[1], 10, 15, true, -DER_NONEXIST); assert_int_equal(rc, 0); - rc = hold_objects(objs, occ, &l_coh, &oids[1], 10, 15, true, 0); + rc = hold_objects(objs, &l_coh, &oids[1], 10, 15, true, 0); assert_int_equal(rc, 0); - rc = vos_obj_hold(occ, vos_hdl2cont(l_coh), oids[1], &epr, 0, - VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, &objs[16], 0); + rc = hold_obj(vos_hdl2cont(l_coh), oids[1], &epr, 0, VOS_OBJ_VISIBLE, + DAOS_INTENT_DEFAULT, &objs[16], 0); assert_rc_equal(rc, 0); - vos_obj_release(occ, objs[16], 0, false); + vos_obj_release(objs[16], 0, false); for (i = 0; i < 5; i++) - vos_obj_release(occ, objs[i], 0, false); + vos_obj_release(objs[i], 0, false); for (i = 10; i < 15; i++) - vos_obj_release(occ, objs[i], 0, false); + vos_obj_release(objs[i], 0, false); - rc = hold_objects(objs, occ, &l_coh, &oids[1], 15, 20, true, 0); + rc = hold_objects(objs, &l_coh, &oids[1], 15, 20, true, 0); assert_int_equal(rc, 0); for (i = 5; i < 10; i++) - vos_obj_release(occ, objs[i], 0, false); + vos_obj_release(objs[i], 0, false); for (i = 15; i < 20; i++) - vos_obj_release(occ, objs[i], 0, false); + vos_obj_release(objs[i], 0, false); rc = vos_cont_close(l_coh); assert_rc_equal(rc, 0); diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index fb8461e2931..e9389deea2f 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -241,7 +241,6 @@ static inline void vos_local_tx_abort(struct dtx_handle *dth) { struct dtx_local_oid_record *record = NULL; - struct daos_lru_cache *occ = NULL; if (dth->dth_local_oid_cnt == 0) return; @@ -251,7 +250,6 @@ vos_local_tx_abort(struct dtx_handle *dth) * can be used to access the pool. */ record = &dth->dth_local_oid_array[0]; - occ = vos_obj_cache_current(record->dor_cont->vc_pool->vp_sysdb); /** * Evict all objects touched by the aborted transaction from the object cache to make sure @@ -260,7 +258,7 @@ vos_local_tx_abort(struct dtx_handle *dth) */ for (int i = 0; i < dth->dth_local_oid_cnt; ++i) { record = &dth->dth_local_oid_array[i]; - (void)vos_obj_evict_by_oid(occ, record->dor_cont, record->dor_oid); + (void)vos_obj_evict_by_oid(record->dor_cont, record->dor_oid); } } diff --git a/src/vos/vos_container.c b/src/vos/vos_container.c index 95fd533ec7a..6e6cbeeeb2a 100644 --- a/src/vos/vos_container.c +++ b/src/vos/vos_container.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -488,7 +488,7 @@ vos_cont_close(daos_handle_t coh) cont->vc_open_count--; if (cont->vc_open_count == 0) - vos_obj_cache_evict(vos_obj_cache_current(cont->vc_pool->vp_sysdb), cont); + vos_obj_cache_evict(cont); D_DEBUG(DB_TRACE, "Close cont "DF_UUID", open count: %d\n", DP_UUID(cont->vc_id), cont->vc_open_count); diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 0e70133629f..697b943ecf0 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -199,8 +199,7 @@ dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, } for (i = 0; i < count; i++) - vos_obj_evict_by_oid(vos_obj_cache_current(cont->vc_pool->vp_sysdb), - cont, oids[i]); + vos_obj_evict_by_oid(cont, oids[i]); } if (dae->dae_oids != NULL && dae->dae_oids != &dae->dae_oid_inline && @@ -2646,14 +2645,12 @@ int vos_dtx_mark_sync(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch) { struct vos_container *cont; - struct daos_lru_cache *occ; struct vos_object *obj; daos_epoch_range_t epr = {0, epoch}; int rc; cont = vos_hdl2cont(coh); - occ = vos_obj_cache_current(cont->vc_pool->vp_sysdb); - rc = vos_obj_hold(occ, cont, oid, &epr, 0, VOS_OBJ_VISIBLE, + rc = vos_obj_hold(cont, oid, &epr, 0, VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, &obj, 0); if (rc != 0) { D_ERROR(DF_UOID" fail to mark sync: rc = "DF_RC"\n", @@ -2672,7 +2669,7 @@ vos_dtx_mark_sync(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch) sizeof(obj->obj_df->vo_sync), UMEM_COMMIT_IMMEDIATE); } - vos_obj_release(occ, obj, 0, false); + vos_obj_release(obj, 0, false); return 0; } diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 7d4dd3ac166..f6a74fce7e6 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -619,16 +619,6 @@ vos_pool_hash_del(struct vos_pool *pool) d_uhash_link_delete(vos_pool_hhash_get(pool->vp_sysdb), &pool->vp_hlink); } -/** - * Getting object cache - * Wrapper for TLS and standalone mode - */ -static inline struct daos_lru_cache * -vos_get_obj_cache(void) -{ - return vos_tls_get(false)->vtl_ocache; -} - /** * Register btree class for container table, it is called within vos_init() * diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index 663e1c2c4d0..4d452f50d6a 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -571,8 +571,7 @@ vos_ioc_destroy(struct vos_io_context *ioc, bool evict) dcs_csum_info_list_fini(&ioc->ic_csum_list); if (ioc->ic_obj) - vos_obj_release(vos_obj_cache_current(ioc->ic_cont->vc_pool->vp_sysdb), ioc->ic_obj, - 0, evict); + vos_obj_release(ioc->ic_obj, 0, evict); vos_ioc_reserve_fini(ioc); vos_ilog_fetch_finish(&ioc->ic_dkey_info); @@ -1562,8 +1561,7 @@ vos_fetch_begin(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, rc = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); D_ASSERT(rc == 0); - rc = vos_obj_hold(vos_obj_cache_current(ioc->ic_cont->vc_pool->vp_sysdb), - ioc->ic_cont, oid, &ioc->ic_epr, ioc->ic_bound, VOS_OBJ_VISIBLE, + rc = vos_obj_hold(ioc->ic_cont, oid, &ioc->ic_epr, ioc->ic_bound, VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, &ioc->ic_obj, ioc->ic_ts_set); if (stop_check(ioc, VOS_COND_FETCH_MASK | VOS_OF_COND_PER_AKEY, NULL, &rc, false)) { @@ -2421,7 +2419,8 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, struct vos_io_context *ioc = vos_ioh2ioc(ioh); struct umem_instance *umem; bool tx_started = false; - uint16_t minor_epc; + uint16_t minor_epc; + uint64_t flags = VOS_OBJ_CREATE | VOS_OBJ_VISIBLE; D_ASSERT(ioc->ic_update); vos_dedup_verify_fini(ioh); @@ -2434,6 +2433,11 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); D_ASSERT(err == 0); + err = vos_obj_hold(ioc->ic_cont, ioc->ic_oid, &ioc->ic_epr, ioc->ic_bound, + flags, DAOS_INTENT_UPDATE, &ioc->ic_obj, ioc->ic_ts_set); + if (err != 0) + goto abort; + err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb); if (err != 0) goto abort; @@ -2460,10 +2464,8 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, D_FREE(daes); } - err = vos_obj_hold(vos_obj_cache_current(ioc->ic_cont->vc_pool->vp_sysdb), - ioc->ic_cont, ioc->ic_oid, &ioc->ic_epr, ioc->ic_bound, - VOS_OBJ_CREATE | VOS_OBJ_VISIBLE, DAOS_INTENT_UPDATE, - &ioc->ic_obj, ioc->ic_ts_set); + err = vos_obj_incarnate(ioc->ic_obj, &ioc->ic_epr, ioc->ic_bound, flags, + DAOS_INTENT_UPDATE, ioc->ic_ts_set); if (err != 0) goto abort; diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index 00219138bd4..ee56a01b629 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -368,13 +368,11 @@ static int obj_punch(daos_handle_t coh, struct vos_object *obj, daos_epoch_t epoch, daos_epoch_t bound, uint64_t flags, struct vos_ts_set *ts_set) { - struct daos_lru_cache *occ; struct vos_container *cont; struct vos_ilog_info *info; int rc; cont = vos_hdl2cont(coh); - occ = vos_obj_cache_current(cont->vc_pool->vp_sysdb); D_ALLOC_PTR(info); if (info == NULL) return -DER_NOMEM; @@ -387,7 +385,7 @@ obj_punch(daos_handle_t coh, struct vos_object *obj, daos_epoch_t epoch, /* evict it from cache, because future fetch should only see empty * object (without obj_df) */ - vos_obj_evict(occ, obj); + vos_obj_evict(obj); failed: vos_ilog_fetch_finish(info); D_FREE(info); @@ -487,6 +485,13 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (rc != 0) goto reset; + hold_flags = (flags & VOS_OF_COND_PUNCH) ? 0 : VOS_OBJ_CREATE; + hold_flags |= VOS_OBJ_VISIBLE; + + rc = vos_obj_hold(cont, oid, &epr, bound, hold_flags, DAOS_INTENT_PUNCH, &obj, ts_set); + if (rc != 0) + goto reset; + rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb); if (rc != 0) goto reset; @@ -510,11 +515,8 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, D_FREE(daes); } - hold_flags = (flags & VOS_OF_COND_PUNCH) ? 0 : VOS_OBJ_CREATE; - hold_flags |= VOS_OBJ_VISIBLE; /* NB: punch always generate a new incarnation of the object */ - rc = vos_obj_hold(vos_obj_cache_current(cont->vc_pool->vp_sysdb), vos_hdl2cont(coh), - oid, &epr, bound, hold_flags, DAOS_INTENT_PUNCH, &obj, ts_set); + rc = vos_obj_incarnate(obj, &epr, bound, hold_flags, DAOS_INTENT_PUNCH, ts_set); if (rc == 0) { if (dkey) { /* key punch */ rc = key_punch(obj, epr.epr_hi, bound, pm_ver, dkey, @@ -528,22 +530,17 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (punch_obj) rc = obj_punch(coh, obj, epr.epr_hi, bound, flags, ts_set); - if (obj != NULL) { - if (rc == 0 && epr.epr_hi > obj->obj_df->vo_max_write) { - rc = umem_tx_xadd_ptr( - vos_cont2umm(cont), &obj->obj_df->vo_max_write, - sizeof(obj->obj_df->vo_max_write), UMEM_XADD_NO_SNAPSHOT); - if (rc == 0) - obj->obj_df->vo_max_write = epr.epr_hi; - } - + D_ASSERT(obj != NULL); + if (rc == 0 && epr.epr_hi > obj->obj_df->vo_max_write) { + rc = umem_tx_xadd_ptr(vos_cont2umm(cont), &obj->obj_df->vo_max_write, + sizeof(obj->obj_df->vo_max_write), UMEM_XADD_NO_SNAPSHOT); if (rc == 0) - rc = vos_mark_agg(cont, &obj->obj_df->vo_tree, - &cont->vc_cont_df->cd_obj_root, epoch); - - vos_obj_release(vos_obj_cache_current(cont->vc_pool->vp_sysdb), obj, 0, - rc != 0); + obj->obj_df->vo_max_write = epr.epr_hi; } + + if (rc == 0) + rc = vos_mark_agg(cont, &obj->obj_df->vo_tree, + &cont->vc_cont_df->cd_obj_root, epoch); } reset: @@ -587,6 +584,9 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, false, rc != 0); } + if (obj != NULL) + vos_obj_release(obj, 0, rc != 0); + D_FREE(daes); D_FREE(dces); vos_ts_set_free(ts_set); @@ -606,7 +606,6 @@ vos_obj_key2anchor(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, dao { struct vos_container *cont; struct vos_krec_df *krec = NULL; - struct daos_lru_cache *occ; int rc; int flags = 0; struct vos_object *obj; @@ -618,9 +617,8 @@ vos_obj_key2anchor(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, dao D_ERROR("Container is not open\n"); return -DER_INVAL; } - occ = vos_obj_cache_current(cont->vc_pool->vp_sysdb); - rc = vos_obj_hold(occ, cont, oid, &epr, DAOS_EPOCH_MAX, 0, DAOS_INTENT_DEFAULT, &obj, NULL); + rc = vos_obj_hold(cont, oid, &epr, DAOS_EPOCH_MAX, 0, DAOS_INTENT_DEFAULT, &obj, NULL); if (rc != 0) { if (rc == -DER_NONEXIST) { daos_anchor_set_eof(anchor); @@ -677,7 +675,7 @@ vos_obj_key2anchor(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, dao key_tree_release(toh, (krec->kr_bmap & KREC_BF_EVT) != 0); out: - vos_obj_release(occ, obj, 0, false); + vos_obj_release(obj, 0, false); return rc; } @@ -686,14 +684,12 @@ static int vos_obj_delete_internal(daos_handle_t coh, daos_unit_oid_t oid, bool only_delete_entry) { struct vos_container *cont = vos_hdl2cont(coh); - struct daos_lru_cache *occ = vos_obj_cache_current(cont->vc_pool->vp_sysdb); struct umem_instance *umm = vos_cont2umm(cont); struct vos_object *obj; daos_epoch_range_t epr = {0, DAOS_EPOCH_MAX}; int rc; - rc = vos_obj_hold(occ, cont, oid, &epr, 0, VOS_OBJ_VISIBLE, - DAOS_INTENT_KILL, &obj, NULL); + rc = vos_obj_hold(cont, oid, &epr, 0, VOS_OBJ_VISIBLE, DAOS_INTENT_KILL, &obj, NULL); if (rc == -DER_NONEXIST) return 0; @@ -702,6 +698,13 @@ vos_obj_delete_internal(daos_handle_t coh, daos_unit_oid_t oid, bool only_delete return rc; } + if (!daos_lru_is_last_user(&obj->obj_llink)) { + rc = -DER_BUSY; + goto out; + } + /* no one else can hold it */ + obj->obj_zombie = true; + rc = umem_tx_begin(umm, NULL); if (rc) goto out; @@ -713,7 +716,7 @@ vos_obj_delete_internal(daos_handle_t coh, daos_unit_oid_t oid, bool only_delete rc = umem_tx_end(umm, rc); out: - vos_obj_release(occ, obj, 0, true); + vos_obj_release(obj, 0, rc == 0); return rc; } @@ -738,7 +741,6 @@ vos_obj_del_key(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, daos_key_t *akey) { struct vos_container *cont = vos_hdl2cont(coh); - struct daos_lru_cache *occ = vos_obj_cache_current(cont->vc_pool->vp_sysdb); struct umem_instance *umm = vos_cont2umm(cont); struct vos_object *obj; daos_key_t *key; @@ -746,8 +748,7 @@ vos_obj_del_key(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, daos_handle_t toh; int rc; - rc = vos_obj_hold(occ, cont, oid, &epr, 0, VOS_OBJ_VISIBLE | VOS_OBJ_KILL_DKEY, - DAOS_INTENT_KILL, &obj, NULL); + rc = vos_obj_hold(cont, oid, &epr, 0, VOS_OBJ_VISIBLE, DAOS_INTENT_KILL, &obj, NULL); if (rc == -DER_NONEXIST) return 0; @@ -794,7 +795,7 @@ vos_obj_del_key(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, out_tx: rc = umem_tx_end(umm, rc); out: - vos_obj_release(occ, obj, 0, true); + vos_obj_release(obj, 0, true); return rc; } @@ -1738,9 +1739,7 @@ vos_obj_iter_prep(vos_iter_type_t type, vos_iter_param_t *param, * the object/key if it's punched more than once. However, rebuild * system should guarantee this will never happen. */ - rc = vos_obj_hold(vos_obj_cache_current(is_sysdb), cont, - param->ip_oid, &oiter->it_epr, - oiter->it_iter.it_bound, + rc = vos_obj_hold(cont, param->ip_oid, &oiter->it_epr, oiter->it_iter.it_bound, (oiter->it_flags & VOS_IT_PUNCHED) ? 0 : VOS_OBJ_VISIBLE, vos_iter_intent(&oiter->it_iter), &oiter->it_obj, ts_set); @@ -1896,8 +1895,7 @@ dkey_nested_iter_init(struct vos_obj_iter *oiter, struct vos_iter_info *info) * the object/key if it's punched more than once. However, rebuild * system should guarantee this will never happen. */ - rc = vos_obj_hold(vos_obj_cache_current(cont->vc_pool->vp_sysdb), cont, info->ii_oid, - &info->ii_epr, oiter->it_iter.it_bound, flags, + rc = vos_obj_hold(cont, info->ii_oid, &info->ii_epr, oiter->it_iter.it_bound, flags, vos_iter_intent(&oiter->it_iter), &oiter->it_obj, NULL); D_ASSERTF(rc != -DER_NONEXIST, @@ -1925,8 +1923,7 @@ dkey_nested_iter_init(struct vos_obj_iter *oiter, struct vos_iter_info *info) return 0; failed: - vos_obj_release(vos_obj_cache_current(cont->vc_pool->vp_sysdb), oiter->it_obj, flags, - false); + vos_obj_release(oiter->it_obj, flags, false); return rc; } @@ -2231,8 +2228,7 @@ vos_obj_iter_fini(struct vos_iterator *iter) else if (iter->it_for_agg) flags = VOS_OBJ_AGGREGATE; } - vos_obj_release(vos_obj_cache_current(object->obj_cont->vc_pool->vp_sysdb), object, - flags, false); + vos_obj_release(object, flags, false); } vos_ilog_fetch_finish(&oiter->it_ilog_info); diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index c687cd77f9c..2ccc8d71988 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -64,21 +64,18 @@ enum { VOS_OBJ_DISCARD = (1 << 2), /** Hold for VOS or EC aggregation */ VOS_OBJ_AGGREGATE = (1 << 3), - /** Hold the object for delete dkey */ - VOS_OBJ_KILL_DKEY = (1 << 4), - /** Don't actually complete the hold, just check for conflicts */ - VOS_OBJ_NO_HOLD = (1 << 5), }; /** * Find an object in the cache \a occ and take its reference. If the object is - * not in cache, this function will load it from PMEM pool or create it, then - * add it to the cache. + * not in cache, this function will load it from PMEM pool, then add it to the + * cache. If the object doesn't exist on PMEM pool, a negative cache entry will + * be returned when VOS_OBJ_CREATE flag is specified, otherwise, -DER_NONEXIST + * will be returned. This function should be called outside of local transaction. * - * \param occ [IN] Object cache, can be per cpu * \param cont [IN] Open container. * \param oid [IN] VOS object ID. - * \param epr [IN,OUT] Epoch range. High epoch should be set + * \param epr [IN] Epoch range. High epoch should be set * to requested epoch. The lower epoch * can be 0 or bounded. * \param bound [IN] Epoch uncertainty bound @@ -96,9 +93,8 @@ enum { * other Another error occurred */ int -vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, - daos_unit_oid_t oid, daos_epoch_range_t *epr, daos_epoch_t bound, - uint64_t flags, uint32_t intent, struct vos_object **obj_p, +vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *epr, + daos_epoch_t bound, uint64_t flags, uint32_t intent, struct vos_object **obj_p, struct vos_ts_set *ts_set); /** @@ -107,13 +103,12 @@ vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, * \param obj [IN] Reference to be released. */ void -vos_obj_release(struct daos_lru_cache *occ, struct vos_object *obj, uint64_t flags, bool evict); +vos_obj_release(struct vos_object *obj, uint64_t flags, bool evict); /** Evict an object reference from the cache */ -void vos_obj_evict(struct daos_lru_cache *occ, struct vos_object *obj); +void vos_obj_evict(struct vos_object *obj); -int vos_obj_evict_by_oid(struct daos_lru_cache *occ, struct vos_container *cont, - daos_unit_oid_t oid); +int vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid); /** * Create an object cache. @@ -133,13 +128,7 @@ void vos_obj_cache_destroy(struct daos_lru_cache *occ); /** evict cached objects for the specified container */ -void vos_obj_cache_evict(struct daos_lru_cache *occ, - struct vos_container *cont); - -/** - * Return object cache for the current IO. - */ -struct daos_lru_cache *vos_obj_cache_current(bool standalone); +void vos_obj_cache_evict(struct vos_container *cont); /** * Object Index API and handles @@ -200,6 +189,23 @@ int vos_oi_find(struct vos_container *cont, daos_unit_oid_t oid, struct vos_obj_df **obj, struct vos_ts_set *ts_set); +/** + * Create a new object for the @oid and return the direct pointer of the + * new allocated object. + * + * \param cont [IN] Open container + * \param oid [IN] DAOS object ID + * \param epoch [IN] Epoch for the lookup + * \param obj [OUT] Direct pointer to VOS object + * \param ts_set[IN] Timestamp sets + * + * \return 0 on success and negative on + * failure + */ +int +vos_oi_alloc(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_t epoch, + struct vos_obj_df **obj_p, struct vos_ts_set *ts_set); + /** * Punch an object from the OI table */ @@ -214,28 +220,45 @@ vos_oi_punch(struct vos_container *cont, daos_unit_oid_t oid, int vos_oi_delete(struct vos_container *cont, daos_unit_oid_t oid, bool only_delete_entry); -/** Hold object for range discard +/** + * When the passed in 'obj' is a negative cache entry, create object on PMEM pool + * and associate it with the negative entry, otherwise, perform some update check + * only. This function should be called within local transaction. * - * \param[in] occ Object cache, can be per cpu - * \param[in] cont Open container - * \param[in] oid The object id - * \param[out] objp Returned object + * \param obj [IN] Object cache entry + * \param epr [IN] Epoch range. High epoch should be set + * to requested epoch. The lower epoch + * can be 0 or bounded. + * \param bound [IN] Epoch uncertainty bound + * \param flags [IN] Object flags + * \param intent [IN] The request intent. + * \param ts_set [IN] Timestamp set * - * \return -DER_NONEXIST object doesn't exist - * -DER_BUSY Object is already in discard - * -DER_AGAIN Object is being destroyed - * 0 Success + * \return 0 Object is successfully incarnated. + * \return -DER_NONEXIST The conditions for success don't apply + * -DER_INPROGRESS The local target doesn't have the + * definitive state of the object. + * other Another error occurred */ int -vos_obj_discard_hold(struct daos_lru_cache *occ, struct vos_container *cont, daos_unit_oid_t oid, - struct vos_object **objp); +vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t bound, + uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set); -/** Release object held for range discard +/** + * Check if an operation will be conflicting with other ongoing operations over the + * same object. (aggregate, discard, obj discard, update, etc.) + * + * FIXME: This function could be deprecated once the obj discard is deprecated. * - * \param[in] occ Object cache, can be per cpu - * \param[in] obj Object to release + * \param cont [IN] VOS container + * \param oid [IN] VOS object ID. + * \param flags [IN] Object flags for the operation to be checked + * + * \return 0 No conflicting operations + * \return -DER_BUSY Read is conflicting with ongoing operations + * -DER_UPDATE_AGAIN Write is conflicting with ongoing operations */ -void -vos_obj_discard_release(struct daos_lru_cache *occ, struct vos_object *obj); +int +vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t flags); #endif diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index e1f99c10e22..8845eae0085 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -189,42 +189,94 @@ obj_cache_evict_cond(struct daos_llink *llink, void *args) } void -vos_obj_cache_evict(struct daos_lru_cache *cache, struct vos_container *cont) +vos_obj_cache_evict(struct vos_container *cont) { - daos_lru_cache_evict(cache, obj_cache_evict_cond, cont); + struct daos_lru_cache *occ; + + occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb); + D_ASSERT(occ != NULL); + + daos_lru_cache_evict(occ, obj_cache_evict_cond, cont); } -/** - * Return object cache for the current IO. - */ -struct daos_lru_cache * -vos_obj_cache_current(bool standalone) +static __thread struct vos_object obj_local = {0}; + +static inline void +obj_put(struct daos_lru_cache *occ, struct vos_object *obj, bool evict) { - return vos_obj_cache_get(standalone); + if (evict) + daos_lru_ref_evict(occ, &obj->obj_llink); + daos_lru_ref_release(occ, &obj->obj_llink); } -static __thread struct vos_object obj_local = {0}; +static int +obj_get(struct daos_lru_cache *occ, struct vos_container *cont, daos_unit_oid_t oid, + bool create, struct vos_object **obj_p) +{ + struct vos_object *obj; + struct daos_llink *lret; + struct obj_lru_key lkey; + int rc; + void *create_flag; -void -vos_obj_release(struct daos_lru_cache *occ, struct vos_object *obj, uint64_t flags, bool evict) + if (cont->vc_pool->vp_dying) + D_GOTO(out, rc = -DER_SHUTDOWN); + + create_flag = create ? cont : NULL; + lkey.olk_cont = cont; + lkey.olk_oid = oid; + + rc = daos_lru_ref_hold(occ, &lkey, sizeof(lkey), create_flag, &lret); + if (rc == 0) { + obj = container_of(lret, struct vos_object, obj_llink); + *obj_p = obj; + return 0; + } +out: + if (rc == -DER_NONEXIST) { + D_ASSERT(create_flag == NULL); + D_DEBUG(DB_TRACE, DF_CONT": Object "DF_UOID" doesn't exist.\n", + DP_CONT(cont->vc_pool->vp_id, cont->vc_id), DP_UOID(oid)); + } else if (rc) { + DL_ERROR(rc, DF_CONT": Failed to find object "DF_UOID".", + DP_CONT(cont->vc_pool->vp_id, cont->vc_id), DP_UOID(oid)); + } + + return rc; +} + +static inline void +obj_release(struct daos_lru_cache *occ, struct vos_object *obj, bool evict) { + D_ASSERT(obj != NULL); + /* TODO: Unpin the object in md-on-ssd phase II */ + if (obj == &obj_local) { clean_object(obj); memset(obj, 0, sizeof(*obj)); return; } - D_ASSERT((occ != NULL) && (obj != NULL)); + obj_put(occ, obj, evict); +} + +void +vos_obj_release(struct vos_object *obj, uint64_t flags, bool evict) +{ + struct daos_lru_cache *occ; + + D_ASSERT(obj != &obj_local); + + occ = vos_obj_cache_get(obj->obj_cont->vc_pool->vp_sysdb); + D_ASSERT(occ != NULL); + if (flags & VOS_OBJ_AGGREGATE) obj->obj_aggregate = 0; else if (flags & VOS_OBJ_DISCARD) obj->obj_discard = 0; - if (evict) - daos_lru_ref_evict(occ, &obj->obj_llink); - - daos_lru_ref_release(occ, &obj->obj_llink); + obj_release(occ, obj, evict); } /** Move local object to the lru cache */ @@ -232,27 +284,15 @@ static inline int cache_object(struct daos_lru_cache *occ, struct vos_object **objp) { struct vos_object *obj_new; - struct daos_llink *lret; - struct obj_lru_key lkey; int rc; - *objp = NULL; - D_ASSERT(obj_local.obj_cont != NULL); - - lkey.olk_cont = obj_local.obj_cont; - lkey.olk_oid = obj_local.obj_id; - - rc = daos_lru_ref_hold(occ, &lkey, sizeof(lkey), obj_local.obj_cont, &lret); - if (rc != 0) { - clean_object(&obj_local); - memset(&obj_local, 0, sizeof(obj_local)); + rc = obj_get(occ, obj_local.obj_cont, obj_local.obj_id, true, &obj_new); + if (rc != 0) return rc; /* Can't cache new object */ - } - /** Object is in cache */ - obj_new = container_of(lret, struct vos_object, obj_llink); /* This object should not be cached */ + D_ASSERT(obj_new != NULL); D_ASSERT(obj_new->obj_df == NULL); vos_ilog_fetch_move(&obj_new->obj_ilog_info, &obj_local.obj_ilog_info); @@ -263,6 +303,7 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp) obj_new->obj_zombie = obj_local.obj_zombie; obj_local.obj_toh = DAOS_HDL_INVAL; obj_local.obj_ih = DAOS_HDL_INVAL; + clean_object(&obj_local); memset(&obj_local, 0, sizeof(obj_local)); @@ -272,10 +313,11 @@ cache_object(struct daos_lru_cache *occ, struct vos_object **objp) } static bool -vos_obj_op_conflict(struct vos_object *obj, uint64_t flags, uint32_t intent, bool create) +check_discard(struct vos_object *obj, uint64_t flags) { bool discard = flags & VOS_OBJ_DISCARD; bool agg = flags & VOS_OBJ_AGGREGATE; + bool create = flags & VOS_OBJ_CREATE; /* VOS aggregation is mutually exclusive with VOS discard. * Object discard is mutually exclusive with VOS discard. @@ -303,232 +345,233 @@ vos_obj_op_conflict(struct vos_object *obj, uint64_t flags, uint32_t intent, boo } int -vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, - daos_unit_oid_t oid, daos_epoch_range_t *epr, daos_epoch_t bound, - uint64_t flags, uint32_t intent, struct vos_object **obj_p, - struct vos_ts_set *ts_set) +vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t flags) { struct vos_object *obj; - struct daos_llink *lret; - struct obj_lru_key lkey; - int rc = 0; - int tmprc; - uint32_t cond_mask = 0; - bool create; - void *create_flag = NULL; - bool visible_only; + struct daos_lru_cache *occ; + int rc; D_ASSERT(cont != NULL); D_ASSERT(cont->vc_pool); - if (obj_p != NULL) - *obj_p = NULL; + occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb); + D_ASSERT(occ != NULL); - if (cont->vc_pool->vp_dying) - return -DER_SHUTDOWN; + rc = obj_get(occ, cont, oid, false, &obj); + if (rc == -DER_NONEXIST) + return 0; + if (rc) + return rc; - create = flags & VOS_OBJ_CREATE; - visible_only = flags & VOS_OBJ_VISIBLE; - /** Pass NULL as the create_args if we are not creating the object so we avoid - * evicting an entry until we need to - */ - if (create) - create_flag = cont; + /* TODO: Pin object in memory */ - D_DEBUG(DB_TRACE, "Try to hold cont="DF_UUID", obj="DF_UOID - " layout %u create=%s epr="DF_X64"-"DF_X64"\n", - DP_UUID(cont->vc_id), DP_UOID(oid), oid.id_layout_ver, - create ? "true" : "false", epr->epr_lo, epr->epr_hi); + if (check_discard(obj, flags)) + /* Update request will retry with this error */ + rc = (flags & VOS_OBJ_CREATE) ? -DER_UPDATE_AGAIN : -DER_BUSY; - /* Create the key for obj cache */ - lkey.olk_cont = cont; - lkey.olk_oid = oid; + obj_release(occ, obj, false); + return rc; +} - rc = daos_lru_ref_hold(occ, &lkey, sizeof(lkey), create_flag, &lret); - if (rc == -DER_NONEXIST) { - D_ASSERT(obj_local.obj_cont == NULL); - if (flags & VOS_OBJ_NO_HOLD) { - /** Object is not cached, so there can be no other holders */ - return 0; +int +vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t bound, + uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set) +{ + struct vos_container *cont = obj->obj_cont; + uint32_t cond_mask = 0; + int rc; + + D_ASSERT((flags & (VOS_OBJ_AGGREGATE | VOS_OBJ_DISCARD)) == 0); + D_ASSERT(intent == DAOS_INTENT_PUNCH || intent == DAOS_INTENT_UPDATE); + + if (obj->obj_df == NULL) { + rc = vos_oi_alloc(cont, obj->obj_id, epr->epr_hi, &obj->obj_df, ts_set); + if (rc) { + DL_ERROR(rc, DF_CONT": Failed to allocate OI "DF_UOID".", + DP_CONT(cont->vc_pool->vp_id, cont->vc_id), + DP_UOID(obj->obj_id)); + return rc; } - obj = &obj_local; - init_object(obj, oid, cont); - } else if (rc != 0) { - D_GOTO(failed_2, rc); + D_ASSERT(obj->obj_df); } else { - /** Object is in cache */ - obj = container_of(lret, struct vos_object, obj_llink); + vos_ilog_ts_ignore(vos_obj2umm(obj), &obj->obj_df->vo_ilog); } - if (obj->obj_zombie) - D_GOTO(failed, rc = -DER_AGAIN); + /* Check again since it could yield since vos_obj_hold() */ + if (check_discard(obj, flags)) + return -DER_UPDATE_AGAIN; - if (intent == DAOS_INTENT_KILL && !(flags & VOS_OBJ_KILL_DKEY)) { - if (obj != &obj_local) { - if (!daos_lru_is_last_user(&obj->obj_llink)) - D_GOTO(failed, rc = -DER_BUSY); + /* Check the sync epoch */ + if (epr->epr_hi <= obj->obj_sync_epoch && + vos_dth_get(obj->obj_cont->vc_pool->vp_sysdb) != NULL) { + /* If someone has synced the object against the + * obj->obj_sync_epoch, then we do not allow to modify the + * object with old epoch. Let's ask the caller to retry with + * newer epoch. + * + * For rebuild case, the @dth will be NULL. + */ + D_ASSERT(obj->obj_sync_epoch > 0); - vos_obj_evict(occ, obj); - } - /* no one else can hold it */ - obj->obj_zombie = true; - if (obj->obj_df) - goto out; /* Ok to delete */ + D_INFO("Refuse %s obj "DF_UOID" because of the epoch "DF_U64 + " is not newer than the sync epoch "DF_U64"\n", + intent == DAOS_INTENT_PUNCH ? "punch" : "update", + DP_UOID(obj->obj_id), epr->epr_hi, obj->obj_sync_epoch); + return -DER_TX_RESTART; } - if (obj->obj_df) { - D_DEBUG(DB_TRACE, "looking up object ilog"); - if (create || intent == DAOS_INTENT_PUNCH) - vos_ilog_ts_ignore(vos_obj2umm(obj), - &obj->obj_df->vo_ilog); - tmprc = vos_ilog_ts_add(ts_set, &obj->obj_df->vo_ilog, - &oid, sizeof(oid)); - D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */ - goto check_object; - } + /* It's done for DAOS_INTENT_PUNCH case */ + if (intent == DAOS_INTENT_PUNCH) + return 0; - /* newly cached object */ - D_DEBUG(DB_TRACE, "%s Got empty obj "DF_UOID" epr="DF_X64"-"DF_X64"\n", - create ? "find/create" : "find", DP_UOID(oid), epr->epr_lo, - epr->epr_hi); + /** If it's a conditional update, we need to preserve the -DER_NONEXIST + * for the caller. + */ + if (ts_set && ts_set->ts_flags & VOS_COND_UPDATE_OP_MASK) + cond_mask = VOS_ILOG_COND_UPDATE; + rc = vos_ilog_update(cont, &obj->obj_df->vo_ilog, epr, bound, NULL, + &obj->obj_ilog_info, cond_mask, ts_set); + if (rc == -DER_NONEXIST && !cond_mask) + rc = 0; - obj->obj_sync_epoch = 0; - if (!create) { - rc = vos_oi_find(cont, oid, &obj->obj_df, ts_set); - if (rc == -DER_NONEXIST) { - D_DEBUG(DB_TRACE, "non exist oid "DF_UOID"\n", - DP_UOID(oid)); - goto failed; - } - } else { + if (rc != 0) + VOS_TX_LOG_FAIL(rc, "Could not update object "DF_UOID" at "DF_U64": "DF_RC"\n", + DP_UOID(obj->obj_id), epr->epr_hi, DP_RC(rc)); + return rc; +} - rc = vos_oi_find_alloc(cont, oid, epr->epr_hi, false, - &obj->obj_df, ts_set); - D_ASSERT(rc || obj->obj_df); - } +int +vos_obj_hold(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *epr, + daos_epoch_t bound, uint64_t flags, uint32_t intent, struct vos_object **obj_p, + struct vos_ts_set *ts_set) +{ + struct vos_object *obj; + struct daos_lru_cache *occ; + int rc, tmprc; + bool create = false; - if (rc != 0) - goto failed; + D_ASSERT(cont != NULL); + D_ASSERT(cont->vc_pool); + D_ASSERT(obj_p != NULL); + *obj_p = NULL; - if (!obj->obj_df) { - D_DEBUG(DB_TRACE, "nonexistent obj "DF_UOID"\n", - DP_UOID(oid)); - D_GOTO(failed, rc = -DER_NONEXIST); - } + occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb); + D_ASSERT(occ != NULL); -check_object: - if (vos_obj_op_conflict(obj, flags, intent, create)) { - /** Cleanup so unit test that triggers doesn't corrupt the state */ - vos_obj_release(occ, obj, 0, false); - /* Update request will retry with this error */ - if (create) - rc = -DER_UPDATE_AGAIN; - else - rc = -DER_BUSY; - goto failed_2; + if (flags & VOS_OBJ_CREATE) { + D_ASSERT(intent == DAOS_INTENT_UPDATE || intent == DAOS_INTENT_PUNCH); + create = true; } - if (flags & VOS_OBJ_NO_HOLD) { - /** Just checking for conflicts, so we are done */ - vos_obj_release(occ, obj, 0, false); - return 0; + D_DEBUG(DB_TRACE, "Try to hold cont="DF_UUID", obj="DF_UOID" " + "layout=%u create=%d epr="DF_X64"-"DF_X64"\n", DP_UUID(cont->vc_id), + DP_UOID(oid), oid.id_layout_ver, create, epr->epr_lo, epr->epr_hi); + + /* Lookup object cache */ + rc = obj_get(occ, cont, oid, create, &obj); + if (rc == -DER_NONEXIST) { + D_ASSERT(obj_local.obj_cont == NULL); + obj = &obj_local; + init_object(obj, oid, cont); + rc = 0; + } else if (rc) { + D_GOTO(fail_log, rc); } - D_ASSERT(obj_p != NULL); + if (obj->obj_zombie) + D_GOTO(failed, rc = -DER_AGAIN); - if ((flags & VOS_OBJ_DISCARD) || intent == DAOS_INTENT_KILL || intent == DAOS_INTENT_PUNCH) - goto out; + if (check_discard(obj, flags)) { + /** Cleanup so unit test that triggers doesn't corrupt the state */ + obj_release(occ, obj, false); + /* Update request will retry with this error */ + rc = create ? -DER_UPDATE_AGAIN : -DER_BUSY; + goto fail_log; + } - if (!create) { - rc = vos_ilog_fetch(vos_cont2umm(cont), vos_cont2hdl(cont), - intent, &obj->obj_df->vo_ilog, epr->epr_hi, - bound, false, /* has_cond: no object level condition. */ - NULL, NULL, &obj->obj_ilog_info); - if (rc != 0) { - if (vos_has_uncertainty(ts_set, &obj->obj_ilog_info, - epr->epr_hi, bound)) - rc = -DER_TX_RESTART; - D_DEBUG(DB_TRACE, "Object "DF_UOID" not found at " - DF_U64"\n", DP_UOID(oid), epr->epr_hi); + /* Lookup OI table if the cached object is negative */ + if (obj->obj_df == NULL) { + obj->obj_sync_epoch = 0; + rc = vos_oi_find(cont, oid, &obj->obj_df, ts_set); + if (rc == 0) { + obj->obj_sync_epoch = obj->obj_df->vo_sync; + } else if (rc == -DER_NONEXIST) { + if (!create) + goto failed; + rc = 0; + } else if (rc) { goto failed; } + } else { + tmprc = vos_ilog_ts_add(ts_set, &obj->obj_df->vo_ilog, &oid, sizeof(oid)); + D_ASSERT(tmprc == 0); /* Non-zero only valid for akey */ + } - rc = vos_ilog_check(&obj->obj_ilog_info, epr, epr, - visible_only); - if (rc != 0) { - D_DEBUG(DB_TRACE, "Object "DF_UOID" not visible at " - DF_U64"-"DF_U64"\n", DP_UOID(oid), epr->epr_lo, - epr->epr_hi); - if (!vos_has_uncertainty(ts_set, &obj->obj_ilog_info, - epr->epr_hi, bound)) + /* TODO: Pin the object in memory in md-on-ssd phase II. Revise the 'obj_local' implementation + * then, since this function could yield. */ + + /* It's done for DAOS_INTENT_UPDATE or DAOS_INTENT_PUNCH or DAOS_INTENT_KILL */ + if (intent == DAOS_INTENT_UPDATE || intent == DAOS_INTENT_PUNCH || + intent == DAOS_INTENT_KILL) { + D_ASSERT((flags & (VOS_OBJ_AGGREGATE | VOS_OBJ_DISCARD)) == 0); + if (obj == &obj_local) { + D_ASSERT(create == false); + rc = cache_object(occ, &obj); + if (rc != 0) goto failed; - - /** If the creation is uncertain, go ahead and fall - * through as if the object exists so we can do - * actual uncertainty check. - */ } - goto out; + *obj_p = obj; + return 0; } + D_ASSERT(obj->obj_df != NULL); - /** If it's a conditional update, we need to preserve the -DER_NONEXIST - * for the caller. - */ - if (ts_set && ts_set->ts_flags & VOS_COND_UPDATE_OP_MASK) - cond_mask = VOS_ILOG_COND_UPDATE; - rc = vos_ilog_update(cont, &obj->obj_df->vo_ilog, epr, bound, NULL, - &obj->obj_ilog_info, cond_mask, ts_set); - if (rc == -DER_TX_RESTART) - goto failed; - if (rc == -DER_NONEXIST && !cond_mask) + /* It's done for obj discard */ + if ((flags & VOS_OBJ_DISCARD)) goto out; + + /* Object ilog check */ + rc = vos_ilog_fetch(vos_cont2umm(cont), vos_cont2hdl(cont), intent, &obj->obj_df->vo_ilog, + epr->epr_hi, bound, false, /* has_cond: no object level condition. */ + NULL, NULL, &obj->obj_ilog_info); if (rc != 0) { - VOS_TX_LOG_FAIL(rc, "Could not update object "DF_UOID" at " - DF_U64 ": "DF_RC"\n", DP_UOID(oid), epr->epr_hi, - DP_RC(rc)); + if (vos_has_uncertainty(ts_set, &obj->obj_ilog_info, epr->epr_hi, bound)) + rc = -DER_TX_RESTART; + D_DEBUG(DB_TRACE, "Object "DF_UOID" not found at " DF_U64"\n", + DP_UOID(oid), epr->epr_hi); goto failed; } -out: - if (obj->obj_df != NULL) - obj->obj_sync_epoch = obj->obj_df->vo_sync; + rc = vos_ilog_check(&obj->obj_ilog_info, epr, epr, flags & VOS_OBJ_VISIBLE); + if (rc != 0) { + D_DEBUG(DB_TRACE, "Object "DF_UOID" not visible at "DF_U64"-"DF_U64"\n", + DP_UOID(oid), epr->epr_lo, epr->epr_hi); + if (!vos_has_uncertainty(ts_set, &obj->obj_ilog_info, epr->epr_hi, bound)) + goto failed; - if (obj->obj_df != NULL && epr->epr_hi <= obj->obj_sync_epoch && - vos_dth_get(obj->obj_cont->vc_pool->vp_sysdb) != NULL && - (intent == DAOS_INTENT_PUNCH || intent == DAOS_INTENT_UPDATE)) { - /* If someone has synced the object against the - * obj->obj_sync_epoch, then we do not allow to modify the - * object with old epoch. Let's ask the caller to retry with - * newer epoch. - * - * For rebuild case, the @dth will be NULL. + /** If the creation is uncertain, go ahead and fall + * through as if the object exists so we can do + * actual uncertainty check. */ - D_ASSERT(obj->obj_sync_epoch > 0); - - D_INFO("Refuse %s obj "DF_UOID" because of the epoch "DF_U64 - " is not newer than the sync epoch "DF_U64"\n", - intent == DAOS_INTENT_PUNCH ? "punch" : "update", - DP_UOID(oid), epr->epr_hi, obj->obj_sync_epoch); - D_GOTO(failed, rc = -DER_TX_RESTART); } - +out: if (obj == &obj_local) { /** Ok, it's successful, go ahead and cache the object. */ rc = cache_object(occ, &obj); if (rc != 0) - goto failed_2; + goto failed; } + *obj_p = obj; if (flags & VOS_OBJ_AGGREGATE) obj->obj_aggregate = 1; else if (flags & VOS_OBJ_DISCARD) obj->obj_discard = 1; - *obj_p = obj; return 0; + failed: - vos_obj_release(occ, obj, 0, true); -failed_2: + obj_release(occ, obj, true); +fail_log: VOS_TX_LOG_FAIL(rc, "failed to hold object " DF_UOID ", rc=" DF_RC "\n", DP_UOID(oid), DP_RC(rc)); @@ -536,29 +579,32 @@ vos_obj_hold(struct daos_lru_cache *occ, struct vos_container *cont, } void -vos_obj_evict(struct daos_lru_cache *occ, struct vos_object *obj) +vos_obj_evict(struct vos_object *obj) { - if (obj == &obj_local) - return; + struct daos_lru_cache *occ; + + D_ASSERT(obj != &obj_local); + occ = vos_obj_cache_get(obj->obj_cont->vc_pool->vp_sysdb); + D_ASSERT(occ != NULL); + daos_lru_ref_evict(occ, &obj->obj_llink); } int -vos_obj_evict_by_oid(struct daos_lru_cache *occ, struct vos_container *cont, - daos_unit_oid_t oid) +vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid) { - struct obj_lru_key lkey; - struct daos_llink *lret; + struct daos_lru_cache *occ; + struct vos_object *obj; int rc; - lkey.olk_cont = cont; - lkey.olk_oid = oid; + occ = vos_obj_cache_get(cont->vc_pool->vp_sysdb); + D_ASSERT(occ != NULL); - rc = daos_lru_ref_hold(occ, &lkey, sizeof(lkey), NULL, &lret); + rc = obj_get(occ, cont, oid, false, &obj); if (rc == 0) { - daos_lru_ref_evict(occ, lret); - daos_lru_ref_release(occ, lret); + obj_put(occ, obj, true); + return 0; } - return rc == -DER_NONEXIST ? 0 : rc; + return (rc == -DER_NONEXIST || rc == -DER_SHUTDOWN)? 0 : rc; } diff --git a/src/vos/vos_obj_index.c b/src/vos/vos_obj_index.c index ea47cf4454c..72870cfd76e 100644 --- a/src/vos/vos_obj_index.c +++ b/src/vos/vos_obj_index.c @@ -241,33 +241,17 @@ vos_oi_find(struct vos_container *cont, daos_unit_oid_t oid, } /** - * Locate a durable object in OI table, or create it if it's not found + * Create a durable object in OI table. */ int -vos_oi_find_alloc(struct vos_container *cont, daos_unit_oid_t oid, - daos_epoch_t epoch, bool log, struct vos_obj_df **obj_p, - struct vos_ts_set *ts_set) +vos_oi_alloc(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_t epoch, + struct vos_obj_df **obj_p, struct vos_ts_set *ts_set) { - struct dtx_handle *dth = vos_dth_get(cont->vc_pool->vp_sysdb); struct vos_obj_df *obj = NULL; - d_iov_t key_iov; - d_iov_t val_iov; - daos_handle_t loh; - struct ilog_desc_cbs cbs; + d_iov_t key_iov, val_iov; int rc; - D_DEBUG(DB_TRACE, "Lookup obj "DF_UOID" in the OI table.\n", - DP_UOID(oid)); - - rc = vos_oi_find(cont, oid, &obj, ts_set); - if (rc == 0) - goto do_log; - if (rc != -DER_NONEXIST) - return rc; - - /* Object ID not found insert it to the OI tree */ - D_DEBUG(DB_TRACE, "Object "DF_UOID" not found adding it..\n", - DP_UOID(oid)); + D_DEBUG(DB_TRACE, "Adding object "DF_UOID"\n", DP_UOID(oid)); d_iov_set(&val_iov, NULL, 0); d_iov_set(&key_iov, &oid, sizeof(oid)); @@ -284,19 +268,46 @@ vos_oi_find_alloc(struct vos_container *cont, daos_unit_oid_t oid, vos_ilog_ts_ignore(vos_cont2umm(cont), &obj->vo_ilog); vos_ilog_ts_mark(ts_set, &obj->vo_ilog); -do_log: - if (!log) - goto skip_log; - vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); - rc = ilog_open(vos_cont2umm(cont), &obj->vo_ilog, &cbs, dth == NULL, &loh); - if (rc != 0) + *obj_p = obj; + + return 0; +} + +/** + * Locate a durable object in OI table, or create it if it's not found + */ +int +vos_oi_find_alloc(struct vos_container *cont, daos_unit_oid_t oid, + daos_epoch_t epoch, bool log, struct vos_obj_df **obj_p, + struct vos_ts_set *ts_set) +{ + struct dtx_handle *dth = vos_dth_get(cont->vc_pool->vp_sysdb); + struct vos_obj_df *obj = NULL; + daos_handle_t loh; + struct ilog_desc_cbs cbs; + int rc; + + D_DEBUG(DB_TRACE, "Lookup obj "DF_UOID" in the OI table.\n", DP_UOID(oid)); + + rc = vos_oi_find(cont, oid, &obj, ts_set); + if (rc == -DER_NONEXIST) { + rc = vos_oi_alloc(cont, oid, epoch, &obj, ts_set); + if (rc) + return rc; + } else if (rc) { return rc; + } - rc = ilog_update(loh, NULL, epoch, - dtx_is_valid_handle(dth) ? dth->dth_op_seq : 1, false); + if (log) { + vos_ilog_desc_cbs_init(&cbs, vos_cont2hdl(cont)); + rc = ilog_open(vos_cont2umm(cont), &obj->vo_ilog, &cbs, dth == NULL, &loh); + if (rc != 0) + return rc; - ilog_close(loh); -skip_log: + rc = ilog_update(loh, NULL, epoch, dtx_is_valid_handle(dth) ? dth->dth_op_seq : 1, + false); + ilog_close(loh); + } if (rc == 0) *obj_p = obj; @@ -824,8 +835,7 @@ oi_iter_check_punch(daos_handle_t ih) D_DEBUG(DB_IO, "Moving object "DF_UOID" to gc heap\n", DP_UOID(oid)); /* Evict the object from cache */ - rc = vos_obj_evict_by_oid(vos_obj_cache_current(oiter->oit_cont->vc_pool->vp_sysdb), - oiter->oit_cont, oid); + rc = vos_obj_evict_by_oid(oiter->oit_cont, oid); if (rc != 0) D_ERROR("Could not evict object "DF_UOID" "DF_RC"\n", DP_UOID(oid), DP_RC(rc)); @@ -867,9 +877,7 @@ oi_iter_aggregate(daos_handle_t ih, bool range_discard) obj = (struct vos_obj_df *)rec_iov.iov_buf; oid = obj->vo_id; - rc = vos_obj_hold(vos_obj_cache_current(cont->vc_pool->vp_sysdb), cont, oid, - &oiter->oit_epr, iter->it_bound, base_flag | VOS_OBJ_NO_HOLD, - DAOS_INTENT_PURGE, NULL, NULL); + rc = vos_obj_check_discard(cont, oid, base_flag); if (rc != 0) { /** -DER_BUSY means the object is in-use already. We will after a yield in this * case. @@ -897,8 +905,7 @@ oi_iter_aggregate(daos_handle_t ih, bool range_discard) */ /* Evict the object from cache */ - rc = vos_obj_evict_by_oid(vos_obj_cache_current(oiter->oit_cont->vc_pool->vp_sysdb), - oiter->oit_cont, oid); + rc = vos_obj_evict_by_oid(oiter->oit_cont, oid); if (rc != 0) D_ERROR("Could not evict object "DF_UOID" "DF_RC"\n", DP_UOID(oid), DP_RC(rc)); diff --git a/src/vos/vos_query.c b/src/vos/vos_query.c index 3c06bb69de1..e924e4016b6 100644 --- a/src/vos/vos_query.c +++ b/src/vos/vos_query.c @@ -676,8 +676,7 @@ vos_obj_query_key(daos_handle_t coh, daos_unit_oid_t oid, uint32_t flags, D_ASSERT(rc == 0); query->qt_bound = MAX(obj_epr.epr_hi, bound); - rc = vos_obj_hold(vos_obj_cache_current(is_sysdb), vos_hdl2cont(coh), oid, - &obj_epr, query->qt_bound, VOS_OBJ_VISIBLE, + rc = vos_obj_hold(cont, oid, &obj_epr, query->qt_bound, VOS_OBJ_VISIBLE, DAOS_INTENT_DEFAULT, &obj, query->qt_ts_set); if (rc != 0) { LOG_RC(rc, "Could not hold object: " DF_RC "\n", DP_RC(rc)); @@ -794,7 +793,7 @@ vos_obj_query_key(daos_handle_t coh, daos_unit_oid_t oid, uint32_t flags, *max_write = obj->obj_df->vo_max_write; if (obj != NULL) - vos_obj_release(vos_obj_cache_current(is_sysdb), obj, 0, false); + vos_obj_release(obj, 0, false); if (rc == 0 || rc == -DER_NONEXIST) { if (vos_ts_wcheck(query->qt_ts_set, obj_epr.epr_hi, From 4fb7d87b751c35422742e794528fc87055e1b14d Mon Sep 17 00:00:00 2001 From: sherintg Date: Thu, 22 Aug 2024 12:00:49 +0530 Subject: [PATCH 7/7] DAOS-14416 umem: Handle scm_sz ~ meta_sz with the v2 allocator (#14977) - The 80% rule for NE buckets will not be applied if the scm_sz is almost equal to meta_sz. - Corrected the check for toggling between V1 and V2 store type when scm_sz passed is zero. - Added assert to catch incorrect computation of chunk_id if zone counts are not set during boot correctly. Signed-off-by: Sherin T George --- src/common/dav_v2/container_ravl.c | 3 +++ src/common/dav_v2/heap.c | 7 +++++-- src/common/tests/umem_test_bmem.c | 6 +++--- src/vos/vos_pool.c | 28 ++++++++++++++-------------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/common/dav_v2/container_ravl.c b/src/common/dav_v2/container_ravl.c index f76dcbca233..af542c3c744 100644 --- a/src/common/dav_v2/container_ravl.c +++ b/src/common/dav_v2/container_ravl.c @@ -56,6 +56,9 @@ container_ravl_insert_block(struct block_container *bc, struct block_container_ravl *c = (struct block_container_ravl *)bc; + ASSERT(m->chunk_id < MAX_CHUNK); + ASSERT(m->zone_id < UINT32_MAX); + c->m = *m; return ravl_emplace_copy(c->tree, m); diff --git a/src/common/dav_v2/heap.c b/src/common/dav_v2/heap.c index 06626dd20ce..2f71cf4af0b 100644 --- a/src/common/dav_v2/heap.c +++ b/src/common/dav_v2/heap.c @@ -593,7 +593,7 @@ zone_calc_size_idx(uint32_t zone_id, unsigned max_zone, size_t heap_size) size_t zone_size_idx = zone_raw_size / CHUNKSIZE; - ASSERT(zone_size_idx <= UINT32_MAX); + ASSERT(zone_size_idx <= MAX_CHUNK); return (uint32_t)zone_size_idx; } @@ -1915,7 +1915,10 @@ heap_get_zone_limits(uint64_t heap_size, uint64_t cache_size) return zd; if (zd.nzones_heap > zd.nzones_cache) { - zd.nzones_ne_max = zd.nzones_cache * 8 / 10; + if (zd.nzones_heap < (zd.nzones_cache + UMEM_CACHE_MIN_EVICTABLE_PAGES)) + zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES; + else + zd.nzones_ne_max = zd.nzones_cache * 8 / 10; if (zd.nzones_cache < (zd.nzones_ne_max + UMEM_CACHE_MIN_EVICTABLE_PAGES)) zd.nzones_ne_max = zd.nzones_cache - UMEM_CACHE_MIN_EVICTABLE_PAGES; } else diff --git a/src/common/tests/umem_test_bmem.c b/src/common/tests/umem_test_bmem.c index b24731ca246..81960032b24 100644 --- a/src/common/tests/umem_test_bmem.c +++ b/src/common/tests/umem_test_bmem.c @@ -2414,7 +2414,7 @@ test_umempobj_nemb_usage(void **state) umem_class_init(&uma, &umm); - /* Do allocation and verify that only 10 zones allotted to non evictable MBs */ + /* Do allocation and verify that only 13 zones allotted to non evictable MBs */ for (num = 0;; num++) { /* do an allocation that takes more than half the zone size */ umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY); @@ -2425,7 +2425,7 @@ test_umempobj_nemb_usage(void **state) prev_umoff = umoff; } /* 80% nemb when heap size greater than cache size */ - assert_int_equal(num, 12); + assert_int_equal(num, 13); print_message("Number of allocations is %d\n", num); for (--num;; num--) { @@ -2451,7 +2451,7 @@ test_umempobj_nemb_usage(void **state) umem_class_init(&uma, &umm); - /* Do allocation and verify that only 10 zones allotted to non evictable MBs */ + /* Do allocation and verify that all 16 zones are allotted to non evictable MBs */ for (num = 0;; num++) { /* do an allocation that takes more than half the zone size */ umoff = umem_atomic_alloc(&umm, alloc_size, UMEM_TYPE_ANY); diff --git a/src/vos/vos_pool.c b/src/vos/vos_pool.c index 135b5aafff0..ea385c16af9 100644 --- a/src/vos/vos_pool.c +++ b/src/vos/vos_pool.c @@ -764,6 +764,7 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, struct umem_pool *pop = NULL; enum bio_mc_flags mc_flags = vos2mc_flags(flags); int rc, ret; + size_t scm_sz_actual; *ph = NULL; /* always use PMEM mode for SMD */ @@ -779,23 +780,22 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, goto umem_create; } - /* Is meta_sz is set then use it, otherwise derive from VOS file size or scm_sz */ - if (!meta_sz) { - if (!scm_sz) { - struct stat lstat; + if (!scm_sz) { + struct stat lstat; - rc = stat(path, &lstat); - if (rc != 0) - return daos_errno2der(errno); - meta_sz = lstat.st_size; - } else { - /* Custom scm_sz specified so use it (not regular DAOS pool case) */ - meta_sz = scm_sz; - } - } + rc = stat(path, &lstat); + if (rc != 0) + return daos_errno2der(errno); + scm_sz_actual = lstat.st_size; + } else + scm_sz_actual = scm_sz; + + /* Is meta_sz is set then use it, otherwise derive from VOS file size or scm_sz */ + if (!meta_sz) + meta_sz = scm_sz_actual; store.store_type = umempobj_get_backend_type(); - if (store.store_type == DAOS_MD_BMEM && meta_sz > scm_sz) + if (store.store_type == DAOS_MD_BMEM && meta_sz > scm_sz_actual) store.store_type = DAOS_MD_BMEM_V2; D_DEBUG(DB_MGMT, "Create BIO meta context for xs:%p pool:"DF_UUID" "