From 3807de4328322470958e48922613cc9ed98d7a1c Mon Sep 17 00:00:00 2001 From: Liu Xuezhao Date: Tue, 25 Jul 2023 16:20:55 +0800 Subject: [PATCH 1/4] Add single-engine, multi-socket support Backport for the following patches DAOS-13380 engine: refine tgt_nr check DAOS-15739 engine: Add multi-socket support (#14234) * DAOS-13380 engine: refine tgt_nr check 1. for non-DAOS_TARGET_OVERSUBSCRIBE case fail to start engine if #cores is not enough 2. for DAOS_TARGET_OVERSUBSCRIBE case allow to force start engine The #nr_xs_helpers possibly be reduced for either case. * DAOS-15739 engine: Add multi-socket support (#14234) Add a simple multi-socket mode for use cases where a single engine must be used. Avoids the issue of having all helper xstreams automatically assigned to a single NUMA node thus increasing efficiency of synchronizations between I/O and helper xstreams. It is the default behavior if all of the following are true Neither pinned_numa_node nor first_core are used. No oversubscription is requested NUMA has uniform number of cores targets and helpers divide evenly among numa nodes There is more than one numa node Update server config logic to ensure first_core is passed on to engine if it's set while keeping existing behavior when both first_core: 0 and pinned_numa_node are set. Signed-off-by: Jeff Olivier Signed-off-by: Xuezhao Liu Signed-off-by: Tom Nabarro --- docs/admin/deployment.md | 6 + docs/admin/env_variables.md | 1 + src/control/cmd/daos_server/start.go | 8 +- src/control/cmd/dmg/auto_test.go | 2 - src/control/server/config/server.go | 14 +- src/control/server/ctl_storage_rpc_test.go | 6 +- src/control/server/engine/config.go | 27 +- src/control/server/instance.go | 8 - src/control/server/instance_exec.go | 6 - src/engine/init.c | 275 +++++++++++------- src/engine/srv.c | 85 ++++-- src/engine/srv_internal.h | 47 +-- src/engine/ult.c | 37 ++- .../ftest/control/daos_system_query.yaml | 3 +- .../ftest/control/dmg_pool_query_test.yaml | 6 +- .../control/dmg_server_set_logmasks.yaml | 3 + src/tests/ftest/harness/core_files.yaml | 3 + src/tests/ftest/pool/create_all_vm.yaml | 2 +- src/tests/ftest/pool/query_attribute.yaml | 2 +- src/tests/ftest/server/daos_server_dump.yaml | 1 + .../ftest/telemetry/dkey_akey_enum_punch.yaml | 2 +- utils/config/daos_server.yml | 4 +- utils/nlt_server.yaml | 1 + 23 files changed, 366 insertions(+), 183 deletions(-) diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md index d83980a121c..5fa7533cefa 100644 --- a/docs/admin/deployment.md +++ b/docs/admin/deployment.md @@ -1377,6 +1377,12 @@ per four target threads, for example `targets: 16` and `nr_xs_helpers: 4`. The server should have sufficiently many physical cores to support the number of targets plus the additional service threads. +The 'targets:' and 'nr_xs_helpers:' requirement are mandatory, if the number +of physical cores are not enough it will fail the starting of the daos engine +(notes that 2 cores reserved for system service), or configures with ENV +"DAOS_TARGET_OVERSUBSCRIBE=1" to force starting daos engine (possibly hurts +performance as multiple XS compete on same core). + ## Storage Formatting diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 48ee91ba1b3..2f5c2053683 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -52,6 +52,7 @@ Environment variables in this section only apply to the server side. |DAOS\_DTX\_AGG\_THD\_AGE|DTX aggregation age threshold in seconds. The valid range is [210, 1830]. The default value is 630.| |DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.| |DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.| +|DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.| ## Server and Client environment variables diff --git a/src/control/cmd/daos_server/start.go b/src/control/cmd/daos_server/start.go index 962d370db96..a64777baacf 100644 --- a/src/control/cmd/daos_server/start.go +++ b/src/control/cmd/daos_server/start.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -29,7 +29,7 @@ type startCmd struct { Modules *string `short:"m" long:"modules" description:"List of server modules to load"` Targets uint16 `short:"t" long:"targets" description:"Number of targets to use (default use all cores)"` NrXsHelpers *uint16 `short:"x" long:"xshelpernr" description:"Number of helper XS per VOS target"` - FirstCore uint16 `short:"f" long:"firstcore" default:"0" description:"Index of first core for service thread"` + FirstCore *uint16 `short:"f" long:"firstcore" description:"Index of first core for service thread"` Group string `short:"g" long:"group" description:"Server group name"` SocketDir string `short:"d" long:"socket_dir" description:"Location for all daos_server & daos_engine sockets"` Insecure bool `short:"i" long:"insecure" description:"Allow for insecure connections"` @@ -76,8 +76,8 @@ func (cmd *startCmd) setCLIOverrides() error { if cmd.NrXsHelpers != nil { srv.WithHelperStreamCount(int(*cmd.NrXsHelpers)) } - if cmd.FirstCore > 0 { - srv.WithServiceThreadCore(int(cmd.FirstCore)) + if cmd.FirstCore != nil { + srv.WithServiceThreadCore(int(*cmd.FirstCore)) } } diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go index 35b63b246f7..73119b52aa9 100644 --- a/src/control/cmd/dmg/auto_test.go +++ b/src/control/cmd/dmg/auto_test.go @@ -580,7 +580,6 @@ transport_config: engines: - targets: 12 nr_xs_helpers: 2 - first_core: 0 log_file: /tmp/daos_engine.0.log storage: - class: dcpm @@ -599,7 +598,6 @@ engines: pinned_numa_node: 0 - targets: 6 nr_xs_helpers: 0 - first_core: 0 log_file: /tmp/daos_engine.1.log storage: - class: dcpm diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go index cd5ecda8cd9..87ae27bb101 100644 --- a/src/control/server/config/server.go +++ b/src/control/server/config/server.go @@ -863,7 +863,11 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA // Detect legacy mode by checking if first_core is being used. legacyMode := false for _, engineCfg := range cfg.Engines { - if engineCfg.ServiceThreadCore != 0 { + if engineCfg.ServiceThreadCore != nil { + if *engineCfg.ServiceThreadCore == 0 && engineCfg.PinnedNumaNode != nil { + // Both are set but we don't know yet which to use + continue + } legacyMode = true break } @@ -872,9 +876,15 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA // Fail if any engine has an explicit pin and non-zero first_core. for idx, engineCfg := range cfg.Engines { if legacyMode { + if engineCfg.PinnedNumaNode != nil { + log.Infof("pinned_numa_node setting ignored on engine %d", idx) + engineCfg.PinnedNumaNode = nil + } log.Debugf("setting legacy core allocation algorithm on engine %d", idx) - engineCfg.PinnedNumaNode = nil continue + } else if engineCfg.ServiceThreadCore != nil { + log.Infof("first_core setting ignored on engine %d", idx) + engineCfg.ServiceThreadCore = nil } numaAffinity, err := detectEngineAffinity(log, engineCfg, affSources...) diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index 7fb4aa9ea87..ecd6c5f66a6 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -1569,7 +1569,10 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) { var engineCfgs []*engine.Config for i, sc := range tc.storageCfgs { log.Debugf("storage cfg contains bdevs %v for engine %d", sc.Bdevs(), i) - engineCfgs = append(engineCfgs, engine.MockConfig().WithStorage(sc...)) + engineCfgs = append(engineCfgs, + engine.MockConfig(). + WithStorage(sc...). + WithTargetCount(tc.engineTargetCount[i])) } sCfg := config.DefaultServer().WithEngines(engineCfgs...) cs := mockControlService(t, log, sCfg, csbmbc, tc.smbc, tc.smsc) @@ -1625,7 +1628,6 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) { } te.setDrpcClient(newMockDrpcClient(dcc)) te._superblock.Rank = ranklist.NewRankPtr(uint32(idx + 1)) - te.setTargetCount(tc.engineTargetCount[idx]) for _, tc := range te.storage.GetBdevConfigs() { tc.Bdev.DeviceRoles.OptionBits = storage.OptionBits(storage.BdevRoleAll) } diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go index b7fe03e4cf7..9926616e08e 100644 --- a/src/control/server/engine/config.go +++ b/src/control/server/engine/config.go @@ -115,7 +115,7 @@ type Config struct { Modules string `yaml:"modules,omitempty" cmdLongFlag:"--modules" cmdShortFlag:"-m"` TargetCount int `yaml:"targets,omitempty" cmdLongFlag:"--targets,nonzero" cmdShortFlag:"-t,nonzero"` HelperStreamCount int `yaml:"nr_xs_helpers" cmdLongFlag:"--xshelpernr" cmdShortFlag:"-x"` - ServiceThreadCore int `yaml:"first_core" cmdLongFlag:"--firstcore,nonzero" cmdShortFlag:"-f,nonzero"` + ServiceThreadCore *int `yaml:"first_core,omitempty" cmdLongFlag:"--firstcore" cmdShortFlag:"-f"` SystemName string `yaml:"-" cmdLongFlag:"--group" cmdShortFlag:"-g"` SocketDir string `yaml:"-" cmdLongFlag:"--socket_dir" cmdShortFlag:"-d"` LogMask string `yaml:"log_mask,omitempty" cmdEnv:"D_LOG_MASK"` @@ -160,10 +160,29 @@ func (c *Config) ReadLogSubsystems() (string, error) { // Validate ensures that the configuration meets minimum standards. func (c *Config) Validate() error { - if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 { + if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 { return errors.New("cannot specify both pinned_numa_node and first_core") } + errNegative := func(s string) error { + return errors.Errorf("%s must not be negative", s) + } + if c.TargetCount < 0 { + return errNegative("target count") + } + if c.HelperStreamCount < 0 { + return errNegative("helper stream count") + } + if c.ServiceThreadCore != nil && *c.ServiceThreadCore < 0 { + return errNegative("service thread core index") + } + if c.MemSize < 0 { + return errNegative("mem size") + } + if c.HugepageSz < 0 { + return errNegative("hugepage size") + } + if c.TargetCount == 0 { return errors.New("target count must be nonzero") } @@ -222,7 +241,7 @@ func IsNUMAMismatch(err error) bool { // SetNUMAAffinity sets the NUMA affinity for the engine, // if not already set in the configuration. func (c *Config) SetNUMAAffinity(node uint) error { - if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 { + if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 { return errors.New("cannot set both NUMA node and service core") } @@ -464,7 +483,7 @@ func (c *Config) WithHelperStreamCount(count int) *Config { // WithServiceThreadCore sets the core index to be used for running DAOS service threads. func (c *Config) WithServiceThreadCore(idx int) *Config { - c.ServiceThreadCore = idx + c.ServiceThreadCore = &idx return c } diff --git a/src/control/server/instance.go b/src/control/server/instance.go index 3837860fabd..14f53cf3b5b 100644 --- a/src/control/server/instance.go +++ b/src/control/server/instance.go @@ -338,14 +338,6 @@ func (ei *EngineInstance) setHugepageSz(hpSizeMb int) { ei.runner.GetConfig().HugepageSz = hpSizeMb } -// setTargetCount updates target count in engine config. -func (ei *EngineInstance) setTargetCount(numTargets int) { - ei.Lock() - defer ei.Unlock() - - ei.runner.GetConfig().TargetCount = numTargets -} - // GetTargetCount returns the target count set for this instance. func (ei *EngineInstance) GetTargetCount() int { ei.RLock() diff --git a/src/control/server/instance_exec.go b/src/control/server/instance_exec.go index 61f25e40b71..19143782ec3 100644 --- a/src/control/server/instance_exec.go +++ b/src/control/server/instance_exec.go @@ -90,12 +90,6 @@ func (ei *EngineInstance) finishStartup(ctx context.Context, ready *srvpb.Notify if err := ei.handleReady(ctx, ready); err != nil { return err } - // update engine target count to reflect allocated number of targets, not number requested - // when starting - // NOTE: Engine mem_size passed on engine invocation is based on the number of targets - // requested in config so if number of targets allocated doesn't match the number of - // targets requested the mem_size value may be inappropriate. - ei.setTargetCount(int(ready.GetNtgts())) ei.ready.SetTrue() diff --git a/src/engine/init.c b/src/engine/init.c index d639456eeb1..c8b4672b7da 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -37,8 +37,7 @@ static char modules[MAX_MODULE_OPTIONS + 1]; /** - * Number of target threads the user would like to start - * 0 means default value, see dss_tgt_nr_get(); + * Number of target threads the user would like to start. */ static unsigned int nr_threads; @@ -72,15 +71,18 @@ hwloc_topology_t dss_topo; int dss_core_depth; /** number of physical cores, w/o hyperthreading */ int dss_core_nr; -/** start offset index of the first core for service XS */ -unsigned int dss_core_offset; +/** start offset index of the first core for service XS. Init to -1 so we can + * detect when it is explicitly set and disable multi-socket mode. + */ +unsigned int dss_core_offset = -1; /** NUMA node to bind to */ int dss_numa_node = -1; -hwloc_bitmap_t core_allocation_bitmap; -/** a copy of the NUMA node object in the topology */ -hwloc_obj_t numa_obj; -/** number of cores in the given NUMA node */ -int dss_num_cores_numa_node; +/** Forward I/O work to neighbor */ +bool dss_forward_neighbor; +/** Cached numa information */ +struct dss_numa_info *dss_numa; +/** Number of active numa nodes, multi-socket mode only */ +int dss_numa_nr = 1; /** Module facility bitmask */ static uint64_t dss_mod_facs; /** Number of storage tiers: 2 for SCM and NVMe */ @@ -247,68 +249,112 @@ modules_load(void) return rc; } +static unsigned int +ncores_needed(unsigned int tgt_nr, unsigned int nr_helpers) +{ + return DAOS_TGT0_OFFSET + tgt_nr + nr_helpers; +} + /** - * Get the appropriate number of main XS based on the number of cores and - * passed in preferred number of threads. + * Check if the #targets and #nr_xs_helpers is valid to start server, the #nr_xs_helpers possibly + * be reduced. */ static int -dss_tgt_nr_get(unsigned int ncores, unsigned int nr, bool oversubscribe) +dss_tgt_nr_check(unsigned int ncores, unsigned int tgt_nr, bool oversubscribe) { - int tgt_nr; - D_ASSERT(ncores >= 1); /* at most 2 helper XS per target */ - if (dss_tgt_offload_xs_nr > 2 * nr) - dss_tgt_offload_xs_nr = 2 * nr; - else if (dss_tgt_offload_xs_nr == 0) + if (dss_tgt_offload_xs_nr > 2 * tgt_nr) { + D_PRINT("#nr_xs_helpers(%d) cannot exceed 2 times #targets (2 x %d = %d).\n", + dss_tgt_offload_xs_nr, tgt_nr, 2 * tgt_nr); + dss_tgt_offload_xs_nr = 2 * tgt_nr; + } else if (dss_tgt_offload_xs_nr == 0) { D_WARN("Suggest to config at least 1 helper XS per DAOS engine\n"); + } - /* Each system XS uses one core, and with dss_tgt_offload_xs_nr - * offload XS. Calculate the tgt_nr as the number of main XS based - * on number of cores. - */ -retry: - tgt_nr = ncores - DAOS_TGT0_OFFSET - dss_tgt_offload_xs_nr; - if (tgt_nr <= 0) - tgt_nr = 1; - - /* If user requires less target threads then set it as dss_tgt_nr, - * if user oversubscribes, then: - * . if oversubscribe is enabled, use the required number - * . if oversubscribe is disabled(default), - * use the number calculated above - * Note: oversubscribing may hurt performance. - */ - if (nr >= 1 && ((nr < tgt_nr) || oversubscribe)) { - tgt_nr = nr; - if (dss_tgt_offload_xs_nr > 2 * tgt_nr) - dss_tgt_offload_xs_nr = 2 * tgt_nr; - } else if (dss_tgt_offload_xs_nr > 2 * tgt_nr) { - dss_tgt_offload_xs_nr--; - goto retry; + if (oversubscribe) { + if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) + D_PRINT("Force to start engine with %d targets %d xs_helpers on %d cores(" + "%d cores reserved for system service).\n", + tgt_nr, dss_tgt_offload_xs_nr, ncores, DAOS_TGT0_OFFSET); + goto out; } - if (tgt_nr != nr) - D_PRINT("%d target XS(xstream) requested (#cores %d); " - "use (%d) target XS\n", nr, ncores, tgt_nr); + if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) { + D_ERROR("cannot start engine with %d targets %d xs_helpers on %d cores, may try " + "with DAOS_TARGET_OVERSUBSCRIBE=1 or reduce #targets/#nr_xs_helpers(" + "%d cores reserved for system service).\n", + tgt_nr, dss_tgt_offload_xs_nr, ncores, DAOS_TGT0_OFFSET); + return -DER_INVAL; + } +out: if (dss_tgt_offload_xs_nr % tgt_nr != 0) dss_helper_pool = true; - return tgt_nr; + return 0; +} + +static bool +dss_multi_socket_check(bool oversub, int numa_nr) +{ + /** Keep this simple and disallow some configurations */ + if (oversub) { + D_INFO("Oversubscription requested, bypassing multi-socket mode\n"); + return false; + } + + if (dss_numa_node != -1) { + D_INFO("Numa node specified, running in single socket mode\n"); + return false; + } + + if (numa_nr < 2) { + D_INFO("No NUMA found, bypassing multi-socket mode\n"); + return false; + } + + if ((dss_tgt_offload_xs_nr % numa_nr) != 0) { + D_INFO("Uneven split of helpers on sockets, bypassing multi-socket mode\n"); + return false; + } + + if ((dss_tgt_nr % numa_nr) != 0) { + D_INFO("Uneven split of targets on sockets, bypassing multi-socket mode\n"); + return false; + } + + return true; } static int -dss_topo_init() +dss_legacy_mode(bool oversub) +{ + D_PRINT("Using legacy core allocation algorithm\n"); + if (dss_core_offset >= dss_core_nr) { + D_ERROR("invalid dss_core_offset %u (set by \"-f\" option), should within " + "range [0, %u]\n", + dss_core_offset, dss_core_nr - 1); + return -DER_INVAL; + } + + return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, oversub); +} + +static int +dss_topo_init(void) { int depth; int numa_node_nr; - int num_cores_visited; - char *cpuset; + int num_cores_visited; int k; + int numa_node; + int rc = 0; + hwloc_obj_t numa_obj; hwloc_obj_t corenode; bool tgt_oversub = false; + bool multi_socket = false; hwloc_topology_init(&dss_topo); hwloc_topology_load(dss_topo); @@ -318,24 +364,26 @@ dss_topo_init() depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE); numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth); d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub); + d_getenv_bool("DAOS_FORWARD_NEIGHBOR", &dss_forward_neighbor); + dss_tgt_nr = nr_threads; - /* if no NUMA node was specified, or NUMA data unavailable */ - /* fall back to the legacy core allocation algorithm */ - if (dss_numa_node == -1 || numa_node_nr <= 0) { - D_PRINT("Using legacy core allocation algorithm\n"); - dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads, - tgt_oversub); - - if (dss_core_offset >= dss_core_nr) { - D_ERROR("invalid dss_core_offset %u " - "(set by \"-f\" option)," - " should within range [0, %u]", - dss_core_offset, dss_core_nr - 1); - return -DER_INVAL; - } - return 0; + /** Set to -1 initially so we can detect when it's set explicitly to + * maintain mode consistency between engines where one sets it to 0. + */ + if (dss_core_offset == -1) { + dss_core_offset = 0; + if (dss_multi_socket_check(tgt_oversub, numa_node_nr)) + multi_socket = true; + } else { + D_INFO("Core offset specified, running in single socket mode\n"); } + /* Fall back to legacy mode if no socket was specified and + * multi-socket mode is not possible or NUMA data is unavailable + */ + if ((!multi_socket && dss_numa_node == -1) || numa_node_nr <= 0) + return dss_legacy_mode(tgt_oversub); + if (dss_numa_node > numa_node_nr) { D_ERROR("Invalid NUMA node selected. " "Must be no larger than %d\n", @@ -343,52 +391,76 @@ dss_topo_init() return -DER_INVAL; } - numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node); - if (numa_obj == NULL) { - D_ERROR("NUMA node %d was not found in the topology", - dss_numa_node); - return -DER_INVAL; - } + D_ALLOC_ARRAY(dss_numa, numa_node_nr); + if (dss_numa == NULL) + return -DER_NOMEM; - /* create an empty bitmap, then set each bit as we */ - /* find a core that matches */ - core_allocation_bitmap = hwloc_bitmap_alloc(); - if (core_allocation_bitmap == NULL) { - D_ERROR("Unable to allocate core allocation bitmap\n"); - return -DER_INVAL; - } + for (numa_node = 0; numa_node < numa_node_nr; numa_node++) { + dss_numa[numa_node].ni_idx = numa_node; + numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, numa_node); + if (numa_obj == NULL) { + D_ERROR("NUMA node %d was not found in the topology\n", numa_node); + D_GOTO(failed, rc = -DER_INVAL); + } - dss_num_cores_numa_node = 0; - num_cores_visited = 0; + /* create an empty bitmap, then set each bit as we */ + /* find a core that matches */ + dss_numa[numa_node].ni_coremap = hwloc_bitmap_alloc(); + if (dss_numa[numa_node].ni_coremap == NULL) { + D_ERROR("Unable to allocate core allocation bitmap\n"); + D_GOTO(failed, rc = -DER_INVAL); + } - for (k = 0; k < dss_core_nr; k++) { - corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k); - if (corenode == NULL) - continue; - if (hwloc_bitmap_isincluded(corenode->cpuset, - numa_obj->cpuset) != 0) { - if (num_cores_visited++ >= dss_core_offset) { - hwloc_bitmap_set(core_allocation_bitmap, k); - hwloc_bitmap_asprintf(&cpuset, - corenode->cpuset); + dss_numa[numa_node].ni_core_nr = 0; + num_cores_visited = 0; + + for (k = 0; k < dss_core_nr; k++) { + corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k); + if (corenode == NULL) + continue; + if (hwloc_bitmap_isincluded(corenode->cpuset, numa_obj->cpuset) != 0) { + if (num_cores_visited++ >= dss_core_offset) + hwloc_bitmap_set(dss_numa[numa_node].ni_coremap, k); + dss_numa[numa_node].ni_core_nr++; } - dss_num_cores_numa_node++; + } + if (multi_socket && numa_node > 0 && + dss_numa[numa_node].ni_core_nr != dss_numa[numa_node - 1].ni_core_nr) { + D_INFO("Non-uniform numa nodes, bypassing multi-socket mode\n"); + D_FREE(dss_numa); + return dss_legacy_mode(false); } } - hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap); - free(cpuset); - - dss_tgt_nr = dss_tgt_nr_get(dss_num_cores_numa_node, nr_threads, - tgt_oversub); - if (dss_core_offset >= dss_num_cores_numa_node) { - D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), " - "should within range [0, %d]", dss_core_offset, - dss_num_cores_numa_node - 1); - return -DER_INVAL; + + if (multi_socket) { + /** In this mode, we simply save the topology for later use but + * still use all of the cores. + */ + D_PRINT("Using Multi-socket NUMA core allocation algorithm\n"); + dss_numa_nr = numa_node_nr; + dss_offload_per_numa_nr = dss_tgt_offload_xs_nr / dss_numa_nr; + dss_tgt_per_numa_nr = dss_tgt_nr / dss_numa_nr; + return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, tgt_oversub); } + if (dss_core_offset >= dss_numa[dss_numa_node].ni_core_nr) { + D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), should within range " + "[0, %d]\n", + dss_core_offset, dss_numa[dss_numa_node].ni_core_nr - 1); + return -DER_INVAL; + } D_PRINT("Using NUMA core allocation algorithm\n"); - return 0; + + return dss_tgt_nr_check(dss_numa[dss_numa_node].ni_core_nr, dss_tgt_nr, tgt_oversub); +failed: + D_FREE(dss_numa); + return rc; +} + +static void +dss_topo_fini(void) +{ + D_FREE(dss_numa); } static ABT_mutex server_init_state_mutex; @@ -826,7 +898,7 @@ server_init(int argc, char *argv[]) DAOS_VERSION, getpid(), dss_self_rank(), dss_tgt_nr, dss_tgt_offload_xs_nr, dss_core_offset, dss_hostname); - if (numa_obj) + if (dss_numa && dss_numa_node != -1) D_PRINT("Using NUMA node: %d", dss_numa_node); return 0; @@ -857,6 +929,7 @@ server_init(int argc, char *argv[]) exit_metrics_init: dss_engine_metrics_fini(); d_tm_fini(); + /* dss_topo_fini cleans itself if it fails */ exit_debug_init: daos_debug_fini(); return rc; @@ -918,6 +991,8 @@ server_fini(bool force) D_INFO("dss_engine_metrics_fini() done\n"); d_tm_fini(); D_INFO("d_tm_fini() done\n"); + dss_topo_fini(); + D_INFO("dss_top_fini() done\n"); daos_debug_fini(); D_INFO("daos_debug_fini() done\n"); } diff --git a/src/engine/srv.c b/src/engine/srv.c index e0c985c38f6..6f470e2c0cb 100644 --- a/src/engine/srv.c +++ b/src/engine/srv.c @@ -75,6 +75,10 @@ #define DRPC_XS_NR (1) /** Number of offload XS */ unsigned int dss_tgt_offload_xs_nr; +/** Number of offload per socket */ +unsigned int dss_offload_per_numa_nr; +/** Number of target per socket */ +unsigned int dss_tgt_per_numa_nr; /** Number of target (XS set) per engine */ unsigned int dss_tgt_nr; /** Number of system XS */ @@ -964,37 +968,58 @@ static int dss_start_xs_id(int tag, int xs_id) { hwloc_obj_t obj; + int tgt; int rc; int xs_core_offset; - unsigned idx; + unsigned int idx; char *cpuset; + struct dss_numa_info *ninfo; + bool clear = false; - D_DEBUG(DB_TRACE, "start xs_id called for %d. ", xs_id); + D_DEBUG(DB_TRACE, "start xs_id called for %d.\n", xs_id); /* if we are NUMA aware, use the NUMA information */ - if (numa_obj) { - idx = hwloc_bitmap_first(core_allocation_bitmap); + if (dss_numa) { + if (dss_numa_node == -1) { + tgt = dss_xs2tgt(xs_id); + if (xs_id == 1) { + /** Put swim on first core of numa 1, core 0 */ + ninfo = &dss_numa[1]; + } else if (tgt != -1) { + /** Split I/O targets evenly among numa nodes */ + ninfo = &dss_numa[tgt / dss_tgt_per_numa_nr]; + } else if (xs_id > 2) { + /** Split helper xstreams evenly among numa nodes */ + tgt = xs_id - dss_sys_xs_nr - dss_tgt_nr; + ninfo = &dss_numa[tgt / dss_offload_per_numa_nr]; + } else { + /** Put the system and DRPC on numa 0, core 0 */ + ninfo = &dss_numa[0]; + } + + D_DEBUG(DB_TRACE, "Using numa node %d for XS %d\n", ninfo->ni_idx, xs_id); + + if (xs_id != 0) + clear = true; + } else { + ninfo = &dss_numa[dss_numa_node]; + if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr)) + clear = true; + } + + idx = hwloc_bitmap_first(ninfo->ni_coremap); if (idx == -1) { D_ERROR("No core available for XS: %d", xs_id); return -DER_INVAL; } - D_DEBUG(DB_TRACE, - "Choosing next available core index %d.", idx); + D_DEBUG(DB_TRACE, "Choosing next available core index %d on numa %d.\n", idx, + ninfo->ni_idx); /* * All system XS will reuse the first XS' core, but * the SWIM and DRPC XS will use separate core if enough cores */ - if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr)) - hwloc_bitmap_clr(core_allocation_bitmap, idx); + if (clear) + hwloc_bitmap_clr(ninfo->ni_coremap, idx); - obj = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, idx); - if (obj == NULL) { - D_PRINT("Null core returned by hwloc\n"); - return -DER_INVAL; - } - - hwloc_bitmap_asprintf(&cpuset, obj->cpuset); - D_DEBUG(DB_TRACE, "Using CPU set %s\n", cpuset); - free(cpuset); } else { D_DEBUG(DB_TRACE, "Using non-NUMA aware core allocation\n"); /* @@ -1007,14 +1032,19 @@ dss_start_xs_id(int tag, int xs_id) xs_core_offset = (dss_core_nr > dss_tgt_nr) ? 1 : 0; else xs_core_offset = 0; - obj = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, - (xs_core_offset + dss_core_offset) - % dss_core_nr); - if (obj == NULL) { - D_ERROR("Null core returned by hwloc for XS %d\n", - xs_id); - return -DER_INVAL; - } + idx = (xs_core_offset + dss_core_offset) % dss_core_nr; + } + + obj = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, idx); + if (obj == NULL) { + D_PRINT("Null core returned by hwloc\n"); + return -DER_INVAL; + } + + if (D_LOG_ENABLED(DB_TRACE)) { + hwloc_bitmap_asprintf(&cpuset, obj->cpuset); + D_DEBUG(DB_TRACE, "Using CPU set %s for XS %d\n", cpuset, xs_id); + free(cpuset); } rc = dss_start_one_xstream(obj->cpuset, tag, xs_id); @@ -1076,9 +1106,8 @@ dss_xstreams_init(void) dss_core_nr, dss_tgt_nr); if (dss_numa_node != -1) { - D_DEBUG(DB_TRACE, - "Detected %d cores on NUMA node %d\n", - dss_num_cores_numa_node, dss_numa_node); + D_DEBUG(DB_TRACE, "Detected %d cores on NUMA node %d\n", + dss_numa[dss_numa_node].ni_core_nr, dss_numa_node); } xstream_data.xd_xs_nr = DSS_XS_NR_TOTAL; diff --git a/src/engine/srv_internal.h b/src/engine/srv_internal.h index 1d4278a98cf..98a3018124e 100644 --- a/src/engine/srv_internal.h +++ b/src/engine/srv_internal.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -116,34 +116,47 @@ struct engine_metrics { struct d_tm_node_t *meminfo; }; +struct dss_numa_info { + /** numa index for this node */ + int ni_idx; + /** Number of cores in this node */ + int ni_core_nr; + /** Allocation bitmap for this numa node */ + hwloc_bitmap_t ni_coremap; +}; + extern struct engine_metrics dss_engine_metrics; #define DSS_HOSTNAME_MAX_LEN 255 /** Server node hostname */ -extern char dss_hostname[]; +extern char dss_hostname[]; /** Server node topology */ -extern hwloc_topology_t dss_topo; +extern hwloc_topology_t dss_topo; /** core depth of the topology */ -extern int dss_core_depth; +extern int dss_core_depth; /** number of physical cores, w/o hyper-threading */ -extern int dss_core_nr; +extern int dss_core_nr; /** start offset index of the first core for service XS */ -extern unsigned int dss_core_offset; +extern unsigned int dss_core_offset; /** NUMA node to bind to */ -extern int dss_numa_node; -/** bitmap describing core allocation */ -extern hwloc_bitmap_t core_allocation_bitmap; -/** a copy of the NUMA node object in the topology */ -extern hwloc_obj_t numa_obj; -/** number of cores in the given NUMA node */ -extern int dss_num_cores_numa_node; -/** Number of offload XS */ -extern unsigned int dss_tgt_offload_xs_nr; +extern int dss_numa_node; +/** Number of active numa nodes (only > 1 if multi-socket mode is enabled) */ +extern int dss_numa_nr; /** number of system XS */ -extern unsigned int dss_sys_xs_nr; +extern unsigned int dss_sys_xs_nr; /** Flag of helper XS as a pool */ -extern bool dss_helper_pool; +extern bool dss_helper_pool; +/** Cached numa information */ +extern struct dss_numa_info *dss_numa; +/** Forward I/O work to neighbor */ +extern bool dss_forward_neighbor; +/** Number of offload XS */ +extern unsigned int dss_tgt_offload_xs_nr; +/** Number of offload per socket */ +extern unsigned int dss_offload_per_numa_nr; +/** Number of target per socket */ +extern unsigned int dss_tgt_per_numa_nr; /** Shadow dss_get_module_info */ struct dss_module_info *get_module_info(void); diff --git a/src/engine/ult.c b/src/engine/ult.c index 47c3b504f8d..18230cec225 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -322,6 +322,37 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags) return dss_collective_internal(func, arg, true, flags); } +static inline uint32_t +sched_ult2xs_multisocket(int xs_type, int tgt_id) +{ + static __thread uint32_t offload; + uint32_t socket; + uint32_t base; + uint32_t target; + + if (dss_tgt_offload_xs_nr == 0) { + if (xs_type == DSS_XS_IOFW && dss_forward_neighbor) { + /* Keep the old forwarding behavior, but NUMA aware */ + socket = tgt_id / dss_numa_nr; + target = (socket * dss_tgt_per_numa_nr) + + (tgt_id + offload) % dss_tgt_per_numa_nr; + offload = target + 17; /* Seed next selection */ + target = DSS_MAIN_XS_ID(target); + goto check; + } + return DSS_XS_SELF; + } + + socket = tgt_id / dss_numa_nr; + base = dss_sys_xs_nr + dss_tgt_nr + (socket * dss_offload_per_numa_nr); + target = base + ((offload + tgt_id) % dss_offload_per_numa_nr); + offload = target + 17; /* Seed next selection */ + +check: + D_ASSERT(target < DSS_XS_NR_TOTAL && target >= dss_sys_xs_nr); + return target; +} + /* ============== ULT create functions =================================== */ static inline int @@ -341,6 +372,8 @@ sched_ult2xs(int xs_type, int tgt_id) case DSS_XS_DRPC: return 2; case DSS_XS_IOFW: + if (dss_numa_nr > 1) + return sched_ult2xs_multisocket(xs_type, tgt_id); if (!dss_helper_pool) { if (dss_tgt_offload_xs_nr > 0) xs_id = DSS_MAIN_XS_ID(tgt_id) + 1; @@ -379,6 +412,8 @@ sched_ult2xs(int xs_type, int tgt_id) xs_id = (DSS_MAIN_XS_ID(tgt_id) + 1) % dss_tgt_nr; break; case DSS_XS_OFFLOAD: + if (dss_numa_nr > 1) + xs_id = sched_ult2xs_multisocket(xs_type, tgt_id); if (!dss_helper_pool) { if (dss_tgt_offload_xs_nr > 0) xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr; diff --git a/src/tests/ftest/control/daos_system_query.yaml b/src/tests/ftest/control/daos_system_query.yaml index c474336c215..a489e2a9760 100644 --- a/src/tests/ftest/control/daos_system_query.yaml +++ b/src/tests/ftest/control/daos_system_query.yaml @@ -7,7 +7,8 @@ server_config: engines: 0: pinned_numa_node: 0 - nr_xs_helpers: 1 + targets: 4 + nr_xs_helpers: 0 fabric_iface_port: 31416 log_file: daos_server0.log log_mask: DEBUG,MEM=ERR diff --git a/src/tests/ftest/control/dmg_pool_query_test.yaml b/src/tests/ftest/control/dmg_pool_query_test.yaml index 3cf5b7bd137..26616c46cbb 100644 --- a/src/tests/ftest/control/dmg_pool_query_test.yaml +++ b/src/tests/ftest/control/dmg_pool_query_test.yaml @@ -8,7 +8,7 @@ server_config: engines_per_host: 1 engines: 0: - targets: 8 + targets: 4 storage: auto system_ram_reserved: 64 pool: @@ -26,8 +26,8 @@ ior: exp_vals: pool_status: 0 - total_targets: 8 - active_targets: 8 + total_targets: 4 + active_targets: 4 total_engines: 1 disabled_targets: 0 version: 1 diff --git a/src/tests/ftest/control/dmg_server_set_logmasks.yaml b/src/tests/ftest/control/dmg_server_set_logmasks.yaml index 29b8e5326e1..2edfba61ec6 100644 --- a/src/tests/ftest/control/dmg_server_set_logmasks.yaml +++ b/src/tests/ftest/control/dmg_server_set_logmasks.yaml @@ -7,6 +7,9 @@ server_config: system_ram_reserved: 6 engines: 0: + targets: 4 + env_vars: + - DAOS_TARGET_OVERSUBSCRIBE=1 storage: 0: class: ram diff --git a/src/tests/ftest/harness/core_files.yaml b/src/tests/ftest/harness/core_files.yaml index 8133398a42f..7655e2b30f6 100644 --- a/src/tests/ftest/harness/core_files.yaml +++ b/src/tests/ftest/harness/core_files.yaml @@ -5,6 +5,9 @@ server_config: engines_per_host: 1 engines: 0: + targets: 4 + env_vars: + - DAOS_TARGET_OVERSUBSCRIBE=1 storage: 0: class: ram diff --git a/src/tests/ftest/pool/create_all_vm.yaml b/src/tests/ftest/pool/create_all_vm.yaml index d6687f96293..8d884f85605 100644 --- a/src/tests/ftest/pool/create_all_vm.yaml +++ b/src/tests/ftest/pool/create_all_vm.yaml @@ -33,7 +33,7 @@ server_config: system_ram_reserved: 6 engines: 0: - targets: 5 + targets: 4 nr_xs_helpers: 0 storage: 0: diff --git a/src/tests/ftest/pool/query_attribute.yaml b/src/tests/ftest/pool/query_attribute.yaml index d91f36ac52f..7222fedd84c 100644 --- a/src/tests/ftest/pool/query_attribute.yaml +++ b/src/tests/ftest/pool/query_attribute.yaml @@ -6,7 +6,7 @@ server_config: engines_per_host: 1 engines: 0: - targets: 8 + targets: 4 nr_xs_helpers: 0 storage: 0: diff --git a/src/tests/ftest/server/daos_server_dump.yaml b/src/tests/ftest/server/daos_server_dump.yaml index 827edcb3934..a9ffd810f69 100644 --- a/src/tests/ftest/server/daos_server_dump.yaml +++ b/src/tests/ftest/server/daos_server_dump.yaml @@ -11,6 +11,7 @@ server_config: engines: 0: targets: 2 + nr_xs_helpers: 1 storage: 0: class: ram diff --git a/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml b/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml index acd2fa472df..f28a32a1733 100644 --- a/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml +++ b/src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml @@ -7,7 +7,7 @@ server_config: engines_per_host: 1 engines: 0: - targets: 8 + targets: 4 nr_xs_helpers: 0 storage: 0: diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index c58f11d5d74..2c7586df0dd 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -336,7 +336,7 @@ # # For best performance, it is necessary that the fabric_iface of this engine # # resides on the same NUMA node as the first_core. # # -# # Optional parameter; set either this option non-zero or pinned_numa_node but not both. +# # Optional parameter; set either this option or pinned_numa_node but not both. # # first_core: 0 # @@ -491,7 +491,7 @@ # # For best performance, it is necessary that the fabric_iface of this engine # # resides on the same NUMA node as the first_core. # # -# # Optional parameter; set either this option non-zero or pinned_numa_node but not both. +# # Optional parameter; set either this option or pinned_numa_node but not both. # # first_core: 22 # diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index 571370650a1..f4b934c4aa1 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -14,6 +14,7 @@ engines: - DAOS_MD_CAP=1024 - DAOS_STRICT_SHUTDOWN=1 - CRT_CTX_SHARE_ADDR=0 + - DAOS_TARGET_OVERSUBSCRIBE=1 - ABT_STACK_OVERFLOW_CHECK=mprotect storage: - From 369e72b214f0d75f8cc7b84c0038346e86535e70 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Tue, 7 May 2024 19:20:44 -0600 Subject: [PATCH 2/4] Fix an issue Required-githooks: true Change-Id: I92f65924f9b4b3dce6a756e01e5bfc9e584af0b6 Signed-off-by: Jeff Olivier --- src/engine/ult.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/engine/ult.c b/src/engine/ult.c index 18230cec225..7d10086e7e0 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -326,30 +326,27 @@ static inline uint32_t sched_ult2xs_multisocket(int xs_type, int tgt_id) { static __thread uint32_t offload; - uint32_t socket; + uint32_t socket = tgt_id / dss_tgt_per_numa_nr; uint32_t base; uint32_t target; if (dss_tgt_offload_xs_nr == 0) { if (xs_type == DSS_XS_IOFW && dss_forward_neighbor) { /* Keep the old forwarding behavior, but NUMA aware */ - socket = tgt_id / dss_numa_nr; target = (socket * dss_tgt_per_numa_nr) + (tgt_id + offload) % dss_tgt_per_numa_nr; - offload = target + 17; /* Seed next selection */ target = DSS_MAIN_XS_ID(target); goto check; } return DSS_XS_SELF; } - socket = tgt_id / dss_numa_nr; base = dss_sys_xs_nr + dss_tgt_nr + (socket * dss_offload_per_numa_nr); target = base + ((offload + tgt_id) % dss_offload_per_numa_nr); - offload = target + 17; /* Seed next selection */ check: D_ASSERT(target < DSS_XS_NR_TOTAL && target >= dss_sys_xs_nr); + offload = target + 17; /* Seed next selection */ return target; } From c3c4f776ec0e8268fa120a7c20ef2c5d08ff6ce8 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Tue, 7 May 2024 19:21:27 -0600 Subject: [PATCH 3/4] Skip-func-hw-test: true Required-githooks: true Change-Id: Iffb8046df03d0e6eb59475245786770ca5310f75 Signed-off-by: Jeff Olivier From eb2de1b226e25f8155c6231999d8870240d1b8e4 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Tue, 7 May 2024 19:42:38 -0600 Subject: [PATCH 4/4] Skip-func-hw: true Required-githooks: true Change-Id: I96bed7f1a8aa8ce546129064bbe562d7e34cd8b2 Signed-off-by: Jeff Olivier --- src/engine/ult.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/engine/ult.c b/src/engine/ult.c index 7d10086e7e0..8056edbf074 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -335,14 +335,14 @@ sched_ult2xs_multisocket(int xs_type, int tgt_id) /* Keep the old forwarding behavior, but NUMA aware */ target = (socket * dss_tgt_per_numa_nr) + (tgt_id + offload) % dss_tgt_per_numa_nr; - target = DSS_MAIN_XS_ID(target); + target = DSS_MAIN_XS_ID(target); goto check; } return DSS_XS_SELF; } - base = dss_sys_xs_nr + dss_tgt_nr + (socket * dss_offload_per_numa_nr); - target = base + ((offload + tgt_id) % dss_offload_per_numa_nr); + base = dss_sys_xs_nr + dss_tgt_nr + (socket * dss_offload_per_numa_nr); + target = base + ((offload + tgt_id) % dss_offload_per_numa_nr); check: D_ASSERT(target < DSS_XS_NR_TOTAL && target >= dss_sys_xs_nr);