daos-stack · jolivier23 · May 2, 2024 · Apr 22, 2024 · Apr 23, 2024 · Apr 24, 2024
@@ -52,6 +52,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_DTX\_AGG\_THD\_AGE|DTX aggregation age threshold in seconds. The valid range is [210, 1830]. The default value is 630.|
 |DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.|
 |DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.|
+|DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.|
 
 ## Server and Client environment variables
 

@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2019-2023 Intel Corporation.
+// (C) Copyright 2019-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -26,7 +26,7 @@ type startCmd struct {
 	Modules             *string `short:"m" long:"modules" description:"List of server modules to load"`
 	Targets             uint16  `short:"t" long:"targets" description:"Number of targets to use (default use all cores)"`
 	NrXsHelpers         *uint16 `short:"x" long:"xshelpernr" description:"Number of helper XS per VOS target"`
-	FirstCore           uint16  `short:"f" long:"firstcore" default:"0" description:"Index of first core for service thread"`
+	FirstCore           *uint16 `short:"f" long:"firstcore" description:"Index of first core for service thread"`
 	Group               string  `short:"g" long:"group" description:"Server group name"`
 	SocketDir           string  `short:"d" long:"socket_dir" description:"Location for all daos_server & daos_engine sockets"`
 	Insecure            bool    `short:"i" long:"insecure" description:"Allow for insecure connections"`
@@ -73,8 +73,8 @@ func (cmd *startCmd) setCLIOverrides() error {
 		if cmd.NrXsHelpers != nil {
 			srv.WithHelperStreamCount(int(*cmd.NrXsHelpers))
 		}
-		if cmd.FirstCore > 0 {
-			srv.WithServiceThreadCore(int(cmd.FirstCore))
+		if cmd.FirstCore != nil {
+			srv.WithServiceThreadCore(int(*cmd.FirstCore))
 		}
 	}
 

@@ -580,7 +580,6 @@ transport_config:
 engines:
 - targets: 12
   nr_xs_helpers: 2
-  first_core: 0
   log_file: /tmp/daos_engine.0.log
   storage:
   - class: dcpm
@@ -599,7 +598,6 @@ engines:
   pinned_numa_node: 0
 - targets: 6
   nr_xs_helpers: 0
-  first_core: 0
   log_file: /tmp/daos_engine.1.log
   storage:
   - class: dcpm

@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2020-2023 Intel Corporation.
+// (C) Copyright 2020-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -869,7 +869,11 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA
 	// Detect legacy mode by checking if first_core is being used.
 	legacyMode := false
 	for _, engineCfg := range cfg.Engines {
-		if engineCfg.ServiceThreadCore != 0 {
+		if engineCfg.ServiceThreadCore != nil {
+			if *engineCfg.ServiceThreadCore == 0 && engineCfg.PinnedNumaNode != nil {
+				// Both are set but we don't know yet which to use
+				continue
+			}
 			legacyMode = true
 			break
 		}
@@ -878,9 +882,15 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA
 	// Fail if any engine has an explicit pin and non-zero first_core.
 	for idx, engineCfg := range cfg.Engines {
 		if legacyMode {
+			if engineCfg.PinnedNumaNode != nil {
+				log.Infof("pinned_numa_node setting ignored on engine %d", idx)
+				engineCfg.PinnedNumaNode = nil
+			}
 			log.Debugf("setting legacy core allocation algorithm on engine %d", idx)
-			engineCfg.PinnedNumaNode = nil
 			continue
+		} else if engineCfg.ServiceThreadCore != nil {
+			log.Infof("first_core setting ignored on engine %d", idx)
+			engineCfg.ServiceThreadCore = nil
 		}
 
 		numaAffinity, err := detectEngineAffinity(log, engineCfg, affSources...)

@@ -243,7 +243,7 @@ type Config struct {
 	Modules           string         `yaml:"modules,omitempty" cmdLongFlag:"--modules" cmdShortFlag:"-m"`
 	TargetCount       int            `yaml:"targets,omitempty" cmdLongFlag:"--targets,nonzero" cmdShortFlag:"-t,nonzero"`
 	HelperStreamCount int            `yaml:"nr_xs_helpers" cmdLongFlag:"--xshelpernr" cmdShortFlag:"-x"`
-	ServiceThreadCore int            `yaml:"first_core" cmdLongFlag:"--firstcore,nonzero" cmdShortFlag:"-f,nonzero"`
+	ServiceThreadCore *int           `yaml:"first_core,omitempty" cmdLongFlag:"--firstcore" cmdShortFlag:"-f"`
 	SystemName        string         `yaml:"-" cmdLongFlag:"--group" cmdShortFlag:"-g"`
 	SocketDir         string         `yaml:"-" cmdLongFlag:"--socket_dir" cmdShortFlag:"-d"`
 	LogMask           string         `yaml:"log_mask,omitempty" cmdEnv:"D_LOG_MASK"`
@@ -289,7 +289,7 @@ func (c *Config) ReadLogSubsystems() (string, error) {
 
 // Validate ensures that the configuration meets minimum standards.
 func (c *Config) Validate() error {
-	if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 {
+	if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 {
 		return errors.New("cannot specify both pinned_numa_node and first_core")
 	}
 
@@ -302,7 +302,7 @@ func (c *Config) Validate() error {
 	if c.HelperStreamCount < 0 {
 		return errNegative("helper stream count")
 	}
-	if c.ServiceThreadCore < 0 {
+	if c.ServiceThreadCore != nil && *c.ServiceThreadCore < 0 {
 		return errNegative("service thread core index")
 	}
 	if c.MemSize < 0 {
@@ -370,7 +370,7 @@ func IsNUMAMismatch(err error) bool {
 // SetNUMAAffinity sets the NUMA affinity for the engine,
 // if not already set in the configuration.
 func (c *Config) SetNUMAAffinity(node uint) error {
-	if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 {
+	if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 {
 		return errors.New("cannot set both NUMA node and service core")
 	}
 
@@ -612,7 +612,7 @@ func (c *Config) WithHelperStreamCount(count int) *Config {
 
 // WithServiceThreadCore sets the core index to be used for running DAOS service threads.
 func (c *Config) WithServiceThreadCore(idx int) *Config {
-	c.ServiceThreadCore = idx
+	c.ServiceThreadCore = &idx
 	return c
 }
 

diff --git a/src/engine/init.c b/src/engine/init.c
@@ -74,15 +74,18 @@ hwloc_topology_t	dss_topo;
 int			dss_core_depth;
 /** number of physical cores, w/o hyperthreading */
 int			dss_core_nr;
-/** start offset index of the first core for service XS */
-unsigned int		dss_core_offset;
+/** start offset index of the first core for service XS.  Init to -1 so we can
+ * detect when it is explicitly set and disable multi-socket mode.
+ */
+unsigned int            dss_core_offset = -1;
 /** NUMA node to bind to */
 int			dss_numa_node = -1;
-hwloc_bitmap_t	core_allocation_bitmap;
-/** a copy of the NUMA node object in the topology */
-hwloc_obj_t		numa_obj;
-/** number of cores in the given NUMA node */
-int			dss_num_cores_numa_node;
+/** Forward I/O work to neighbor */
+bool                    dss_forward_neighbor;
+/** Cached numa information */
+struct dss_numa_info   *dss_numa;
+/** Number of active numa nodes, multi-socket mode only */
+int                     dss_numa_nr = 1;
 /** Module facility bitmask */
 static uint64_t		dss_mod_facs;
 /** Number of storage tiers: 2 for SCM and NVMe */
@@ -305,16 +308,65 @@ dss_tgt_nr_check(unsigned int ncores, unsigned int tgt_nr, bool oversubscribe)
 	return 0;
 }
 
+static bool
+dss_multi_socket_check(bool oversub, int numa_nr)
+{
+	/** Keep this simple and disallow some configurations */
+	if (oversub) {
+		D_INFO("Oversubscription requested, bypassing multi-socket mode\n");
+		return false;
+	}
+
+	if (dss_numa_node != -1) {
+		D_INFO("Numa node specified, running in single socket mode\n");
+		return false;
+	}
+
+	if (numa_nr < 2) {
+		D_INFO("No NUMA found, bypassing multi-socket mode\n");
+		return false;
+	}
+
+	if ((dss_tgt_offload_xs_nr % numa_nr) != 0) {
+		D_INFO("Uneven split of helpers on sockets, bypassing multi-socket mode\n");
+		return false;
+	}
+
+	if ((dss_tgt_nr % numa_nr) != 0) {
+		D_INFO("Uneven split of targets on sockets, bypassing multi-socket mode\n");
+		return false;
+	}
+
+	return true;
+}
+
 static int
-dss_topo_init()
+dss_legacy_mode(bool oversub)
+{
+	D_PRINT("Using legacy core allocation algorithm\n");
+	if (dss_core_offset >= dss_core_nr) {
+		D_ERROR("invalid dss_core_offset %u (set by \"-f\" option), should within "
+			"range [0, %u]\n",
+			dss_core_offset, dss_core_nr - 1);
+		return -DER_INVAL;
+	}
+
+	return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, oversub);
+}
+
+static int
+dss_topo_init(void)
 {
 	int		depth;
 	int		numa_node_nr;
-	int		num_cores_visited;
-	char		*cpuset;
+	int             num_cores_visited;
 	int		k;
+	int             numa_node;
+	int             rc = 0;
+	hwloc_obj_t     numa_obj;
 	hwloc_obj_t	corenode;
 	bool            tgt_oversub = false;
+	bool            multi_socket = false;
 
 	hwloc_topology_init(&dss_topo);
 	hwloc_topology_load(dss_topo);
@@ -324,70 +376,101 @@ dss_topo_init()
 	depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE);
 	numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth);
 	d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub);
+	d_getenv_bool("DAOS_FORWARD_NEIGHBOR", &dss_forward_neighbor);
 	dss_tgt_nr = nr_threads;
 
-	/* if no NUMA node was specified, or NUMA data unavailable */
-	/* fall back to the legacy core allocation algorithm */
-	if (dss_numa_node == -1 || numa_node_nr <= 0) {
-		D_PRINT("Using legacy core allocation algorithm\n");
-		if (dss_core_offset >= dss_core_nr) {
-			D_ERROR("invalid dss_core_offset %u (set by \"-f\" option), should within "
-				"range [0, %u]\n",
-				dss_core_offset, dss_core_nr - 1);
-			return -DER_INVAL;
-		}
-
-		return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, tgt_oversub);
+	/** Set to -1 initially so we can detect when it's set explicitly to
+	 * maintain mode consistency between engines where one sets it to 0.
+	 */
+	if (dss_core_offset == -1) {
+		dss_core_offset = 0;
+		if (dss_multi_socket_check(tgt_oversub, numa_node_nr))
+			multi_socket = true;
+	} else {
+		D_INFO("Core offset specified, running in single socket mode\n");
 	}
 
+	/* Fall back to legacy mode if no socket was specified and
+	 * multi-socket mode is not possible or NUMA data is unavailable
+	 */
+	if ((!multi_socket && dss_numa_node == -1) || numa_node_nr <= 0)
+		return dss_legacy_mode(tgt_oversub);
+
 	if (dss_numa_node > numa_node_nr) {
 		D_ERROR("Invalid NUMA node selected. Must be no larger than %d\n", numa_node_nr);
 		return -DER_INVAL;
 	}
 
-	numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node);
-	if (numa_obj == NULL) {
-		D_ERROR("NUMA node %d was not found in the topology\n", dss_numa_node);
-		return -DER_INVAL;
-	}
+	D_ALLOC_ARRAY(dss_numa, numa_node_nr);
+	if (dss_numa == NULL)
+		return -DER_NOMEM;
 
-	/* create an empty bitmap, then set each bit as we */
-	/* find a core that matches */
-	core_allocation_bitmap = hwloc_bitmap_alloc();
-	if (core_allocation_bitmap == NULL) {
-		D_ERROR("Unable to allocate core allocation bitmap\n");
-		return -DER_INVAL;
-	}
+	for (numa_node = 0; numa_node < numa_node_nr; numa_node++) {
+		dss_numa[numa_node].ni_idx = numa_node;
+		numa_obj                   = hwloc_get_obj_by_depth(dss_topo, depth, numa_node);
+		if (numa_obj == NULL) {
+			D_ERROR("NUMA node %d was not found in the topology\n", numa_node);
+			D_GOTO(failed, rc = -DER_INVAL);
+		}
 
-	dss_num_cores_numa_node = 0;
-	num_cores_visited = 0;
+		/* create an empty bitmap, then set each bit as we */
+		/* find a core that matches */
+		dss_numa[numa_node].ni_coremap = hwloc_bitmap_alloc();
+		if (dss_numa[numa_node].ni_coremap == NULL) {
+			D_ERROR("Unable to allocate core allocation bitmap\n");
+			D_GOTO(failed, rc = -DER_INVAL);
+		}
 
-	for (k = 0; k < dss_core_nr; k++) {
-		corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
-		if (corenode == NULL)
-			continue;
-		if (hwloc_bitmap_isincluded(corenode->cpuset,
-					    numa_obj->cpuset) != 0) {
-			if (num_cores_visited++ >= dss_core_offset) {
-				hwloc_bitmap_set(core_allocation_bitmap, k);
-				hwloc_bitmap_asprintf(&cpuset,
-						      corenode->cpuset);
+		dss_numa[numa_node].ni_core_nr = 0;
+		num_cores_visited              = 0;
+
+		for (k = 0; k < dss_core_nr; k++) {
+			corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
+			if (corenode == NULL)
+				continue;
+			if (hwloc_bitmap_isincluded(corenode->cpuset, numa_obj->cpuset) != 0) {
+				if (num_cores_visited++ >= dss_core_offset)
+					hwloc_bitmap_set(dss_numa[numa_node].ni_coremap, k);
+				dss_numa[numa_node].ni_core_nr++;
 			}
-			dss_num_cores_numa_node++;
 		}
+		if (multi_socket && numa_node > 0 &&
+		    dss_numa[numa_node].ni_core_nr != dss_numa[numa_node - 1].ni_core_nr) {
+			D_INFO("Non-uniform numa nodes, bypassing multi-socket mode\n");
+			D_FREE(dss_numa);
+			return dss_legacy_mode(false);
+		}
+	}
+
+	if (multi_socket) {
+		/** In this mode, we simply save the topology for later use but
+		 * still use all of the cores.
+		 */
+		D_PRINT("Using Multi-socket NUMA core allocation algorithm\n");
+		dss_numa_nr             = numa_node_nr;
+		dss_offload_per_numa_nr = dss_tgt_offload_xs_nr / dss_numa_nr;
+		dss_tgt_per_numa_nr     = dss_tgt_nr / dss_numa_nr;
+		return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, tgt_oversub);
 	}
-	hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap);
-	free(cpuset);
 
-	if (dss_core_offset >= dss_num_cores_numa_node) {
+	if (dss_core_offset >= dss_numa[dss_numa_node].ni_core_nr) {
 		D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), should within range "
 			"[0, %d]\n",
-			dss_core_offset, dss_num_cores_numa_node - 1);
+			dss_core_offset, dss_numa[dss_numa_node].ni_core_nr - 1);
 		return -DER_INVAL;
 	}
 	D_PRINT("Using NUMA core allocation algorithm\n");
 
-	return dss_tgt_nr_check(dss_num_cores_numa_node, dss_tgt_nr, tgt_oversub);
+	return dss_tgt_nr_check(dss_numa[dss_numa_node].ni_core_nr, dss_tgt_nr, tgt_oversub);
+failed:
+	D_FREE(dss_numa);
+	return rc;
+}
+
+static void
+dss_topo_fini(void)
+{
+	D_FREE(dss_numa);
 }
 
 static ABT_mutex		server_init_state_mutex;
@@ -814,7 +897,7 @@ server_init(int argc, char *argv[])
 		DAOS_VERSION, getpid(), dss_self_rank(), dss_tgt_nr,
 		dss_tgt_offload_xs_nr, dss_core_offset, dss_hostname);
 
-	if (numa_obj)
+	if (dss_numa && dss_numa_node != -1)
 		D_PRINT("Using NUMA node: %d", dss_numa_node);
 
 	return 0;
@@ -841,6 +924,7 @@ server_init(int argc, char *argv[])
 exit_metrics_init:
 	dss_engine_metrics_fini();
 	d_tm_fini();
+	/* dss_topo_fini cleans itself if it fails */
 exit_debug_init:
 	daos_debug_fini();
 	return rc;
@@ -899,6 +983,8 @@ server_fini(bool force)
 	D_INFO("dss_engine_metrics_fini() done\n");
 	d_tm_fini();
 	D_INFO("d_tm_fini() done\n");
+	dss_topo_fini();
+	D_INFO("dss_top_fini() done\n");
 	daos_debug_fini();
 	D_INFO("daos_debug_fini() done\n");
 }