DAOS-15739 engine: Add single-engine, multi-socket support (#14311)

Backport for the following patches DAOS-13380 engine: refine tgt_nr check (#12405) DAOS-15739 engine: Add multi-socket support (#14234) DAOS-623 engine: Fix a typo (#14329) * DAOS-13380 engine: refine tgt_nr check 1. for non-DAOS_TARGET_OVERSUBSCRIBE case fail to start engine if #cores is not enough 2. for DAOS_TARGET_OVERSUBSCRIBE case allow to force start engine The #nr_xs_helpers possibly be reduced for either case. * DAOS-15739 engine: Add multi-socket support (#14234) Add a simple multi-socket mode for use cases where a single engine must be used. Avoids the issue of having all helper xstreams automatically assigned to a single NUMA node thus increasing efficiency of synchronizations between I/O and helper xstreams. It is the default behavior if all of the following are true Neither pinned_numa_node nor first_core are used. No oversubscription is requested NUMA has uniform number of cores targets and helpers divide evenly among numa nodes There is more than one numa node Update server config logic to ensure first_core is passed on to engine if it's set while keeping existing behavior when both first_core: 0 and pinned_numa_node are set. Signed-off-by: Jeff Olivier <[email protected]> Signed-off-by: Xuezhao Liu <[email protected]> Signed-off-by: Tom Nabarro <[email protected]>
daos-stack · May 8, 2024 · f16a7dd · f16a7dd
1 parent d438ace
commit f16a7dd
Show file tree

Hide file tree

Showing 23 changed files with 363 additions and 183 deletions.
diff --git a/docs/admin/deployment.md b/docs/admin/deployment.md
@@ -1377,6 +1377,12 @@ per four target threads, for example `targets: 16` and `nr_xs_helpers: 4`.
 The server should have sufficiently many physical cores to support the
 number of targets plus the additional service threads.
 
+The 'targets:' and 'nr_xs_helpers:' requirement are mandatory, if the number
+of physical cores are not enough it will fail the starting of the daos engine
+(notes that 2 cores reserved for system service), or configures with ENV
+"DAOS_TARGET_OVERSUBSCRIBE=1" to force starting daos engine (possibly hurts
+performance as multiple XS compete on same core).
+
 
 ## Storage Formatting
 

diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
@@ -52,6 +52,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_DTX\_AGG\_THD\_AGE|DTX aggregation age threshold in seconds. The valid range is [210, 1830]. The default value is 630.|
 |DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.|
 |DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.|
+|DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.|
 
 ## Server and Client environment variables
 

diff --git a/src/control/cmd/daos_server/start.go b/src/control/cmd/daos_server/start.go
@@ -1,5 +1,5 @@
 //
-// (C) Copyright 2019-2023 Intel Corporation.
+// (C) Copyright 2019-2024 Intel Corporation.
 //
 // SPDX-License-Identifier: BSD-2-Clause-Patent
 //
@@ -29,7 +29,7 @@ type startCmd struct {
 	Modules             *string `short:"m" long:"modules" description:"List of server modules to load"`
 	Targets             uint16  `short:"t" long:"targets" description:"Number of targets to use (default use all cores)"`
 	NrXsHelpers         *uint16 `short:"x" long:"xshelpernr" description:"Number of helper XS per VOS target"`
-	FirstCore           uint16  `short:"f" long:"firstcore" default:"0" description:"Index of first core for service thread"`
+	FirstCore           *uint16 `short:"f" long:"firstcore" description:"Index of first core for service thread"`
 	Group               string  `short:"g" long:"group" description:"Server group name"`
 	SocketDir           string  `short:"d" long:"socket_dir" description:"Location for all daos_server & daos_engine sockets"`
 	Insecure            bool    `short:"i" long:"insecure" description:"Allow for insecure connections"`
@@ -76,8 +76,8 @@ func (cmd *startCmd) setCLIOverrides() error {
 		if cmd.NrXsHelpers != nil {
 			srv.WithHelperStreamCount(int(*cmd.NrXsHelpers))
 		}
-		if cmd.FirstCore > 0 {
-			srv.WithServiceThreadCore(int(cmd.FirstCore))
+		if cmd.FirstCore != nil {
+			srv.WithServiceThreadCore(int(*cmd.FirstCore))
 		}
 	}
 

diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go
@@ -580,7 +580,6 @@ transport_config:
 engines:
 - targets: 12
   nr_xs_helpers: 2
-  first_core: 0
   log_file: /tmp/daos_engine.0.log
   storage:
   - class: dcpm
@@ -599,7 +598,6 @@ engines:
   pinned_numa_node: 0
 - targets: 6
   nr_xs_helpers: 0
-  first_core: 0
   log_file: /tmp/daos_engine.1.log
   storage:
   - class: dcpm

diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go
@@ -863,7 +863,11 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA
 	// Detect legacy mode by checking if first_core is being used.
 	legacyMode := false
 	for _, engineCfg := range cfg.Engines {
-		if engineCfg.ServiceThreadCore != 0 {
+		if engineCfg.ServiceThreadCore != nil {
+			if *engineCfg.ServiceThreadCore == 0 && engineCfg.PinnedNumaNode != nil {
+				// Both are set but we don't know yet which to use
+				continue
+			}
 			legacyMode = true
 			break
 		}
@@ -872,9 +876,15 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA
 	// Fail if any engine has an explicit pin and non-zero first_core.
 	for idx, engineCfg := range cfg.Engines {
 		if legacyMode {
+			if engineCfg.PinnedNumaNode != nil {
+				log.Infof("pinned_numa_node setting ignored on engine %d", idx)
+				engineCfg.PinnedNumaNode = nil
+			}
 			log.Debugf("setting legacy core allocation algorithm on engine %d", idx)
-			engineCfg.PinnedNumaNode = nil
 			continue
+		} else if engineCfg.ServiceThreadCore != nil {
+			log.Infof("first_core setting ignored on engine %d", idx)
+			engineCfg.ServiceThreadCore = nil
 		}
 
 		numaAffinity, err := detectEngineAffinity(log, engineCfg, affSources...)

diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go
@@ -1569,7 +1569,10 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) {
 			var engineCfgs []*engine.Config
 			for i, sc := range tc.storageCfgs {
 				log.Debugf("storage cfg contains bdevs %v for engine %d", sc.Bdevs(), i)
-				engineCfgs = append(engineCfgs, engine.MockConfig().WithStorage(sc...))
+				engineCfgs = append(engineCfgs,
+					engine.MockConfig().
+						WithStorage(sc...).
+						WithTargetCount(tc.engineTargetCount[i]))
 			}
 			sCfg := config.DefaultServer().WithEngines(engineCfgs...)
 			cs := mockControlService(t, log, sCfg, csbmbc, tc.smbc, tc.smsc)
@@ -1625,7 +1628,6 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) {
 				}
 				te.setDrpcClient(newMockDrpcClient(dcc))
 				te._superblock.Rank = ranklist.NewRankPtr(uint32(idx + 1))
-				te.setTargetCount(tc.engineTargetCount[idx])
 				for _, tc := range te.storage.GetBdevConfigs() {
 					tc.Bdev.DeviceRoles.OptionBits = storage.OptionBits(storage.BdevRoleAll)
 				}

diff --git a/src/control/server/engine/config.go b/src/control/server/engine/config.go
@@ -115,7 +115,7 @@ type Config struct {
 	Modules           string         `yaml:"modules,omitempty" cmdLongFlag:"--modules" cmdShortFlag:"-m"`
 	TargetCount       int            `yaml:"targets,omitempty" cmdLongFlag:"--targets,nonzero" cmdShortFlag:"-t,nonzero"`
 	HelperStreamCount int            `yaml:"nr_xs_helpers" cmdLongFlag:"--xshelpernr" cmdShortFlag:"-x"`
-	ServiceThreadCore int            `yaml:"first_core" cmdLongFlag:"--firstcore,nonzero" cmdShortFlag:"-f,nonzero"`
+	ServiceThreadCore *int           `yaml:"first_core,omitempty" cmdLongFlag:"--firstcore" cmdShortFlag:"-f"`
 	SystemName        string         `yaml:"-" cmdLongFlag:"--group" cmdShortFlag:"-g"`
 	SocketDir         string         `yaml:"-" cmdLongFlag:"--socket_dir" cmdShortFlag:"-d"`
 	LogMask           string         `yaml:"log_mask,omitempty" cmdEnv:"D_LOG_MASK"`
@@ -160,10 +160,29 @@ func (c *Config) ReadLogSubsystems() (string, error) {
 
 // Validate ensures that the configuration meets minimum standards.
 func (c *Config) Validate() error {
-	if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 {
+	if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 {
 		return errors.New("cannot specify both pinned_numa_node and first_core")
 	}
 
+	errNegative := func(s string) error {
+		return errors.Errorf("%s must not be negative", s)
+	}
+	if c.TargetCount < 0 {
+		return errNegative("target count")
+	}
+	if c.HelperStreamCount < 0 {
+		return errNegative("helper stream count")
+	}
+	if c.ServiceThreadCore != nil && *c.ServiceThreadCore < 0 {
+		return errNegative("service thread core index")
+	}
+	if c.MemSize < 0 {
+		return errNegative("mem size")
+	}
+	if c.HugepageSz < 0 {
+		return errNegative("hugepage size")
+	}
+
 	if c.TargetCount == 0 {
 		return errors.New("target count must be nonzero")
 	}
@@ -222,7 +241,7 @@ func IsNUMAMismatch(err error) bool {
 // SetNUMAAffinity sets the NUMA affinity for the engine,
 // if not already set in the configuration.
 func (c *Config) SetNUMAAffinity(node uint) error {
-	if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 {
+	if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 {
 		return errors.New("cannot set both NUMA node and service core")
 	}
 
@@ -464,7 +483,7 @@ func (c *Config) WithHelperStreamCount(count int) *Config {
 
 // WithServiceThreadCore sets the core index to be used for running DAOS service threads.
 func (c *Config) WithServiceThreadCore(idx int) *Config {
-	c.ServiceThreadCore = idx
+	c.ServiceThreadCore = &idx
 	return c
 }
 

diff --git a/src/control/server/instance.go b/src/control/server/instance.go
@@ -338,14 +338,6 @@ func (ei *EngineInstance) setHugepageSz(hpSizeMb int) {
 	ei.runner.GetConfig().HugepageSz = hpSizeMb
 }
 
-// setTargetCount updates target count in engine config.
-func (ei *EngineInstance) setTargetCount(numTargets int) {
-	ei.Lock()
-	defer ei.Unlock()
-
-	ei.runner.GetConfig().TargetCount = numTargets
-}
-
 // GetTargetCount returns the target count set for this instance.
 func (ei *EngineInstance) GetTargetCount() int {
 	ei.RLock()

diff --git a/src/control/server/instance_exec.go b/src/control/server/instance_exec.go
@@ -90,12 +90,6 @@ func (ei *EngineInstance) finishStartup(ctx context.Context, ready *srvpb.Notify
 	if err := ei.handleReady(ctx, ready); err != nil {
 		return err
 	}
-	// update engine target count to reflect allocated number of targets, not number requested
-	// when starting
-	// NOTE: Engine mem_size passed on engine invocation is based on the number of targets
-	//       requested in config so if number of targets allocated doesn't match the number of
-	//       targets requested the mem_size value may be inappropriate.
-	ei.setTargetCount(int(ready.GetNtgts()))
 
 	ei.ready.SetTrue()