Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-15739 engine: Add multi-socket support #14234

Merged
merged 22 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c198e6d
DAOS-15739 engine: Add multi-socket support
jolivier23 Apr 22, 2024
b706c29
minor fix
jolivier23 Apr 23, 2024
610a9a8
Remove DAOS_MULTISOCKET envirable
jolivier23 Apr 24, 2024
4b1730c
Add DAOS_FORWARD_SELF
jolivier23 Apr 24, 2024
e153977
Merge branch 'master' into jvolivie/add_multisocket
jolivier23 Apr 24, 2024
e86d14d
Skip-build-ubuntu20-rpm: true
jolivier23 Apr 24, 2024
34459d0
Merge branch 'master' into jvolivie/add_multisocket
jolivier23 Apr 25, 2024
89a7e24
Fix a bug with dss_core_offset
jolivier23 Apr 25, 2024
00fcc86
Merge branch 'master' into jvolivie/add_multisocket
jolivier23 Apr 30, 2024
e854c9f
Merge branch 'master' into jvolivie/add_multisocket
jolivier23 May 1, 2024
62d6591
Fix first_core handling in control plane so it
jolivier23 May 1, 2024
043c5dc
Avoid invalid assertion
jolivier23 May 1, 2024
e3e3a5a
autoconfig shouldn't be setting both pinned_numa_node and first_core
jolivier23 May 1, 2024
5e6a225
Fix up some configs to avoid setting first_core
jolivier23 May 1, 2024
a2cdd98
Revert "Fix up some configs to avoid setting first_core"
jolivier23 May 1, 2024
9dfe48f
Revert "autoconfig shouldn't be setting both pinned_numa_node and fir…
jolivier23 May 1, 2024
b61c48e
Allow first_core: 0 to be set with pinned_numa_node
jolivier23 May 1, 2024
ef9c4a5
Features: control
jolivier23 May 1, 2024
b8420f9
Print which setting is superfluous
jolivier23 May 1, 2024
a0a86df
Set first core to nil
jolivier23 May 1, 2024
e0ccb53
Add one comment
jolivier23 May 1, 2024
33c7d85
Features: control
jolivier23 May 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/admin/env_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Environment variables in this section only apply to the server side.
|DAOS\_DTX\_AGG\_THD\_AGE|DTX aggregation age threshold in seconds. The valid range is [210, 1830]. The default value is 630.|
|DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.|
|DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.|
|DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.|

## Server and Client environment variables

Expand Down
8 changes: 4 additions & 4 deletions src/control/cmd/daos_server/start.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2019-2023 Intel Corporation.
// (C) Copyright 2019-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand All @@ -26,7 +26,7 @@ type startCmd struct {
Modules *string `short:"m" long:"modules" description:"List of server modules to load"`
Targets uint16 `short:"t" long:"targets" description:"Number of targets to use (default use all cores)"`
NrXsHelpers *uint16 `short:"x" long:"xshelpernr" description:"Number of helper XS per VOS target"`
FirstCore uint16 `short:"f" long:"firstcore" default:"0" description:"Index of first core for service thread"`
FirstCore *uint16 `short:"f" long:"firstcore" description:"Index of first core for service thread"`
Group string `short:"g" long:"group" description:"Server group name"`
SocketDir string `short:"d" long:"socket_dir" description:"Location for all daos_server & daos_engine sockets"`
Insecure bool `short:"i" long:"insecure" description:"Allow for insecure connections"`
Expand Down Expand Up @@ -73,8 +73,8 @@ func (cmd *startCmd) setCLIOverrides() error {
if cmd.NrXsHelpers != nil {
srv.WithHelperStreamCount(int(*cmd.NrXsHelpers))
}
if cmd.FirstCore > 0 {
srv.WithServiceThreadCore(int(cmd.FirstCore))
if cmd.FirstCore != nil {
srv.WithServiceThreadCore(int(*cmd.FirstCore))
}
}

Expand Down
2 changes: 0 additions & 2 deletions src/control/cmd/dmg/auto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,6 @@ transport_config:
engines:
- targets: 12
nr_xs_helpers: 2
first_core: 0
log_file: /tmp/daos_engine.0.log
storage:
- class: dcpm
Expand All @@ -599,7 +598,6 @@ engines:
pinned_numa_node: 0
- targets: 6
nr_xs_helpers: 0
first_core: 0
log_file: /tmp/daos_engine.1.log
storage:
- class: dcpm
Expand Down
16 changes: 13 additions & 3 deletions src/control/server/config/server.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2023 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -869,7 +869,11 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA
// Detect legacy mode by checking if first_core is being used.
legacyMode := false
for _, engineCfg := range cfg.Engines {
if engineCfg.ServiceThreadCore != 0 {
if engineCfg.ServiceThreadCore != nil {
if *engineCfg.ServiceThreadCore == 0 && engineCfg.PinnedNumaNode != nil {
// Both are set but we don't know yet which to use
continue
}
legacyMode = true
break
}
Expand All @@ -878,9 +882,15 @@ func (cfg *Server) SetEngineAffinities(log logging.Logger, affSources ...EngineA
// Fail if any engine has an explicit pin and non-zero first_core.
for idx, engineCfg := range cfg.Engines {
if legacyMode {
if engineCfg.PinnedNumaNode != nil {
log.Infof("pinned_numa_node setting ignored on engine %d", idx)
engineCfg.PinnedNumaNode = nil
}
log.Debugf("setting legacy core allocation algorithm on engine %d", idx)
engineCfg.PinnedNumaNode = nil
continue
} else if engineCfg.ServiceThreadCore != nil {
log.Infof("first_core setting ignored on engine %d", idx)
engineCfg.ServiceThreadCore = nil
}

numaAffinity, err := detectEngineAffinity(log, engineCfg, affSources...)
Expand Down
10 changes: 5 additions & 5 deletions src/control/server/engine/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ type Config struct {
Modules string `yaml:"modules,omitempty" cmdLongFlag:"--modules" cmdShortFlag:"-m"`
TargetCount int `yaml:"targets,omitempty" cmdLongFlag:"--targets,nonzero" cmdShortFlag:"-t,nonzero"`
HelperStreamCount int `yaml:"nr_xs_helpers" cmdLongFlag:"--xshelpernr" cmdShortFlag:"-x"`
ServiceThreadCore int `yaml:"first_core" cmdLongFlag:"--firstcore,nonzero" cmdShortFlag:"-f,nonzero"`
ServiceThreadCore *int `yaml:"first_core,omitempty" cmdLongFlag:"--firstcore" cmdShortFlag:"-f"`
SystemName string `yaml:"-" cmdLongFlag:"--group" cmdShortFlag:"-g"`
SocketDir string `yaml:"-" cmdLongFlag:"--socket_dir" cmdShortFlag:"-d"`
LogMask string `yaml:"log_mask,omitempty" cmdEnv:"D_LOG_MASK"`
Expand Down Expand Up @@ -289,7 +289,7 @@ func (c *Config) ReadLogSubsystems() (string, error) {

// Validate ensures that the configuration meets minimum standards.
func (c *Config) Validate() error {
if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 {
if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 {
return errors.New("cannot specify both pinned_numa_node and first_core")
}

Expand All @@ -302,7 +302,7 @@ func (c *Config) Validate() error {
if c.HelperStreamCount < 0 {
return errNegative("helper stream count")
}
if c.ServiceThreadCore < 0 {
if c.ServiceThreadCore != nil && *c.ServiceThreadCore < 0 {
return errNegative("service thread core index")
}
if c.MemSize < 0 {
Expand Down Expand Up @@ -370,7 +370,7 @@ func IsNUMAMismatch(err error) bool {
// SetNUMAAffinity sets the NUMA affinity for the engine,
// if not already set in the configuration.
func (c *Config) SetNUMAAffinity(node uint) error {
if c.PinnedNumaNode != nil && c.ServiceThreadCore != 0 {
if c.PinnedNumaNode != nil && c.ServiceThreadCore != nil && *c.ServiceThreadCore != 0 {
return errors.New("cannot set both NUMA node and service core")
}

Expand Down Expand Up @@ -612,7 +612,7 @@ func (c *Config) WithHelperStreamCount(count int) *Config {

// WithServiceThreadCore sets the core index to be used for running DAOS service threads.
func (c *Config) WithServiceThreadCore(idx int) *Config {
c.ServiceThreadCore = idx
c.ServiceThreadCore = &idx
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while we are changing things it might make sense to change ServiceThreadCore to *uint

return c
}

Expand Down
192 changes: 139 additions & 53 deletions src/engine/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,18 @@ hwloc_topology_t dss_topo;
int dss_core_depth;
/** number of physical cores, w/o hyperthreading */
int dss_core_nr;
/** start offset index of the first core for service XS */
unsigned int dss_core_offset;
/** start offset index of the first core for service XS. Init to -1 so we can
* detect when it is explicitly set and disable multi-socket mode.
*/
unsigned int dss_core_offset = -1;
/** NUMA node to bind to */
int dss_numa_node = -1;
hwloc_bitmap_t core_allocation_bitmap;
/** a copy of the NUMA node object in the topology */
hwloc_obj_t numa_obj;
/** number of cores in the given NUMA node */
int dss_num_cores_numa_node;
/** Forward I/O work to neighbor */
bool dss_forward_neighbor;
/** Cached numa information */
struct dss_numa_info *dss_numa;
/** Number of active numa nodes, multi-socket mode only */
int dss_numa_nr = 1;
/** Module facility bitmask */
static uint64_t dss_mod_facs;
/** Number of storage tiers: 2 for SCM and NVMe */
Expand Down Expand Up @@ -305,16 +308,65 @@ dss_tgt_nr_check(unsigned int ncores, unsigned int tgt_nr, bool oversubscribe)
return 0;
}

static bool
dss_multi_socket_check(bool oversub, int numa_nr)
{
/** Keep this simple and disallow some configurations */
if (oversub) {
D_INFO("Oversubscription requested, bypassing multi-socket mode\n");
return false;
}

if (dss_numa_node != -1) {
D_INFO("Numa node specified, running in single socket mode\n");
return false;
}

if (numa_nr < 2) {
D_INFO("No NUMA found, bypassing multi-socket mode\n");
return false;
}

if ((dss_tgt_offload_xs_nr % numa_nr) != 0) {
D_INFO("Uneven split of helpers on sockets, bypassing multi-socket mode\n");
return false;
}

if ((dss_tgt_nr % numa_nr) != 0) {
D_INFO("Uneven split of targets on sockets, bypassing multi-socket mode\n");
return false;
}

return true;
}

static int
dss_topo_init()
dss_legacy_mode(bool oversub)
{
D_PRINT("Using legacy core allocation algorithm\n");
if (dss_core_offset >= dss_core_nr) {
D_ERROR("invalid dss_core_offset %u (set by \"-f\" option), should within "
"range [0, %u]\n",
dss_core_offset, dss_core_nr - 1);
return -DER_INVAL;
}

return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, oversub);
}

static int
dss_topo_init(void)
{
int depth;
int numa_node_nr;
int num_cores_visited;
char *cpuset;
int num_cores_visited;
int k;
int numa_node;
int rc = 0;
hwloc_obj_t numa_obj;
hwloc_obj_t corenode;
bool tgt_oversub = false;
bool multi_socket = false;

hwloc_topology_init(&dss_topo);
hwloc_topology_load(dss_topo);
Expand All @@ -324,70 +376,101 @@ dss_topo_init()
depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE);
numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth);
d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub);
d_getenv_bool("DAOS_FORWARD_NEIGHBOR", &dss_forward_neighbor);
dss_tgt_nr = nr_threads;

/* if no NUMA node was specified, or NUMA data unavailable */
/* fall back to the legacy core allocation algorithm */
if (dss_numa_node == -1 || numa_node_nr <= 0) {
D_PRINT("Using legacy core allocation algorithm\n");
if (dss_core_offset >= dss_core_nr) {
D_ERROR("invalid dss_core_offset %u (set by \"-f\" option), should within "
"range [0, %u]\n",
dss_core_offset, dss_core_nr - 1);
return -DER_INVAL;
}

return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, tgt_oversub);
/** Set to -1 initially so we can detect when it's set explicitly to
* maintain mode consistency between engines where one sets it to 0.
*/
if (dss_core_offset == -1) {
dss_core_offset = 0;
if (dss_multi_socket_check(tgt_oversub, numa_node_nr))
multi_socket = true;
} else {
D_INFO("Core offset specified, running in single socket mode\n");
}

/* Fall back to legacy mode if no socket was specified and
* multi-socket mode is not possible or NUMA data is unavailable
*/
if ((!multi_socket && dss_numa_node == -1) || numa_node_nr <= 0)
return dss_legacy_mode(tgt_oversub);

if (dss_numa_node > numa_node_nr) {
D_ERROR("Invalid NUMA node selected. Must be no larger than %d\n", numa_node_nr);
return -DER_INVAL;
}

numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node);
if (numa_obj == NULL) {
D_ERROR("NUMA node %d was not found in the topology\n", dss_numa_node);
return -DER_INVAL;
}
D_ALLOC_ARRAY(dss_numa, numa_node_nr);
if (dss_numa == NULL)
return -DER_NOMEM;

/* create an empty bitmap, then set each bit as we */
/* find a core that matches */
core_allocation_bitmap = hwloc_bitmap_alloc();
if (core_allocation_bitmap == NULL) {
D_ERROR("Unable to allocate core allocation bitmap\n");
return -DER_INVAL;
}
for (numa_node = 0; numa_node < numa_node_nr; numa_node++) {
dss_numa[numa_node].ni_idx = numa_node;
numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, numa_node);
if (numa_obj == NULL) {
D_ERROR("NUMA node %d was not found in the topology\n", numa_node);
D_GOTO(failed, rc = -DER_INVAL);
}

dss_num_cores_numa_node = 0;
num_cores_visited = 0;
/* create an empty bitmap, then set each bit as we */
/* find a core that matches */
dss_numa[numa_node].ni_coremap = hwloc_bitmap_alloc();
if (dss_numa[numa_node].ni_coremap == NULL) {
D_ERROR("Unable to allocate core allocation bitmap\n");
D_GOTO(failed, rc = -DER_INVAL);
}

for (k = 0; k < dss_core_nr; k++) {
corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
if (corenode == NULL)
continue;
if (hwloc_bitmap_isincluded(corenode->cpuset,
numa_obj->cpuset) != 0) {
if (num_cores_visited++ >= dss_core_offset) {
hwloc_bitmap_set(core_allocation_bitmap, k);
hwloc_bitmap_asprintf(&cpuset,
corenode->cpuset);
dss_numa[numa_node].ni_core_nr = 0;
num_cores_visited = 0;

for (k = 0; k < dss_core_nr; k++) {
corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
if (corenode == NULL)
continue;
if (hwloc_bitmap_isincluded(corenode->cpuset, numa_obj->cpuset) != 0) {
if (num_cores_visited++ >= dss_core_offset)
hwloc_bitmap_set(dss_numa[numa_node].ni_coremap, k);
dss_numa[numa_node].ni_core_nr++;
}
dss_num_cores_numa_node++;
}
if (multi_socket && numa_node > 0 &&
dss_numa[numa_node].ni_core_nr != dss_numa[numa_node - 1].ni_core_nr) {
D_INFO("Non-uniform numa nodes, bypassing multi-socket mode\n");
D_FREE(dss_numa);
return dss_legacy_mode(false);
}
}

if (multi_socket) {
/** In this mode, we simply save the topology for later use but
* still use all of the cores.
*/
D_PRINT("Using Multi-socket NUMA core allocation algorithm\n");
dss_numa_nr = numa_node_nr;
dss_offload_per_numa_nr = dss_tgt_offload_xs_nr / dss_numa_nr;
dss_tgt_per_numa_nr = dss_tgt_nr / dss_numa_nr;
return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, tgt_oversub);
}
hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap);
free(cpuset);

if (dss_core_offset >= dss_num_cores_numa_node) {
if (dss_core_offset >= dss_numa[dss_numa_node].ni_core_nr) {
D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), should within range "
"[0, %d]\n",
dss_core_offset, dss_num_cores_numa_node - 1);
dss_core_offset, dss_numa[dss_numa_node].ni_core_nr - 1);
return -DER_INVAL;
}
D_PRINT("Using NUMA core allocation algorithm\n");

return dss_tgt_nr_check(dss_num_cores_numa_node, dss_tgt_nr, tgt_oversub);
return dss_tgt_nr_check(dss_numa[dss_numa_node].ni_core_nr, dss_tgt_nr, tgt_oversub);
failed:
D_FREE(dss_numa);
return rc;
}

static void
dss_topo_fini(void)
{
D_FREE(dss_numa);
}

static ABT_mutex server_init_state_mutex;
Expand Down Expand Up @@ -814,7 +897,7 @@ server_init(int argc, char *argv[])
DAOS_VERSION, getpid(), dss_self_rank(), dss_tgt_nr,
dss_tgt_offload_xs_nr, dss_core_offset, dss_hostname);

if (numa_obj)
if (dss_numa && dss_numa_node != -1)
D_PRINT("Using NUMA node: %d", dss_numa_node);

return 0;
Expand All @@ -841,6 +924,7 @@ server_init(int argc, char *argv[])
exit_metrics_init:
dss_engine_metrics_fini();
d_tm_fini();
/* dss_topo_fini cleans itself if it fails */
exit_debug_init:
daos_debug_fini();
return rc;
Expand Down Expand Up @@ -899,6 +983,8 @@ server_fini(bool force)
D_INFO("dss_engine_metrics_fini() done\n");
d_tm_fini();
D_INFO("d_tm_fini() done\n");
dss_topo_fini();
D_INFO("dss_top_fini() done\n");
daos_debug_fini();
D_INFO("daos_debug_fini() done\n");
}
Expand Down
Loading
Loading