Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-14408 common: ensure NDCTL not used for storage class ram #15203

Merged
merged 7 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/landing-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ on:
- ci/**
- requirements-build.txt
- requirements-utest.txt
- utils/build.config

permissions: {}

Expand Down
16 changes: 16 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
daos (2.6.1-4) unstable; urgency=medium
[ Tomasz Gromadzki ]
* Add support of the PMDK package 2.1.0 with NDCTL enabled.
* Increase the default ULT stack size to 20KiB if the engine uses
the DCPM storage class.
* Prevent using the RAM storage class (simulated PMem) when
the shutdown state (SDS) is active.
* Automatically disable SDS for the RAM storage class on engine startup.
* Force explicitly setting the PMEMOBJ_CONF='sds.at_create=0'
environment variable to deactivate SDS for the DAOS tools
(ddb, daos_perf, vos_perf, etc.) when used WITHOUT DCPM.
Otherwise, a user is supposed to be stopped by an error
like: "Unsafe shutdown count is not supported for this source".

-- Tomasz Gromadzki <[email protected]> Wed, 02 Oct 2024 12:00:00 +0200

daos (2.6.1-3) unstable; urgency=medium
[ Phillip Henderson ]
* Third release candidate for 2.6.1
Expand Down
6 changes: 4 additions & 2 deletions debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Build-Depends: debhelper (>= 10),
python3-distro,
libabt-dev,
libucx-dev,
libpmemobj-dev (>= 2.0.0),
libpmemobj-dev (>= 2.1.0),
libfuse3-dev,
libprotobuf-c-dev,
libjson-c-dev,
Expand Down Expand Up @@ -118,7 +118,9 @@ Depends: python (>=3.8), python3, python-yaml, python3-yaml,
daos-client (= ${binary:Version}),
daos-admin (= ${binary:Version}),
golang-go (>=1.18),
libcapstone-dev
libcapstone-dev,
libndctl-dev,
libdaxctl-dev
Description: The Distributed Asynchronous Object Storage (DAOS) is an open-source
software-defined object store designed from the ground up for
massively distributed Non Volatile Memory (NVM). DAOS takes advantage
Expand Down
1 change: 0 additions & 1 deletion site_scons/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,6 @@ def define_components(reqs):
retriever=GitRepoRetriever(),
commands=[['make',
'all',
'NDCTL_ENABLE=n',
'BUILD_EXAMPLES=n',
'BUILD_BENCHMARKS=n',
'DOC=n',
Expand Down
86 changes: 86 additions & 0 deletions src/control/server/engine/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package engine
import (
"fmt"
"os"
"strconv"
"strings"

"github.com/pkg/errors"
Expand All @@ -28,6 +29,8 @@ const (
envLogMasks = "D_LOG_MASK"
envLogDbgStreams = "DD_MASK"
envLogSubsystems = "DD_SUBSYS"

minABTThreadStackSizeDCPM = 20480
)

// FabricConfig encapsulates networking fabric configuration.
Expand Down Expand Up @@ -342,7 +345,80 @@ func (c *Config) Validate() error {
if err := ValidateLogSubsystems(subsystems); err != nil {
return errors.Wrap(err, "validate engine log subsystems")
}
return nil
}

// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class.
func (c *Config) UpdatePMDKEnvarsStackSizeDCPM() error {
stackSizeStr, err := c.GetEnvVar("ABT_THREAD_STACKSIZE")
if err != nil {
c.EnvVars = append(c.EnvVars, fmt.Sprintf("ABT_THREAD_STACKSIZE=%d",
minABTThreadStackSizeDCPM))
return nil
}
// Ensure at least 20KiB ABT stack size for an engine with DCPM storage class.
stackSizeValue, err := strconv.Atoi(stackSizeStr)
if err != nil {
return errors.Errorf("env_var ABT_THREAD_STACKSIZE has invalid value: %s",
stackSizeStr)
}
if stackSizeValue < minABTThreadStackSizeDCPM {
return errors.Errorf("env_var ABT_THREAD_STACKSIZE should be >= %d "+
"for DCPM storage class, found %d", minABTThreadStackSizeDCPM,
stackSizeValue)
}
return nil
}

// Ensure proper configuration of shutdown (SDS) state
func (c *Config) UpdatePMDKEnvarsPMemobjConf(isDCPM bool) error {
pmemobjConfStr, pmemobjConfErr := c.GetEnvVar("PMEMOBJ_CONF")
//also work for empty string
hasSdsAtCreate := strings.Contains(pmemobjConfStr, "sds.at_create")
if isDCPM {
if !hasSdsAtCreate {
return nil
}
// Confirm default handling of shutdown state (SDS) for DCPM storage class.
return errors.New("env_var PMEMOBJ_CONF should NOT contain 'sds.at_create=?' " +
"for DCPM storage class, found '" + pmemobjConfStr + "'")
}

// Disable shutdown state (SDS) (part of RAS) for RAM-based simulated SCM.
if pmemobjConfErr != nil {
c.EnvVars = append(c.EnvVars, "PMEMOBJ_CONF=sds.at_create=0")
return nil
}
if !hasSdsAtCreate {
envVars, _ := common.DeleteKeyValue(c.EnvVars, "PMEMOBJ_CONF")
c.EnvVars = append(envVars, "PMEMOBJ_CONF="+pmemobjConfStr+
";sds.at_create=0")
return nil
}
if strings.Contains(pmemobjConfStr, "sds.at_create=1") {
return errors.New("env_var PMEMOBJ_CONF should contain 'sds.at_create=0' " +
"for non-DCPM storage class, found '" + pmemobjConfStr + "'")
}
return nil
}

// Ensure proper environment variables for PMDK w/ NDCTL enabled based on
// the actual configuration of the storage class.
func (c *Config) UpdatePMDKEnvars() error {

if len(c.Storage.Tiers) == 0 {
return errors.New("Invalid config - no tier 0 defined")
}

isDCPM := c.Storage.Tiers[0].Class == storage.ClassDcpm

if err := c.UpdatePMDKEnvarsPMemobjConf(isDCPM); err != nil {
return err
}

if isDCPM {
return c.UpdatePMDKEnvarsStackSizeDCPM()
}
return nil
}

Expand Down Expand Up @@ -690,3 +766,13 @@ func (c *Config) WithStorageIndex(i uint32) *Config {
c.Storage.EngineIdx = uint(i)
return c
}

// WithEnvVarAbtThreadStackSize sets environment variable ABT_THREAD_STACKSIZE.
func (c *Config) WithEnvVarAbtThreadStackSize(stack_size uint16) *Config {
return c.WithEnvVars(fmt.Sprintf("ABT_THREAD_STACKSIZE=%d", stack_size))
}

// WithEnvVarPMemObjSdsAtCreate sets PMEMOBJ_CONF env. var. to sds.at_create=0/1 value
func (c *Config) WithEnvVarPMemObjSdsAtCreate(value uint8) *Config {
return c.WithEnvVars(fmt.Sprintf("PMEMOBJ_CONF=sds.at_create=%d", value))
}
207 changes: 207 additions & 0 deletions src/control/server/engine/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1104,3 +1104,210 @@ func TestFabricConfig_Update(t *testing.T) {
})
}
}

func TestConfig_UpdatePMDKEnvarsStackSizeDCPM(t *testing.T) {
validConfig := func() *Config {
return MockConfig().WithStorage(
storage.NewTierConfig().
WithStorageClass("dcpm"))
}

for name, tc := range map[string]struct {
cfg *Config
expErr error
expABTthreadStackSize int
}{
"empty config should not fail": {
cfg: MockConfig(),
expABTthreadStackSize: minABTThreadStackSizeDCPM,
},
"valid config for DCPM should not fail": {
cfg: validConfig().WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM),
expABTthreadStackSize: minABTThreadStackSizeDCPM,
},
"config for DCPM without thread size should not fail": {
cfg: validConfig(),
expABTthreadStackSize: minABTThreadStackSizeDCPM,
},
"config for DCPM with stack size big enough should not fail": {
cfg: validConfig().
WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM + 1),
expABTthreadStackSize: minABTThreadStackSizeDCPM + 1,
},
"config for DCPM with stack size too small should fail": {
cfg: validConfig().
WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
expErr: errors.New(fmt.Sprintf("env_var ABT_THREAD_STACKSIZE "+
"should be >= %d for DCPM storage class, found %d",
minABTThreadStackSizeDCPM, minABTThreadStackSizeDCPM-1)),
},
"config for DCPM with invalid ABT_THREAD_STACKSIZE value should fail": {
cfg: validConfig().WithEnvVars("ABT_THREAD_STACKSIZE=foo_bar"),
expErr: errors.New("env_var ABT_THREAD_STACKSIZE has invalid value: foo_bar"),
},
} {
t.Run(name, func(t *testing.T) {
err := tc.cfg.UpdatePMDKEnvarsStackSizeDCPM()
test.CmpErr(t, tc.expErr, err)
if err == nil {
stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE")
test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE")
stackSizeVal, err := strconv.Atoi(stackSizeStr)
test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE")
test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal,
"Invalid ABT_THREAD_STACKSIZE value")
}
})
}
}

func TestConfig_UpdatePMDKEnvarsPMemobjConfDCPM(t *testing.T) {
validConfig := func() *Config {
return MockConfig().WithStorage(
storage.NewTierConfig().WithStorageClass("dcpm"))
}

for name, tc := range map[string]struct {
cfg *Config
expErr error
}{
"empty config should not fail": {
cfg: MockConfig(),
},
"valid config for DCPM should not fail": {
cfg: validConfig(),
},
"config for DCPM with forced sds.at_create (1) should fail": {
cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1),
expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " +
"'sds.at_create=?' for DCPM storage class, found 'sds.at_create=1'"),
},
"config for DCPM with forced sds.at_create (0) should fail": {
cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0),
expErr: errors.New("env_var PMEMOBJ_CONF should NOT contain " +
"'sds.at_create=?' for DCPM storage class, found 'sds.at_create=0'"),
},
} {
t.Run(name, func(t *testing.T) {
test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(true))
})
}
}

func TestConfig_UpdatePMDKEnvarsPMemobjConfNRam(t *testing.T) {
validConfig := func() *Config {
return MockConfig().WithStorage(
storage.NewTierConfig().
WithStorageClass("dcpm"))
}

for name, tc := range map[string]struct {
cfg *Config
expErr error
expPMEMOBJ_CONF string
}{
"empty config should not fail": {
cfg: validConfig(),
expPMEMOBJ_CONF: "sds.at_create=0",
},
"config for ram without PMEMOBJ_CONF should not fail": {
cfg: MockConfig(),
expPMEMOBJ_CONF: "sds.at_create=0",
},
"valid config for should not fail": {
cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(0),
expPMEMOBJ_CONF: "sds.at_create=0",
},
"config for ram w/ PMEMOBJ_CONF w/o sds.at_create should should be updated": {
cfg: validConfig().WithEnvVars("PMEMOBJ_CONF=foo_bar"),
expPMEMOBJ_CONF: "foo_bar;sds.at_create=0",
},
"config for ram with sds.at_create set to 1 should fail": {
cfg: validConfig().WithEnvVarPMemObjSdsAtCreate(1),
expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
"'sds.at_create=0' for non-DCPM storage class" +
", found 'sds.at_create=1'"),
},
"config for ram w/ PMEMOBJ_CONF w/ sds.at_create=1 should fail": {
cfg: validConfig().
WithEnvVars("PMEMOBJ_CONF=sds.at_create=1;foo-bar"),
expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
"'sds.at_create=0' for non-DCPM storage class" +
", found 'sds.at_create=1;foo-bar'"),
},
} {
t.Run(name, func(t *testing.T) {
test.CmpErr(t, tc.expErr, tc.cfg.UpdatePMDKEnvarsPMemobjConf(false))
if len(tc.expPMEMOBJ_CONF) > 0 {
sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF")
test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF")
test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create,
"Invalid PMEMOBJ_CONF")
}

})
}
}

func TestConfig_UpdatePMDKEnvars(t *testing.T) {
validConfig := func(storageclas string) *Config {
return MockConfig().WithStorage(
storage.NewTierConfig().
WithStorageClass(storageclas))
}
for name, tc := range map[string]struct {
cfg *Config
expErr error
expPMEMOBJ_CONF string
expABTthreadStackSize int
}{
"empty config should fail": {
cfg: MockConfig(),
expErr: errors.New("Invalid config - no tier 0 defined"),
expABTthreadStackSize: -1,
},
"valid config for RAM should not fail": {
cfg: validConfig("ram").
WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
expPMEMOBJ_CONF: "sds.at_create=0",
expABTthreadStackSize: minABTThreadStackSizeDCPM - 1,
},
"invalid config for RAM should fail": {
cfg: validConfig("ram").WithEnvVarPMemObjSdsAtCreate(1),
expErr: errors.New("env_var PMEMOBJ_CONF should contain " +
"'sds.at_create=0' for non-DCPM storage class, " +
"found 'sds.at_create=1'"),
expABTthreadStackSize: -1,
},
"valid config for DCPM should not fail": {
cfg: validConfig("dcpm"),
expABTthreadStackSize: minABTThreadStackSizeDCPM,
},
"invalid config for DCPM should not fail": {
cfg: validConfig("dcpm").
WithEnvVarAbtThreadStackSize(minABTThreadStackSizeDCPM - 1),
expErr: errors.New("env_var ABT_THREAD_STACKSIZE should be >= 20480 " +
"for DCPM storage class, found 20479"),
expABTthreadStackSize: minABTThreadStackSizeDCPM - 1,
},
} {
t.Run(name, func(t *testing.T) {
errTc := tc.cfg.UpdatePMDKEnvars()
test.CmpErr(t, tc.expErr, errTc)
if len(tc.expPMEMOBJ_CONF) > 0 {
sds_at_create, err := tc.cfg.GetEnvVar("PMEMOBJ_CONF")
test.AssertTrue(t, err == nil, "Missing env var PMEMOBJ_CONF")
test.AssertEqual(t, tc.expPMEMOBJ_CONF, sds_at_create,
"Invalid PMEMOBJ_CONF")
}
if tc.expABTthreadStackSize >= 0 {
stackSizeStr, err := tc.cfg.GetEnvVar("ABT_THREAD_STACKSIZE")
test.AssertTrue(t, err == nil, "Missing env var ABT_THREAD_STACKSIZE")
stackSizeVal, err := strconv.Atoi(stackSizeStr)
test.AssertTrue(t, err == nil, "Invalid env var ABT_THREAD_STACKSIZE")
test.AssertEqual(t, tc.expABTthreadStackSize, stackSizeVal,
"Invalid ABT_THREAD_STACKSIZE value")
}
})
}
}
6 changes: 6 additions & 0 deletions src/control/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ func processConfig(log logging.Logger, cfg *config.Server, fis *hardware.FabricI
return err
}

for _, ec := range cfg.Engines {
if err := ec.UpdatePMDKEnvars(); err != nil {
return err
}
}

return nil
}

Expand Down
Loading
Loading