Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16829 client: Fix daos pool query regression #15638

Merged
merged 2 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/admin/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ Alternately, the administrator may erase and re-format the DAOS system to start

### Engines become unavailable

Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("suspect ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.
Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("Disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("Dead ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.
mjmac marked this conversation as resolved.
Show resolved Hide resolved

To restore availability as well as capacity and performance, try to start all "excluded" or "errored" engines. Starting all of them at the same time minimizes the chance of triggering rebuild jobs. In many cases, the following command suffices:
```
Expand Down
109 changes: 32 additions & 77 deletions src/control/cmd/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,21 +295,37 @@ func convertPoolInfo(pinfo *C.daos_pool_info_t) (*daos.PoolInfo, error) {
return poolInfo, nil
}

func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
var rlPtr **C.d_rank_list_t = nil
var rl *C.d_rank_list_t = nil

if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) ||
queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
rlPtr = &rl
}
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
var enabledRanks *C.d_rank_list_t
var disabledRanks *C.d_rank_list_t
defer func() {
C.d_rank_list_free(enabledRanks)
C.d_rank_list_free(disabledRanks)
}()

var rc C.int
cPoolInfo := C.daos_pool_info_t{
pi_bits: C.uint64_t(queryMask),
}
if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) && queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
enaQm := queryMask
enaQm.ClearOptions(daos.PoolQueryOptionDisabledEngines)
mjmac marked this conversation as resolved.
Show resolved Hide resolved
cPoolInfo.pi_bits = C.uint64_t(enaQm)
rc = C.daos_pool_query(poolHdl, &enabledRanks, &cPoolInfo, nil, nil)
if err := daosError(rc); err != nil {
return nil, err
}

/* second query to just get disabled ranks */
rc = C.daos_pool_query(poolHdl, &disabledRanks, nil, nil, nil)
} else if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
rc = C.daos_pool_query(poolHdl, &enabledRanks, &cPoolInfo, nil, nil)
} else if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
rc = C.daos_pool_query(poolHdl, &disabledRanks, &cPoolInfo, nil, nil)
} else {
rc = C.daos_pool_query(poolHdl, nil, &cPoolInfo, nil, nil)
}

rc := C.daos_pool_query(poolHdl, rlPtr, &cPoolInfo, nil, nil)
defer C.d_rank_list_free(rl)
if err := daosError(rc); err != nil {
return nil, err
}
Expand All @@ -318,79 +334,18 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (
if err != nil {
return nil, err
}
poolInfo.QueryMask = queryMask

if rlPtr != nil {
rs, err := rankSetFromC(rl)
if enabledRanks != nil {
poolInfo.EnabledRanks, err = rankSetFromC(enabledRanks)
if err != nil {
return nil, err
}
if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
poolInfo.EnabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
poolInfo.DisabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
poolInfo.DeadRanks = rs
}
}

return poolInfo, nil
}
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
poolInfo := &daos.PoolInfo{}
originalMask := queryMask // Save the original queryMask

// Function to handle the query and return a single RankList
queryAndUpdate := func(option string) error {
// Clear previous options and set new option
queryMask.ClearAll()
queryMask.SetOptions(option)

poolInfo1, err := queryPoolRankLists(poolHdl, queryMask)
if disabledRanks != nil {
poolInfo.DisabledRanks, err = rankSetFromC(disabledRanks)
if err != nil {
return err
}

switch option {
case daos.PoolQueryOptionEnabledEngines:
poolInfo.EnabledRanks = poolInfo1.EnabledRanks
case daos.PoolQueryOptionDisabledEngines:
poolInfo.DisabledRanks = poolInfo1.DisabledRanks
case daos.PoolQueryOptionDeadEngines:
poolInfo.DeadRanks = poolInfo1.DeadRanks
}
return nil
}

// Preprocess queryMask, select one option for the first query
var firstOption string
if originalMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
firstOption = daos.PoolQueryOptionEnabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
firstOption = daos.PoolQueryOptionDisabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDeadEngines) {
firstOption = daos.PoolQueryOptionDeadEngines
}

// Perform the first query to get basic information
if err := queryAndUpdate(firstOption); err != nil {
return nil, err
}

// Check the original query mask and update fields as needed
queryOptions := []string{
daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionDisabledEngines,
daos.PoolQueryOptionDeadEngines,
}

// Process each option sequentially
for _, opt := range queryOptions {
if originalMask.HasOption(opt) && opt != firstOption {
if err := queryAndUpdate(opt); err != nil {
return nil, err
}
return nil, err
}
}

Expand Down
16 changes: 16 additions & 0 deletions src/control/common/proto/logging.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,22 @@ func Debug(msg proto.Message) string {
}
fmt.Fprintf(&bld, "meta-file-size: %s (%d)", humanize.Bytes(m.MemFileBytes),
m.MemFileBytes)
case *mgmtpb.PoolQueryReq:
fmt.Fprintf(&bld, "%T id:%s qm:%s", m, m.Id, daos.PoolQueryMask(m.QueryMask))
case *mgmtpb.PoolQueryResp:
fmt.Fprintf(&bld, "%T status:%s uuid:%s qm:%s map:%d tot(eng/tgts):%d/%d ver(p/u):%d/%d svc_ldr:%d ",
m, daos.Status(m.Status), m.Uuid, daos.PoolQueryMask(m.QueryMask), m.Version,
m.TotalEngines, m.TotalTargets, m.PoolLayoutVer, m.UpgradeLayoutVer, m.SvcLdr)
ranks := &ranklist.RankSet{}
for _, r := range m.SvcReps {
ranks.Add(ranklist.Rank(r))
}
fmt.Fprintf(&bld, "svc_ranks:%s ", ranks.String())
fmt.Fprintf(&bld, "ena_ranks:%s ", m.EnabledRanks)
fmt.Fprintf(&bld, "dis_ranks:%s ", m.DisabledRanks)
fmt.Fprintf(&bld, "dead_ranks:%s ", m.DeadRanks)
fmt.Fprintf(&bld, "rebuild:%+v ", m.Rebuild)
fmt.Fprintf(&bld, "tier_stats:%+v ", m.TierStats)
case *mgmtpb.PoolEvictReq:
fmt.Fprintf(&bld, "%T pool:%s", m, m.Id)
if len(m.Handles) > 0 {
Expand Down
39 changes: 23 additions & 16 deletions src/control/lib/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ type (
MediaType StorageMediaType `json:"media_type"`
}

// PoolQueryOption is used to supply pool query options.
PoolQueryOption string

// PoolQueryMask implements a bitmask for pool query options.
PoolQueryMask C.uint64_t
)
Expand All @@ -112,15 +115,15 @@ const (
HealthOnlyPoolQueryMask = PoolQueryMask(^uint64(0) &^ (C.DPI_ENGINES_ENABLED | C.DPI_SPACE))

// PoolQueryOptionSpace retrieves storage space usage as part of the pool query.
PoolQueryOptionSpace = "space"
PoolQueryOptionSpace PoolQueryOption = "space"
// PoolQueryOptionRebuild retrieves pool rebuild status as part of the pool query.
PoolQueryOptionRebuild = "rebuild"
PoolQueryOptionRebuild PoolQueryOption = "rebuild"
// PoolQueryOptionEnabledEngines retrieves enabled engines as part of the pool query.
PoolQueryOptionEnabledEngines = "enabled_engines"
PoolQueryOptionEnabledEngines PoolQueryOption = "enabled_engines"
// PoolQueryOptionDisabledEngines retrieves disabled engines as part of the pool query.
PoolQueryOptionDisabledEngines = "disabled_engines"
PoolQueryOptionDisabledEngines PoolQueryOption = "disabled_engines"
// PoolQueryOptionDeadEngines retrieves dead engines as part of the pool query.
PoolQueryOptionDeadEngines = "dead_engines"
PoolQueryOptionDeadEngines PoolQueryOption = "dead_engines"

// PoolConnectFlagReadOnly indicates that the connection is read-only.
PoolConnectFlagReadOnly = C.DAOS_PC_RO
Expand All @@ -130,15 +133,19 @@ const (
PoolConnectFlagExclusive = C.DAOS_PC_EX
)

var poolQueryOptMap = map[C.int]string{
func (pqo PoolQueryOption) String() string {
return string(pqo)
}

var poolQueryOptMap = map[C.int]PoolQueryOption{
C.DPI_SPACE: PoolQueryOptionSpace,
C.DPI_REBUILD_STATUS: PoolQueryOptionRebuild,
C.DPI_ENGINES_ENABLED: PoolQueryOptionEnabledEngines,
C.DPI_ENGINES_DISABLED: PoolQueryOptionDisabledEngines,
C.DPI_ENGINES_DEAD: PoolQueryOptionDeadEngines,
}

func resolvePoolQueryOpt(name string) (C.int, error) {
func resolvePoolQueryOpt(name PoolQueryOption) (C.int, error) {
for opt, optName := range poolQueryOptMap {
if name == optName {
return opt, nil
Expand All @@ -149,16 +156,16 @@ func resolvePoolQueryOpt(name string) (C.int, error) {

// MustNewPoolQueryMask returns a PoolQueryMask initialized with the specified options.
// NB: If an error occurs due to an invalid option, it panics.
func MustNewPoolQueryMask(options ...string) (mask PoolQueryMask) {
func MustNewPoolQueryMask(options ...PoolQueryOption) (mask PoolQueryMask) {
if err := mask.SetOptions(options...); err != nil {
panic(err)
}
return
}

// SetOptions sets the pool query mask to include the specified options.
func (pqm *PoolQueryMask) SetOptions(optNames ...string) error {
for _, optName := range optNames {
func (pqm *PoolQueryMask) SetOptions(options ...PoolQueryOption) error {
for _, optName := range options {
if opt, err := resolvePoolQueryOpt(optName); err != nil {
return err
} else {
Expand All @@ -169,8 +176,8 @@ func (pqm *PoolQueryMask) SetOptions(optNames ...string) error {
}

// ClearOptions clears the pool query mask of the specified options.
func (pqm *PoolQueryMask) ClearOptions(optNames ...string) error {
for _, optName := range optNames {
func (pqm *PoolQueryMask) ClearOptions(options ...PoolQueryOption) error {
for _, optName := range options {
if opt, err := resolvePoolQueryOpt(optName); err != nil {
return err
} else {
Expand All @@ -191,8 +198,8 @@ func (pqm *PoolQueryMask) ClearAll() {
}

// HasOption returns true if the pool query mask includes the specified option.
func (pqm PoolQueryMask) HasOption(optName string) bool {
return strings.Contains(pqm.String(), optName)
func (pqm PoolQueryMask) HasOption(option PoolQueryOption) bool {
return strings.Contains(pqm.String(), option.String())
}

func (pqm PoolQueryMask) String() string {
Expand All @@ -201,7 +208,7 @@ func (pqm PoolQueryMask) String() string {
opt := C.int(1 << i)
if flag, ok := poolQueryOptMap[opt]; ok {
if pqm&PoolQueryMask(opt) != 0 {
flags = append(flags, flag)
flags = append(flags, flag.String())
}
}
}
Expand All @@ -228,7 +235,7 @@ func (pqm *PoolQueryMask) UnmarshalJSON(data []byte) error {
var newVal PoolQueryMask
for _, opt := range strings.Split(strings.Trim(string(data), "\""), ",") {
for k, v := range poolQueryOptMap {
if v == opt {
if v.String() == opt {
newVal |= PoolQueryMask(k)
goto next
}
Expand Down
43 changes: 37 additions & 6 deletions src/control/lib/daos/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,12 @@ func genTestMask(modifyFn func(pqm *PoolQueryMask)) PoolQueryMask {
return testMask
}

func genOptsStr(queryOpts ...string) string {
return strings.Join(queryOpts, ",")
func genOptsStr(queryOpts ...PoolQueryOption) string {
optStrs := make([]string, 0, len(queryOpts))
for _, opt := range queryOpts {
optStrs = append(optStrs, opt.String())
}
return strings.Join(optStrs, ",")
}

func TestDaos_PoolQueryMask(t *testing.T) {
Expand Down Expand Up @@ -156,7 +160,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionSpace)
}),
expString: PoolQueryOptionSpace,
expString: PoolQueryOptionSpace.String(),
},
"set query space=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -176,7 +180,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionRebuild)
}),
expString: PoolQueryOptionRebuild,
expString: PoolQueryOptionRebuild.String(),
},
"set query rebuild=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -189,7 +193,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionEnabledEngines)
}),
expString: PoolQueryOptionEnabledEngines,
expString: PoolQueryOptionEnabledEngines.String(),
},
"set query enabled_engines=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -202,7 +206,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionDisabledEngines)
}),
expString: PoolQueryOptionDisabledEngines,
expString: PoolQueryOptionDisabledEngines.String(),
},
"set query disabled_engines=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -220,6 +224,33 @@ func TestDaos_PoolQueryMask(t *testing.T) {
}
}

func TestDaos_PoolQueryMaskHasOption(t *testing.T) {
for name, tc := range map[string]struct {
testMask PoolQueryMask
testOpt PoolQueryOption
expHasOpt bool
}{
"empty shouldn't match anything": {
testOpt: PoolQueryOptionSpace,
expHasOpt: false,
},
"health-only shouldn't match space": {
testMask: HealthOnlyPoolQueryMask,
testOpt: PoolQueryOptionSpace,
expHasOpt: false,
},
"default should match space": {
testMask: DefaultPoolQueryMask,
testOpt: PoolQueryOptionSpace,
expHasOpt: true,
},
} {
t.Run(name, func(t *testing.T) {
test.AssertEqual(t, tc.expHasOpt, tc.testMask.HasOption(tc.testOpt), "unexpected HasOption result")
})
}
}

func TestDaos_PoolQueryMaskMarshalJSON(t *testing.T) {
// NB: The MarshalJSON implementation uses the stringer, so
// there's no point in testing all of the options here.
Expand Down
Loading