Skip to content

Commit

Permalink
DAOS-16829 client: Fix daos pool query regression
Browse files Browse the repository at this point in the history
The patch landed for DAOS-16477 introduced a regression
in the pool query functionality of the daos tool. This
patch simplifies the pool query logic to retrieve
enabled and disabled ranks. Dead ranks are not available
to the client without pool query protocol changes.

Required-githooks: true
Signed-off-by: Michael MacDonald <[email protected]>
  • Loading branch information
mjmac committed Dec 19, 2024
1 parent 41b089d commit 602dbad
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 99 deletions.
15 changes: 15 additions & 0 deletions docs/admin/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,21 @@ To resolve the issue:

Alternately, the administrator may erase and re-format the DAOS system to start over fresh using the new addresses.

### Engines become unavailable

Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("Disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("Dead ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.

To restore availability as well as capacity and performance, try to start all "excluded" or "errored" engines. Starting all of them at the same time minimizes the chance of triggering rebuild jobs. In many cases, the following command suffices:
```
$ dmg system start
```
If some pools remain unavailable (e.g., `dmg pool list` keeps timing out) after the previous step, restart the whole system:
```
$ dmg system stop --force
$ dmg system start
```
If some engines have been excluded from certain pools, and they are available again, reintegrate them to the pools.

## Diagnostic and Recovery Tools

!!! WARNING : Please be careful and use this tool under supervision of DAOS support team.
Expand Down
109 changes: 32 additions & 77 deletions src/control/cmd/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,21 +295,37 @@ func convertPoolInfo(pinfo *C.daos_pool_info_t) (*daos.PoolInfo, error) {
return poolInfo, nil
}

func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
var rlPtr **C.d_rank_list_t = nil
var rl *C.d_rank_list_t = nil

if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) ||
queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
rlPtr = &rl
}
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
var enabledRanks *C.d_rank_list_t
var disabledRanks *C.d_rank_list_t
defer func() {
C.d_rank_list_free(enabledRanks)
C.d_rank_list_free(disabledRanks)
}()

var rc C.int
cPoolInfo := C.daos_pool_info_t{
pi_bits: C.uint64_t(queryMask),
}
if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) && queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
enaQm := queryMask
enaQm.ClearOptions(daos.PoolQueryOptionDisabledEngines)
cPoolInfo.pi_bits = C.uint64_t(enaQm)
rc = C.daos_pool_query(poolHdl, &enabledRanks, &cPoolInfo, nil, nil)
if err := daosError(rc); err != nil {
return nil, err
}

/* second query to just get disabled ranks */
rc = C.daos_pool_query(poolHdl, &disabledRanks, nil, nil, nil)
} else if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
rc = C.daos_pool_query(poolHdl, &enabledRanks, &cPoolInfo, nil, nil)
} else if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
rc = C.daos_pool_query(poolHdl, &disabledRanks, &cPoolInfo, nil, nil)
} else {
rc = C.daos_pool_query(poolHdl, nil, &cPoolInfo, nil, nil)
}

rc := C.daos_pool_query(poolHdl, rlPtr, &cPoolInfo, nil, nil)
defer C.d_rank_list_free(rl)
if err := daosError(rc); err != nil {
return nil, err
}
Expand All @@ -318,79 +334,18 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (
if err != nil {
return nil, err
}
poolInfo.QueryMask = queryMask

if rlPtr != nil {
rs, err := rankSetFromC(rl)
if enabledRanks != nil {
poolInfo.EnabledRanks, err = rankSetFromC(enabledRanks)
if err != nil {
return nil, err
}
if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
poolInfo.EnabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
poolInfo.DisabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
poolInfo.DeadRanks = rs
}
}

return poolInfo, nil
}
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
poolInfo := &daos.PoolInfo{}
originalMask := queryMask // Save the original queryMask

// Function to handle the query and return a single RankList
queryAndUpdate := func(option string) error {
// Clear previous options and set new option
queryMask.ClearAll()
queryMask.SetOptions(option)

poolInfo1, err := queryPoolRankLists(poolHdl, queryMask)
if disabledRanks != nil {
poolInfo.DisabledRanks, err = rankSetFromC(disabledRanks)
if err != nil {
return err
}

switch option {
case daos.PoolQueryOptionEnabledEngines:
poolInfo.EnabledRanks = poolInfo1.EnabledRanks
case daos.PoolQueryOptionDisabledEngines:
poolInfo.DisabledRanks = poolInfo1.DisabledRanks
case daos.PoolQueryOptionDeadEngines:
poolInfo.DeadRanks = poolInfo1.DeadRanks
}
return nil
}

// Preprocess queryMask, select one option for the first query
var firstOption string
if originalMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
firstOption = daos.PoolQueryOptionEnabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
firstOption = daos.PoolQueryOptionDisabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDeadEngines) {
firstOption = daos.PoolQueryOptionDeadEngines
}

// Perform the first query to get basic information
if err := queryAndUpdate(firstOption); err != nil {
return nil, err
}

// Check the original query mask and update fields as needed
queryOptions := []string{
daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionDisabledEngines,
daos.PoolQueryOptionDeadEngines,
}

// Process each option sequentially
for _, opt := range queryOptions {
if originalMask.HasOption(opt) && opt != firstOption {
if err := queryAndUpdate(opt); err != nil {
return nil, err
}
return nil, err
}
}

Expand Down
16 changes: 16 additions & 0 deletions src/control/common/proto/logging.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,22 @@ func Debug(msg proto.Message) string {
for i, b := range m.TierBytes {
fmt.Fprintf(&bld, "%d:%d ", i, b)
}
case *mgmtpb.PoolQueryReq:
fmt.Fprintf(&bld, "%T id:%s qm:%s", m, m.Id, daos.PoolQueryMask(m.QueryMask))
case *mgmtpb.PoolQueryResp:
fmt.Fprintf(&bld, "%T status:%s uuid:%s qm:%s map:%d tot(eng/tgts):%d/%d ver(p/u):%d/%d svc_ldr:%d ",
m, daos.Status(m.Status), m.Uuid, daos.PoolQueryMask(m.QueryMask), m.Version,
m.TotalEngines, m.TotalTargets, m.PoolLayoutVer, m.UpgradeLayoutVer, m.SvcLdr)
ranks := &ranklist.RankSet{}
for _, r := range m.SvcReps {
ranks.Add(ranklist.Rank(r))
}
fmt.Fprintf(&bld, "svc_ranks:%s ", ranks.String())
fmt.Fprintf(&bld, "ena_ranks:%s ", m.EnabledRanks)
fmt.Fprintf(&bld, "dis_ranks:%s ", m.DisabledRanks)
fmt.Fprintf(&bld, "dead_ranks:%s ", m.DeadRanks)
fmt.Fprintf(&bld, "rebuild:%+v ", m.Rebuild)
fmt.Fprintf(&bld, "tier_stats:%+v ", m.TierStats)
case *mgmtpb.PoolEvictReq:
fmt.Fprintf(&bld, "%T pool:%s", m, m.Id)
if len(m.Handles) > 0 {
Expand Down
39 changes: 23 additions & 16 deletions src/control/lib/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ type (
MediaType StorageMediaType `json:"media_type"`
}

// PoolQueryOption is used to supply pool query options.
PoolQueryOption string

// PoolQueryMask implements a bitmask for pool query options.
PoolQueryMask C.uint64_t
)
Expand All @@ -110,15 +113,15 @@ const (
HealthOnlyPoolQueryMask = PoolQueryMask(^uint64(0) &^ (C.DPI_ENGINES_ENABLED | C.DPI_SPACE))

// PoolQueryOptionSpace retrieves storage space usage as part of the pool query.
PoolQueryOptionSpace = "space"
PoolQueryOptionSpace PoolQueryOption = "space"
// PoolQueryOptionRebuild retrieves pool rebuild status as part of the pool query.
PoolQueryOptionRebuild = "rebuild"
PoolQueryOptionRebuild PoolQueryOption = "rebuild"
// PoolQueryOptionEnabledEngines retrieves enabled engines as part of the pool query.
PoolQueryOptionEnabledEngines = "enabled_engines"
PoolQueryOptionEnabledEngines PoolQueryOption = "enabled_engines"
// PoolQueryOptionDisabledEngines retrieves disabled engines as part of the pool query.
PoolQueryOptionDisabledEngines = "disabled_engines"
PoolQueryOptionDisabledEngines PoolQueryOption = "disabled_engines"
// PoolQueryOptionDeadEngines retrieves dead engines as part of the pool query.
PoolQueryOptionDeadEngines = "dead_engines"
PoolQueryOptionDeadEngines PoolQueryOption = "dead_engines"

// PoolConnectFlagReadOnly indicates that the connection is read-only.
PoolConnectFlagReadOnly = C.DAOS_PC_RO
Expand All @@ -128,15 +131,19 @@ const (
PoolConnectFlagExclusive = C.DAOS_PC_EX
)

var poolQueryOptMap = map[C.int]string{
func (pqo PoolQueryOption) String() string {
return string(pqo)
}

var poolQueryOptMap = map[C.int]PoolQueryOption{
C.DPI_SPACE: PoolQueryOptionSpace,
C.DPI_REBUILD_STATUS: PoolQueryOptionRebuild,
C.DPI_ENGINES_ENABLED: PoolQueryOptionEnabledEngines,
C.DPI_ENGINES_DISABLED: PoolQueryOptionDisabledEngines,
C.DPI_ENGINES_DEAD: PoolQueryOptionDeadEngines,
}

func resolvePoolQueryOpt(name string) (C.int, error) {
func resolvePoolQueryOpt(name PoolQueryOption) (C.int, error) {
for opt, optName := range poolQueryOptMap {
if name == optName {
return opt, nil
Expand All @@ -147,16 +154,16 @@ func resolvePoolQueryOpt(name string) (C.int, error) {

// MustNewPoolQueryMask returns a PoolQueryMask initialized with the specified options.
// NB: If an error occurs due to an invalid option, it panics.
func MustNewPoolQueryMask(options ...string) (mask PoolQueryMask) {
func MustNewPoolQueryMask(options ...PoolQueryOption) (mask PoolQueryMask) {
if err := mask.SetOptions(options...); err != nil {
panic(err)
}
return
}

// SetOptions sets the pool query mask to include the specified options.
func (pqm *PoolQueryMask) SetOptions(optNames ...string) error {
for _, optName := range optNames {
func (pqm *PoolQueryMask) SetOptions(options ...PoolQueryOption) error {
for _, optName := range options {
if opt, err := resolvePoolQueryOpt(optName); err != nil {
return err
} else {
Expand All @@ -167,8 +174,8 @@ func (pqm *PoolQueryMask) SetOptions(optNames ...string) error {
}

// ClearOptions clears the pool query mask of the specified options.
func (pqm *PoolQueryMask) ClearOptions(optNames ...string) error {
for _, optName := range optNames {
func (pqm *PoolQueryMask) ClearOptions(options ...PoolQueryOption) error {
for _, optName := range options {
if opt, err := resolvePoolQueryOpt(optName); err != nil {
return err
} else {
Expand All @@ -189,8 +196,8 @@ func (pqm *PoolQueryMask) ClearAll() {
}

// HasOption returns true if the pool query mask includes the specified option.
func (pqm PoolQueryMask) HasOption(optName string) bool {
return strings.Contains(pqm.String(), optName)
func (pqm PoolQueryMask) HasOption(option PoolQueryOption) bool {
return strings.Contains(pqm.String(), option.String())
}

func (pqm PoolQueryMask) String() string {
Expand All @@ -199,7 +206,7 @@ func (pqm PoolQueryMask) String() string {
opt := C.int(1 << i)
if flag, ok := poolQueryOptMap[opt]; ok {
if pqm&PoolQueryMask(opt) != 0 {
flags = append(flags, flag)
flags = append(flags, flag.String())
}
}
}
Expand All @@ -226,7 +233,7 @@ func (pqm *PoolQueryMask) UnmarshalJSON(data []byte) error {
var newVal PoolQueryMask
for _, opt := range strings.Split(strings.Trim(string(data), "\""), ",") {
for k, v := range poolQueryOptMap {
if v == opt {
if v.String() == opt {
newVal |= PoolQueryMask(k)
goto next
}
Expand Down
43 changes: 37 additions & 6 deletions src/control/lib/daos/pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,12 @@ func genTestMask(modifyFn func(pqm *PoolQueryMask)) PoolQueryMask {
return testMask
}

func genOptsStr(queryOpts ...string) string {
return strings.Join(queryOpts, ",")
func genOptsStr(queryOpts ...PoolQueryOption) string {
optStrs := make([]string, 0, len(queryOpts))
for _, opt := range queryOpts {
optStrs = append(optStrs, opt.String())
}
return strings.Join(optStrs, ",")
}

func TestDaos_PoolQueryMask(t *testing.T) {
Expand Down Expand Up @@ -156,7 +160,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionSpace)
}),
expString: PoolQueryOptionSpace,
expString: PoolQueryOptionSpace.String(),
},
"set query space=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -176,7 +180,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionRebuild)
}),
expString: PoolQueryOptionRebuild,
expString: PoolQueryOptionRebuild.String(),
},
"set query rebuild=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -189,7 +193,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionEnabledEngines)
}),
expString: PoolQueryOptionEnabledEngines,
expString: PoolQueryOptionEnabledEngines.String(),
},
"set query enabled_engines=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -202,7 +206,7 @@ func TestDaos_PoolQueryMask(t *testing.T) {
testMask: genTestMask(func(pqm *PoolQueryMask) {
pqm.SetOptions(PoolQueryOptionDisabledEngines)
}),
expString: PoolQueryOptionDisabledEngines,
expString: PoolQueryOptionDisabledEngines.String(),
},
"set query disabled_engines=false": {
testMask: genTestMask(func(pqm *PoolQueryMask) {
Expand All @@ -220,6 +224,33 @@ func TestDaos_PoolQueryMask(t *testing.T) {
}
}

func TestDaos_PoolQueryMaskHasOption(t *testing.T) {
for name, tc := range map[string]struct {
testMask PoolQueryMask
testOpt PoolQueryOption
expHasOpt bool
}{
"empty shouldn't match anything": {
testOpt: PoolQueryOptionSpace,
expHasOpt: false,
},
"health-only shouldn't match space": {
testMask: HealthOnlyPoolQueryMask,
testOpt: PoolQueryOptionSpace,
expHasOpt: false,
},
"default should match space": {
testMask: DefaultPoolQueryMask,
testOpt: PoolQueryOptionSpace,
expHasOpt: true,
},
} {
t.Run(name, func(t *testing.T) {
test.AssertEqual(t, tc.expHasOpt, tc.testMask.HasOption(tc.testOpt), "unexpected HasOption result")
})
}
}

func TestDaos_PoolQueryMaskMarshalJSON(t *testing.T) {
// NB: The MarshalJSON implementation uses the stringer, so
// there's no point in testing all of the options here.
Expand Down

0 comments on commit 602dbad

Please sign in to comment.