From 1891962b6dc09ab022455e0af0acc0f26371afb4 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Thu, 19 Dec 2024 10:10:15 -0500 Subject: [PATCH] DAOS-16829 client: Fix daos pool query regression (#15638) The patch landed for DAOS-16477 introduced a regression in the pool query functionality of the daos tool. This patch simplifies the pool query logic to retrieve enabled and disabled ranks. Dead ranks are not available to the client without pool query protocol changes. Signed-off-by: Michael MacDonald --- docs/admin/troubleshooting.md | 2 +- src/control/cmd/daos/pool.go | 109 ++++++++-------------------- src/control/common/proto/logging.go | 16 ++++ src/control/lib/daos/pool.go | 39 ++++++---- src/control/lib/daos/pool_test.go | 43 +++++++++-- 5 files changed, 109 insertions(+), 100 deletions(-) diff --git a/docs/admin/troubleshooting.md b/docs/admin/troubleshooting.md index 5de7b95412d..92e48e6e83e 100644 --- a/docs/admin/troubleshooting.md +++ b/docs/admin/troubleshooting.md @@ -556,7 +556,7 @@ Alternately, the administrator may erase and re-format the DAOS system to start ### Engines become unavailable -Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("suspect ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain. +Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("Disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("Dead ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain. To restore availability as well as capacity and performance, try to start all "excluded" or "errored" engines. Starting all of them at the same time minimizes the chance of triggering rebuild jobs. In many cases, the following command suffices: ``` diff --git a/src/control/cmd/daos/pool.go b/src/control/cmd/daos/pool.go index 0066f3959b7..831a775db1a 100644 --- a/src/control/cmd/daos/pool.go +++ b/src/control/cmd/daos/pool.go @@ -295,21 +295,37 @@ func convertPoolInfo(pinfo *C.daos_pool_info_t) (*daos.PoolInfo, error) { return poolInfo, nil } -func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) { - var rlPtr **C.d_rank_list_t = nil - var rl *C.d_rank_list_t = nil - - if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) || - queryMask.HasOption(daos.PoolQueryOptionDeadEngines) { - rlPtr = &rl - } +func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) { + var enabledRanks *C.d_rank_list_t + var disabledRanks *C.d_rank_list_t + defer func() { + C.d_rank_list_free(enabledRanks) + C.d_rank_list_free(disabledRanks) + }() + var rc C.int cPoolInfo := C.daos_pool_info_t{ pi_bits: C.uint64_t(queryMask), } + if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) && queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) { + enaQm := queryMask + enaQm.ClearOptions(daos.PoolQueryOptionDisabledEngines) + cPoolInfo.pi_bits = C.uint64_t(enaQm) + rc = C.daos_pool_query(poolHdl, &enabledRanks, &cPoolInfo, nil, nil) + if err := daosError(rc); err != nil { + return nil, err + } + + /* second query to just get disabled ranks */ + rc = C.daos_pool_query(poolHdl, &disabledRanks, nil, nil, nil) + } else if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) { + rc = C.daos_pool_query(poolHdl, &enabledRanks, &cPoolInfo, nil, nil) + } else if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) { + rc = C.daos_pool_query(poolHdl, &disabledRanks, &cPoolInfo, nil, nil) + } else { + rc = C.daos_pool_query(poolHdl, nil, &cPoolInfo, nil, nil) + } - rc := C.daos_pool_query(poolHdl, rlPtr, &cPoolInfo, nil, nil) - defer C.d_rank_list_free(rl) if err := daosError(rc); err != nil { return nil, err } @@ -318,79 +334,18 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) ( if err != nil { return nil, err } + poolInfo.QueryMask = queryMask - if rlPtr != nil { - rs, err := rankSetFromC(rl) + if enabledRanks != nil { + poolInfo.EnabledRanks, err = rankSetFromC(enabledRanks) if err != nil { return nil, err } - if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) { - poolInfo.EnabledRanks = rs - } - if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) { - poolInfo.DisabledRanks = rs - } - if queryMask.HasOption(daos.PoolQueryOptionDeadEngines) { - poolInfo.DeadRanks = rs - } } - - return poolInfo, nil -} -func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) { - poolInfo := &daos.PoolInfo{} - originalMask := queryMask // Save the original queryMask - - // Function to handle the query and return a single RankList - queryAndUpdate := func(option string) error { - // Clear previous options and set new option - queryMask.ClearAll() - queryMask.SetOptions(option) - - poolInfo1, err := queryPoolRankLists(poolHdl, queryMask) + if disabledRanks != nil { + poolInfo.DisabledRanks, err = rankSetFromC(disabledRanks) if err != nil { - return err - } - - switch option { - case daos.PoolQueryOptionEnabledEngines: - poolInfo.EnabledRanks = poolInfo1.EnabledRanks - case daos.PoolQueryOptionDisabledEngines: - poolInfo.DisabledRanks = poolInfo1.DisabledRanks - case daos.PoolQueryOptionDeadEngines: - poolInfo.DeadRanks = poolInfo1.DeadRanks - } - return nil - } - - // Preprocess queryMask, select one option for the first query - var firstOption string - if originalMask.HasOption(daos.PoolQueryOptionEnabledEngines) { - firstOption = daos.PoolQueryOptionEnabledEngines - } else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) { - firstOption = daos.PoolQueryOptionDisabledEngines - } else if originalMask.HasOption(daos.PoolQueryOptionDeadEngines) { - firstOption = daos.PoolQueryOptionDeadEngines - } - - // Perform the first query to get basic information - if err := queryAndUpdate(firstOption); err != nil { - return nil, err - } - - // Check the original query mask and update fields as needed - queryOptions := []string{ - daos.PoolQueryOptionEnabledEngines, - daos.PoolQueryOptionDisabledEngines, - daos.PoolQueryOptionDeadEngines, - } - - // Process each option sequentially - for _, opt := range queryOptions { - if originalMask.HasOption(opt) && opt != firstOption { - if err := queryAndUpdate(opt); err != nil { - return nil, err - } + return nil, err } } diff --git a/src/control/common/proto/logging.go b/src/control/common/proto/logging.go index 7a9677ed2d8..627f92cfc64 100644 --- a/src/control/common/proto/logging.go +++ b/src/control/common/proto/logging.go @@ -116,6 +116,22 @@ func Debug(msg proto.Message) string { } fmt.Fprintf(&bld, "meta-file-size: %s (%d)", humanize.Bytes(m.MemFileBytes), m.MemFileBytes) + case *mgmtpb.PoolQueryReq: + fmt.Fprintf(&bld, "%T id:%s qm:%s", m, m.Id, daos.PoolQueryMask(m.QueryMask)) + case *mgmtpb.PoolQueryResp: + fmt.Fprintf(&bld, "%T status:%s uuid:%s qm:%s map:%d tot(eng/tgts):%d/%d ver(p/u):%d/%d svc_ldr:%d ", + m, daos.Status(m.Status), m.Uuid, daos.PoolQueryMask(m.QueryMask), m.Version, + m.TotalEngines, m.TotalTargets, m.PoolLayoutVer, m.UpgradeLayoutVer, m.SvcLdr) + ranks := &ranklist.RankSet{} + for _, r := range m.SvcReps { + ranks.Add(ranklist.Rank(r)) + } + fmt.Fprintf(&bld, "svc_ranks:%s ", ranks.String()) + fmt.Fprintf(&bld, "ena_ranks:%s ", m.EnabledRanks) + fmt.Fprintf(&bld, "dis_ranks:%s ", m.DisabledRanks) + fmt.Fprintf(&bld, "dead_ranks:%s ", m.DeadRanks) + fmt.Fprintf(&bld, "rebuild:%+v ", m.Rebuild) + fmt.Fprintf(&bld, "tier_stats:%+v ", m.TierStats) case *mgmtpb.PoolEvictReq: fmt.Fprintf(&bld, "%T pool:%s", m, m.Id) if len(m.Handles) > 0 { diff --git a/src/control/lib/daos/pool.go b/src/control/lib/daos/pool.go index 3d26b7a2d4e..6555a8cf721 100644 --- a/src/control/lib/daos/pool.go +++ b/src/control/lib/daos/pool.go @@ -101,6 +101,9 @@ type ( MediaType StorageMediaType `json:"media_type"` } + // PoolQueryOption is used to supply pool query options. + PoolQueryOption string + // PoolQueryMask implements a bitmask for pool query options. PoolQueryMask C.uint64_t ) @@ -112,15 +115,15 @@ const ( HealthOnlyPoolQueryMask = PoolQueryMask(^uint64(0) &^ (C.DPI_ENGINES_ENABLED | C.DPI_SPACE)) // PoolQueryOptionSpace retrieves storage space usage as part of the pool query. - PoolQueryOptionSpace = "space" + PoolQueryOptionSpace PoolQueryOption = "space" // PoolQueryOptionRebuild retrieves pool rebuild status as part of the pool query. - PoolQueryOptionRebuild = "rebuild" + PoolQueryOptionRebuild PoolQueryOption = "rebuild" // PoolQueryOptionEnabledEngines retrieves enabled engines as part of the pool query. - PoolQueryOptionEnabledEngines = "enabled_engines" + PoolQueryOptionEnabledEngines PoolQueryOption = "enabled_engines" // PoolQueryOptionDisabledEngines retrieves disabled engines as part of the pool query. - PoolQueryOptionDisabledEngines = "disabled_engines" + PoolQueryOptionDisabledEngines PoolQueryOption = "disabled_engines" // PoolQueryOptionDeadEngines retrieves dead engines as part of the pool query. - PoolQueryOptionDeadEngines = "dead_engines" + PoolQueryOptionDeadEngines PoolQueryOption = "dead_engines" // PoolConnectFlagReadOnly indicates that the connection is read-only. PoolConnectFlagReadOnly = C.DAOS_PC_RO @@ -130,7 +133,11 @@ const ( PoolConnectFlagExclusive = C.DAOS_PC_EX ) -var poolQueryOptMap = map[C.int]string{ +func (pqo PoolQueryOption) String() string { + return string(pqo) +} + +var poolQueryOptMap = map[C.int]PoolQueryOption{ C.DPI_SPACE: PoolQueryOptionSpace, C.DPI_REBUILD_STATUS: PoolQueryOptionRebuild, C.DPI_ENGINES_ENABLED: PoolQueryOptionEnabledEngines, @@ -138,7 +145,7 @@ var poolQueryOptMap = map[C.int]string{ C.DPI_ENGINES_DEAD: PoolQueryOptionDeadEngines, } -func resolvePoolQueryOpt(name string) (C.int, error) { +func resolvePoolQueryOpt(name PoolQueryOption) (C.int, error) { for opt, optName := range poolQueryOptMap { if name == optName { return opt, nil @@ -149,7 +156,7 @@ func resolvePoolQueryOpt(name string) (C.int, error) { // MustNewPoolQueryMask returns a PoolQueryMask initialized with the specified options. // NB: If an error occurs due to an invalid option, it panics. -func MustNewPoolQueryMask(options ...string) (mask PoolQueryMask) { +func MustNewPoolQueryMask(options ...PoolQueryOption) (mask PoolQueryMask) { if err := mask.SetOptions(options...); err != nil { panic(err) } @@ -157,8 +164,8 @@ func MustNewPoolQueryMask(options ...string) (mask PoolQueryMask) { } // SetOptions sets the pool query mask to include the specified options. -func (pqm *PoolQueryMask) SetOptions(optNames ...string) error { - for _, optName := range optNames { +func (pqm *PoolQueryMask) SetOptions(options ...PoolQueryOption) error { + for _, optName := range options { if opt, err := resolvePoolQueryOpt(optName); err != nil { return err } else { @@ -169,8 +176,8 @@ func (pqm *PoolQueryMask) SetOptions(optNames ...string) error { } // ClearOptions clears the pool query mask of the specified options. -func (pqm *PoolQueryMask) ClearOptions(optNames ...string) error { - for _, optName := range optNames { +func (pqm *PoolQueryMask) ClearOptions(options ...PoolQueryOption) error { + for _, optName := range options { if opt, err := resolvePoolQueryOpt(optName); err != nil { return err } else { @@ -191,8 +198,8 @@ func (pqm *PoolQueryMask) ClearAll() { } // HasOption returns true if the pool query mask includes the specified option. -func (pqm PoolQueryMask) HasOption(optName string) bool { - return strings.Contains(pqm.String(), optName) +func (pqm PoolQueryMask) HasOption(option PoolQueryOption) bool { + return strings.Contains(pqm.String(), option.String()) } func (pqm PoolQueryMask) String() string { @@ -201,7 +208,7 @@ func (pqm PoolQueryMask) String() string { opt := C.int(1 << i) if flag, ok := poolQueryOptMap[opt]; ok { if pqm&PoolQueryMask(opt) != 0 { - flags = append(flags, flag) + flags = append(flags, flag.String()) } } } @@ -228,7 +235,7 @@ func (pqm *PoolQueryMask) UnmarshalJSON(data []byte) error { var newVal PoolQueryMask for _, opt := range strings.Split(strings.Trim(string(data), "\""), ",") { for k, v := range poolQueryOptMap { - if v == opt { + if v.String() == opt { newVal |= PoolQueryMask(k) goto next } diff --git a/src/control/lib/daos/pool_test.go b/src/control/lib/daos/pool_test.go index 8cf5f71312f..4cec1466844 100644 --- a/src/control/lib/daos/pool_test.go +++ b/src/control/lib/daos/pool_test.go @@ -114,8 +114,12 @@ func genTestMask(modifyFn func(pqm *PoolQueryMask)) PoolQueryMask { return testMask } -func genOptsStr(queryOpts ...string) string { - return strings.Join(queryOpts, ",") +func genOptsStr(queryOpts ...PoolQueryOption) string { + optStrs := make([]string, 0, len(queryOpts)) + for _, opt := range queryOpts { + optStrs = append(optStrs, opt.String()) + } + return strings.Join(optStrs, ",") } func TestDaos_PoolQueryMask(t *testing.T) { @@ -156,7 +160,7 @@ func TestDaos_PoolQueryMask(t *testing.T) { testMask: genTestMask(func(pqm *PoolQueryMask) { pqm.SetOptions(PoolQueryOptionSpace) }), - expString: PoolQueryOptionSpace, + expString: PoolQueryOptionSpace.String(), }, "set query space=false": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -176,7 +180,7 @@ func TestDaos_PoolQueryMask(t *testing.T) { testMask: genTestMask(func(pqm *PoolQueryMask) { pqm.SetOptions(PoolQueryOptionRebuild) }), - expString: PoolQueryOptionRebuild, + expString: PoolQueryOptionRebuild.String(), }, "set query rebuild=false": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -189,7 +193,7 @@ func TestDaos_PoolQueryMask(t *testing.T) { testMask: genTestMask(func(pqm *PoolQueryMask) { pqm.SetOptions(PoolQueryOptionEnabledEngines) }), - expString: PoolQueryOptionEnabledEngines, + expString: PoolQueryOptionEnabledEngines.String(), }, "set query enabled_engines=false": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -202,7 +206,7 @@ func TestDaos_PoolQueryMask(t *testing.T) { testMask: genTestMask(func(pqm *PoolQueryMask) { pqm.SetOptions(PoolQueryOptionDisabledEngines) }), - expString: PoolQueryOptionDisabledEngines, + expString: PoolQueryOptionDisabledEngines.String(), }, "set query disabled_engines=false": { testMask: genTestMask(func(pqm *PoolQueryMask) { @@ -220,6 +224,33 @@ func TestDaos_PoolQueryMask(t *testing.T) { } } +func TestDaos_PoolQueryMaskHasOption(t *testing.T) { + for name, tc := range map[string]struct { + testMask PoolQueryMask + testOpt PoolQueryOption + expHasOpt bool + }{ + "empty shouldn't match anything": { + testOpt: PoolQueryOptionSpace, + expHasOpt: false, + }, + "health-only shouldn't match space": { + testMask: HealthOnlyPoolQueryMask, + testOpt: PoolQueryOptionSpace, + expHasOpt: false, + }, + "default should match space": { + testMask: DefaultPoolQueryMask, + testOpt: PoolQueryOptionSpace, + expHasOpt: true, + }, + } { + t.Run(name, func(t *testing.T) { + test.AssertEqual(t, tc.expHasOpt, tc.testMask.HasOption(tc.testOpt), "unexpected HasOption result") + }) + } +} + func TestDaos_PoolQueryMaskMarshalJSON(t *testing.T) { // NB: The MarshalJSON implementation uses the stringer, so // there's no point in testing all of the options here.