From 7c5e4433708572773f187515525342d313379bf0 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Tue, 21 Mar 2023 07:56:59 -0400 Subject: [PATCH] DAOS-12189 chk: Include removed pools in query count (#11746) When a pool has been removed, it will no longer appear in the pools list for the response. We can still detect that a pool was removed by checking the reports list, however. If we find a removed pool, include it in the count of checked pools when displaying the query results. Includes misc. improvements to JSON output as well. Signed-off-by: Michael MacDonald --- src/control/cmd/dmg/pretty/check.go | 34 ++-- src/control/cmd/dmg/pretty/check_test.go | 191 +++++++++++++++++++++++ src/control/lib/control/check.go | 50 +++++- 3 files changed, 261 insertions(+), 14 deletions(-) create mode 100644 src/control/cmd/dmg/pretty/check_test.go diff --git a/src/control/cmd/dmg/pretty/check.go b/src/control/cmd/dmg/pretty/check.go index f7ac2aa0815..e056bf817cb 100644 --- a/src/control/cmd/dmg/pretty/check.go +++ b/src/control/cmd/dmg/pretty/check.go @@ -9,9 +9,11 @@ package pretty import ( "fmt" "io" + "sort" "github.com/dustin/go-humanize/english" + "github.com/daos-stack/daos/src/control/common" "github.com/daos-stack/daos/src/control/lib/control" "github.com/daos-stack/daos/src/control/lib/txtfmt" ) @@ -51,16 +53,25 @@ func countResultPools(resp *control.SystemCheckQueryResp) int { } poolMap[pool.UUID] = struct{}{} } + for _, report := range resp.Reports { + if report.IsRemovedPool() && report.PoolUuid != "" { + poolMap[report.PoolUuid] = struct{}{} + } + } return len(poolMap) } func PrintCheckQueryResp(out io.Writer, resp *control.SystemCheckQueryResp, verbose bool) { fmt.Fprintln(out, "DAOS System Checker Info") + if resp == nil { + fmt.Fprintln(out, " No results found.") + return + } statusMsg := fmt.Sprintf("Current status: %s", resp.Status) if resp.Status > control.SystemCheckStatusInit && resp.Status < control.SystemCheckStatusCompleted { - statusMsg += fmt.Sprintf(" (started at: %s)", resp.StartTime) + statusMsg += fmt.Sprintf(" (started at: %s)", common.FormatTime(resp.StartTime)) } fmt.Fprintf(out, " %s\n", statusMsg) fmt.Fprintf(out, " Current phase: %s (%s)\n", resp.ScanPhase, resp.ScanPhase.Description()) @@ -77,18 +88,23 @@ func PrintCheckQueryResp(out io.Writer, resp *control.SystemCheckQueryResp, verb fmt.Fprintf(out, " %s %s\n", action, english.Plural(poolCount, "pool", "")) } - if len(resp.Pools) > 0 { - if verbose { - fmt.Fprintln(out, "\nPer-Pool Checker Info:") - for _, pool := range resp.Pools { - fmt.Fprintf(out, " %+v\n", pool) - } + if len(resp.Pools) > 0 && verbose { + pools := make([]*control.SystemCheckPoolInfo, 0, len(resp.Pools)) + for _, pool := range resp.Pools { + pools = append(pools, pool) + } + sort.Slice(pools, func(i, j int) bool { + return pools[i].UUID < pools[j].UUID + }) + fmt.Fprintln(out, "\nPer-Pool Checker Info:") + for _, pool := range pools { + fmt.Fprintf(out, " %+v\n", pool) } - fmt.Fprintln(out) } + fmt.Fprintln(out) if len(resp.Reports) == 0 { - fmt.Fprintln(out, "No reports to display") + fmt.Fprintln(out, "No reports to display.") return } diff --git a/src/control/cmd/dmg/pretty/check_test.go b/src/control/cmd/dmg/pretty/check_test.go new file mode 100644 index 00000000000..5f21021cd19 --- /dev/null +++ b/src/control/cmd/dmg/pretty/check_test.go @@ -0,0 +1,191 @@ +// +// (C) Copyright 2023 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package pretty_test + +import ( + "bytes" + "strings" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + + "github.com/daos-stack/daos/src/control/cmd/dmg/pretty" + chkpb "github.com/daos-stack/daos/src/control/common/proto/chk" + "github.com/daos-stack/daos/src/control/lib/control" +) + +func TestPretty_PrintCheckQueryResp(t *testing.T) { + checkTime, err := time.Parse(time.RFC822Z, "20 Mar 23 10:07 -0500") + if err != nil { + t.Fatal(err) + } + + for name, tc := range map[string]struct { + resp *control.SystemCheckQueryResp + verbose bool + expOut string + }{ + "empty": { + expOut: ` +DAOS System Checker Info + No results found. +`, + }, + "(verbose) 2 pools being checked": { + resp: &control.SystemCheckQueryResp{ + Status: control.SystemCheckStatusRunning, + ScanPhase: control.SystemCheckScanPhaseContainerList, + StartTime: checkTime, + Pools: map[string]*control.SystemCheckPoolInfo{ + "pool-1": { + UUID: "pool-1", + Status: chkpb.CheckPoolStatus_CPS_CHECKING.String(), + Phase: chkpb.CheckScanPhase_CSP_PREPARE.String(), + StartTime: checkTime, + }, + "pool-2": { + UUID: "pool-2", + Status: chkpb.CheckPoolStatus_CPS_CHECKING.String(), + Phase: chkpb.CheckScanPhase_CSP_PREPARE.String(), + StartTime: checkTime, + }, + }, + }, + verbose: true, + expOut: ` +DAOS System Checker Info + Current status: RUNNING (started at: 2023-03-20T10:07:00.000-05:00) + Current phase: CONT_LIST (Comparing container list on PS and storage nodes) + Checking 2 pools + +Per-Pool Checker Info: + Pool pool-1: 0 ranks, status: CPS_CHECKING, phase: CSP_PREPARE, started: 2023-03-20T10:07:00.000-05:00 + Pool pool-2: 0 ranks, status: CPS_CHECKING, phase: CSP_PREPARE, started: 2023-03-20T10:07:00.000-05:00 + +No reports to display. +`, + }, + "(verbose) 3 pools repaired; 1 unchecked; 1 removed": { + resp: &control.SystemCheckQueryResp{ + Status: control.SystemCheckStatusCompleted, + ScanPhase: control.SystemCheckScanPhaseDone, + Pools: map[string]*control.SystemCheckPoolInfo{ + "pool-1": { + UUID: "pool-1", + Status: chkpb.CheckPoolStatus_CPS_CHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_DONE.String(), + StartTime: checkTime, + }, + "pool-2": { + UUID: "pool-2", + Status: chkpb.CheckPoolStatus_CPS_CHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_DONE.String(), + StartTime: checkTime, + }, + "pool-3": { + UUID: "pool-3", + Status: chkpb.CheckPoolStatus_CPS_CHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_DONE.String(), + StartTime: checkTime, + }, + "pool-5": { + UUID: "pool-5", + Status: chkpb.CheckPoolStatus_CPS_UNCHECKED.String(), + Phase: chkpb.CheckScanPhase_CSP_PREPARE.String(), + }, + }, + Reports: []*control.SystemCheckReport{ + { + CheckReport: chkpb.CheckReport{ + Seq: 1, + Class: chkpb.CheckInconsistClass_CIC_POOL_BAD_SVCL, + Action: chkpb.CheckInconsistAction_CIA_IGNORE, + Msg: "message 1", + PoolUuid: "pool-1", + }, + }, + { + CheckReport: chkpb.CheckReport{ + Seq: 2, + Class: chkpb.CheckInconsistClass_CIC_POOL_BAD_LABEL, + Action: chkpb.CheckInconsistAction_CIA_TRUST_MS, + Msg: "message 2", + PoolUuid: "pool-2", + }, + }, + { + CheckReport: chkpb.CheckReport{ + Seq: 3, + Class: chkpb.CheckInconsistClass_CIC_POOL_LESS_SVC_WITHOUT_QUORUM, + Action: chkpb.CheckInconsistAction_CIA_TRUST_PS, + Msg: "message 3", + PoolUuid: "pool-3", + }, + }, + { + CheckReport: chkpb.CheckReport{ + Seq: 4, + Class: chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_ENGINE, + Action: chkpb.CheckInconsistAction_CIA_DISCARD, + Msg: "message 4", + PoolUuid: "pool-4", + }, + }, + }, + }, + verbose: true, + expOut: ` +DAOS System Checker Info + Current status: COMPLETED + Current phase: DSP_DONE (Check completed) + Checked 4 pools + +Per-Pool Checker Info: + Pool pool-1: 0 ranks, status: CPS_CHECKED, phase: DSP_DONE, started: 2023-03-20T10:07:00.000-05:00 + Pool pool-2: 0 ranks, status: CPS_CHECKED, phase: DSP_DONE, started: 2023-03-20T10:07:00.000-05:00 + Pool pool-3: 0 ranks, status: CPS_CHECKED, phase: DSP_DONE, started: 2023-03-20T10:07:00.000-05:00 + Pool pool-5: 0 ranks, status: CPS_UNCHECKED, phase: CSP_PREPARE + +Inconsistency Reports: + ID: 0x1 + Class: POOL_BAD_SVCL + Message: message 1 + Pool: pool-1 + Resolution: IGNORE + + ID: 0x2 + Class: POOL_BAD_LABEL + Message: message 2 + Pool: pool-2 + Resolution: TRUST_MS + + ID: 0x3 + Class: POOL_LESS_SVC_WITHOUT_QUORUM + Message: message 3 + Pool: pool-3 + Resolution: TRUST_PS + + ID: 0x4 + Class: POOL_NONEXIST_ON_ENGINE + Message: message 4 + Pool: pool-4 + Resolution: DISCARD + +`, + }, + } { + t.Run(name, func(t *testing.T) { + var buf bytes.Buffer + pretty.PrintCheckQueryResp(&buf, tc.resp, tc.verbose) + got := buf.String() + if diff := cmp.Diff(strings.TrimLeft(tc.expOut, "\n"), got); diff != "" { + t.Fatalf("unexpected output (-want, +got):\n%s", diff) + } + }) + } +} diff --git a/src/control/lib/control/check.go b/src/control/lib/control/check.go index c337c693ad0..c30b3c3b9c4 100644 --- a/src/control/lib/control/check.go +++ b/src/control/lib/control/check.go @@ -8,6 +8,7 @@ package control import ( "context" + "encoding/json" "fmt" "sort" "strings" @@ -17,6 +18,7 @@ import ( "google.golang.org/grpc" "google.golang.org/protobuf/proto" + "github.com/daos-stack/daos/src/control/common" pbutil "github.com/daos-stack/daos/src/control/common/proto" chkpb "github.com/daos-stack/daos/src/control/common/proto/chk" mgmtpb "github.com/daos-stack/daos/src/control/common/proto/mgmt" @@ -422,6 +424,12 @@ func (r *SystemCheckReport) IsInteractive() bool { return r.Action == chkpb.CheckInconsistAction_CIA_INTERACT } +func (r *SystemCheckReport) IsRemovedPool() bool { + return r.Action == chkpb.CheckInconsistAction_CIA_DISCARD && + (r.Class == chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_ENGINE || + r.Class == chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_MS) +} + func (r *SystemCheckReport) Resolution() string { msg := SystemCheckRepairAction(r.Action).String() if len(r.ActMsgs) == 1 { @@ -438,9 +446,26 @@ type SystemCheckPoolInfo struct { Label string `json:"label"` Status string `json:"status"` Phase string `json:"phase"` - StartTime time.Time `json:"start_time"` - Remaining time.Duration `json:"remaining"` - Elapsed time.Duration `json:"elapsed"` + StartTime time.Time `json:"-"` + Remaining time.Duration `json:"-"` + Elapsed time.Duration `json:"-"` +} + +func (p *SystemCheckPoolInfo) MarshalJSON() ([]byte, error) { + type toJSON SystemCheckPoolInfo + return json.Marshal(&struct { + *toJSON + RankCount int `json:"rank_count"` + StartTime string `json:"start_time"` + Remaining float64 `json:"remaining"` + Elapsed float64 `json:"elapsed"` + }{ + toJSON: (*toJSON)(p), + RankCount: len(p.RawRankInfo), + StartTime: common.FormatTime(p.StartTime), + Remaining: p.Remaining.Seconds(), + Elapsed: p.Elapsed.Seconds(), + }) } func (p *SystemCheckPoolInfo) String() string { @@ -450,8 +475,12 @@ func (p *SystemCheckPoolInfo) String() string { } else if p.Remaining > 0 { remOrElapsed = fmt.Sprintf(" remaining: %s", p.Remaining) } - return fmt.Sprintf("Pool %s: %d ranks, status: %s, phase: %s, started: %s%s", - p.UUID, len(p.RawRankInfo), p.Status, p.Phase, p.StartTime, remOrElapsed) + timeStr := "" + if !p.StartTime.IsZero() { + timeStr = fmt.Sprintf(", started: %s%s", common.FormatTime(p.StartTime), remOrElapsed) + } + return fmt.Sprintf("Pool %s: %d ranks, status: %s, phase: %s%s", + p.UUID, len(p.RawRankInfo), p.Status, p.Phase, timeStr) } func (p *SystemCheckPoolInfo) Unchecked() bool { @@ -507,6 +536,17 @@ type SystemCheckQueryResp struct { Reports []*SystemCheckReport `json:"reports"` } +func (r *SystemCheckQueryResp) MarshalJSON() ([]byte, error) { + type toJSON SystemCheckQueryResp + return json.Marshal(struct { + StartTime string `json:"start_time"` + *toJSON + }{ + StartTime: common.FormatTime(r.StartTime), + toJSON: (*toJSON)(r), + }) +} + // SystemCheckQuery queries the system checker status. func SystemCheckQuery(ctx context.Context, rpcClient UnaryInvoker, req *SystemCheckQueryReq) (*SystemCheckQueryResp, error) { if req == nil {