From 205b0e7cae0dc21dfe284d2f1beb8f09f410ee78 Mon Sep 17 00:00:00 2001 From: Dave May Date: Thu, 12 Nov 2020 11:25:28 -0500 Subject: [PATCH] nomad operator debug - add client node filtering arguments (#9331) * operator debug - add client node filtering arguments * add WaitForClient helper function * use RPC in WaitForClient to avoid unnecessary imports * guard against nil values * move initialization up and shorten test duration * cleanup nodeLookupFailCount logic * only display max node notice if we actually tried to capture nodes --- command/operator_debug.go | 173 +++++++++++++++++++++++++-------- command/operator_debug_test.go | 118 ++++++++++++++++++++-- testutil/wait.go | 23 +++++ 3 files changed, 265 insertions(+), 49 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 96f7f87033b..34966fb5ca9 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -33,6 +33,8 @@ type OperatorDebugCommand struct { interval time.Duration logLevel string stale bool + maxNodes int + nodeClass string nodeIDs []string serverIDs []string consul *external @@ -69,9 +71,15 @@ Debug Options: -log-level= The log level to monitor. Defaults to DEBUG. + -max-nodes= + Cap the maximum number of client nodes included in the capture. Defaults to 10, set to 0 for unlimited. + -node-id=, Comma separated list of Nomad client node ids, to monitor for logs and include pprof - profiles. Accepts id prefixes. + profiles. Accepts id prefixes, and "all" to select all nodes (up to count = max-nodes). + + -node-class= + Filter client nodes based on node class. -server-id=, Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof @@ -150,6 +158,8 @@ func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags { "-duration": complete.PredictAnything, "-interval": complete.PredictAnything, "-log-level": complete.PredictAnything, + "-max-nodes": complete.PredictAnything, + "-node-class": complete.PredictAnything, "-node-id": complete.PredictAnything, "-server-id": complete.PredictAnything, "-output": complete.PredictAnything, @@ -174,6 +184,8 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags.StringVar(&duration, "duration", "2m", "") flags.StringVar(&interval, "interval", "2m", "") flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") + flags.IntVar(&c.maxNodes, "max-nodes", 10, "") + flags.StringVar(&c.nodeClass, "node-class", "", "") flags.StringVar(&nodeIDs, "node-id", "", "") flags.StringVar(&serverIDs, "server-id", "", "") flags.BoolVar(&c.stale, "stale", false, "") @@ -204,7 +216,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { return 1 } - // Parse the time durations + // Parse the capture duration d, err := time.ParseDuration(duration) if err != nil { c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error())) @@ -212,6 +224,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.duration = d + // Parse the capture interval i, err := time.ParseDuration(interval) if err != nil { c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error())) @@ -219,6 +232,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { } c.interval = i + // Verify there are no extra arguments args = flags.Args() if l := len(args); l != 0 { c.Ui.Error("This command takes no arguments") @@ -226,33 +240,109 @@ func (c *OperatorDebugCommand) Run(args []string) int { return 1 } + // Initialize capture variables and structs + c.manifest = make([]string, 0) + ctx, cancel := context.WithCancel(context.Background()) + c.ctx = ctx + c.cancel = cancel + c.trap() + + // Generate timestamped file name + format := "2006-01-02-150405Z" + c.timestamp = time.Now().UTC().Format(format) + stamped := "nomad-debug-" + c.timestamp + + // Create the output directory + var tmp string + if output != "" { + // User specified output directory + tmp = filepath.Join(output, stamped) + _, err := os.Stat(tmp) + if !os.IsNotExist(err) { + c.Ui.Error("Output directory already exists") + return 2 + } + } else { + // Generate temp directory + tmp, err = ioutil.TempDir(os.TempDir(), stamped) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error())) + return 2 + } + defer os.RemoveAll(tmp) + } + + c.collectDir = tmp + + // Create an instance of the API client client, err := c.Meta.Client() if err != nil { c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error())) return 1 } - // Resolve node prefixes + // Search all nodes If a node class is specified without a list of node id prefixes + if c.nodeClass != "" && nodeIDs == "" { + nodeIDs = "all" + } + + // Resolve client node id prefixes + nodesFound := 0 + nodeLookupFailCount := 0 + nodeCaptureCount := 0 + for _, id := range argNodes(nodeIDs) { - id = sanitizeUUIDPrefix(id) + if id == "all" { + // Capture from all nodes using empty prefix filter + id = "" + } else { + // Capture from nodes starting with prefix id + id = sanitizeUUIDPrefix(id) + } nodes, _, err := client.Nodes().PrefixList(id) if err != nil { c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err)) return 1 } - // Return error if no nodes are found - if len(nodes) == 0 { + + // Increment fail count if no nodes are found + nodesFound = len(nodes) + if nodesFound == 0 { c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id)) - return 1 + nodeLookupFailCount++ + continue } + // Apply constraints to nodes found for _, n := range nodes { + // Ignore nodes that do not match specified class + if c.nodeClass != "" && n.NodeClass != c.nodeClass { + continue + } + + // Add node to capture list c.nodeIDs = append(c.nodeIDs, n.ID) + nodeCaptureCount++ + + // Stop looping when we reach the max + if c.maxNodes != 0 && nodeCaptureCount >= c.maxNodes { + break + } } } + // Return error if nodes were specified but none were found + if len(nodeIDs) > 0 && nodeCaptureCount == 0 { + c.Ui.Error(fmt.Sprintf("Failed to retrieve clients, 0 nodes found in list: %s", nodeIDs)) + return 1 + } + // Resolve servers members, err := client.Agent().Members() + if err != nil { + c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err)) + return 1 + } c.writeJSON("version", "members.json", members, err) // We always write the error to the file, but don't range if no members found if serverIDs == "all" && members != nil { @@ -266,62 +356,58 @@ func (c *OperatorDebugCommand) Run(args []string) int { } } + serversFound := 0 + serverCaptureCount := 0 + + if members != nil { + serversFound = len(members.Members) + } + if c.serverIDs != nil { + serverCaptureCount = len(c.serverIDs) + } + // Return error if servers were specified but not found - if len(serverIDs) > 0 && len(c.serverIDs) == 0 { + if len(serverIDs) > 0 && serverCaptureCount == 0 { c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs)) return 1 } - c.manifest = make([]string, 0) - ctx, cancel := context.WithCancel(context.Background()) - c.ctx = ctx - c.cancel = cancel - c.trap() - - format := "2006-01-02-150405Z" - c.timestamp = time.Now().UTC().Format(format) - stamped := "nomad-debug-" + c.timestamp - - c.Ui.Output("Starting debugger and capturing cluster data...") - c.Ui.Output(fmt.Sprintf("Capturing from servers: %v", c.serverIDs)) - c.Ui.Output(fmt.Sprintf("Capturing from client nodes: %v", c.nodeIDs)) - - c.Ui.Output(fmt.Sprintf(" Interval: '%s'", interval)) - c.Ui.Output(fmt.Sprintf(" Duration: '%s'", duration)) - - // Create the output path - var tmp string - if output != "" { - tmp = filepath.Join(output, stamped) - _, err := os.Stat(tmp) - if !os.IsNotExist(err) { - c.Ui.Error("Output directory already exists") - return 2 - } - } else { - tmp, err = ioutil.TempDir(os.TempDir(), stamped) - if err != nil { - c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error())) - return 2 - } - defer os.RemoveAll(tmp) + // Display general info about the capture + c.Ui.Output("Starting debugger...") + c.Ui.Output("") + c.Ui.Output(fmt.Sprintf(" Servers: (%d/%d) %v", serverCaptureCount, serversFound, c.serverIDs)) + c.Ui.Output(fmt.Sprintf(" Clients: (%d/%d) %v", nodeCaptureCount, nodesFound, c.nodeIDs)) + if nodeCaptureCount > 0 && nodeCaptureCount == c.maxNodes { + c.Ui.Output(fmt.Sprintf(" Max node count reached (%d)", c.maxNodes)) } + if nodeLookupFailCount > 0 { + c.Ui.Output(fmt.Sprintf("Client fail count: %v", nodeLookupFailCount)) + } + if c.nodeClass != "" { + c.Ui.Output(fmt.Sprintf(" Node Class: %s", c.nodeClass)) + } + c.Ui.Output(fmt.Sprintf(" Interval: %s", interval)) + c.Ui.Output(fmt.Sprintf(" Duration: %s", duration)) + c.Ui.Output("") + c.Ui.Output("Capturing cluster data...") - c.collectDir = tmp - + // Start collecting data err = c.collect(client) if err != nil { c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error())) return 2 } + // Write index json/html manifest files c.writeManifest() + // Exit before archive if output directory was specified if output != "" { c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir)) return 0 } + // Create archive tarball archiveFile := stamped + ".tar.gz" err = TarCZF(archiveFile, tmp, stamped) if err != nil { @@ -329,6 +415,7 @@ func (c *OperatorDebugCommand) Run(args []string) int { return 2 } + // Final output with name of tarball c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile)) return 0 } diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index f0806a6b892..3e335fa7250 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -6,6 +6,7 @@ import ( "testing" "time" + "github.com/hashicorp/nomad/command/agent" "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/cli" "github.com/stretchr/testify/assert" @@ -34,10 +35,93 @@ func TestDebugUtils(t *testing.T) { require.Equal(t, "https://127.0.0.1:8500", e.addr("foo")) } +func TestDebug_NodeClass(t *testing.T) { + // Start test server and API client + srv, _, url := testServer(t, false, nil) + defer srv.Shutdown() + + // Wait for leadership to establish + testutil.WaitForLeader(t, srv.Agent.RPC) + + // Retrieve server RPC address to join clients + srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC + t.Logf("[TEST] Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr) + + // Setup Client 1 (nodeclass = clienta) + agentConfFunc1 := func(c *agent.Config) { + c.Region = "global" + c.EnableDebug = true + c.Server.Enabled = false + c.Client.NodeClass = "clienta" + c.Client.Enabled = true + c.Client.Servers = []string{srvRPCAddr} + } + + // Start Client 1 + client1 := agent.NewTestAgent(t, "client1", agentConfFunc1) + defer client1.Shutdown() + + // Wait for the client to connect + client1NodeID := client1.Agent.Client().NodeID() + testutil.WaitForClient(t, srv.Agent.RPC, client1NodeID) + t.Logf("[TEST] Client1 ready, id: %s", client1NodeID) + + // Setup Client 2 (nodeclass = clientb) + agentConfFunc2 := func(c *agent.Config) { + c.Region = "global" + c.EnableDebug = true + c.Server.Enabled = false + c.Client.NodeClass = "clientb" + c.Client.Enabled = true + c.Client.Servers = []string{srvRPCAddr} + } + + // Start Client 2 + client2 := agent.NewTestAgent(t, "client2", agentConfFunc2) + defer client2.Shutdown() + + // Wait for the client to connect + client2NodeID := client2.Agent.Client().NodeID() + testutil.WaitForClient(t, srv.Agent.RPC, client2NodeID) + t.Logf("[TEST] Client2 ready, id: %s", client2NodeID) + + // Setup Client 3 (nodeclass = clienta) + agentConfFunc3 := func(c *agent.Config) { + c.Server.Enabled = false + c.EnableDebug = false + c.Client.NodeClass = "clienta" + c.Client.Servers = []string{srvRPCAddr} + } + + // Start Client 3 + client3 := agent.NewTestAgent(t, "client3", agentConfFunc3) + defer client3.Shutdown() + + // Wait for the client to connect + client3NodeID := client3.Agent.Client().NodeID() + testutil.WaitForClient(t, srv.Agent.RPC, client3NodeID) + t.Logf("[TEST] Client3 ready, id: %s", client3NodeID) + + // Setup mock UI + ui := cli.NewMockUi() + cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} + + // Debug on client - node class = "clienta" + code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"}) + + assert.Equal(t, 0, code) // take note of failed return code, but continue to allow buffer content checks + require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty") + require.Contains(t, ui.OutputWriter.String(), "Starting debugger") + require.Contains(t, ui.OutputWriter.String(), "Node Class: clienta") + + ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() +} + func TestDebugSuccesses(t *testing.T) { - t.Parallel() srv, _, url := testServer(t, false, nil) defer srv.Shutdown() + testutil.WaitForLeader(t, srv.Agent.RPC) ui := cli.NewMockUi() cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} @@ -47,20 +131,24 @@ func TestDebugSuccesses(t *testing.T) { // Debug on the leader code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "leader"}) assert.Equal(t, 0, code) // take note of failed return code, but continue to see why + assert.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty") require.Contains(t, ui.OutputWriter.String(), "Starting debugger") ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() // Debug on all servers code = cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all"}) assert.Equal(t, 0, code) + require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty") require.Contains(t, ui.OutputWriter.String(), "Starting debugger") ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() } func TestDebugFails(t *testing.T) { - t.Parallel() srv, _, url := testServer(t, false, nil) defer srv.Shutdown() + testutil.WaitForLeader(t, srv.Agent.RPC) ui := cli.NewMockUi() cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} @@ -68,22 +156,32 @@ func TestDebugFails(t *testing.T) { // Fails incorrect args code := cmd.Run([]string{"some", "bad", "args"}) require.Equal(t, 1, code) + ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() // Fails illegal node ids code = cmd.Run([]string{"-node-id", "foo:bar"}) require.Equal(t, 1, code) + ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() // Fails missing node ids - code = cmd.Run([]string{"-node-id", "abc,def"}) + code = cmd.Run([]string{"-node-id", "abc,def", "-duration", "250ms"}) require.Equal(t, 1, code) + ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() // Fails bad durations code = cmd.Run([]string{"-duration", "foo"}) require.Equal(t, 1, code) + ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() // Fails bad durations code = cmd.Run([]string{"-interval", "bar"}) require.Equal(t, 1, code) + ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() // Fails existing output format := "2006-01-02-150405Z" @@ -91,19 +189,27 @@ func TestDebugFails(t *testing.T) { path := filepath.Join(os.TempDir(), stamped) os.MkdirAll(path, 0755) defer os.Remove(path) - code = cmd.Run([]string{"-output", os.TempDir()}) + // short duration to prevent timeout + code = cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms"}) require.Equal(t, 2, code) + ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() // Fails bad address code = cmd.Run([]string{"-address", url + "bogus"}) - assert.Equal(t, 1, code) + assert.Equal(t, 1, code) // take note of failed return code, but continue to see why in the OutputWriter + require.NotContains(t, ui.OutputWriter.String(), "Starting debugger") + require.Contains(t, ui.ErrorWriter.String(), "invalid address") ui.OutputWriter.Reset() + ui.ErrorWriter.Reset() } func TestDebugCapturedFiles(t *testing.T) { - t.Parallel() + // NOTE: pprof tracing/profiling cannot be run in parallel + srv, _, url := testServer(t, false, nil) defer srv.Shutdown() + testutil.WaitForLeader(t, srv.Agent.RPC) ui := cli.NewMockUi() cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} diff --git a/testutil/wait.go b/testutil/wait.go index ddc53139e68..2314e82c40a 100644 --- a/testutil/wait.go +++ b/testutil/wait.go @@ -95,6 +95,29 @@ func WaitForLeader(t testing.T, rpc rpcFn) { }) } +// WaitForClient blocks until the client can be found +func WaitForClient(t testing.T, rpc rpcFn, nodeID string) { + t.Helper() + WaitForResult(func() (bool, error) { + req := structs.NodeSpecificRequest{ + NodeID: nodeID, + QueryOptions: structs.QueryOptions{Region: "global"}, + } + var out structs.SingleNodeResponse + + err := rpc("Node.GetNode", &req, &out) + if err != nil { + return false, err + } + if out.Node == nil { + return false, fmt.Errorf("node not found") + } + return out.Node.Status == structs.NodeStatusReady, nil + }, func(err error) { + t.Fatalf("failed to find node: %v", err) + }) +} + // WaitForVotingMembers blocks until autopilot promotes all server peers // to be voting members. //