Skip to content

Commit

Permalink
cli: rename paths in debug bundle for clarity (#11307)
Browse files Browse the repository at this point in the history
* Rename folders to reflect purpose
* Improve captured files test coverage
* Rename CSI plugins output file
* Add changelog entry
* fix test and make changelog message more explicit

Co-authored-by: Luiz Aoqui <[email protected]>
  • Loading branch information
davemay99 and lgfa29 authored Oct 13, 2021
1 parent ff1b2f7 commit 1d30caa
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 60 deletions.
3 changes: 3 additions & 0 deletions .changelog/11307.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:breaking-change
cli: Renamed folders in `nomad operator debug` bundle for clarity
```
41 changes: 22 additions & 19 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,11 @@ type OperatorDebugCommand struct {
}

const (
userAgent = "nomad operator debug"
userAgent = "nomad operator debug"
clusterDir = "cluster"
clientDir = "client"
serverDir = "server"
intervalDir = "interval"
)

func (c *OperatorDebugCommand) Help() string {
Expand Down Expand Up @@ -458,7 +462,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}

// Write complete list of server members to file
c.writeJSON("version", "members.json", members, err)
c.writeJSON(clusterDir, "members.json", members, err)

// Filter for servers matching criteria
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
Expand Down Expand Up @@ -538,18 +542,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {

// collect collects data from our endpoints and writes the archive bundle
func (c *OperatorDebugCommand) collect(client *api.Client) error {
// Version contains cluster meta information
dir := "version"
// Collect cluster data

self, err := client.Agent().Self()
c.writeJSON(dir, "agent-self.json", self, err)
c.writeJSON(clusterDir, "agent-self.json", self, err)

var qo *api.QueryOptions
namespaces, _, err := client.Namespaces().List(qo)
c.writeJSON(dir, "namespaces.json", namespaces, err)
c.writeJSON(clusterDir, "namespaces.json", namespaces, err)

regions, err := client.Regions().List()
c.writeJSON(dir, "regions.json", regions, err)
c.writeJSON(clusterDir, "regions.json", regions, err)

// Fetch data directly from consul and vault. Ignore errors
var consul, vault string
Expand Down Expand Up @@ -582,8 +585,8 @@ func (c *OperatorDebugCommand) collect(client *api.Client) error {
}
}

c.collectConsul(dir, consul)
c.collectVault(dir, vault)
c.collectConsul(clusterDir, consul)
c.collectVault(clusterDir, vault)
c.collectAgentHosts(client)
c.collectPprofs(client)

Expand Down Expand Up @@ -616,11 +619,11 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error {
// startMonitors starts go routines for each node and client
func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
for _, id := range c.nodeIDs {
go c.startMonitor("client", "node_id", id, client)
go c.startMonitor(clientDir, "node_id", id, client)
}

for _, id := range c.serverIDs {
go c.startMonitor("server", "server_id", id, client)
go c.startMonitor(serverDir, "server_id", id, client)
}
}

Expand Down Expand Up @@ -664,19 +667,19 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
// collectAgentHosts calls collectAgentHost for each selected node
func (c *OperatorDebugCommand) collectAgentHosts(client *api.Client) {
for _, n := range c.nodeIDs {
c.collectAgentHost("client", n, client)
c.collectAgentHost(clientDir, n, client)
}

for _, n := range c.serverIDs {
c.collectAgentHost("server", n, client)
c.collectAgentHost(serverDir, n, client)
}
}

// collectAgentHost gets the agent host data
func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Client) {
var host *api.HostDataResponse
var err error
if path == "server" {
if path == serverDir {
host, err = client.Agent().Host(id, "", nil)
} else {
host, err = client.Agent().Host("", id, nil)
Expand All @@ -699,19 +702,19 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
// collectPprofs captures the /agent/pprof for each listed node
func (c *OperatorDebugCommand) collectPprofs(client *api.Client) {
for _, n := range c.nodeIDs {
c.collectPprof("client", n, client)
c.collectPprof(clientDir, n, client)
}

for _, n := range c.serverIDs {
c.collectPprof("server", n, client)
c.collectPprof(serverDir, n, client)
}
}

// collectPprof captures pprof data for the node
func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) {
pprofDurationSeconds := int(c.pprofDuration.Seconds())
opts := api.PprofOptions{Seconds: pprofDurationSeconds}
if path == "server" {
if path == serverDir {
opts.ServerID = id
} else {
opts.NodeID = id
Expand Down Expand Up @@ -810,7 +813,7 @@ func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) {

case <-interval:
name = fmt.Sprintf("%04d", intervalCount)
dir = filepath.Join("nomad", name)
dir = filepath.Join(intervalDir, name)
c.Ui.Output(fmt.Sprintf(" Capture interval %s", name))
c.collectNomad(dir, client)
c.collectOperator(dir, client)
Expand Down Expand Up @@ -859,7 +862,7 @@ func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) erro

// CSI Plugins - /v1/plugins?type=csi
ps, _, err := client.CSIPlugins().List(qo)
c.writeJSON(dir, "plugins.json", ps, err)
c.writeJSON(dir, "csi-plugins.json", ps, err)

// CSI Plugin details - /v1/plugin/csi/:plugin_id
for _, p := range ps {
Expand Down
130 changes: 94 additions & 36 deletions command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,68 +346,126 @@ func TestDebug_Bad_CSIPlugin_Names(t *testing.T) {
var pluginFiles []string
for _, pluginName := range cases {
pluginFile := fmt.Sprintf("csi-plugin-id-%s.json", helper.CleanFilename(pluginName, "_"))
pluginFile = filepath.Join(path, "nomad", "0000", pluginFile)
pluginFile = filepath.Join(path, intervalDir, "0000", pluginFile)
pluginFiles = append(pluginFiles, pluginFile)
}

testutil.WaitForFiles(t, pluginFiles)
}

func buildPathSlice(path string, files []string) []string {
paths := []string{}
for _, file := range files {
paths = append(paths, filepath.Join(path, file))
}
return paths
}

func TestDebug_CapturedFiles(t *testing.T) {
srv, _, url := testServer(t, false, nil)
srv, _, url := testServer(t, true, nil)
testutil.WaitForLeader(t, srv.Agent.RPC)

serverNodeName := srv.Config.NodeName
region := srv.Config.Region
serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
clientID := srv.Agent.Client().NodeID()

t.Logf("serverName: %s, clientID, %s", serverName, clientID)

// Setup file slices
clusterFiles := []string{
"agent-self.json",
"consul-agent-members.json",
"consul-agent-self.json",
"members.json",
"namespaces.json",
"regions.json",
"vault-sys-health.json",
}

pprofFiles := []string{
"allocs.prof",
"goroutine-debug1.txt",
"goroutine-debug2.txt",
"goroutine.prof",
"heap.prof",
"profile.prof",
"threadcreate.prof",
"trace.prof",
}

clientFiles := []string{
"agent-host.json",
"monitor.log",
}
clientFiles = append(clientFiles, pprofFiles...)

serverFiles := []string{
"agent-host.json",
"monitor.log",
}
serverFiles = append(serverFiles, pprofFiles...)

intervalFiles := []string{
"allocations.json",
"csi-plugins.json",
"csi-volumes.json",
"deployments.json",
"evaluations.json",
"jobs.json",
"license.json",
"metrics.json",
"nodes.json",
"operator-autopilot-health.json",
"operator-raft.json",
"operator-scheduler.json",
}

ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

code := cmd.Run([]string{
"-address", url,
"-output", os.TempDir(),
"-server-id", "leader",
"-server-id", serverName,
"-node-id", clientID,
"-duration", "1300ms",
"-interval", "600ms",
})

// Get capture directory
path := cmd.collectDir
defer os.Remove(path)

// There should be no errors
require.Empty(t, ui.ErrorWriter.String())
require.Equal(t, 0, code)
ui.ErrorWriter.Reset()

serverFiles := []string{
// Version is always captured
filepath.Join(path, "version", "agent-self.json"),

// Consul and Vault contain results or errors
filepath.Join(path, "version", "consul-agent-self.json"),
filepath.Join(path, "version", "vault-sys-health.json"),

// Monitor files are only created when selected
filepath.Join(path, "server", "leader", "monitor.log"),

// Pprof profiles
filepath.Join(path, "server", "leader", "profile.prof"),
filepath.Join(path, "server", "leader", "trace.prof"),
filepath.Join(path, "server", "leader", "goroutine.prof"),
filepath.Join(path, "server", "leader", "goroutine-debug1.txt"),
filepath.Join(path, "server", "leader", "goroutine-debug2.txt"),
filepath.Join(path, "server", "leader", "heap.prof"),
filepath.Join(path, "server", "leader", "allocs.prof"),
filepath.Join(path, "server", "leader", "threadcreate.prof"),

// Multiple snapshots are collected, 00 is always created
filepath.Join(path, "nomad", "0000", "jobs.json"),
filepath.Join(path, "nomad", "0000", "nodes.json"),
filepath.Join(path, "nomad", "0000", "metrics.json"),

// Multiple snapshots are collected, 01 requires two intervals
filepath.Join(path, "nomad", "0001", "jobs.json"),
filepath.Join(path, "nomad", "0001", "nodes.json"),
filepath.Join(path, "nomad", "0001", "metrics.json"),
}

testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute)
// Verify cluster files
clusterPaths := buildPathSlice(cmd.path(clusterDir), clusterFiles)
t.Logf("Waiting for cluster files in path: %s", clusterDir)
testutil.WaitForFilesUntil(t, clusterPaths, 2*time.Minute)

// Verify client files
clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
t.Logf("Waiting for client files in path: %s", clientDir)
testutil.WaitForFilesUntil(t, clientPaths, 2*time.Minute)

// Verify server files
serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
t.Logf("Waiting for server files in path: %s", serverDir)
testutil.WaitForFilesUntil(t, serverPaths, 2*time.Minute)

// Verify interval 0000 files
intervalPaths0 := buildPathSlice(cmd.path(intervalDir, "0000"), intervalFiles)
t.Logf("Waiting for interval 0000 files in path: %s", intervalDir)
testutil.WaitForFilesUntil(t, intervalPaths0, 2*time.Minute)

// Verify interval 0001 files
intervalPaths1 := buildPathSlice(cmd.path(intervalDir, "0001"), intervalFiles)
t.Logf("Waiting for interval 0001 files in path: %s", intervalDir)
testutil.WaitForFilesUntil(t, intervalPaths1, 2*time.Minute)
}

func TestDebug_ExistingOutput(t *testing.T) {
Expand Down
10 changes: 5 additions & 5 deletions testutil/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL
// WaitForFiles blocks until all the files in the slice are present
func WaitForFiles(t testing.TB, files []string) {
WaitForResult(func() (bool, error) {
return FilesExist(files), nil
return FilesExist(files)
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
Expand All @@ -250,18 +250,18 @@ func WaitForFiles(t testing.TB, files []string) {
// WaitForFilesUntil blocks until duration or all the files in the slice are present
func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) {
WaitForResultUntil(until, func() (bool, error) {
return FilesExist(files), nil
return FilesExist(files)
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
}

// FilesExist verifies all files in the slice are present
func FilesExist(files []string) bool {
func FilesExist(files []string) (bool, error) {
for _, f := range files {
if _, err := os.Stat(f); os.IsNotExist(err) {
return false
return false, fmt.Errorf("expected file not found: %v", f)
}
}
return true
return true, nil
}

0 comments on commit 1d30caa

Please sign in to comment.