diff --git a/command/commands.go b/command/commands.go index 9a24e4f192e..12a18abe86c 100644 --- a/command/commands.go +++ b/command/commands.go @@ -201,6 +201,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory { Meta: meta, }, nil }, + "debug": func() (cli.Command, error) { + return &DebugCommand{ + Meta: meta, + }, nil + }, "deployment": func() (cli.Command, error) { return &DeploymentCommand{ Meta: meta, diff --git a/command/debug.go b/command/debug.go new file mode 100644 index 00000000000..46ae6cd9da3 --- /dev/null +++ b/command/debug.go @@ -0,0 +1,698 @@ +package command + +import ( + "archive/tar" + "compress/gzip" + "context" + "encoding/json" + "fmt" + "html/template" + "io" + "io/ioutil" + "net/http" + "os" + "os/signal" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/posener/complete" +) + +type DebugCommand struct { + Meta + + timestamp string + collectDir string + duration time.Duration + interval time.Duration + logLevel string + nodeIDs []string + serverIDs []string + consulToken string + vaultToken string + manifest []string + ctx context.Context + cancel context.CancelFunc +} + +const ( + userAgent = "nomad debug" +) + +func (c *DebugCommand) Help() string { + helpText := ` +Usage: nomad debug [options] + + Build an archive containing Nomad cluster configuration and state, and Consul and Vault + status. Include logs and pprof profiles for selected servers and client nodes. + +General Options: + + ` + generalOptionsUsage() + ` + +Debug Options: + + -duration=2m + The duration of the log monitor command. Defaults to 2m. + + -interval=2m + The interval between snapshots of the Nomad state. If unspecified, only one snapshot is + captured. + + -log-level=DEBUG + The log level to monitor. Defaults to DEBUG. + + -node-id=n1,n2 + Comma separated list of Nomad client node ids, to monitor for logs and include pprof + profiles. Accepts id prefixes. + + -server-id=s1,s2 + Comma separated list of Nomad server names, or "leader" to monitor for logs and include pprof + profiles. + + -output=path + Path to the parent directory of the output directory. If not specified, an archive is built + in the current directory. + + -consul-token + Token used to query Consul. Defaults to CONSUL_TOKEN + + -vault-token + Token used to query Vault. Defaults to VAULT_TOKEN +` + return strings.TrimSpace(helpText) +} + +func (c *DebugCommand) Synopsis() string { + return "Build a debug archive" +} + +func (c *DebugCommand) AutocompleteFlags() complete.Flags { + return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), + complete.Flags{ + "-duration": complete.PredictAnything, + "-interval": complete.PredictAnything, + "-log-level": complete.PredictAnything, + "-node-id": complete.PredictAnything, + "-server-id": complete.PredictAnything, + "-output": complete.PredictAnything, + "-consul-token": complete.PredictAnything, + "-vault-token": complete.PredictAnything, + }) +} + +func (c *DebugCommand) AutocompleteArgs() complete.Predictor { + return complete.PredictNothing +} + +func (c *DebugCommand) Name() string { return "debug" } + +func (c *DebugCommand) Run(args []string) int { + flags := c.Meta.FlagSet(c.Name(), FlagSetClient) + flags.Usage = func() { c.Ui.Output(c.Help()) } + + var duration, interval, output string + var nodeIDs, serverIDs string + + flags.StringVar(&duration, "duration", "2m", "") + flags.StringVar(&interval, "interval", "2m", "") + flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") + flags.StringVar(&nodeIDs, "node-id", "", "") + flags.StringVar(&serverIDs, "server-id", "", "") + flags.StringVar(&output, "output", "", "") + flags.StringVar(&c.consulToken, "consul-token", "", "") + flags.StringVar(&c.vaultToken, "vault-token", "", "") + + if err := flags.Parse(args); err != nil { + return 1 + } + + // Parse the time durations + d, err := time.ParseDuration(duration) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error())) + return 1 + } + c.duration = d + + i, err := time.ParseDuration(interval) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error())) + return 1 + } + c.interval = i + + args = flags.Args() + if l := len(args); l != 0 { + c.Ui.Error("This command takes no arguments") + c.Ui.Error(commandErrorText(c)) + return 1 + } + + client, err := c.Meta.Client() + if err != nil { + c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error())) + return 1 + } + + // Resolve node prefixes + for _, id := range argNodes(nodeIDs) { + id = sanitizeUUIDPrefix(id) + nodes, _, err := client.Nodes().PrefixList(id) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err)) + return 1 + } + // Return error if no nodes are found + if len(nodes) == 0 { + c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id)) + return 1 + } + + for _, n := range nodes { + c.nodeIDs = append(c.nodeIDs, n.ID) + } + } + + // Resolve server prefixes + for _, id := range argNodes(serverIDs) { + c.serverIDs = append(c.serverIDs, id) + } + + c.manifest = make([]string, 0) + ctx, cancel := context.WithCancel(context.Background()) + c.ctx = ctx + c.cancel = cancel + + // Setup the output path + format := "2006-01-02-150405Z" + c.timestamp = time.Now().UTC().Format(format) + stamped := "nomad-debug-" + c.timestamp + + var tmp string + if output != "" { + tmp = filepath.Join(output, stamped) + _, err := os.Stat(tmp) + if !os.IsNotExist(err) { + c.Ui.Error("Output directory already exists") + return 2 + } + } else { + tmp, err = ioutil.TempDir(os.TempDir(), stamped) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error())) + return 2 + } + defer os.RemoveAll(tmp) + } + + c.collectDir = tmp + + // Capture signals so we can shutdown the monitor API calls on Int + c.trap() + + err = c.collect(client) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error())) + return 2 + } + + c.writeManifest() + + if output != "" { + c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir)) + return 0 + } + + archiveFile := stamped + ".tar.gz" + err = TarCZF(archiveFile, tmp) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error creating archive: %s", err.Error())) + return 2 + } + + c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile)) + return 0 +} + +// collect collects data from our endpoints and writes the archive bundle +func (c *DebugCommand) collect(client *api.Client) error { + // Version contains cluster meta information + dir := "version" + err := c.mkdir(dir) + if err != nil { + return err + } + + self, err := client.Agent().Self() + if err != nil { + return fmt.Errorf("agent self: %s", err.Error()) + } + c.writeJSON(dir, "agent-self.json", self) + + // Fetch data directly from consul and vault. Ignore errors + var consul, vault string + + m, ok := self.Config["Consul"].(map[string]interface{}) + if ok { + raw := m["Addr"] + consul, _ = raw.(string) + raw = m["EnableSSL"] + ssl, _ := raw.(bool) + if ssl { + consul = "https://" + consul + } else { + consul = "http://" + consul + } + } + + m, ok = self.Config["Vault"].(map[string]interface{}) + if ok { + raw := m["Addr"] + vault, _ = raw.(string) + } + + c.collectConsul(dir, consul) + c.collectVault(dir, vault) + + c.startMonitors(client) + c.collectPeriodic(client) + c.collectPprofs(client) + + return nil +} + +// path returns platform specific paths in the tmp root directory +func (c *DebugCommand) path(paths ...string) string { + ps := []string{c.collectDir} + ps = append(ps, paths...) + return filepath.Join(ps...) +} + +// mkdir creates directories in the tmp root directory +func (c *DebugCommand) mkdir(paths ...string) error { + return os.MkdirAll(c.path(paths...), 0755) +} + +// startMonitors starts go routines for each node and client +func (c *DebugCommand) startMonitors(client *api.Client) { + for _, id := range c.nodeIDs { + go c.startMonitor("client", "node_id", id, client) + } + + for _, id := range c.serverIDs { + go c.startMonitor("server", "server_id", id, client) + } +} + +// startMonitor starts one monitor api request, writing to a file. It blocks and should be +// called in a go routine. Errors are ignored, we want to build the archive even if a node +// is unavailable +func (c *DebugCommand) startMonitor(path, idKey, nodeID string, client *api.Client) { + c.mkdir(path, nodeID) + fh, err := os.Create(c.path(path, nodeID, "monitor.log")) + if err != nil { + return + } + defer fh.Close() + + qo := api.QueryOptions{ + Params: map[string]string{ + idKey: nodeID, + "log_level": c.logLevel, + }, + } + + outCh, errCh := client.Agent().Monitor(c.ctx.Done(), &qo) + for { + select { + case out := <-outCh: + if out == nil { + continue + } + fh.Write(out.Data) + fh.WriteString("\n") + + case err := <-errCh: + fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error())) + return + + case <-c.ctx.Done(): + return + } + } +} + +// collectPprofs captures the /agent/pprof for each listed node +func (c *DebugCommand) collectPprofs(client *api.Client) { + for _, n := range c.nodeIDs { + c.collectPprof("client", n, client) + } + + for _, n := range c.serverIDs { + c.collectPprof("server", n, client) + } + +} + +// collectPprof captures pprof data for the node +func (c *DebugCommand) collectPprof(path, id string, client *api.Client) { + opts := api.PprofOptions{Seconds: 1} + if path == "server" { + opts.ServerID = id + } else { + opts.NodeID = id + } + + path = filepath.Join(path, id) + + bs, err := client.Agent().CPUProfile(opts, nil) + if err == nil { + c.writeBytes(path, "profile.prof", bs) + } + + bs, err = client.Agent().Trace(opts, nil) + if err == nil { + c.writeBytes(path, "trace.prof", bs) + } + + bs, err = client.Agent().Lookup("goroutine", opts, nil) + if err == nil { + c.writeBytes(path, "goroutine.prof", bs) + } +} + +// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops +// the monitor requests +func (c *DebugCommand) collectPeriodic(client *api.Client) { + // Not monitoring any logs, just capture the nomad context before exit + if len(c.nodeIDs) == 0 && len(c.serverIDs) == 0 { + dir := filepath.Join("nomad", "00") + c.collectNomad(dir, client) + return + } + + duration := time.After(c.duration) + // Set interval to 0 so that we immediately execute, wait the interval next time + interval := time.After(0 * time.Second) + var intervalCount int + var dir string + + for { + select { + case <-duration: + c.cancel() + return + + case <-interval: + dir = filepath.Join("nomad", fmt.Sprintf("%02d", intervalCount)) + c.collectNomad(dir, client) + interval = time.After(c.interval) + intervalCount += 1 + + case <-c.ctx.Done(): + return + } + } +} + +// collectNomad captures the nomad cluster state +func (c *DebugCommand) collectNomad(dir string, client *api.Client) error { + err := c.mkdir(dir) + if err != nil { + return err + } + + var qo *api.QueryOptions + + js, _, err := client.Jobs().List(qo) + if err != nil { + return fmt.Errorf("listing jobs: %s", err.Error()) + } + c.writeJSON(dir, "jobs.json", js) + + ds, _, err := client.Deployments().List(qo) + if err != nil { + return fmt.Errorf("listing deployments: %s", err.Error()) + } + c.writeJSON(dir, "deployments.json", ds) + + es, _, err := client.Evaluations().List(qo) + if err != nil { + return fmt.Errorf("listing evaluations: %s", err.Error()) + } + c.writeJSON(dir, "evaluations.json", es) + + as, _, err := client.Allocations().List(qo) + if err != nil { + return fmt.Errorf("listing allocations: %s", err.Error()) + } + c.writeJSON(dir, "allocations.json", as) + + ns, _, err := client.Nodes().List(qo) + if err != nil { + return fmt.Errorf("listing nodes: %s", err.Error()) + } + c.writeJSON(dir, "nodes.json", ns) + + ps, _, err := client.CSIPlugins().List(qo) + if err != nil { + return fmt.Errorf("listing plugins: %s", err.Error()) + } + c.writeJSON(dir, "plugins.json", ps) + + vs, _, err := client.CSIVolumes().List(qo) + if err != nil { + return fmt.Errorf("listing volumes: %s", err.Error()) + } + c.writeJSON(dir, "volumes.json", vs) + + return nil +} + +// collectConsul calls the Consul API directly to collect data +func (c *DebugCommand) collectConsul(dir, consul string) error { + if consul == "" { + return nil + } + + token := c.consulToken + if token == "" { + os.Getenv("CONSUL_TOKEN") + } + + client := http.Client{ + Timeout: 2 * time.Second, + } + + req, _ := http.NewRequest("GET", consul+"/v1/agent/self", nil) + req.Header.Add("X-Consul-Token", token) + req.Header.Add("User-Agent", userAgent) + resp, err := client.Do(req) + c.writeBody(dir, "consul-agent-self.json", resp, err) + + req, _ = http.NewRequest("GET", consul+"/v1/agent/members", nil) + req.Header.Add("X-Consul-Token", token) + req.Header.Add("User-Agent", userAgent) + resp, err = client.Do(req) + c.writeBody(dir, "consul-agent-members.json", resp, err) + + return nil +} + +// collectVault calls the Vault API directly to collect data +func (c *DebugCommand) collectVault(dir, vault string) error { + if vault == "" { + return nil + } + + token := c.vaultToken + if token == "" { + os.Getenv("VAULT_TOKEN") + } + + client := http.Client{ + Timeout: 2 * time.Second, + } + + req, _ := http.NewRequest("GET", vault+"/sys/health", nil) + req.Header.Add("X-Vault-Token", token) + req.Header.Add("User-Agent", userAgent) + resp, err := client.Do(req) + c.writeBody(dir, "vault-sys-health.json", resp, err) + + return nil +} + +// writeBytes writes a file to the archive, recording it in the manifest +func (c *DebugCommand) writeBytes(dir, file string, data []byte) error { + path := filepath.Join(dir, file) + c.manifest = append(c.manifest, path) + path = filepath.Join(c.collectDir, path) + + fh, err := os.Create(path) + if err != nil { + return err + } + defer fh.Close() + + _, err = fh.Write(data) + return err +} + +// writeJSON writes JSON responses from the Nomad API calls to the archive +func (c *DebugCommand) writeJSON(dir, file string, data interface{}) error { + bytes, err := json.Marshal(data) + if err != nil { + return err + } + return c.writeBytes(dir, file, bytes) +} + +// writeBody is a helper that writes the body of an http.Response to the archive +func (c *DebugCommand) writeBody(dir, file string, resp *http.Response, err error) { + if err != nil { + return + } + defer resp.Body.Close() + + if resp.ContentLength == 0 { + return + } + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return + } + + c.writeBytes(dir, file, body) +} + +// writeManifest creates the index files +func (c *DebugCommand) writeManifest() error { + // Write the JSON + path := filepath.Join(c.collectDir, "index.json") + jsonFh, err := os.Create(path) + if err != nil { + return err + } + defer jsonFh.Close() + + json.NewEncoder(jsonFh).Encode(c.manifest) + + // Write the HTML + path = filepath.Join(c.collectDir, "index.html") + htmlFh, err := os.Create(path) + if err != nil { + return err + } + defer htmlFh.Close() + + head, _ := template.New("head").Parse("{{.}}\n

{{.}}

\n\n" + + head.Execute(htmlFh, c.timestamp) + for _, f := range c.manifest { + line.Execute(htmlFh, f) + } + htmlFh.WriteString(tail) + + return nil +} + +// trap captures signals, and closes stopCh +func (c *DebugCommand) trap() { + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, + syscall.SIGHUP, + syscall.SIGINT, + syscall.SIGTERM, + syscall.SIGQUIT) + + go func() { + <-sigCh + c.cancel() + }() +} + +// TarCZF, like the tar command, recursively builds a gzip compressed tar archive from a +// directory +func TarCZF(archive string, src string) error { + // ensure the src actually exists before trying to tar it + if _, err := os.Stat(src); err != nil { + return fmt.Errorf("Unable to tar files - %v", err.Error()) + } + + // create the archive + fh, err := os.Create(archive) + if err != nil { + return err + } + defer fh.Close() + + zz := gzip.NewWriter(fh) + defer zz.Close() + + tw := tar.NewWriter(zz) + defer tw.Close() + + // tar + return filepath.Walk(src, func(file string, fi os.FileInfo, err error) error { + + // return on any error + if err != nil { + return err + } + + if !fi.Mode().IsRegular() { + return nil + } + + header, err := tar.FileInfoHeader(fi, fi.Name()) + if err != nil { + return err + } + + // remove leading path to the src, so files are relative to the archive + header.Name = strings.TrimPrefix(strings.Replace(file, src, "", -1), string(filepath.Separator)) + + if err := tw.WriteHeader(header); err != nil { + return err + } + + // copy the file contents + f, err := os.Open(file) + if err != nil { + return err + } + + if _, err := io.Copy(tw, f); err != nil { + return err + } + + f.Close() + + return nil + }) +} + +// argNodes splits node ids from the command line by "," +func argNodes(input string) []string { + ns := strings.Split(input, ",") + var out []string + for _, n := range ns { + s := strings.TrimSpace(n) + if s == "" { + continue + } + out = append(out, s) + } + return out +} diff --git a/command/debug_test.go b/command/debug_test.go new file mode 100644 index 00000000000..d88f375e1ec --- /dev/null +++ b/command/debug_test.go @@ -0,0 +1,108 @@ +package command + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/mitchellh/cli" + "github.com/stretchr/testify/require" +) + +func TestDebugUtils(t *testing.T) { + xs := argNodes("foo, bar") + require.Equal(t, []string{"foo", "bar"}, xs) + + xs = argNodes("") + require.Len(t, xs, 0) + require.Empty(t, xs) +} + +func TestDebugFails(t *testing.T) { + t.Parallel() + srv, _, _ := testServer(t, false, nil) + defer srv.Shutdown() + + ui := new(cli.MockUi) + cmd := &DebugCommand{Meta: Meta{Ui: ui}} + + // Fails incorrect args + code := cmd.Run([]string{"some", "bad", "args"}) + require.Equal(t, 1, code) + + // Fails illegal node ids + code = cmd.Run([]string{"-node-id", "foo:bar"}) + require.Equal(t, 1, code) + + // Fails missing node ids + code = cmd.Run([]string{"-node-id", "abc,def"}) + require.Equal(t, 1, code) + + // Fails bad durations + code = cmd.Run([]string{"-duration", "foo"}) + require.Equal(t, 1, code) + + // Fails bad durations + code = cmd.Run([]string{"-interval", "bar"}) + require.Equal(t, 1, code) + + // Fails existing output + format := "2006-01-02-150405Z" + stamped := "nomad-debug-" + time.Now().UTC().Format(format) + path := filepath.Join(os.TempDir(), stamped) + os.MkdirAll(path, 0755) + defer os.Remove(path) + code = cmd.Run([]string{"-output", os.TempDir()}) + require.Equal(t, 2, code) +} + +func TestDebugCapturedFiles(t *testing.T) { + t.Parallel() + srv, _, url := testServer(t, false, nil) + defer srv.Shutdown() + + ui := new(cli.MockUi) + cmd := &DebugCommand{Meta: Meta{Ui: ui}} + + // Calculate the output name + format := "2006-01-02-150405Z" + stamped := "nomad-debug-" + time.Now().UTC().Format(format) + path := filepath.Join(os.TempDir(), stamped) + defer os.Remove(path) + + code := cmd.Run([]string{ + "-address", url, + "-output", os.TempDir(), + "-server-id", "leader", + "-duration", "1s", + "-interval", "500ms", + }) + + require.Empty(t, ui.ErrorWriter.String()) + require.Equal(t, 0, code) + ui.ErrorWriter.Reset() + + // Version is always captured + require.FileExists(t, filepath.Join(path, "version", "agent-self.json")) + + // Consul and Vault are only captured if they exist + _, err := os.Stat(filepath.Join(path, "version", "consul-agent-self.json")) + require.Error(t, err) + _, err = os.Stat(filepath.Join(path, "version", "vault-sys-health.json")) + require.Error(t, err) + + // Monitor files are only created when selected + require.FileExists(t, filepath.Join(path, "server", "leader", "monitor.log")) + require.FileExists(t, filepath.Join(path, "server", "leader", "profile.prof")) + require.FileExists(t, filepath.Join(path, "server", "leader", "trace.prof")) + require.FileExists(t, filepath.Join(path, "server", "leader", "goroutine.prof")) + + // Multiple snapshots are collected, 00 is always created + require.FileExists(t, filepath.Join(path, "nomad", "00", "jobs.json")) + require.FileExists(t, filepath.Join(path, "nomad", "00", "nodes.json")) + + // Multiple snapshots are collected, 01 requires two intervals + require.FileExists(t, filepath.Join(path, "nomad", "01", "jobs.json")) + require.FileExists(t, filepath.Join(path, "nomad", "01", "nodes.json")) +}