diff --git a/command/commands.go b/command/commands.go index 9a24e4f192e..12a18abe86c 100644 --- a/command/commands.go +++ b/command/commands.go @@ -201,6 +201,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory { Meta: meta, }, nil }, + "debug": func() (cli.Command, error) { + return &DebugCommand{ + Meta: meta, + }, nil + }, "deployment": func() (cli.Command, error) { return &DeploymentCommand{ Meta: meta, diff --git a/command/debug.go b/command/debug.go new file mode 100644 index 00000000000..46ae6cd9da3 --- /dev/null +++ b/command/debug.go @@ -0,0 +1,698 @@ +package command + +import ( + "archive/tar" + "compress/gzip" + "context" + "encoding/json" + "fmt" + "html/template" + "io" + "io/ioutil" + "net/http" + "os" + "os/signal" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/posener/complete" +) + +type DebugCommand struct { + Meta + + timestamp string + collectDir string + duration time.Duration + interval time.Duration + logLevel string + nodeIDs []string + serverIDs []string + consulToken string + vaultToken string + manifest []string + ctx context.Context + cancel context.CancelFunc +} + +const ( + userAgent = "nomad debug" +) + +func (c *DebugCommand) Help() string { + helpText := ` +Usage: nomad debug [options] + + Build an archive containing Nomad cluster configuration and state, and Consul and Vault + status. Include logs and pprof profiles for selected servers and client nodes. + +General Options: + + ` + generalOptionsUsage() + ` + +Debug Options: + + -duration=2m + The duration of the log monitor command. Defaults to 2m. + + -interval=2m + The interval between snapshots of the Nomad state. If unspecified, only one snapshot is + captured. + + -log-level=DEBUG + The log level to monitor. Defaults to DEBUG. + + -node-id=n1,n2 + Comma separated list of Nomad client node ids, to monitor for logs and include pprof + profiles. Accepts id prefixes. + + -server-id=s1,s2 + Comma separated list of Nomad server names, or "leader" to monitor for logs and include pprof + profiles. + + -output=path + Path to the parent directory of the output directory. If not specified, an archive is built + in the current directory. + + -consul-token + Token used to query Consul. Defaults to CONSUL_TOKEN + + -vault-token + Token used to query Vault. Defaults to VAULT_TOKEN +` + return strings.TrimSpace(helpText) +} + +func (c *DebugCommand) Synopsis() string { + return "Build a debug archive" +} + +func (c *DebugCommand) AutocompleteFlags() complete.Flags { + return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), + complete.Flags{ + "-duration": complete.PredictAnything, + "-interval": complete.PredictAnything, + "-log-level": complete.PredictAnything, + "-node-id": complete.PredictAnything, + "-server-id": complete.PredictAnything, + "-output": complete.PredictAnything, + "-consul-token": complete.PredictAnything, + "-vault-token": complete.PredictAnything, + }) +} + +func (c *DebugCommand) AutocompleteArgs() complete.Predictor { + return complete.PredictNothing +} + +func (c *DebugCommand) Name() string { return "debug" } + +func (c *DebugCommand) Run(args []string) int { + flags := c.Meta.FlagSet(c.Name(), FlagSetClient) + flags.Usage = func() { c.Ui.Output(c.Help()) } + + var duration, interval, output string + var nodeIDs, serverIDs string + + flags.StringVar(&duration, "duration", "2m", "") + flags.StringVar(&interval, "interval", "2m", "") + flags.StringVar(&c.logLevel, "log-level", "DEBUG", "") + flags.StringVar(&nodeIDs, "node-id", "", "") + flags.StringVar(&serverIDs, "server-id", "", "") + flags.StringVar(&output, "output", "", "") + flags.StringVar(&c.consulToken, "consul-token", "", "") + flags.StringVar(&c.vaultToken, "vault-token", "", "") + + if err := flags.Parse(args); err != nil { + return 1 + } + + // Parse the time durations + d, err := time.ParseDuration(duration) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error())) + return 1 + } + c.duration = d + + i, err := time.ParseDuration(interval) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error())) + return 1 + } + c.interval = i + + args = flags.Args() + if l := len(args); l != 0 { + c.Ui.Error("This command takes no arguments") + c.Ui.Error(commandErrorText(c)) + return 1 + } + + client, err := c.Meta.Client() + if err != nil { + c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error())) + return 1 + } + + // Resolve node prefixes + for _, id := range argNodes(nodeIDs) { + id = sanitizeUUIDPrefix(id) + nodes, _, err := client.Nodes().PrefixList(id) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err)) + return 1 + } + // Return error if no nodes are found + if len(nodes) == 0 { + c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id)) + return 1 + } + + for _, n := range nodes { + c.nodeIDs = append(c.nodeIDs, n.ID) + } + } + + // Resolve server prefixes + for _, id := range argNodes(serverIDs) { + c.serverIDs = append(c.serverIDs, id) + } + + c.manifest = make([]string, 0) + ctx, cancel := context.WithCancel(context.Background()) + c.ctx = ctx + c.cancel = cancel + + // Setup the output path + format := "2006-01-02-150405Z" + c.timestamp = time.Now().UTC().Format(format) + stamped := "nomad-debug-" + c.timestamp + + var tmp string + if output != "" { + tmp = filepath.Join(output, stamped) + _, err := os.Stat(tmp) + if !os.IsNotExist(err) { + c.Ui.Error("Output directory already exists") + return 2 + } + } else { + tmp, err = ioutil.TempDir(os.TempDir(), stamped) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error())) + return 2 + } + defer os.RemoveAll(tmp) + } + + c.collectDir = tmp + + // Capture signals so we can shutdown the monitor API calls on Int + c.trap() + + err = c.collect(client) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error())) + return 2 + } + + c.writeManifest() + + if output != "" { + c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir)) + return 0 + } + + archiveFile := stamped + ".tar.gz" + err = TarCZF(archiveFile, tmp) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error creating archive: %s", err.Error())) + return 2 + } + + c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile)) + return 0 +} + +// collect collects data from our endpoints and writes the archive bundle +func (c *DebugCommand) collect(client *api.Client) error { + // Version contains cluster meta information + dir := "version" + err := c.mkdir(dir) + if err != nil { + return err + } + + self, err := client.Agent().Self() + if err != nil { + return fmt.Errorf("agent self: %s", err.Error()) + } + c.writeJSON(dir, "agent-self.json", self) + + // Fetch data directly from consul and vault. Ignore errors + var consul, vault string + + m, ok := self.Config["Consul"].(map[string]interface{}) + if ok { + raw := m["Addr"] + consul, _ = raw.(string) + raw = m["EnableSSL"] + ssl, _ := raw.(bool) + if ssl { + consul = "https://" + consul + } else { + consul = "http://" + consul + } + } + + m, ok = self.Config["Vault"].(map[string]interface{}) + if ok { + raw := m["Addr"] + vault, _ = raw.(string) + } + + c.collectConsul(dir, consul) + c.collectVault(dir, vault) + + c.startMonitors(client) + c.collectPeriodic(client) + c.collectPprofs(client) + + return nil +} + +// path returns platform specific paths in the tmp root directory +func (c *DebugCommand) path(paths ...string) string { + ps := []string{c.collectDir} + ps = append(ps, paths...) + return filepath.Join(ps...) +} + +// mkdir creates directories in the tmp root directory +func (c *DebugCommand) mkdir(paths ...string) error { + return os.MkdirAll(c.path(paths...), 0755) +} + +// startMonitors starts go routines for each node and client +func (c *DebugCommand) startMonitors(client *api.Client) { + for _, id := range c.nodeIDs { + go c.startMonitor("client", "node_id", id, client) + } + + for _, id := range c.serverIDs { + go c.startMonitor("server", "server_id", id, client) + } +} + +// startMonitor starts one monitor api request, writing to a file. It blocks and should be +// called in a go routine. Errors are ignored, we want to build the archive even if a node +// is unavailable +func (c *DebugCommand) startMonitor(path, idKey, nodeID string, client *api.Client) { + c.mkdir(path, nodeID) + fh, err := os.Create(c.path(path, nodeID, "monitor.log")) + if err != nil { + return + } + defer fh.Close() + + qo := api.QueryOptions{ + Params: map[string]string{ + idKey: nodeID, + "log_level": c.logLevel, + }, + } + + outCh, errCh := client.Agent().Monitor(c.ctx.Done(), &qo) + for { + select { + case out := <-outCh: + if out == nil { + continue + } + fh.Write(out.Data) + fh.WriteString("\n") + + case err := <-errCh: + fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error())) + return + + case <-c.ctx.Done(): + return + } + } +} + +// collectPprofs captures the /agent/pprof for each listed node +func (c *DebugCommand) collectPprofs(client *api.Client) { + for _, n := range c.nodeIDs { + c.collectPprof("client", n, client) + } + + for _, n := range c.serverIDs { + c.collectPprof("server", n, client) + } + +} + +// collectPprof captures pprof data for the node +func (c *DebugCommand) collectPprof(path, id string, client *api.Client) { + opts := api.PprofOptions{Seconds: 1} + if path == "server" { + opts.ServerID = id + } else { + opts.NodeID = id + } + + path = filepath.Join(path, id) + + bs, err := client.Agent().CPUProfile(opts, nil) + if err == nil { + c.writeBytes(path, "profile.prof", bs) + } + + bs, err = client.Agent().Trace(opts, nil) + if err == nil { + c.writeBytes(path, "trace.prof", bs) + } + + bs, err = client.Agent().Lookup("goroutine", opts, nil) + if err == nil { + c.writeBytes(path, "goroutine.prof", bs) + } +} + +// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops +// the monitor requests +func (c *DebugCommand) collectPeriodic(client *api.Client) { + // Not monitoring any logs, just capture the nomad context before exit + if len(c.nodeIDs) == 0 && len(c.serverIDs) == 0 { + dir := filepath.Join("nomad", "00") + c.collectNomad(dir, client) + return + } + + duration := time.After(c.duration) + // Set interval to 0 so that we immediately execute, wait the interval next time + interval := time.After(0 * time.Second) + var intervalCount int + var dir string + + for { + select { + case <-duration: + c.cancel() + return + + case <-interval: + dir = filepath.Join("nomad", fmt.Sprintf("%02d", intervalCount)) + c.collectNomad(dir, client) + interval = time.After(c.interval) + intervalCount += 1 + + case <-c.ctx.Done(): + return + } + } +} + +// collectNomad captures the nomad cluster state +func (c *DebugCommand) collectNomad(dir string, client *api.Client) error { + err := c.mkdir(dir) + if err != nil { + return err + } + + var qo *api.QueryOptions + + js, _, err := client.Jobs().List(qo) + if err != nil { + return fmt.Errorf("listing jobs: %s", err.Error()) + } + c.writeJSON(dir, "jobs.json", js) + + ds, _, err := client.Deployments().List(qo) + if err != nil { + return fmt.Errorf("listing deployments: %s", err.Error()) + } + c.writeJSON(dir, "deployments.json", ds) + + es, _, err := client.Evaluations().List(qo) + if err != nil { + return fmt.Errorf("listing evaluations: %s", err.Error()) + } + c.writeJSON(dir, "evaluations.json", es) + + as, _, err := client.Allocations().List(qo) + if err != nil { + return fmt.Errorf("listing allocations: %s", err.Error()) + } + c.writeJSON(dir, "allocations.json", as) + + ns, _, err := client.Nodes().List(qo) + if err != nil { + return fmt.Errorf("listing nodes: %s", err.Error()) + } + c.writeJSON(dir, "nodes.json", ns) + + ps, _, err := client.CSIPlugins().List(qo) + if err != nil { + return fmt.Errorf("listing plugins: %s", err.Error()) + } + c.writeJSON(dir, "plugins.json", ps) + + vs, _, err := client.CSIVolumes().List(qo) + if err != nil { + return fmt.Errorf("listing volumes: %s", err.Error()) + } + c.writeJSON(dir, "volumes.json", vs) + + return nil +} + +// collectConsul calls the Consul API directly to collect data +func (c *DebugCommand) collectConsul(dir, consul string) error { + if consul == "" { + return nil + } + + token := c.consulToken + if token == "" { + os.Getenv("CONSUL_TOKEN") + } + + client := http.Client{ + Timeout: 2 * time.Second, + } + + req, _ := http.NewRequest("GET", consul+"/v1/agent/self", nil) + req.Header.Add("X-Consul-Token", token) + req.Header.Add("User-Agent", userAgent) + resp, err := client.Do(req) + c.writeBody(dir, "consul-agent-self.json", resp, err) + + req, _ = http.NewRequest("GET", consul+"/v1/agent/members", nil) + req.Header.Add("X-Consul-Token", token) + req.Header.Add("User-Agent", userAgent) + resp, err = client.Do(req) + c.writeBody(dir, "consul-agent-members.json", resp, err) + + return nil +} + +// collectVault calls the Vault API directly to collect data +func (c *DebugCommand) collectVault(dir, vault string) error { + if vault == "" { + return nil + } + + token := c.vaultToken + if token == "" { + os.Getenv("VAULT_TOKEN") + } + + client := http.Client{ + Timeout: 2 * time.Second, + } + + req, _ := http.NewRequest("GET", vault+"/sys/health", nil) + req.Header.Add("X-Vault-Token", token) + req.Header.Add("User-Agent", userAgent) + resp, err := client.Do(req) + c.writeBody(dir, "vault-sys-health.json", resp, err) + + return nil +} + +// writeBytes writes a file to the archive, recording it in the manifest +func (c *DebugCommand) writeBytes(dir, file string, data []byte) error { + path := filepath.Join(dir, file) + c.manifest = append(c.manifest, path) + path = filepath.Join(c.collectDir, path) + + fh, err := os.Create(path) + if err != nil { + return err + } + defer fh.Close() + + _, err = fh.Write(data) + return err +} + +// writeJSON writes JSON responses from the Nomad API calls to the archive +func (c *DebugCommand) writeJSON(dir, file string, data interface{}) error { + bytes, err := json.Marshal(data) + if err != nil { + return err + } + return c.writeBytes(dir, file, bytes) +} + +// writeBody is a helper that writes the body of an http.Response to the archive +func (c *DebugCommand) writeBody(dir, file string, resp *http.Response, err error) { + if err != nil { + return + } + defer resp.Body.Close() + + if resp.ContentLength == 0 { + return + } + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return + } + + c.writeBytes(dir, file, body) +} + +// writeManifest creates the index files +func (c *DebugCommand) writeManifest() error { + // Write the JSON + path := filepath.Join(c.collectDir, "index.json") + jsonFh, err := os.Create(path) + if err != nil { + return err + } + defer jsonFh.Close() + + json.NewEncoder(jsonFh).Encode(c.manifest) + + // Write the HTML + path = filepath.Join(c.collectDir, "index.html") + htmlFh, err := os.Create(path) + if err != nil { + return err + } + defer htmlFh.Close() + + head, _ := template.New("head").Parse("