From b430bafe9031f871e21403fde008bd8494350091 Mon Sep 17 00:00:00 2001 From: Dave May Date: Mon, 21 Jun 2021 14:22:49 -0400 Subject: [PATCH] Add remaining pprof profiles to nomad operator debug (#10748) * Add remaining pprof profiles to debug dump * Refactor pprof profile capture * Add WaitForFilesUntil and WaitForResultUntil utility functions * Add CHANGELOG entry --- CHANGELOG.md | 1 + command/operator_debug.go | 82 +++++++++++++++++++--------------- command/operator_debug_test.go | 7 ++- testutil/wait.go | 46 +++++++++++++++---- testutil/wait_test.go | 42 +++++++++++++++++ 5 files changed, 131 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47874b38424..579f8dcc22c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ IMPROVEMENTS: * cli: Added `-monitor` flag to `deployment status` command and automatically monitor deployments from `job run` command. [[GH-10661](https://github.com/hashicorp/nomad/pull/10661)] +* cli: Added remainder of available pprof profiles to `nomad operator debug` capture. [[GH-10748](https://github.com/hashicorp/nomad/issues/10748)] * consul/connect: Validate Connect service upstream address uniqueness within task group [[GH-7833](https://github.com/hashicorp/nomad/issues/7833)] * deps: Update gopsutil for multisocket cpuinfo detection performance fix [[GH-10761](https://github.com/hashicorp/nomad/pull/10790)] * docker: Tasks using `network.mode = "bridge"` that don't set their `network_mode` will receive a `/etc/hosts` file that includes the pause container's hostname and any `extra_hosts`. [[GH-10766](https://github.com/hashicorp/nomad/issues/10766)] diff --git a/command/operator_debug.go b/command/operator_debug.go index a66d3ff24de..d03098422ba 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -654,52 +654,60 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) } } - bs, err = client.Agent().Trace(opts, nil) - if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof trace.prof, err: %v", path, err)) - } else { - err := c.writeBytes(path, "trace.prof", bs) - if err != nil { - c.Ui.Error(err.Error()) - } + // goroutine debug type 1 = legacy text format for human readable output + opts.Debug = 1 + c.savePprofProfile(path, "goroutine", opts, client) + + // goroutine debug type 2 = goroutine stacks in panic format + opts.Debug = 2 + c.savePprofProfile(path, "goroutine", opts, client) + + // Reset to pprof binary format + opts.Debug = 0 + + c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines + c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program + c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. + c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations + c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads + + // This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable + // c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives + + // This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable + // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes +} + +// savePprofProfile retrieves a pprof profile and writes to disk +func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) { + fileName := fmt.Sprintf("%s.prof", profile) + if opts.Debug > 0 { + fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug) } - bs, err = client.Agent().Lookup("goroutine", opts, nil) + bs, err := retrievePprofProfile(profile, opts, client) if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine.prof, err: %v", path, err)) - } else { - err := c.writeBytes(path, "goroutine.prof", bs) - if err != nil { - c.Ui.Error(err.Error()) - } + c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error())) } - // Gather goroutine text output - debug type 1 - // debug type 1 writes the legacy text format for human readable output - opts.Debug = 1 - bs, err = client.Agent().Lookup("goroutine", opts, nil) + err = c.writeBytes(path, fileName, bs) if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v", path, err)) - } else { - err := c.writeBytes(path, "goroutine-debug1.txt", bs) - if err != nil { - c.Ui.Error(err.Error()) - } + c.Ui.Error(fmt.Sprintf("%s: Failed to write file %s, err: %s", path, fileName, err.Error())) } +} - // Gather goroutine text output - debug type 2 - // When printing the "goroutine" profile, debug=2 means to print the goroutine - // stacks in the same form that a Go program uses when dying due to an unrecovered panic. - opts.Debug = 2 - bs, err = client.Agent().Lookup("goroutine", opts, nil) - if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v", path, err)) - } else { - err := c.writeBytes(path, "goroutine-debug2.txt", bs) - if err != nil { - c.Ui.Error(err.Error()) - } +// retrievePprofProfile gets a pprof profile from the node specified in opts using the API client +func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client) (bs []byte, err error) { + switch profile { + case "cpuprofile": + bs, err = client.Agent().CPUProfile(opts, nil) + case "trace": + bs, err = client.Agent().Trace(opts, nil) + default: + bs, err = client.Agent().Lookup(profile, opts, nil) } + + return bs, err } // collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index dff63b5c861..a5aa6f88dcc 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -365,11 +365,16 @@ func TestDebug_CapturedFiles(t *testing.T) { // Monitor files are only created when selected filepath.Join(path, "server", "leader", "monitor.log"), + + // Pprof profiles filepath.Join(path, "server", "leader", "profile.prof"), filepath.Join(path, "server", "leader", "trace.prof"), filepath.Join(path, "server", "leader", "goroutine.prof"), filepath.Join(path, "server", "leader", "goroutine-debug1.txt"), filepath.Join(path, "server", "leader", "goroutine-debug2.txt"), + filepath.Join(path, "server", "leader", "heap.prof"), + filepath.Join(path, "server", "leader", "allocs.prof"), + filepath.Join(path, "server", "leader", "threadcreate.prof"), // Multiple snapshots are collected, 00 is always created filepath.Join(path, "nomad", "0000", "jobs.json"), @@ -382,7 +387,7 @@ func TestDebug_CapturedFiles(t *testing.T) { filepath.Join(path, "nomad", "0001", "metrics.json"), } - testutil.WaitForFiles(t, serverFiles) + testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute) } func TestDebug_ExistingOutput(t *testing.T) { diff --git a/testutil/wait.go b/testutil/wait.go index 91bce007f66..19215d0ced0 100644 --- a/testutil/wait.go +++ b/testutil/wait.go @@ -8,7 +8,6 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/kr/pretty" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -35,6 +34,23 @@ func WaitForResultRetries(retries int64, test testFn, error errorFn) { } } +// WaitForResultUntil waits the duration for the test to pass. +// Otherwise error is called after the deadline expires. +func WaitForResultUntil(until time.Duration, test testFn, errorFunc errorFn) { + var success bool + var err error + deadline := time.Now().Add(until) + for time.Now().Before(deadline) { + success, err = test() + if success { + return + } + // Sleep some arbitrary fraction of the deadline + time.Sleep(until / 30) + } + errorFunc(err) +} + // AssertUntil asserts the test function passes throughout the given duration. // Otherwise error is called on failure. func AssertUntil(until time.Duration, test testFn, error errorFn) { @@ -217,16 +233,28 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL // WaitForFiles blocks until all the files in the slice are present func WaitForFiles(t testing.TB, files []string) { - assert := assert.New(t) WaitForResult(func() (bool, error) { - for _, f := range files { - exists := assert.FileExists(f) - if !exists { - return false, fmt.Errorf("expected file to exist %s", f) - } - } - return true, nil + return FilesExist(files), nil }, func(err error) { t.Fatalf("missing expected files: %v", err) }) } + +// WaitForFilesUntil blocks until duration or all the files in the slice are present +func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) { + WaitForResultUntil(until, func() (bool, error) { + return FilesExist(files), nil + }, func(err error) { + t.Fatalf("missing expected files: %v", err) + }) +} + +// FilesExist verifies all files in the slice are present +func FilesExist(files []string) bool { + for _, f := range files { + if _, err := os.Stat(f); os.IsNotExist(err) { + return false + } + } + return true +} diff --git a/testutil/wait_test.go b/testutil/wait_test.go index 110b2e6a797..bcc7ff8c6fd 100644 --- a/testutil/wait_test.go +++ b/testutil/wait_test.go @@ -1 +1,43 @@ package testutil + +import ( + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestWait_WaitForFilesUntil(t *testing.T) { + + var files []string + for i := 1; i < 10; i++ { + filename := fmt.Sprintf("test%d.txt", i) + filepath := filepath.Join(os.TempDir(), filename) + files = append(files, filepath) + + defer os.Remove(filepath) + } + + go func() { + for _, filepath := range files { + t.Logf("Creating file %s...", filepath) + fh, err := os.Create(filepath) + fh.Close() + + require.NoError(t, err, "error creating test file") + require.FileExists(t, filepath) + + time.Sleep(250 * time.Millisecond) + } + }() + + duration := 5 * time.Second + t.Log("Waiting 5 seconds for files...") + WaitForFilesUntil(t, files, duration) + + t.Log("done") + +}