From 6690ec9590088d15ffa5e0df002c6355a15f2918 Mon Sep 17 00:00:00 2001 From: davemay99 Date: Sun, 6 Jun 2021 00:32:38 -0400 Subject: [PATCH 1/4] Add remaining pprof profiles to debug dump --- command/operator_debug.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/command/operator_debug.go b/command/operator_debug.go index a66d3ff24de..4dc87decf9b 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -674,6 +674,12 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) } } + err = c.savePprofProfile(path, "heap", opts, client) + err = c.savePprofProfile(path, "allocs", opts, client) + err = c.savePprofProfile(path, "threadcreate", opts, client) + err = c.savePprofProfile(path, "block", opts, client) + err = c.savePprofProfile(path, "mutex", opts, client) + // Gather goroutine text output - debug type 1 // debug type 1 writes the legacy text format for human readable output opts.Debug = 1 @@ -702,6 +708,19 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) } } +// savePprofProfile collects a pprof profile from the client API and writes to disk +func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) error { + bs, err := client.Agent().Lookup(profile, opts, nil) + if err != nil { + return fmt.Errorf("%s: Failed to retrieve pprof profile %s, err: %v", path, profile, err) + } + + fileName := fmt.Sprintf("%s.prof", profile) + err = c.writeBytes(path, fileName, bs) + + return err +} + // collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops // the monitor requests func (c *OperatorDebugCommand) collectPeriodic(client *api.Client) { From fbd6980a71609b6d4319fcbebf1b245048b7e399 Mon Sep 17 00:00:00 2001 From: davemay99 Date: Fri, 11 Jun 2021 10:21:35 -0400 Subject: [PATCH 2/4] Refactor pprof profile capture --- command/operator_debug.go | 95 +++++++++++++++------------------- command/operator_debug_test.go | 5 ++ 2 files changed, 47 insertions(+), 53 deletions(-) diff --git a/command/operator_debug.go b/command/operator_debug.go index 4dc87decf9b..d03098422ba 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -654,71 +654,60 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client) } } - bs, err = client.Agent().Trace(opts, nil) - if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof trace.prof, err: %v", path, err)) - } else { - err := c.writeBytes(path, "trace.prof", bs) - if err != nil { - c.Ui.Error(err.Error()) - } - } + // goroutine debug type 1 = legacy text format for human readable output + opts.Debug = 1 + c.savePprofProfile(path, "goroutine", opts, client) - bs, err = client.Agent().Lookup("goroutine", opts, nil) - if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine.prof, err: %v", path, err)) - } else { - err := c.writeBytes(path, "goroutine.prof", bs) - if err != nil { - c.Ui.Error(err.Error()) - } - } + // goroutine debug type 2 = goroutine stacks in panic format + opts.Debug = 2 + c.savePprofProfile(path, "goroutine", opts, client) - err = c.savePprofProfile(path, "heap", opts, client) - err = c.savePprofProfile(path, "allocs", opts, client) - err = c.savePprofProfile(path, "threadcreate", opts, client) - err = c.savePprofProfile(path, "block", opts, client) - err = c.savePprofProfile(path, "mutex", opts, client) + // Reset to pprof binary format + opts.Debug = 0 - // Gather goroutine text output - debug type 1 - // debug type 1 writes the legacy text format for human readable output - opts.Debug = 1 - bs, err = client.Agent().Lookup("goroutine", opts, nil) - if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v", path, err)) - } else { - err := c.writeBytes(path, "goroutine-debug1.txt", bs) - if err != nil { - c.Ui.Error(err.Error()) - } + c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines + c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program + c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample. + c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations + c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads + + // This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable + // c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives + + // This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable + // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes +} + +// savePprofProfile retrieves a pprof profile and writes to disk +func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) { + fileName := fmt.Sprintf("%s.prof", profile) + if opts.Debug > 0 { + fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug) } - // Gather goroutine text output - debug type 2 - // When printing the "goroutine" profile, debug=2 means to print the goroutine - // stacks in the same form that a Go program uses when dying due to an unrecovered panic. - opts.Debug = 2 - bs, err = client.Agent().Lookup("goroutine", opts, nil) + bs, err := retrievePprofProfile(profile, opts, client) if err != nil { - c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v", path, err)) - } else { - err := c.writeBytes(path, "goroutine-debug2.txt", bs) - if err != nil { - c.Ui.Error(err.Error()) - } + c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error())) } -} -// savePprofProfile collects a pprof profile from the client API and writes to disk -func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) error { - bs, err := client.Agent().Lookup(profile, opts, nil) + err = c.writeBytes(path, fileName, bs) if err != nil { - return fmt.Errorf("%s: Failed to retrieve pprof profile %s, err: %v", path, profile, err) + c.Ui.Error(fmt.Sprintf("%s: Failed to write file %s, err: %s", path, fileName, err.Error())) } +} - fileName := fmt.Sprintf("%s.prof", profile) - err = c.writeBytes(path, fileName, bs) +// retrievePprofProfile gets a pprof profile from the node specified in opts using the API client +func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client) (bs []byte, err error) { + switch profile { + case "cpuprofile": + bs, err = client.Agent().CPUProfile(opts, nil) + case "trace": + bs, err = client.Agent().Trace(opts, nil) + default: + bs, err = client.Agent().Lookup(profile, opts, nil) + } - return err + return bs, err } // collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index dff63b5c861..55782662832 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -365,11 +365,16 @@ func TestDebug_CapturedFiles(t *testing.T) { // Monitor files are only created when selected filepath.Join(path, "server", "leader", "monitor.log"), + + // Pprof profiles filepath.Join(path, "server", "leader", "profile.prof"), filepath.Join(path, "server", "leader", "trace.prof"), filepath.Join(path, "server", "leader", "goroutine.prof"), filepath.Join(path, "server", "leader", "goroutine-debug1.txt"), filepath.Join(path, "server", "leader", "goroutine-debug2.txt"), + filepath.Join(path, "server", "leader", "heap.prof"), + filepath.Join(path, "server", "leader", "allocs.prof"), + filepath.Join(path, "server", "leader", "threadcreate.prof"), // Multiple snapshots are collected, 00 is always created filepath.Join(path, "nomad", "0000", "jobs.json"), From 0423c1fae0530226b4fd1188927100d48090099c Mon Sep 17 00:00:00 2001 From: davemay99 Date: Fri, 11 Jun 2021 16:12:55 -0400 Subject: [PATCH 3/4] Add WaitForFilesUntil and WaitForResultUntil utility functions --- command/operator_debug_test.go | 2 +- testutil/wait.go | 46 +++++++++++++++++++++++++++------- testutil/wait_test.go | 42 +++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index 55782662832..a5aa6f88dcc 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -387,7 +387,7 @@ func TestDebug_CapturedFiles(t *testing.T) { filepath.Join(path, "nomad", "0001", "metrics.json"), } - testutil.WaitForFiles(t, serverFiles) + testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute) } func TestDebug_ExistingOutput(t *testing.T) { diff --git a/testutil/wait.go b/testutil/wait.go index 91bce007f66..19215d0ced0 100644 --- a/testutil/wait.go +++ b/testutil/wait.go @@ -8,7 +8,6 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/kr/pretty" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -35,6 +34,23 @@ func WaitForResultRetries(retries int64, test testFn, error errorFn) { } } +// WaitForResultUntil waits the duration for the test to pass. +// Otherwise error is called after the deadline expires. +func WaitForResultUntil(until time.Duration, test testFn, errorFunc errorFn) { + var success bool + var err error + deadline := time.Now().Add(until) + for time.Now().Before(deadline) { + success, err = test() + if success { + return + } + // Sleep some arbitrary fraction of the deadline + time.Sleep(until / 30) + } + errorFunc(err) +} + // AssertUntil asserts the test function passes throughout the given duration. // Otherwise error is called on failure. func AssertUntil(until time.Duration, test testFn, error errorFn) { @@ -217,16 +233,28 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL // WaitForFiles blocks until all the files in the slice are present func WaitForFiles(t testing.TB, files []string) { - assert := assert.New(t) WaitForResult(func() (bool, error) { - for _, f := range files { - exists := assert.FileExists(f) - if !exists { - return false, fmt.Errorf("expected file to exist %s", f) - } - } - return true, nil + return FilesExist(files), nil }, func(err error) { t.Fatalf("missing expected files: %v", err) }) } + +// WaitForFilesUntil blocks until duration or all the files in the slice are present +func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) { + WaitForResultUntil(until, func() (bool, error) { + return FilesExist(files), nil + }, func(err error) { + t.Fatalf("missing expected files: %v", err) + }) +} + +// FilesExist verifies all files in the slice are present +func FilesExist(files []string) bool { + for _, f := range files { + if _, err := os.Stat(f); os.IsNotExist(err) { + return false + } + } + return true +} diff --git a/testutil/wait_test.go b/testutil/wait_test.go index 110b2e6a797..bcc7ff8c6fd 100644 --- a/testutil/wait_test.go +++ b/testutil/wait_test.go @@ -1 +1,43 @@ package testutil + +import ( + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestWait_WaitForFilesUntil(t *testing.T) { + + var files []string + for i := 1; i < 10; i++ { + filename := fmt.Sprintf("test%d.txt", i) + filepath := filepath.Join(os.TempDir(), filename) + files = append(files, filepath) + + defer os.Remove(filepath) + } + + go func() { + for _, filepath := range files { + t.Logf("Creating file %s...", filepath) + fh, err := os.Create(filepath) + fh.Close() + + require.NoError(t, err, "error creating test file") + require.FileExists(t, filepath) + + time.Sleep(250 * time.Millisecond) + } + }() + + duration := 5 * time.Second + t.Log("Waiting 5 seconds for files...") + WaitForFilesUntil(t, files, duration) + + t.Log("done") + +} From 5be0076dea6779597bddf0bbed4db62496dc2710 Mon Sep 17 00:00:00 2001 From: davemay99 Date: Fri, 11 Jun 2021 16:31:05 -0400 Subject: [PATCH 4/4] Add CHANGELOG entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47874b38424..579f8dcc22c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ IMPROVEMENTS: * cli: Added `-monitor` flag to `deployment status` command and automatically monitor deployments from `job run` command. [[GH-10661](https://github.com/hashicorp/nomad/pull/10661)] +* cli: Added remainder of available pprof profiles to `nomad operator debug` capture. [[GH-10748](https://github.com/hashicorp/nomad/issues/10748)] * consul/connect: Validate Connect service upstream address uniqueness within task group [[GH-7833](https://github.com/hashicorp/nomad/issues/7833)] * deps: Update gopsutil for multisocket cpuinfo detection performance fix [[GH-10761](https://github.com/hashicorp/nomad/pull/10790)] * docker: Tasks using `network.mode = "bridge"` that don't set their `network_mode` will receive a `/etc/hosts` file that includes the pause container's hostname and any `extra_hosts`. [[GH-10766](https://github.com/hashicorp/nomad/issues/10766)]