Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add remaining pprof profiles to nomad operator debug #10748

Merged
merged 4 commits into from
Jun 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

IMPROVEMENTS:
* cli: Added `-monitor` flag to `deployment status` command and automatically monitor deployments from `job run` command. [[GH-10661](https://github.com/hashicorp/nomad/pull/10661)]
* cli: Added remainder of available pprof profiles to `nomad operator debug` capture. [[GH-10748](https://github.com/hashicorp/nomad/issues/10748)]
* consul/connect: Validate Connect service upstream address uniqueness within task group [[GH-7833](https://github.com/hashicorp/nomad/issues/7833)]
* deps: Update gopsutil for multisocket cpuinfo detection performance fix [[GH-10761](https://github.com/hashicorp/nomad/pull/10790)]
* docker: Tasks using `network.mode = "bridge"` that don't set their `network_mode` will receive a `/etc/hosts` file that includes the pause container's hostname and any `extra_hosts`. [[GH-10766](https://github.com/hashicorp/nomad/issues/10766)]
Expand Down
82 changes: 45 additions & 37 deletions command/operator_debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -654,52 +654,60 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
}
}

bs, err = client.Agent().Trace(opts, nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof trace.prof, err: %v", path, err))
} else {
err := c.writeBytes(path, "trace.prof", bs)
if err != nil {
c.Ui.Error(err.Error())
}
// goroutine debug type 1 = legacy text format for human readable output
opts.Debug = 1
c.savePprofProfile(path, "goroutine", opts, client)

// goroutine debug type 2 = goroutine stacks in panic format
opts.Debug = 2
c.savePprofProfile(path, "goroutine", opts, client)

// Reset to pprof binary format
opts.Debug = 0

c.savePprofProfile(path, "goroutine", opts, client) // Stack traces of all current goroutines
c.savePprofProfile(path, "trace", opts, client) // A trace of execution of the current program
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads

// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives

// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
}

// savePprofProfile retrieves a pprof profile and writes to disk
func (c *OperatorDebugCommand) savePprofProfile(path string, profile string, opts api.PprofOptions, client *api.Client) {
fileName := fmt.Sprintf("%s.prof", profile)
if opts.Debug > 0 {
fileName = fmt.Sprintf("%s-debug%d.txt", profile, opts.Debug)
}

bs, err = client.Agent().Lookup("goroutine", opts, nil)
bs, err := retrievePprofProfile(profile, opts, client)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine.prof, err: %v", path, err))
} else {
err := c.writeBytes(path, "goroutine.prof", bs)
if err != nil {
c.Ui.Error(err.Error())
}
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof %s, err: %s", path, fileName, err.Error()))
}

// Gather goroutine text output - debug type 1
// debug type 1 writes the legacy text format for human readable output
opts.Debug = 1
bs, err = client.Agent().Lookup("goroutine", opts, nil)
err = c.writeBytes(path, fileName, bs)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug1.txt, err: %v", path, err))
} else {
err := c.writeBytes(path, "goroutine-debug1.txt", bs)
if err != nil {
c.Ui.Error(err.Error())
}
c.Ui.Error(fmt.Sprintf("%s: Failed to write file %s, err: %s", path, fileName, err.Error()))
}
}

// Gather goroutine text output - debug type 2
// When printing the "goroutine" profile, debug=2 means to print the goroutine
// stacks in the same form that a Go program uses when dying due to an unrecovered panic.
opts.Debug = 2
bs, err = client.Agent().Lookup("goroutine", opts, nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("%s: Failed to retrieve pprof goroutine-debug2.txt, err: %v", path, err))
} else {
err := c.writeBytes(path, "goroutine-debug2.txt", bs)
if err != nil {
c.Ui.Error(err.Error())
}
// retrievePprofProfile gets a pprof profile from the node specified in opts using the API client
func retrievePprofProfile(profile string, opts api.PprofOptions, client *api.Client) (bs []byte, err error) {
switch profile {
case "cpuprofile":
bs, err = client.Agent().CPUProfile(opts, nil)
case "trace":
bs, err = client.Agent().Trace(opts, nil)
default:
bs, err = client.Agent().Lookup(profile, opts, nil)
}

return bs, err
}

// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops
Expand Down
7 changes: 6 additions & 1 deletion command/operator_debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,16 @@ func TestDebug_CapturedFiles(t *testing.T) {

// Monitor files are only created when selected
filepath.Join(path, "server", "leader", "monitor.log"),

// Pprof profiles
filepath.Join(path, "server", "leader", "profile.prof"),
filepath.Join(path, "server", "leader", "trace.prof"),
filepath.Join(path, "server", "leader", "goroutine.prof"),
filepath.Join(path, "server", "leader", "goroutine-debug1.txt"),
filepath.Join(path, "server", "leader", "goroutine-debug2.txt"),
filepath.Join(path, "server", "leader", "heap.prof"),
filepath.Join(path, "server", "leader", "allocs.prof"),
filepath.Join(path, "server", "leader", "threadcreate.prof"),

// Multiple snapshots are collected, 00 is always created
filepath.Join(path, "nomad", "0000", "jobs.json"),
Expand All @@ -382,7 +387,7 @@ func TestDebug_CapturedFiles(t *testing.T) {
filepath.Join(path, "nomad", "0001", "metrics.json"),
}

testutil.WaitForFiles(t, serverFiles)
testutil.WaitForFilesUntil(t, serverFiles, 2*time.Minute)
}

func TestDebug_ExistingOutput(t *testing.T) {
Expand Down
46 changes: 37 additions & 9 deletions testutil/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (

"github.com/hashicorp/nomad/nomad/structs"
"github.com/kr/pretty"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

Expand All @@ -35,6 +34,23 @@ func WaitForResultRetries(retries int64, test testFn, error errorFn) {
}
}

// WaitForResultUntil waits the duration for the test to pass.
// Otherwise error is called after the deadline expires.
func WaitForResultUntil(until time.Duration, test testFn, errorFunc errorFn) {
var success bool
var err error
deadline := time.Now().Add(until)
for time.Now().Before(deadline) {
success, err = test()
if success {
return
}
// Sleep some arbitrary fraction of the deadline
time.Sleep(until / 30)
}
errorFunc(err)
}

// AssertUntil asserts the test function passes throughout the given duration.
// Otherwise error is called on failure.
func AssertUntil(until time.Duration, test testFn, error errorFn) {
Expand Down Expand Up @@ -217,16 +233,28 @@ func WaitForRunning(t testing.TB, rpc rpcFn, job *structs.Job) []*structs.AllocL

// WaitForFiles blocks until all the files in the slice are present
func WaitForFiles(t testing.TB, files []string) {
assert := assert.New(t)
WaitForResult(func() (bool, error) {
for _, f := range files {
exists := assert.FileExists(f)
if !exists {
return false, fmt.Errorf("expected file to exist %s", f)
}
}
return true, nil
return FilesExist(files), nil
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
}

// WaitForFilesUntil blocks until duration or all the files in the slice are present
func WaitForFilesUntil(t testing.TB, files []string, until time.Duration) {
WaitForResultUntil(until, func() (bool, error) {
return FilesExist(files), nil
}, func(err error) {
t.Fatalf("missing expected files: %v", err)
})
}

// FilesExist verifies all files in the slice are present
func FilesExist(files []string) bool {
for _, f := range files {
if _, err := os.Stat(f); os.IsNotExist(err) {
return false
}
}
return true
}
42 changes: 42 additions & 0 deletions testutil/wait_test.go
Original file line number Diff line number Diff line change
@@ -1 +1,43 @@
package testutil

import (
"fmt"
"os"
"path/filepath"
"testing"
"time"

"github.com/stretchr/testify/require"
)

func TestWait_WaitForFilesUntil(t *testing.T) {

var files []string
for i := 1; i < 10; i++ {
filename := fmt.Sprintf("test%d.txt", i)
filepath := filepath.Join(os.TempDir(), filename)
files = append(files, filepath)

defer os.Remove(filepath)
}

go func() {
for _, filepath := range files {
t.Logf("Creating file %s...", filepath)
fh, err := os.Create(filepath)
fh.Close()

require.NoError(t, err, "error creating test file")
require.FileExists(t, filepath)

time.Sleep(250 * time.Millisecond)
}
}()

duration := 5 * time.Second
t.Log("Waiting 5 seconds for files...")
WaitForFilesUntil(t, files, duration)

t.Log("done")

}