Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

elastic-agent diagnostics pprof #28798

Merged
merged 20 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
dd06f1f
Allow -httpprof to bind to sockets/pipes
michel-laterman Nov 3, 2021
b035fe8
Enable pprof debug endpoint on socket for agent and beats
michel-laterman Nov 4, 2021
78069ae
Add new Pprof command to control.proto
michel-laterman Nov 6, 2021
131bf1f
Add pprof option to diagnostics collect
michel-laterman Nov 6, 2021
c61c010
Fix linting issues
michel-laterman Nov 8, 2021
e3fba76
Add diagonstics pprof command allow pprof to collect from agent
michel-laterman Nov 8, 2021
f750e5b
Merge branch 'master' into elastic-agent-profile
michel-laterman Nov 17, 2021
dc66dbd
Revert debug socket changes
michel-laterman Nov 17, 2021
bbaf43a
Cleanup timeout handling
michel-laterman Nov 17, 2021
f29e207
Merge remote-tracking branch 'origin/master' into elastic-agent-profile
michel-laterman Nov 22, 2021
c82f697
Fix linting issue add timeout flag
michel-laterman Nov 23, 2021
a9b3693
Add more command help text.
michel-laterman Nov 23, 2021
b4ffdf0
Add CHANGELOG
michel-laterman Nov 23, 2021
8f145a6
move spec collection for routes to fn
michel-laterman Nov 25, 2021
1b2a9ff
add monitoringCfg reference to control server
michel-laterman Nov 25, 2021
639707c
Merge remote-tracking branch 'origin/master' into elastic-agent-profile
michel-laterman Dec 2, 2021
e6707c4
elastic-agent server only processes pprof requests when enabled
michel-laterman Dec 2, 2021
1bdc078
Fix error message fix commands only on elastic-agent
michel-laterman Dec 2, 2021
91140b7
Add pprof fleet.yml, fix nil reference
michel-laterman Dec 6, 2021
4e6ecbb
Change pprof setting name to monitoring.pprof.enabled
michel-laterman Dec 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions x-pack/elastic-agent/CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,4 @@
- Add diagnostics collect command to gather beat metadata, config, policy, and logs and bundle it into an archive. {pull}28461[28461]
- Add `KIBANA_FLEET_SERVICE_TOKEN` to Elastic Agent container. {pull}28096[28096]
- Enable pprof endpoints for beats processes. Allow pprof endpoints for elastic-agent if enabled. {pull}28983[28983]
- Add `--pprof` flag to `elastic-agent diagnostics` and an `elastic-agent pprof` command to allow operators to gather pprof data from the agent and beats running under it. {pull}28798[28798]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets make sure we follow up with proper docs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already in a pr

42 changes: 42 additions & 0 deletions x-pack/elastic-agent/control.proto
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,19 @@ enum ActionStatus {
FAILURE = 1;
}

// pprof endpoint that can be requested.
enum PprofOption {
ALLOCS = 0;
BLOCK = 1;
CMDLINE = 2;
GOROUTINE = 3;
HEAP = 4;
MUTEX = 5;
PROFILE = 6;
THREADCREATE = 7;
TRACE = 8;
}

// Empty message.
message Empty {
}
Expand Down Expand Up @@ -128,6 +141,32 @@ message ProcMetaResponse {
repeated ProcMeta procs = 1;
}

// PprofRequest is a request for pprof data from and http/pprof endpoint.
message PprofRequest {
// The profiles that are requested
repeated PprofOption pprofType = 1;
// A string representing a time.Duration to apply to trace, and profile options.
string traceDuration = 2;
// The application that will be profiled, if empty all applications are profiled.
string appName = 3;
// The route key to match for profiling, if empty all are profiled.
string routeKey = 4;
}

// PprofResult is the result of a pprof request for a given application/route key.
message PprofResult {
string appName = 1;
string routeKey = 2;
PprofOption pprofType = 3;
bytes result = 4;
string error = 5;
}

// PprofResponse is a wrapper to return all pprof responses.
message PprofResponse {
repeated PprofResult results = 1;
}

service ElasticAgentControl {
// Fetches the currently running version of the Elastic Agent.
rpc Version(Empty) returns (VersionResponse);
Expand All @@ -143,4 +182,7 @@ service ElasticAgentControl {

// Gather all running process metadata.
rpc ProcMeta(Empty) returns (ProcMetaResponse);

// Gather requested pprof data from specified applications.
rpc Pprof(PprofRequest) returns (PprofResponse);
}
200 changes: 194 additions & 6 deletions x-pack/elastic-agent/pkg/agent/cmd/diagnostics.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/application/paths"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/configuration"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/control/client"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/control/proto"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/agent/errors"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/cli"
"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/config/operations"
Expand Down Expand Up @@ -63,6 +64,7 @@ func newDiagnosticsCommand(s []string, streams *cli.IOStreams) *cobra.Command {

cmd.Flags().String("output", "human", "Output the diagnostics information in either human, json, or yaml (default: human)")
cmd.AddCommand(newDiagnosticsCollectCommandWithArgs(s, streams))
cmd.AddCommand(newDiagnosticsPprofCommandWithArgs(s, streams))

return cmd
}
Expand All @@ -72,7 +74,7 @@ func newDiagnosticsCollectCommandWithArgs(_ []string, streams *cli.IOStreams) *c
Use: "collect",
Short: "Collect diagnostics information from the elastic-agent and write it to a zip archive.",
Long: "Collect diagnostics information from the elastic-agent and write it to a zip archive.\nNote that any credentials will appear in plain text.",
Args: cobra.MaximumNArgs(1),
Args: cobra.MaximumNArgs(3),
RunE: func(c *cobra.Command, args []string) error {
file, _ := c.Flags().GetString("file")

Expand All @@ -89,12 +91,58 @@ func newDiagnosticsCollectCommandWithArgs(_ []string, streams *cli.IOStreams) *c
return fmt.Errorf("unsupported output: %s", output)
}

return diagnosticsCollectCmd(streams, file, output)
pprof, _ := c.Flags().GetBool("pprof")
d, _ := c.Flags().GetDuration("pprof-duration")
// get the command timeout value only if one is set explicitly.
// otherwise a value of 30s + pprof-duration will be used.
var timeout time.Duration
if c.Flags().Changed("timeout") {
timeout, _ = c.Flags().GetDuration("timeout")
}

return diagnosticsCollectCmd(streams, file, output, pprof, d, timeout)
},
}

cmd.Flags().StringP("file", "f", "", "name of the output diagnostics zip archive")
cmd.Flags().String("output", "yaml", "Output the collected information in either json, or yaml (default: yaml)") // replace output flag with different options
cmd.Flags().Bool("pprof", false, "Collect all pprof data from all running applications.")
cmd.Flags().Duration("pprof-duration", time.Second*30, "The duration to collect trace and profiling data from the debug/pprof endpoints. (default: 30s)")
cmd.Flags().Duration("timeout", time.Second*30, "The timeout for the diagnostics collect command, will be either 30s or 30s+pprof-duration by default. Should be longer then pprof-duration when pprof is enabled as the command needs time to process/archive the response.")

return cmd
}

func newDiagnosticsPprofCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command {
cmd := &cobra.Command{
Use: "pprof",
Short: "Collect pprof information from a running process.",
Long: "Collect pprof information from the elastic-agent or one of its processes and write to stdout or a file.\nBy default it will gather a 30s profile of the elastic-agent and output on stdout.",
Args: cobra.MaximumNArgs(5),
RunE: func(c *cobra.Command, args []string) error {
file, _ := c.Flags().GetString("file")
pprofType, _ := c.Flags().GetString("pprof-type")
d, _ := c.Flags().GetDuration("pprof-duration")
// get the command timeout value only if one is set explicitly.
// otherwise a value of 30s + pprof-duration will be used.
var timeout time.Duration
if c.Flags().Changed("timeout") {
timeout, _ = c.Flags().GetDuration("timeout")
}

pprofApp, _ := c.Flags().GetString("pprof-application")
pprofRK, _ := c.Flags().GetString("pprof-route-key")

return diagnosticsPprofCmd(streams, d, timeout, file, pprofType, pprofApp, pprofRK)
},
}

cmd.Flags().StringP("file", "f", "", "name of the output file, stdout if unspecified.")
cmd.Flags().String("pprof-type", "profile", "Collect all pprof data from all running applications. Select one of [allocs, block, cmdline, goroutine, heap, mutex, profile, threadcreate, trace]")
cmd.Flags().Duration("pprof-duration", time.Second*30, "The duration to collect trace and profiling data from the debug/pprof endpoints. (default: 30s)")
cmd.Flags().Duration("timeout", time.Second*60, "The timeout for the pprof collect command, defaults to 30s+pprof-duration by default. Should be longer then pprof-duration as the command needs time to process the response.")
cmd.Flags().String("pprof-application", "elastic-agent", "Application name to collect pprof data from.")
cmd.Flags().String("pprof-route-key", "default", "Route key to collect pprof data from.")

return cmd
}
Expand Down Expand Up @@ -127,14 +175,22 @@ func diagnosticCmd(streams *cli.IOStreams, cmd *cobra.Command, args []string) er
return outputFunc(streams.Out, diag)
}

func diagnosticsCollectCmd(streams *cli.IOStreams, fileName, outputFormat string) error {
func diagnosticsCollectCmd(streams *cli.IOStreams, fileName, outputFormat string, pprof bool, pprofDur, cmdTimeout time.Duration) error {
err := tryContainerLoadPaths()
if err != nil {
return err
}

ctx := handleSignal(context.Background())
innerCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
// set command timeout to 30s or 30s+pprofDur if no timeout is specified
if cmdTimeout == time.Duration(0) {
cmdTimeout = time.Second * 30
if pprof {
cmdTimeout += pprofDur
}

}
innerCtx, cancel := context.WithTimeout(ctx, cmdTimeout)
defer cancel()

diag, err := getDiagnostics(innerCtx)
Expand All @@ -151,7 +207,15 @@ func diagnosticsCollectCmd(streams *cli.IOStreams, fileName, outputFormat string
return fmt.Errorf("unable to gather config data: %w", err)
}

err = createZip(fileName, outputFormat, diag, cfg)
var pprofData map[string][]client.ProcPProf = nil
if pprof {
pprofData, err = getAllPprof(innerCtx, pprofDur)
if err != nil {
return fmt.Errorf("unable to gather pprof data: %w", err)
}
}

err = createZip(fileName, outputFormat, diag, cfg, pprofData)
if err != nil {
return fmt.Errorf("unable to create archive %q: %w", fileName, err)
}
Expand All @@ -160,6 +224,68 @@ func diagnosticsCollectCmd(streams *cli.IOStreams, fileName, outputFormat string
return nil
}

func diagnosticsPprofCmd(streams *cli.IOStreams, dur, cmdTimeout time.Duration, outFile, pType, appName, rk string) error {
pt, ok := proto.PprofOption_value[strings.ToUpper(pType)]
if !ok {
return fmt.Errorf("unknown pprof-type %q, select one of [allocs, block, cmdline, goroutine, heap, mutex, profile, threadcreate, trace]", pType)
}

// the elastic-agent application does not have a route key
if appName == "elastic-agent" {
rk = ""
}

ctx := handleSignal(context.Background())
// set cmdTimeout to 30s+dur if not set.
if cmdTimeout == time.Duration(0) {
cmdTimeout = time.Second*30 + dur
}
innerCtx, cancel := context.WithTimeout(ctx, cmdTimeout)
defer cancel()

daemon := client.New()
err := daemon.Connect(ctx)
if err != nil {
return err
}

pprofData, err := daemon.Pprof(innerCtx, dur, []proto.PprofOption{proto.PprofOption(pt)}, appName, rk)
if err != nil {
return err
}

// validate response
pArr, ok := pprofData[proto.PprofOption_name[pt]]
if !ok {
return fmt.Errorf("route key %q not found in response data (map length: %d)", rk, len(pprofData))
}
if len(pArr) != 1 {
return fmt.Errorf("route key application length 1 expected, recieved %d", len(pArr))
}
res := pArr[0]

if res.Error != "" {
return fmt.Errorf(res.Error)
}

// handle result
if outFile != "" {
f, err := os.Create(outFile)
if err != nil {
return err
}
defer f.Close()
_, err = f.Write(res.Result)
if err != nil {
return err
}
fmt.Fprintf(streams.Out, "pprof data written to %s\n", outFile)
return nil
}
_, err = streams.Out.Write(res.Result)
return err
}

func getDiagnostics(ctx context.Context) (DiagnosticsInfo, error) {
daemon := client.New()
diag := DiagnosticsInfo{}
Expand Down Expand Up @@ -242,7 +368,7 @@ func gatherConfig() (AgentConfig, error) {
//
// The passed DiagnosticsInfo and AgentConfig data is written in the specified output format.
// Any local log files are collected and copied into the archive.
func createZip(fileName, outputFormat string, diag DiagnosticsInfo, cfg AgentConfig) error {
func createZip(fileName, outputFormat string, diag DiagnosticsInfo, cfg AgentConfig, pprof map[string][]client.ProcPProf) error {
f, err := os.Create(fileName)
if err != nil {
return err
Expand Down Expand Up @@ -298,6 +424,13 @@ func createZip(fileName, outputFormat string, diag DiagnosticsInfo, cfg AgentCon
return closeHandlers(err, zw, f)
}

if pprof != nil {
err := zipProfs(zw, pprof)
if err != nil {
return closeHandlers(err, zw, f)
}
}

return closeHandlers(nil, zw, f)
}

Expand Down Expand Up @@ -371,3 +504,58 @@ func closeHandlers(err error, closers ...io.Closer) error {
}
return mErr.ErrorOrNil()
}

func getAllPprof(ctx context.Context, d time.Duration) (map[string][]client.ProcPProf, error) {
daemon := client.New()
err := daemon.Connect(ctx)
if err != nil {
return nil, err
}
pprofTypes := []proto.PprofOption{
proto.PprofOption_ALLOCS,
proto.PprofOption_BLOCK,
proto.PprofOption_CMDLINE,
proto.PprofOption_GOROUTINE,
proto.PprofOption_HEAP,
proto.PprofOption_MUTEX,
proto.PprofOption_PROFILE,
proto.PprofOption_THREADCREATE,
proto.PprofOption_TRACE,
}
return daemon.Pprof(ctx, d, pprofTypes, "", "")
}

func zipProfs(zw *zip.Writer, pprof map[string][]client.ProcPProf) error {
zf, err := zw.Create("pprof/")
if err != nil {
return err
}
for pType, profs := range pprof {
zf, err = zw.Create("pprof/" + pType + "/")
if err != nil {
return err
}
for _, p := range profs {
if p.Error != "" {
zf, err = zw.Create("pprof/" + pType + "/" + p.Name + "_" + p.RouteKey + "_error.txt")
if err != nil {
return err
}
_, err = zf.Write([]byte(p.Error))
if err != nil {
return err
}
continue
}
zf, err = zw.Create("pprof/" + pType + "/" + p.Name + "_" + p.RouteKey + ".pprof")
if err != nil {
return err
}
_, err = zf.Write(p.Result)
if err != nil {
return err
}
}
}
return nil
}
36 changes: 36 additions & 0 deletions x-pack/elastic-agent/pkg/agent/control/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ type ProcMeta struct {
Error string
}

// ProcPProf returns pprof data for a process.
type ProcPProf struct {
Name string
RouteKey string
Result []byte
Error string
}

// AgentStatus is the current status of the Elastic Agent.
type AgentStatus struct {
Status Status
Expand All @@ -95,6 +103,8 @@ type Client interface {
Upgrade(ctx context.Context, version string, sourceURI string) (string, error)
// ProcMeta gathers running process meta-data.
ProcMeta(ctx context.Context) ([]ProcMeta, error)
// Pprof gathers data from the /debug/pprof/ endpoints specified.
Pprof(ctx context.Context, d time.Duration, pprofTypes []proto.PprofOption, appName, routeKey string) (map[string][]ProcPProf, error)
}

// client manages the state and communication to the Elastic Agent.
Expand Down Expand Up @@ -247,3 +257,29 @@ func (c *client) ProcMeta(ctx context.Context) ([]ProcMeta, error) {
}
return procMeta, nil
}

// Pprof gathers /debug/pprof data and returns a map of pprof-type: ProcPProf data
func (c *client) Pprof(ctx context.Context, d time.Duration, pprofTypes []proto.PprofOption, appName, routeKey string) (map[string][]ProcPProf, error) {
resp, err := c.client.Pprof(ctx, &proto.PprofRequest{
PprofType: pprofTypes,
TraceDuration: d.String(),
AppName: appName,
RouteKey: routeKey,
})
if err != nil {
return nil, err
}
res := map[string][]ProcPProf{}
for _, pType := range pprofTypes {
res[pType.String()] = make([]ProcPProf, 0)
}
for _, r := range resp.Results {
res[r.PprofType.String()] = append(res[r.PprofType.String()], ProcPProf{
Name: r.AppName,
RouteKey: r.RouteKey,
Result: r.Result,
Error: r.Error,
})
}
return res, nil
}
Loading