From 1c7d1acdfe745d924f828a13c77247d82a692820 Mon Sep 17 00:00:00 2001 From: Marco Iorio Date: Tue, 7 May 2024 14:38:54 +0200 Subject: [PATCH] sysdump: collect clustermesh-apiserver profiling data Let's additionally collect gops profiling data for both the clustermesh and kvstoremesh containers of the clustermesh-apiserver, to enable investigating performance issues. Profiling data is collected proxying a connection to the remote gops server, to avoid relying on the gops client. Indeed, it would then save the result to a temporary directory, and we cannot easily retrieve it given that the target is a distroless image lacking shell tools. Signed-off-by: Marco Iorio --- sysdump/client.go | 2 ++ sysdump/constants.go | 8 ++++-- sysdump/sysdump.go | 64 +++++++++++++++++++++++++++++++++++++++-- sysdump/sysdump_test.go | 4 +++ 4 files changed, 73 insertions(+), 5 deletions(-) diff --git a/sysdump/client.go b/sysdump/client.go index 67ab9b7768..56efc1f084 100644 --- a/sysdump/client.go +++ b/sysdump/client.go @@ -6,6 +6,7 @@ package sysdump import ( "bytes" "context" + "io" "github.com/blang/semver/v4" ciliumv2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2" @@ -40,6 +41,7 @@ type KubernetesClient interface { GetLogs(ctx context.Context, namespace, name, container string, opts corev1.PodLogOptions) (string, error) GetPodsTable(ctx context.Context) (*metav1.Table, error) ProxyGet(ctx context.Context, namespace, name, url string) (string, error) + ProxyTCP(ctx context.Context, namespace, name string, port uint16, handler func(io.ReadWriteCloser) error) error GetSecret(ctx context.Context, namespace, name string, opts metav1.GetOptions) (*corev1.Secret, error) GetCiliumVersion(ctx context.Context, p *corev1.Pod) (*semver.Version, error) GetVersion(ctx context.Context) (string, error) diff --git a/sysdump/constants.go b/sysdump/constants.go index 58dff11aa1..7cce3d0103 100644 --- a/sysdump/constants.go +++ b/sysdump/constants.go @@ -9,6 +9,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" + "github.com/google/gops/signal" + "github.com/cilium/cilium-cli/defaults" ) @@ -149,9 +151,9 @@ var ( "stack", "stats", } - gopsProfiling = []string{ - "pprof-heap", - "pprof-cpu", + gopsProfiling = map[string]byte{ + "pprof-heap": signal.HeapProfile, + "pprof-cpu": signal.CPUProfile, } gopsTrace = "trace" diff --git a/sysdump/sysdump.go b/sysdump/sysdump.go index 1f342d7a6f..81465c3fbc 100644 --- a/sysdump/sysdump.go +++ b/sysdump/sysdump.go @@ -19,6 +19,7 @@ import ( "sync" "time" + ciliumdef "github.com/cilium/cilium/pkg/defaults" "github.com/cilium/cilium/pkg/versioncheck" "github.com/cilium/workerpool" "github.com/spf13/cobra" @@ -1046,11 +1047,21 @@ func (c *Collector) Run() error { return fmt.Errorf("failed to collect the Cilium clustermesh metrics: %w", err) } - for _, container := range []string{defaults.ClusterMeshContainerName, defaults.ClusterMeshKVStoreMeshContainerName} { + for container, port := range map[string]uint16{ + defaults.ClusterMeshContainerName: ciliumdef.GopsPortApiserver, + defaults.ClusterMeshKVStoreMeshContainerName: ciliumdef.GopsPortKVStoreMesh, + } { err = c.SubmitGopsSubtasks(AllPods(pods), container) if err != nil { return fmt.Errorf("failed to collect the Cilium clustermesh gops stats: %w", err) } + + if c.Options.Profiling { + err = c.SubmitStreamProfilingGopsSubtasks(AllPods(pods), container, port) + if err != nil { + return fmt.Errorf("failed to collect the Cilium clustermesh profiles: %w", err) + } + } } return nil @@ -2448,7 +2459,7 @@ func (c *Collector) SubmitGopsSubtasks(pods []*corev1.Pod, containerName string) func (c *Collector) SubmitProfilingGopsSubtasks(pods []*corev1.Pod, containerName string) error { for _, p := range pods { p := p - for _, g := range gopsProfiling { + for g := range gopsProfiling { g := g if err := c.Pool.Submit(fmt.Sprintf("gops-%s-%s", p.Name, g), func(ctx context.Context) error { agentPID, err := c.getGopsPID(ctx, p, containerName) @@ -2485,6 +2496,55 @@ func (c *Collector) SubmitProfilingGopsSubtasks(pods []*corev1.Pod, containerNam return nil } +// SubmitStreamProfilingGopsSubtasks submits tasks to collect profiling data from pods. Differently +// from SubmitProfilingGopsSubtasks, it directly retrieves the profiles from the remote gops server, +// rather than calling the gops client binary. This allows to retrieve the profiles from distroless +// containers as well, as it does not depend on any shell tools. +func (c *Collector) SubmitStreamProfilingGopsSubtasks(pods []*corev1.Pod, containerName string, port uint16) error { + for _, p := range pods { + p := p + + if !podIsRunningAndHasContainer(p, containerName) { + continue + } + + for g, b := range gopsProfiling { + if err := c.Pool.Submit(fmt.Sprintf("gops-%s-%s", p.Name, g), func(ctx context.Context) error { + err := c.Client.ProxyTCP(ctx, p.Namespace, p.Name, port, func(stream io.ReadWriteCloser) error { + defer stream.Close() + + _, err := stream.Write([]byte{b}) + if err != nil { + return fmt.Errorf("requesting profiling data: %w", err) + } + + file := c.AbsoluteTempPath(fmt.Sprintf("%s-%s-%s-.pprof", p.Name, containerName, g)) + outFile, err := os.OpenFile(file, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("creating target file: %w", err) + } + defer outFile.Close() + + _, err = io.Copy(outFile, stream) + if err != nil { + return fmt.Errorf("saving profiling data: %w", err) + } + + return nil + }) + + if err != nil { + return fmt.Errorf("failed to collect gops profiling data: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed to submit %s gops task for %q: %w", g, p.Name, err) + } + } + } + return nil +} + // SubmitTracingGopsSubtask submits task to collect tracing data from pods. func (c *Collector) SubmitTracingGopsSubtask(pods []*corev1.Pod, containerName string) error { for _, p := range pods { diff --git a/sysdump/sysdump_test.go b/sysdump/sysdump_test.go index 8430586c86..864494f420 100644 --- a/sysdump/sysdump_test.go +++ b/sysdump/sysdump_test.go @@ -345,6 +345,10 @@ func (c *fakeClient) ProxyGet(_ context.Context, namespace, name, url string) (s return fmt.Sprintf("Get from %s/%s/%s", namespace, name, url), nil } +func (c *fakeClient) ProxyTCP(_ context.Context, _, _ string, _ uint16, _ func(io.ReadWriteCloser) error) error { + panic("implement me") +} + func (c *fakeClient) GetSecret(_ context.Context, _, _ string, _ metav1.GetOptions) (*corev1.Secret, error) { panic("implement me") }