Skip to content

Commit

Permalink
sysdump: collect clustermesh-apiserver profiling data
Browse files Browse the repository at this point in the history
Let's additionally collect gops profiling data for both the clustermesh
and kvstoremesh containers of the clustermesh-apiserver, to enable
investigating performance issues. Profiling data is collected proxying
a connection to the remote gops server, to avoid relying on the gops
client. Indeed, it would then save the result to a temporary directory,
and we cannot easily retrieve it given that the target is a distroless
image lacking shell tools.

Signed-off-by: Marco Iorio <[email protected]>
  • Loading branch information
giorio94 authored and michi-covalent committed May 15, 2024
1 parent ca3037f commit 1c7d1ac
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 5 deletions.
2 changes: 2 additions & 0 deletions sysdump/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package sysdump
import (
"bytes"
"context"
"io"

"github.com/blang/semver/v4"
ciliumv2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2"
Expand Down Expand Up @@ -40,6 +41,7 @@ type KubernetesClient interface {
GetLogs(ctx context.Context, namespace, name, container string, opts corev1.PodLogOptions) (string, error)
GetPodsTable(ctx context.Context) (*metav1.Table, error)
ProxyGet(ctx context.Context, namespace, name, url string) (string, error)
ProxyTCP(ctx context.Context, namespace, name string, port uint16, handler func(io.ReadWriteCloser) error) error
GetSecret(ctx context.Context, namespace, name string, opts metav1.GetOptions) (*corev1.Secret, error)
GetCiliumVersion(ctx context.Context, p *corev1.Pod) (*semver.Version, error)
GetVersion(ctx context.Context) (string, error)
Expand Down
8 changes: 5 additions & 3 deletions sysdump/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/google/gops/signal"

"github.com/cilium/cilium-cli/defaults"
)

Expand Down Expand Up @@ -149,9 +151,9 @@ var (
"stack",
"stats",
}
gopsProfiling = []string{
"pprof-heap",
"pprof-cpu",
gopsProfiling = map[string]byte{
"pprof-heap": signal.HeapProfile,
"pprof-cpu": signal.CPUProfile,
}
gopsTrace = "trace"

Expand Down
64 changes: 62 additions & 2 deletions sysdump/sysdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"sync"
"time"

ciliumdef "github.com/cilium/cilium/pkg/defaults"
"github.com/cilium/cilium/pkg/versioncheck"
"github.com/cilium/workerpool"
"github.com/spf13/cobra"
Expand Down Expand Up @@ -1046,11 +1047,21 @@ func (c *Collector) Run() error {
return fmt.Errorf("failed to collect the Cilium clustermesh metrics: %w", err)
}

for _, container := range []string{defaults.ClusterMeshContainerName, defaults.ClusterMeshKVStoreMeshContainerName} {
for container, port := range map[string]uint16{
defaults.ClusterMeshContainerName: ciliumdef.GopsPortApiserver,
defaults.ClusterMeshKVStoreMeshContainerName: ciliumdef.GopsPortKVStoreMesh,
} {
err = c.SubmitGopsSubtasks(AllPods(pods), container)
if err != nil {
return fmt.Errorf("failed to collect the Cilium clustermesh gops stats: %w", err)
}

if c.Options.Profiling {
err = c.SubmitStreamProfilingGopsSubtasks(AllPods(pods), container, port)
if err != nil {
return fmt.Errorf("failed to collect the Cilium clustermesh profiles: %w", err)
}
}
}

return nil
Expand Down Expand Up @@ -2448,7 +2459,7 @@ func (c *Collector) SubmitGopsSubtasks(pods []*corev1.Pod, containerName string)
func (c *Collector) SubmitProfilingGopsSubtasks(pods []*corev1.Pod, containerName string) error {
for _, p := range pods {
p := p
for _, g := range gopsProfiling {
for g := range gopsProfiling {
g := g
if err := c.Pool.Submit(fmt.Sprintf("gops-%s-%s", p.Name, g), func(ctx context.Context) error {
agentPID, err := c.getGopsPID(ctx, p, containerName)
Expand Down Expand Up @@ -2485,6 +2496,55 @@ func (c *Collector) SubmitProfilingGopsSubtasks(pods []*corev1.Pod, containerNam
return nil
}

// SubmitStreamProfilingGopsSubtasks submits tasks to collect profiling data from pods. Differently
// from SubmitProfilingGopsSubtasks, it directly retrieves the profiles from the remote gops server,
// rather than calling the gops client binary. This allows to retrieve the profiles from distroless
// containers as well, as it does not depend on any shell tools.
func (c *Collector) SubmitStreamProfilingGopsSubtasks(pods []*corev1.Pod, containerName string, port uint16) error {
for _, p := range pods {
p := p

if !podIsRunningAndHasContainer(p, containerName) {
continue
}

for g, b := range gopsProfiling {
if err := c.Pool.Submit(fmt.Sprintf("gops-%s-%s", p.Name, g), func(ctx context.Context) error {
err := c.Client.ProxyTCP(ctx, p.Namespace, p.Name, port, func(stream io.ReadWriteCloser) error {
defer stream.Close()

_, err := stream.Write([]byte{b})
if err != nil {
return fmt.Errorf("requesting profiling data: %w", err)
}

file := c.AbsoluteTempPath(fmt.Sprintf("%s-%s-%s-<ts>.pprof", p.Name, containerName, g))
outFile, err := os.OpenFile(file, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return fmt.Errorf("creating target file: %w", err)
}
defer outFile.Close()

_, err = io.Copy(outFile, stream)
if err != nil {
return fmt.Errorf("saving profiling data: %w", err)
}

return nil
})

if err != nil {
return fmt.Errorf("failed to collect gops profiling data: %w", err)
}
return nil
}); err != nil {
return fmt.Errorf("failed to submit %s gops task for %q: %w", g, p.Name, err)
}
}
}
return nil
}

// SubmitTracingGopsSubtask submits task to collect tracing data from pods.
func (c *Collector) SubmitTracingGopsSubtask(pods []*corev1.Pod, containerName string) error {
for _, p := range pods {
Expand Down
4 changes: 4 additions & 0 deletions sysdump/sysdump_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,10 @@ func (c *fakeClient) ProxyGet(_ context.Context, namespace, name, url string) (s
return fmt.Sprintf("Get from %s/%s/%s", namespace, name, url), nil
}

func (c *fakeClient) ProxyTCP(_ context.Context, _, _ string, _ uint16, _ func(io.ReadWriteCloser) error) error {
panic("implement me")
}

func (c *fakeClient) GetSecret(_ context.Context, _, _ string, _ metav1.GetOptions) (*corev1.Secret, error) {
panic("implement me")
}
Expand Down

0 comments on commit 1c7d1ac

Please sign in to comment.