From dd840b7a79ba22aca7bda3af5cb925ea208c1dc6 Mon Sep 17 00:00:00 2001 From: Chad Patel Date: Fri, 22 Sep 2023 15:01:07 -0500 Subject: [PATCH] tweaks for enhanced container insights metrics (#96) * tweaks for enhanced container insights metrics * run go mod tidy --- internal/aws/containerinsight/const.go | 93 +++++++++++-------- .../awscontainerinsightreceiver/README.md | 80 +++++++++------- receiver/awscontainerinsightreceiver/go.mod | 2 +- .../k8sapiserver/prometheus_scraper.go | 8 ++ .../internal/stores/podstore.go | 19 ++-- .../internal/stores/podstore_test.go | 52 +++++++++-- 6 files changed, 167 insertions(+), 87 deletions(-) diff --git a/internal/aws/containerinsight/const.go b/internal/aws/containerinsight/const.go index 5dddc4d23117..62228df8ecd4 100644 --- a/internal/aws/containerinsight/const.go +++ b/internal/aws/containerinsight/const.go @@ -76,33 +76,38 @@ const ( FSInodesfree = "filesystem_inodes_free" FSUtilization = "filesystem_utilization" - StatusConditionReady = "status_condition_ready" - StatusConditionDiskPressure = "status_condition_disk_pressure" - StatusConditionMemoryPressure = "status_condition_memory_pressure" - StatusConditionPIDPressure = "status_condition_pid_pressure" - StatusConditionNetworkUnavailable = "status_condition_network_unavailable" - StatusConditionUnknown = "status_condition_unknown" - StatusCapacityPods = "status_capacity_pods" - StatusAllocatablePods = "status_allocatable_pods" - StatusNumberAvailable = "status_number_available" - StatusNumberUnavailable = "status_number_unavailable" - StatusDesiredNumberScheduled = "status_desired_number_scheduled" - StatusCurrentNumberScheduled = "status_current_number_scheduled" - StatusReplicasAvailable = "status_replicas_available" - StatusReplicasUnavailable = "status_replicas_unavailable" - SpecReplicas = "spec_replicas" - StatusRunning = "status_running" - StatusTerminated = "status_terminated" - StatusWaiting = "status_waiting" - StatusWaitingReasonCrashed = "status_waiting_reason_crashed" - StatusPending = "status_pending" - StatusSucceeded = "status_succeeded" - StatusFailed = "status_failed" - StatusUnknown = "status_unknown" - StatusReady = "status_ready" - StatusScheduled = "status_scheduled" - ReplicasDesired = "replicas_desired" - ReplicasReady = "replicas_ready" + StatusConditionReady = "status_condition_ready" + StatusConditionDiskPressure = "status_condition_disk_pressure" + StatusConditionMemoryPressure = "status_condition_memory_pressure" + StatusConditionPIDPressure = "status_condition_pid_pressure" + StatusConditionNetworkUnavailable = "status_condition_network_unavailable" + StatusConditionUnknown = "status_condition_unknown" + StatusCapacityPods = "status_capacity_pods" + StatusAllocatablePods = "status_allocatable_pods" + StatusNumberAvailable = "status_number_available" + StatusNumberUnavailable = "status_number_unavailable" + StatusDesiredNumberScheduled = "status_desired_number_scheduled" + StatusCurrentNumberScheduled = "status_current_number_scheduled" + StatusReplicasAvailable = "status_replicas_available" + StatusReplicasUnavailable = "status_replicas_unavailable" + SpecReplicas = "spec_replicas" + StatusRunning = "status_running" + StatusTerminated = "status_terminated" + StatusWaiting = "status_waiting" + StatusWaitingReasonCrashLoopBackOff = "status_waiting_reason_crash_loop_back_off" + StatusWaitingReasonImagePullError = "status_waiting_reason_image_pull_error" + StatusWaitingReasonStartError = "status_waiting_reason_start_error" + StatusWaitingReasonCreateContainerError = "status_waiting_reason_create_container_error" + StatusWaitingReasonCreateContainerConfigError = "status_waiting_reason_create_container_config_error" + StatusTerminatedReasonOOMKilled = "status_terminated_reason_oom_killed" + StatusPending = "status_pending" + StatusSucceeded = "status_succeeded" + StatusFailed = "status_failed" + StatusUnknown = "status_unknown" + StatusReady = "status_ready" + StatusScheduled = "status_scheduled" + ReplicasDesired = "replicas_desired" + ReplicasReady = "replicas_ready" RunningPodCount = "number_of_running_pods" RunningContainerCount = "number_of_running_containers" @@ -157,6 +162,16 @@ const ( UnitPercent = "Percent" ) +var WaitingReasonLookup = map[string]string{ + "CrashLoopBackOff": StatusWaitingReasonCrashLoopBackOff, + "ErrImagePull": StatusWaitingReasonImagePullError, + "ImagePullBackOff": StatusWaitingReasonImagePullError, + "InvalidImageName": StatusWaitingReasonImagePullError, + "CreateContainerError": StatusWaitingReasonCreateContainerError, + "CreateContainerConfigError": StatusWaitingReasonCreateContainerConfigError, + "StartError": StatusWaitingReasonStartError, +} + var metricToUnitMap map[string]string func init() { @@ -246,16 +261,20 @@ func init() { ReplicasReady: UnitCount, // kube-state-metrics equivalents - StatusRunning: UnitCount, - StatusTerminated: UnitCount, - StatusWaiting: UnitCount, - StatusWaitingReasonCrashed: UnitCount, - StatusFailed: UnitCount, - StatusPending: UnitCount, - StatusSucceeded: UnitCount, - StatusUnknown: UnitCount, - StatusReady: UnitCount, - StatusScheduled: UnitCount, + StatusRunning: UnitCount, + StatusTerminated: UnitCount, + StatusWaiting: UnitCount, + StatusWaitingReasonCrashLoopBackOff: UnitCount, + StatusWaitingReasonImagePullError: UnitCount, + StatusWaitingReasonStartError: UnitCount, + StatusWaitingReasonCreateContainerConfigError: UnitCount, + StatusWaitingReasonCreateContainerError: UnitCount, + StatusFailed: UnitCount, + StatusPending: UnitCount, + StatusSucceeded: UnitCount, + StatusUnknown: UnitCount, + StatusReady: UnitCount, + StatusScheduled: UnitCount, // cluster metrics NodeCount: UnitCount, diff --git a/receiver/awscontainerinsightreceiver/README.md b/receiver/awscontainerinsightreceiver/README.md index b49972362e35..81502bb1954a 100644 --- a/receiver/awscontainerinsightreceiver/README.md +++ b/receiver/awscontainerinsightreceiver/README.md @@ -372,14 +372,23 @@ kubectl apply -f config.yaml ### Cluster ControlPlane | Metric | Unit | |-----------------------------------------------------------|---------| -| apiserver_admission_controller_admission_duration_seconds | Seconds | +| apiserver_admission_controller_admission_duration_seconds | Seconds | +| apiserver_admission_step_admission_duration_seconds | Seconds | +| apiserver_admission_webhook_admission_duration_seconds | Seconds | +| apiserver_current_inflight_requests | Count | +| apiserver_current_inqueue_requests | Count | | apiserver_flowcontrol_rejected_requests_total | Count | | apiserver_flowcontrol_request_concurrency_limit | Count | +| apiserver_longrunning_requests | Count | | apiserver_request_duration_seconds | Seconds | | apiserver_request_total | Count | +| apiserver_request_total_5xx | Count | +| apiserver_requested_deprecated_apis | Count | | apiserver_storage_list_duration_seconds | Seconds | | apiserver_storage_objects | Count | -| etcd_db_total_size_in_bytes | Count | +| apiserver_storage_db_total_size_in_bytes | Bytes | +| apiserver_storage_size_bytes | Bytes | +| etcd_db_total_size_in_bytes | Bytes | | etcd_request_duration_seconds | Seconds | | rest_client_request_duration_seconds | Seconds | | rest_client_requests_total | Count | @@ -780,37 +789,42 @@ kubectl apply -f config.yaml ### Container -| Metric | Unit | -|---------------------------------------------------|--------------| -| container_cpu_limit | Millicore | -| container_cpu_request | Millicore | -| container_cpu_usage_system | Millicore | -| container_cpu_usage_total | Millicore | -| container_cpu_usage_user | Millicore | -| container_cpu_utilization | Percent | -| container_cpu_utilization_over_container_limit | Percent | -| container_memory_cache | Bytes | -| container_memory_failcnt | Count | -| container_memory_hierarchical_pgfault | Count/Second | -| container_memory_hierarchical_pgmajfault | Count/Second | -| container_memory_limit | Bytes | -| container_memory_mapped_file | Bytes | -| container_memory_max_usage | Bytes | -| container_memory_pgfault | Count/Second | -| container_memory_pgmajfault | Count/Second | -| container_memory_failures_total | Count/Second | -| container_memory_request | Bytes | -| container_memory_rss | Bytes | -| container_memory_swap | Bytes | -| container_memory_usage | Bytes | -| container_memory_utilization | Percent | -| container_memory_utilization_over_container_limit | Percent | -| container_memory_working_set | Bytes | -| number_of_container_restarts | Count | -| container_status_running | Count | -| container_status_terminated | Count | -| container_status_waiting | Count | -| container_status_waiting_reason_crashed | Count | +| Metric | Unit | +|---------------------------------------------------------------|--------------| +| container_cpu_limit | Millicore | +| container_cpu_request | Millicore | +| container_cpu_usage_system | Millicore | +| container_cpu_usage_total | Millicore | +| container_cpu_usage_user | Millicore | +| container_cpu_utilization | Percent | +| container_cpu_utilization_over_container_limit | Percent | +| container_memory_cache | Bytes | +| container_memory_failcnt | Count | +| container_memory_hierarchical_pgfault | Count/Second | +| container_memory_hierarchical_pgmajfault | Count/Second | +| container_memory_limit | Bytes | +| container_memory_mapped_file | Bytes | +| container_memory_max_usage | Bytes | +| container_memory_pgfault | Count/Second | +| container_memory_pgmajfault | Count/Second | +| container_memory_failures_total | Count/Second | +| container_memory_request | Bytes | +| container_memory_rss | Bytes | +| container_memory_swap | Bytes | +| container_memory_usage | Bytes | +| container_memory_utilization | Percent | +| container_memory_utilization_over_container_limit | Percent | +| container_memory_working_set | Bytes | +| number_of_container_restarts | Count | +| container_status_running | Count | +| container_status_terminated | Count | +| container_status_waiting | Count | +| container_status_waiting_reason_crash_loop_back_off | Count | +| container_status_waiting_reason_image_pull_error | Count | +| container_status_waiting_reason_start_error | Count | +| container_status_waiting_reason_create_container_error | Count | +| container_status_waiting_reason_create_container_config_error | Count | +| container_status_terminated_reason_oom_killed | Count |

diff --git a/receiver/awscontainerinsightreceiver/go.mod b/receiver/awscontainerinsightreceiver/go.mod index 52fa88e90f9c..59268a7f852c 100644 --- a/receiver/awscontainerinsightreceiver/go.mod +++ b/receiver/awscontainerinsightreceiver/go.mod @@ -25,6 +25,7 @@ require ( go.opentelemetry.io/collector/pdata v1.0.0-rcv0014.0.20230908201109-ab3d6c5b6470 go.opentelemetry.io/collector/receiver v0.84.1-0.20230908201109-ab3d6c5b6470 go.uber.org/zap v1.25.0 + golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.28.1 k8s.io/apimachinery v0.28.1 @@ -203,7 +204,6 @@ require ( go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/crypto v0.13.0 // indirect - golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect golang.org/x/mod v0.12.0 // indirect golang.org/x/net v0.15.0 // indirect golang.org/x/oauth2 v0.11.0 // indirect diff --git a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go index 7c39ad80874d..e31f9506350e 100644 --- a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go +++ b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go @@ -33,12 +33,20 @@ const ( var ( controlPlaneMetricAllowList = []string{ "apiserver_admission_controller_admission_duration_seconds.*", + "apiserver_admission_step_admission_duration_seconds.*", + "apiserver_admission_webhook_admission_duration_seconds.*", + "apiserver_current_inflight_requests", + "apiserver_current_inqueue_requests", "apiserver_flowcontrol_rejected_requests_total", "apiserver_flowcontrol_request_concurrency_limit", + "apiserver_longrunning_requests", "apiserver_request_duration_seconds.*", "apiserver_request_total", + "apiserver_requested_deprecated_apis", "apiserver_storage_list_duration_seconds.*", "apiserver_storage_objects", + "apiserver_storage_db_total_size_in_bytes.*", + "apiserver_storage_size_bytes.*", "etcd_db_total_size_in_bytes.*", "etcd_request_duration_seconds.*", "rest_client_request_duration_seconds.*", diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go index 723eeb3e7c95..e907b1b6cc96 100644 --- a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go +++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go @@ -518,10 +518,9 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) { for _, containerStatus := range pod.Status.ContainerStatuses { if containerStatus.Name == containerName { possibleStatuses := map[string]int{ - ci.StatusRunning: 0, - ci.StatusWaiting: 0, - ci.StatusWaitingReasonCrashed: 0, - ci.StatusTerminated: 0, + ci.StatusRunning: 0, + ci.StatusWaiting: 0, + ci.StatusTerminated: 0, } switch { case containerStatus.State.Running != nil: @@ -530,10 +529,11 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) { case containerStatus.State.Waiting != nil: metric.AddTag(ci.ContainerStatus, "Waiting") possibleStatuses[ci.StatusWaiting] = 1 - if containerStatus.State.Waiting.Reason != "" { - metric.AddTag(ci.ContainerStatusReason, containerStatus.State.Waiting.Reason) - if strings.Contains(containerStatus.State.Waiting.Reason, "Crash") { - possibleStatuses[ci.StatusWaitingReasonCrashed] = 1 + reason := containerStatus.State.Waiting.Reason + if reason != "" { + metric.AddTag(ci.ContainerStatusReason, reason) + if val, ok := ci.WaitingReasonLookup[reason]; ok { + possibleStatuses[val] = 1 } } case containerStatus.State.Terminated != nil: @@ -546,6 +546,9 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) { if containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.Reason != "" { metric.AddTag(ci.ContainerLastTerminationReason, containerStatus.LastTerminationState.Terminated.Reason) + if strings.Contains(containerStatus.LastTerminationState.Terminated.Reason, "OOMKilled") { + possibleStatuses[ci.StatusTerminatedReasonOOMKilled] = 1 + } } containerKey := createContainerKeyFromMetric(metric) if containerKey != "" { diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go index e71352252c03..982fca985cdb 100644 --- a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go +++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go @@ -15,6 +15,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" + "golang.org/x/exp/maps" corev1 "k8s.io/api/core/v1" ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight" @@ -357,9 +358,12 @@ func getPodStore() *PodStore { } func generateMetric(fields map[string]interface{}, tags map[string]string) CIMetric { + tagsCopy := maps.Clone(tags) + fieldsCopy := maps.Clone(fields) + return &mockCIMetric{ - tags: tags, - fields: fields, + tags: tagsCopy, + fields: fieldsCopy, } } @@ -672,8 +676,9 @@ func TestPodStore_addStatus_enhanced_metrics(t *testing.T) { assert.Equal(t, "OOMKilled", metric.GetTag(ci.ContainerLastTerminationReason)) assert.Equal(t, int(1), metric.GetField(ci.ContainerRestartCount).(int)) assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusTerminated))) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusTerminatedReasonOOMKilled))) - pod.Status.ContainerStatuses[0].State.Terminated = nil + pod.Status.ContainerStatuses[0].LastTerminationState.Terminated = nil pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"} tags = map[string]string{ci.MetricType: ci.TypeContainer, ci.K8sNamespace: "default", ci.K8sPodNameKey: "cpu-limit", ci.ContainerNamekey: "ubuntu"} @@ -682,9 +687,15 @@ func TestPodStore_addStatus_enhanced_metrics(t *testing.T) { podStore.addStatus(metric, pod) assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus)) assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting))) - assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed))) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashLoopBackOff))) + // sparse metrics + assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError))) + assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusTerminatedReasonOOMKilled))) + assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonStartError))) + assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerError))) + assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerConfigError))) - pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "SomeOtherReason"} + pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"} tags = map[string]string{ci.MetricType: ci.TypeContainer, ci.K8sNamespace: "default", ci.K8sPodNameKey: "cpu-limit", ci.ContainerNamekey: "ubuntu"} metric = generateMetric(fields, tags) @@ -692,7 +703,32 @@ func TestPodStore_addStatus_enhanced_metrics(t *testing.T) { podStore.addStatus(metric, pod) assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus)) assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting))) - assert.Equal(t, 0, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed))) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError))) + + pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "ErrImagePull"} + metric = generateMetric(fields, tags) + podStore.addStatus(metric, pod) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError))) + + pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "InvalidImageName"} + metric = generateMetric(fields, tags) + podStore.addStatus(metric, pod) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError))) + + pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "CreateContainerError"} + metric = generateMetric(fields, tags) + podStore.addStatus(metric, pod) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerError))) + + pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "CreateContainerConfigError"} + metric = generateMetric(fields, tags) + podStore.addStatus(metric, pod) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerConfigError))) + + pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "StartError"} + metric = generateMetric(fields, tags) + podStore.addStatus(metric, pod) + assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonStartError))) // test delta of restartCount pod.Status.ContainerStatuses[0].RestartCount = 3 @@ -762,7 +798,7 @@ func TestPodStore_addStatus_without_enhanced_metrics(t *testing.T) { podStore.addStatus(metric, pod) assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus)) assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting))) - assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed))) + assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashLoopBackOff))) pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "SomeOtherReason"} @@ -772,7 +808,7 @@ func TestPodStore_addStatus_without_enhanced_metrics(t *testing.T) { podStore.addStatus(metric, pod) assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus)) assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting))) - assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed))) + assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashLoopBackOff))) // test delta of restartCount pod.Status.ContainerStatuses[0].RestartCount = 3