diff --git a/internal/aws/containerinsight/const.go b/internal/aws/containerinsight/const.go
index 5dddc4d23117..62228df8ecd4 100644
--- a/internal/aws/containerinsight/const.go
+++ b/internal/aws/containerinsight/const.go
@@ -76,33 +76,38 @@ const (
FSInodesfree = "filesystem_inodes_free"
FSUtilization = "filesystem_utilization"
- StatusConditionReady = "status_condition_ready"
- StatusConditionDiskPressure = "status_condition_disk_pressure"
- StatusConditionMemoryPressure = "status_condition_memory_pressure"
- StatusConditionPIDPressure = "status_condition_pid_pressure"
- StatusConditionNetworkUnavailable = "status_condition_network_unavailable"
- StatusConditionUnknown = "status_condition_unknown"
- StatusCapacityPods = "status_capacity_pods"
- StatusAllocatablePods = "status_allocatable_pods"
- StatusNumberAvailable = "status_number_available"
- StatusNumberUnavailable = "status_number_unavailable"
- StatusDesiredNumberScheduled = "status_desired_number_scheduled"
- StatusCurrentNumberScheduled = "status_current_number_scheduled"
- StatusReplicasAvailable = "status_replicas_available"
- StatusReplicasUnavailable = "status_replicas_unavailable"
- SpecReplicas = "spec_replicas"
- StatusRunning = "status_running"
- StatusTerminated = "status_terminated"
- StatusWaiting = "status_waiting"
- StatusWaitingReasonCrashed = "status_waiting_reason_crashed"
- StatusPending = "status_pending"
- StatusSucceeded = "status_succeeded"
- StatusFailed = "status_failed"
- StatusUnknown = "status_unknown"
- StatusReady = "status_ready"
- StatusScheduled = "status_scheduled"
- ReplicasDesired = "replicas_desired"
- ReplicasReady = "replicas_ready"
+ StatusConditionReady = "status_condition_ready"
+ StatusConditionDiskPressure = "status_condition_disk_pressure"
+ StatusConditionMemoryPressure = "status_condition_memory_pressure"
+ StatusConditionPIDPressure = "status_condition_pid_pressure"
+ StatusConditionNetworkUnavailable = "status_condition_network_unavailable"
+ StatusConditionUnknown = "status_condition_unknown"
+ StatusCapacityPods = "status_capacity_pods"
+ StatusAllocatablePods = "status_allocatable_pods"
+ StatusNumberAvailable = "status_number_available"
+ StatusNumberUnavailable = "status_number_unavailable"
+ StatusDesiredNumberScheduled = "status_desired_number_scheduled"
+ StatusCurrentNumberScheduled = "status_current_number_scheduled"
+ StatusReplicasAvailable = "status_replicas_available"
+ StatusReplicasUnavailable = "status_replicas_unavailable"
+ SpecReplicas = "spec_replicas"
+ StatusRunning = "status_running"
+ StatusTerminated = "status_terminated"
+ StatusWaiting = "status_waiting"
+ StatusWaitingReasonCrashLoopBackOff = "status_waiting_reason_crash_loop_back_off"
+ StatusWaitingReasonImagePullError = "status_waiting_reason_image_pull_error"
+ StatusWaitingReasonStartError = "status_waiting_reason_start_error"
+ StatusWaitingReasonCreateContainerError = "status_waiting_reason_create_container_error"
+ StatusWaitingReasonCreateContainerConfigError = "status_waiting_reason_create_container_config_error"
+ StatusTerminatedReasonOOMKilled = "status_terminated_reason_oom_killed"
+ StatusPending = "status_pending"
+ StatusSucceeded = "status_succeeded"
+ StatusFailed = "status_failed"
+ StatusUnknown = "status_unknown"
+ StatusReady = "status_ready"
+ StatusScheduled = "status_scheduled"
+ ReplicasDesired = "replicas_desired"
+ ReplicasReady = "replicas_ready"
RunningPodCount = "number_of_running_pods"
RunningContainerCount = "number_of_running_containers"
@@ -157,6 +162,16 @@ const (
UnitPercent = "Percent"
)
+var WaitingReasonLookup = map[string]string{
+ "CrashLoopBackOff": StatusWaitingReasonCrashLoopBackOff,
+ "ErrImagePull": StatusWaitingReasonImagePullError,
+ "ImagePullBackOff": StatusWaitingReasonImagePullError,
+ "InvalidImageName": StatusWaitingReasonImagePullError,
+ "CreateContainerError": StatusWaitingReasonCreateContainerError,
+ "CreateContainerConfigError": StatusWaitingReasonCreateContainerConfigError,
+ "StartError": StatusWaitingReasonStartError,
+}
+
var metricToUnitMap map[string]string
func init() {
@@ -246,16 +261,20 @@ func init() {
ReplicasReady: UnitCount,
// kube-state-metrics equivalents
- StatusRunning: UnitCount,
- StatusTerminated: UnitCount,
- StatusWaiting: UnitCount,
- StatusWaitingReasonCrashed: UnitCount,
- StatusFailed: UnitCount,
- StatusPending: UnitCount,
- StatusSucceeded: UnitCount,
- StatusUnknown: UnitCount,
- StatusReady: UnitCount,
- StatusScheduled: UnitCount,
+ StatusRunning: UnitCount,
+ StatusTerminated: UnitCount,
+ StatusWaiting: UnitCount,
+ StatusWaitingReasonCrashLoopBackOff: UnitCount,
+ StatusWaitingReasonImagePullError: UnitCount,
+ StatusWaitingReasonStartError: UnitCount,
+ StatusWaitingReasonCreateContainerConfigError: UnitCount,
+ StatusWaitingReasonCreateContainerError: UnitCount,
+ StatusFailed: UnitCount,
+ StatusPending: UnitCount,
+ StatusSucceeded: UnitCount,
+ StatusUnknown: UnitCount,
+ StatusReady: UnitCount,
+ StatusScheduled: UnitCount,
// cluster metrics
NodeCount: UnitCount,
diff --git a/receiver/awscontainerinsightreceiver/README.md b/receiver/awscontainerinsightreceiver/README.md
index b49972362e35..81502bb1954a 100644
--- a/receiver/awscontainerinsightreceiver/README.md
+++ b/receiver/awscontainerinsightreceiver/README.md
@@ -372,14 +372,23 @@ kubectl apply -f config.yaml
### Cluster ControlPlane
| Metric | Unit |
|-----------------------------------------------------------|---------|
-| apiserver_admission_controller_admission_duration_seconds | Seconds |
+| apiserver_admission_controller_admission_duration_seconds | Seconds |
+| apiserver_admission_step_admission_duration_seconds | Seconds |
+| apiserver_admission_webhook_admission_duration_seconds | Seconds |
+| apiserver_current_inflight_requests | Count |
+| apiserver_current_inqueue_requests | Count |
| apiserver_flowcontrol_rejected_requests_total | Count |
| apiserver_flowcontrol_request_concurrency_limit | Count |
+| apiserver_longrunning_requests | Count |
| apiserver_request_duration_seconds | Seconds |
| apiserver_request_total | Count |
+| apiserver_request_total_5xx | Count |
+| apiserver_requested_deprecated_apis | Count |
| apiserver_storage_list_duration_seconds | Seconds |
| apiserver_storage_objects | Count |
-| etcd_db_total_size_in_bytes | Count |
+| apiserver_storage_db_total_size_in_bytes | Bytes |
+| apiserver_storage_size_bytes | Bytes |
+| etcd_db_total_size_in_bytes | Bytes |
| etcd_request_duration_seconds | Seconds |
| rest_client_request_duration_seconds | Seconds |
| rest_client_requests_total | Count |
@@ -780,37 +789,42 @@ kubectl apply -f config.yaml
### Container
-| Metric | Unit |
-|---------------------------------------------------|--------------|
-| container_cpu_limit | Millicore |
-| container_cpu_request | Millicore |
-| container_cpu_usage_system | Millicore |
-| container_cpu_usage_total | Millicore |
-| container_cpu_usage_user | Millicore |
-| container_cpu_utilization | Percent |
-| container_cpu_utilization_over_container_limit | Percent |
-| container_memory_cache | Bytes |
-| container_memory_failcnt | Count |
-| container_memory_hierarchical_pgfault | Count/Second |
-| container_memory_hierarchical_pgmajfault | Count/Second |
-| container_memory_limit | Bytes |
-| container_memory_mapped_file | Bytes |
-| container_memory_max_usage | Bytes |
-| container_memory_pgfault | Count/Second |
-| container_memory_pgmajfault | Count/Second |
-| container_memory_failures_total | Count/Second |
-| container_memory_request | Bytes |
-| container_memory_rss | Bytes |
-| container_memory_swap | Bytes |
-| container_memory_usage | Bytes |
-| container_memory_utilization | Percent |
-| container_memory_utilization_over_container_limit | Percent |
-| container_memory_working_set | Bytes |
-| number_of_container_restarts | Count |
-| container_status_running | Count |
-| container_status_terminated | Count |
-| container_status_waiting | Count |
-| container_status_waiting_reason_crashed | Count |
+| Metric | Unit |
+|---------------------------------------------------------------|--------------|
+| container_cpu_limit | Millicore |
+| container_cpu_request | Millicore |
+| container_cpu_usage_system | Millicore |
+| container_cpu_usage_total | Millicore |
+| container_cpu_usage_user | Millicore |
+| container_cpu_utilization | Percent |
+| container_cpu_utilization_over_container_limit | Percent |
+| container_memory_cache | Bytes |
+| container_memory_failcnt | Count |
+| container_memory_hierarchical_pgfault | Count/Second |
+| container_memory_hierarchical_pgmajfault | Count/Second |
+| container_memory_limit | Bytes |
+| container_memory_mapped_file | Bytes |
+| container_memory_max_usage | Bytes |
+| container_memory_pgfault | Count/Second |
+| container_memory_pgmajfault | Count/Second |
+| container_memory_failures_total | Count/Second |
+| container_memory_request | Bytes |
+| container_memory_rss | Bytes |
+| container_memory_swap | Bytes |
+| container_memory_usage | Bytes |
+| container_memory_utilization | Percent |
+| container_memory_utilization_over_container_limit | Percent |
+| container_memory_working_set | Bytes |
+| number_of_container_restarts | Count |
+| container_status_running | Count |
+| container_status_terminated | Count |
+| container_status_waiting | Count |
+| container_status_waiting_reason_crash_loop_back_off | Count |
+| container_status_waiting_reason_image_pull_error | Count |
+| container_status_waiting_reason_start_error | Count |
+| container_status_waiting_reason_create_container_error | Count |
+| container_status_waiting_reason_create_container_config_error | Count |
+| container_status_terminated_reason_oom_killed | Count |
diff --git a/receiver/awscontainerinsightreceiver/go.mod b/receiver/awscontainerinsightreceiver/go.mod
index 52fa88e90f9c..59268a7f852c 100644
--- a/receiver/awscontainerinsightreceiver/go.mod
+++ b/receiver/awscontainerinsightreceiver/go.mod
@@ -25,6 +25,7 @@ require (
go.opentelemetry.io/collector/pdata v1.0.0-rcv0014.0.20230908201109-ab3d6c5b6470
go.opentelemetry.io/collector/receiver v0.84.1-0.20230908201109-ab3d6c5b6470
go.uber.org/zap v1.25.0
+ golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.28.1
k8s.io/apimachinery v0.28.1
@@ -203,7 +204,6 @@ require (
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.13.0 // indirect
- golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect
golang.org/x/mod v0.12.0 // indirect
golang.org/x/net v0.15.0 // indirect
golang.org/x/oauth2 v0.11.0 // indirect
diff --git a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go
index 7c39ad80874d..e31f9506350e 100644
--- a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go
+++ b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/prometheus_scraper.go
@@ -33,12 +33,20 @@ const (
var (
controlPlaneMetricAllowList = []string{
"apiserver_admission_controller_admission_duration_seconds.*",
+ "apiserver_admission_step_admission_duration_seconds.*",
+ "apiserver_admission_webhook_admission_duration_seconds.*",
+ "apiserver_current_inflight_requests",
+ "apiserver_current_inqueue_requests",
"apiserver_flowcontrol_rejected_requests_total",
"apiserver_flowcontrol_request_concurrency_limit",
+ "apiserver_longrunning_requests",
"apiserver_request_duration_seconds.*",
"apiserver_request_total",
+ "apiserver_requested_deprecated_apis",
"apiserver_storage_list_duration_seconds.*",
"apiserver_storage_objects",
+ "apiserver_storage_db_total_size_in_bytes.*",
+ "apiserver_storage_size_bytes.*",
"etcd_db_total_size_in_bytes.*",
"etcd_request_duration_seconds.*",
"rest_client_request_duration_seconds.*",
diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go
index 723eeb3e7c95..e907b1b6cc96 100644
--- a/receiver/awscontainerinsightreceiver/internal/stores/podstore.go
+++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore.go
@@ -518,10 +518,9 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) {
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.Name == containerName {
possibleStatuses := map[string]int{
- ci.StatusRunning: 0,
- ci.StatusWaiting: 0,
- ci.StatusWaitingReasonCrashed: 0,
- ci.StatusTerminated: 0,
+ ci.StatusRunning: 0,
+ ci.StatusWaiting: 0,
+ ci.StatusTerminated: 0,
}
switch {
case containerStatus.State.Running != nil:
@@ -530,10 +529,11 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) {
case containerStatus.State.Waiting != nil:
metric.AddTag(ci.ContainerStatus, "Waiting")
possibleStatuses[ci.StatusWaiting] = 1
- if containerStatus.State.Waiting.Reason != "" {
- metric.AddTag(ci.ContainerStatusReason, containerStatus.State.Waiting.Reason)
- if strings.Contains(containerStatus.State.Waiting.Reason, "Crash") {
- possibleStatuses[ci.StatusWaitingReasonCrashed] = 1
+ reason := containerStatus.State.Waiting.Reason
+ if reason != "" {
+ metric.AddTag(ci.ContainerStatusReason, reason)
+ if val, ok := ci.WaitingReasonLookup[reason]; ok {
+ possibleStatuses[val] = 1
}
}
case containerStatus.State.Terminated != nil:
@@ -546,6 +546,9 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) {
if containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.Reason != "" {
metric.AddTag(ci.ContainerLastTerminationReason, containerStatus.LastTerminationState.Terminated.Reason)
+ if strings.Contains(containerStatus.LastTerminationState.Terminated.Reason, "OOMKilled") {
+ possibleStatuses[ci.StatusTerminatedReasonOOMKilled] = 1
+ }
}
containerKey := createContainerKeyFromMetric(metric)
if containerKey != "" {
diff --git a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go
index e71352252c03..982fca985cdb 100644
--- a/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go
+++ b/receiver/awscontainerinsightreceiver/internal/stores/podstore_test.go
@@ -15,6 +15,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
+ "golang.org/x/exp/maps"
corev1 "k8s.io/api/core/v1"
ci "github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight"
@@ -357,9 +358,12 @@ func getPodStore() *PodStore {
}
func generateMetric(fields map[string]interface{}, tags map[string]string) CIMetric {
+ tagsCopy := maps.Clone(tags)
+ fieldsCopy := maps.Clone(fields)
+
return &mockCIMetric{
- tags: tags,
- fields: fields,
+ tags: tagsCopy,
+ fields: fieldsCopy,
}
}
@@ -672,8 +676,9 @@ func TestPodStore_addStatus_enhanced_metrics(t *testing.T) {
assert.Equal(t, "OOMKilled", metric.GetTag(ci.ContainerLastTerminationReason))
assert.Equal(t, int(1), metric.GetField(ci.ContainerRestartCount).(int))
assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusTerminated)))
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusTerminatedReasonOOMKilled)))
- pod.Status.ContainerStatuses[0].State.Terminated = nil
+ pod.Status.ContainerStatuses[0].LastTerminationState.Terminated = nil
pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}
tags = map[string]string{ci.MetricType: ci.TypeContainer, ci.K8sNamespace: "default", ci.K8sPodNameKey: "cpu-limit", ci.ContainerNamekey: "ubuntu"}
@@ -682,9 +687,15 @@ func TestPodStore_addStatus_enhanced_metrics(t *testing.T) {
podStore.addStatus(metric, pod)
assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus))
assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting)))
- assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed)))
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashLoopBackOff)))
+ // sparse metrics
+ assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError)))
+ assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusTerminatedReasonOOMKilled)))
+ assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonStartError)))
+ assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerError)))
+ assert.Nil(t, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerConfigError)))
- pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "SomeOtherReason"}
+ pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}
tags = map[string]string{ci.MetricType: ci.TypeContainer, ci.K8sNamespace: "default", ci.K8sPodNameKey: "cpu-limit", ci.ContainerNamekey: "ubuntu"}
metric = generateMetric(fields, tags)
@@ -692,7 +703,32 @@ func TestPodStore_addStatus_enhanced_metrics(t *testing.T) {
podStore.addStatus(metric, pod)
assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus))
assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting)))
- assert.Equal(t, 0, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed)))
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError)))
+
+ pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "ErrImagePull"}
+ metric = generateMetric(fields, tags)
+ podStore.addStatus(metric, pod)
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError)))
+
+ pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "InvalidImageName"}
+ metric = generateMetric(fields, tags)
+ podStore.addStatus(metric, pod)
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonImagePullError)))
+
+ pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "CreateContainerError"}
+ metric = generateMetric(fields, tags)
+ podStore.addStatus(metric, pod)
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerError)))
+
+ pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "CreateContainerConfigError"}
+ metric = generateMetric(fields, tags)
+ podStore.addStatus(metric, pod)
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCreateContainerConfigError)))
+
+ pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "StartError"}
+ metric = generateMetric(fields, tags)
+ podStore.addStatus(metric, pod)
+ assert.Equal(t, 1, metric.GetField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonStartError)))
// test delta of restartCount
pod.Status.ContainerStatuses[0].RestartCount = 3
@@ -762,7 +798,7 @@ func TestPodStore_addStatus_without_enhanced_metrics(t *testing.T) {
podStore.addStatus(metric, pod)
assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus))
assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting)))
- assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed)))
+ assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashLoopBackOff)))
pod.Status.ContainerStatuses[0].State.Waiting = &corev1.ContainerStateWaiting{Reason: "SomeOtherReason"}
@@ -772,7 +808,7 @@ func TestPodStore_addStatus_without_enhanced_metrics(t *testing.T) {
podStore.addStatus(metric, pod)
assert.Equal(t, "Waiting", metric.GetTag(ci.ContainerStatus))
assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaiting)))
- assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashed)))
+ assert.False(t, metric.HasField(ci.MetricName(ci.TypeContainer, ci.StatusWaitingReasonCrashLoopBackOff)))
// test delta of restartCount
pod.Status.ContainerStatuses[0].RestartCount = 3