Skip to content

Commit

Permalink
tweaks for enhanced container insights metrics (amazon-contributing#96)
Browse files Browse the repository at this point in the history
* tweaks for enhanced container insights metrics

* run go mod tidy
  • Loading branch information
chadpatel authored and lisguo committed Oct 20, 2023
1 parent 6dbca90 commit 06e7538
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 87 deletions.
93 changes: 56 additions & 37 deletions internal/aws/containerinsight/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,33 +76,38 @@ const (
FSInodesfree = "filesystem_inodes_free"
FSUtilization = "filesystem_utilization"

StatusConditionReady = "status_condition_ready"
StatusConditionDiskPressure = "status_condition_disk_pressure"
StatusConditionMemoryPressure = "status_condition_memory_pressure"
StatusConditionPIDPressure = "status_condition_pid_pressure"
StatusConditionNetworkUnavailable = "status_condition_network_unavailable"
StatusConditionUnknown = "status_condition_unknown"
StatusCapacityPods = "status_capacity_pods"
StatusAllocatablePods = "status_allocatable_pods"
StatusNumberAvailable = "status_number_available"
StatusNumberUnavailable = "status_number_unavailable"
StatusDesiredNumberScheduled = "status_desired_number_scheduled"
StatusCurrentNumberScheduled = "status_current_number_scheduled"
StatusReplicasAvailable = "status_replicas_available"
StatusReplicasUnavailable = "status_replicas_unavailable"
SpecReplicas = "spec_replicas"
StatusRunning = "status_running"
StatusTerminated = "status_terminated"
StatusWaiting = "status_waiting"
StatusWaitingReasonCrashed = "status_waiting_reason_crashed"
StatusPending = "status_pending"
StatusSucceeded = "status_succeeded"
StatusFailed = "status_failed"
StatusUnknown = "status_unknown"
StatusReady = "status_ready"
StatusScheduled = "status_scheduled"
ReplicasDesired = "replicas_desired"
ReplicasReady = "replicas_ready"
StatusConditionReady = "status_condition_ready"
StatusConditionDiskPressure = "status_condition_disk_pressure"
StatusConditionMemoryPressure = "status_condition_memory_pressure"
StatusConditionPIDPressure = "status_condition_pid_pressure"
StatusConditionNetworkUnavailable = "status_condition_network_unavailable"
StatusConditionUnknown = "status_condition_unknown"
StatusCapacityPods = "status_capacity_pods"
StatusAllocatablePods = "status_allocatable_pods"
StatusNumberAvailable = "status_number_available"
StatusNumberUnavailable = "status_number_unavailable"
StatusDesiredNumberScheduled = "status_desired_number_scheduled"
StatusCurrentNumberScheduled = "status_current_number_scheduled"
StatusReplicasAvailable = "status_replicas_available"
StatusReplicasUnavailable = "status_replicas_unavailable"
SpecReplicas = "spec_replicas"
StatusRunning = "status_running"
StatusTerminated = "status_terminated"
StatusWaiting = "status_waiting"
StatusWaitingReasonCrashLoopBackOff = "status_waiting_reason_crash_loop_back_off"
StatusWaitingReasonImagePullError = "status_waiting_reason_image_pull_error"
StatusWaitingReasonStartError = "status_waiting_reason_start_error"
StatusWaitingReasonCreateContainerError = "status_waiting_reason_create_container_error"
StatusWaitingReasonCreateContainerConfigError = "status_waiting_reason_create_container_config_error"
StatusTerminatedReasonOOMKilled = "status_terminated_reason_oom_killed"
StatusPending = "status_pending"
StatusSucceeded = "status_succeeded"
StatusFailed = "status_failed"
StatusUnknown = "status_unknown"
StatusReady = "status_ready"
StatusScheduled = "status_scheduled"
ReplicasDesired = "replicas_desired"
ReplicasReady = "replicas_ready"

RunningPodCount = "number_of_running_pods"
RunningContainerCount = "number_of_running_containers"
Expand Down Expand Up @@ -157,6 +162,16 @@ const (
UnitPercent = "Percent"
)

var WaitingReasonLookup = map[string]string{
"CrashLoopBackOff": StatusWaitingReasonCrashLoopBackOff,
"ErrImagePull": StatusWaitingReasonImagePullError,
"ImagePullBackOff": StatusWaitingReasonImagePullError,
"InvalidImageName": StatusWaitingReasonImagePullError,
"CreateContainerError": StatusWaitingReasonCreateContainerError,
"CreateContainerConfigError": StatusWaitingReasonCreateContainerConfigError,
"StartError": StatusWaitingReasonStartError,
}

var metricToUnitMap map[string]string

func init() {
Expand Down Expand Up @@ -246,16 +261,20 @@ func init() {
ReplicasReady: UnitCount,

// kube-state-metrics equivalents
StatusRunning: UnitCount,
StatusTerminated: UnitCount,
StatusWaiting: UnitCount,
StatusWaitingReasonCrashed: UnitCount,
StatusFailed: UnitCount,
StatusPending: UnitCount,
StatusSucceeded: UnitCount,
StatusUnknown: UnitCount,
StatusReady: UnitCount,
StatusScheduled: UnitCount,
StatusRunning: UnitCount,
StatusTerminated: UnitCount,
StatusWaiting: UnitCount,
StatusWaitingReasonCrashLoopBackOff: UnitCount,
StatusWaitingReasonImagePullError: UnitCount,
StatusWaitingReasonStartError: UnitCount,
StatusWaitingReasonCreateContainerConfigError: UnitCount,
StatusWaitingReasonCreateContainerError: UnitCount,
StatusFailed: UnitCount,
StatusPending: UnitCount,
StatusSucceeded: UnitCount,
StatusUnknown: UnitCount,
StatusReady: UnitCount,
StatusScheduled: UnitCount,

// cluster metrics
NodeCount: UnitCount,
Expand Down
80 changes: 47 additions & 33 deletions receiver/awscontainerinsightreceiver/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -372,14 +372,23 @@ kubectl apply -f config.yaml
### Cluster ControlPlane
| Metric | Unit |
|-----------------------------------------------------------|---------|
| apiserver_admission_controller_admission_duration_seconds | Seconds |
| apiserver_admission_controller_admission_duration_seconds | Seconds |
| apiserver_admission_step_admission_duration_seconds | Seconds |
| apiserver_admission_webhook_admission_duration_seconds | Seconds |
| apiserver_current_inflight_requests | Count |
| apiserver_current_inqueue_requests | Count |
| apiserver_flowcontrol_rejected_requests_total | Count |
| apiserver_flowcontrol_request_concurrency_limit | Count |
| apiserver_longrunning_requests | Count |
| apiserver_request_duration_seconds | Seconds |
| apiserver_request_total | Count |
| apiserver_request_total_5xx | Count |
| apiserver_requested_deprecated_apis | Count |
| apiserver_storage_list_duration_seconds | Seconds |
| apiserver_storage_objects | Count |
| etcd_db_total_size_in_bytes | Count |
| apiserver_storage_db_total_size_in_bytes | Bytes |
| apiserver_storage_size_bytes | Bytes |
| etcd_db_total_size_in_bytes | Bytes |
| etcd_request_duration_seconds | Seconds |
| rest_client_request_duration_seconds | Seconds |
| rest_client_requests_total | Count |
Expand Down Expand Up @@ -780,37 +789,42 @@ kubectl apply -f config.yaml


### Container
| Metric | Unit |
|---------------------------------------------------|--------------|
| container_cpu_limit | Millicore |
| container_cpu_request | Millicore |
| container_cpu_usage_system | Millicore |
| container_cpu_usage_total | Millicore |
| container_cpu_usage_user | Millicore |
| container_cpu_utilization | Percent |
| container_cpu_utilization_over_container_limit | Percent |
| container_memory_cache | Bytes |
| container_memory_failcnt | Count |
| container_memory_hierarchical_pgfault | Count/Second |
| container_memory_hierarchical_pgmajfault | Count/Second |
| container_memory_limit | Bytes |
| container_memory_mapped_file | Bytes |
| container_memory_max_usage | Bytes |
| container_memory_pgfault | Count/Second |
| container_memory_pgmajfault | Count/Second |
| container_memory_failures_total | Count/Second |
| container_memory_request | Bytes |
| container_memory_rss | Bytes |
| container_memory_swap | Bytes |
| container_memory_usage | Bytes |
| container_memory_utilization | Percent |
| container_memory_utilization_over_container_limit | Percent |
| container_memory_working_set | Bytes |
| number_of_container_restarts | Count |
| container_status_running | Count |
| container_status_terminated | Count |
| container_status_waiting | Count |
| container_status_waiting_reason_crashed | Count |
| Metric | Unit |
|---------------------------------------------------------------|--------------|
| container_cpu_limit | Millicore |
| container_cpu_request | Millicore |
| container_cpu_usage_system | Millicore |
| container_cpu_usage_total | Millicore |
| container_cpu_usage_user | Millicore |
| container_cpu_utilization | Percent |
| container_cpu_utilization_over_container_limit | Percent |
| container_memory_cache | Bytes |
| container_memory_failcnt | Count |
| container_memory_hierarchical_pgfault | Count/Second |
| container_memory_hierarchical_pgmajfault | Count/Second |
| container_memory_limit | Bytes |
| container_memory_mapped_file | Bytes |
| container_memory_max_usage | Bytes |
| container_memory_pgfault | Count/Second |
| container_memory_pgmajfault | Count/Second |
| container_memory_failures_total | Count/Second |
| container_memory_request | Bytes |
| container_memory_rss | Bytes |
| container_memory_swap | Bytes |
| container_memory_usage | Bytes |
| container_memory_utilization | Percent |
| container_memory_utilization_over_container_limit | Percent |
| container_memory_working_set | Bytes |
| number_of_container_restarts | Count |
| container_status_running | Count |
| container_status_terminated | Count |
| container_status_waiting | Count |
| container_status_waiting_reason_crash_loop_back_off | Count |
| container_status_waiting_reason_image_pull_error | Count |
| container_status_waiting_reason_start_error | Count |
| container_status_waiting_reason_create_container_error | Count |
| container_status_waiting_reason_create_container_config_error | Count |
| container_status_terminated_reason_oom_killed | Count |

<br/><br/>

Expand Down
2 changes: 1 addition & 1 deletion receiver/awscontainerinsightreceiver/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ require (
go.opentelemetry.io/collector/pdata v1.0.0-rcv0014.0.20230908201109-ab3d6c5b6470
go.opentelemetry.io/collector/receiver v0.84.1-0.20230908201109-ab3d6c5b6470
go.uber.org/zap v1.25.0
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.28.1
k8s.io/apimachinery v0.28.1
Expand Down Expand Up @@ -203,7 +204,6 @@ require (
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.13.0 // indirect
golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect
golang.org/x/mod v0.12.0 // indirect
golang.org/x/net v0.15.0 // indirect
golang.org/x/oauth2 v0.11.0 // indirect
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,20 @@ const (
var (
controlPlaneMetricAllowList = []string{
"apiserver_admission_controller_admission_duration_seconds.*",
"apiserver_admission_step_admission_duration_seconds.*",
"apiserver_admission_webhook_admission_duration_seconds.*",
"apiserver_current_inflight_requests",
"apiserver_current_inqueue_requests",
"apiserver_flowcontrol_rejected_requests_total",
"apiserver_flowcontrol_request_concurrency_limit",
"apiserver_longrunning_requests",
"apiserver_request_duration_seconds.*",
"apiserver_request_total",
"apiserver_requested_deprecated_apis",
"apiserver_storage_list_duration_seconds.*",
"apiserver_storage_objects",
"apiserver_storage_db_total_size_in_bytes.*",
"apiserver_storage_size_bytes.*",
"etcd_db_total_size_in_bytes.*",
"etcd_request_duration_seconds.*",
"rest_client_request_duration_seconds.*",
Expand Down
19 changes: 11 additions & 8 deletions receiver/awscontainerinsightreceiver/internal/stores/podstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -518,10 +518,9 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) {
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.Name == containerName {
possibleStatuses := map[string]int{
ci.StatusRunning: 0,
ci.StatusWaiting: 0,
ci.StatusWaitingReasonCrashed: 0,
ci.StatusTerminated: 0,
ci.StatusRunning: 0,
ci.StatusWaiting: 0,
ci.StatusTerminated: 0,
}
switch {
case containerStatus.State.Running != nil:
Expand All @@ -530,10 +529,11 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) {
case containerStatus.State.Waiting != nil:
metric.AddTag(ci.ContainerStatus, "Waiting")
possibleStatuses[ci.StatusWaiting] = 1
if containerStatus.State.Waiting.Reason != "" {
metric.AddTag(ci.ContainerStatusReason, containerStatus.State.Waiting.Reason)
if strings.Contains(containerStatus.State.Waiting.Reason, "Crash") {
possibleStatuses[ci.StatusWaitingReasonCrashed] = 1
reason := containerStatus.State.Waiting.Reason
if reason != "" {
metric.AddTag(ci.ContainerStatusReason, reason)
if val, ok := ci.WaitingReasonLookup[reason]; ok {
possibleStatuses[val] = 1
}
}
case containerStatus.State.Terminated != nil:
Expand All @@ -546,6 +546,9 @@ func (p *PodStore) addStatus(metric CIMetric, pod *corev1.Pod) {

if containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.Reason != "" {
metric.AddTag(ci.ContainerLastTerminationReason, containerStatus.LastTerminationState.Terminated.Reason)
if strings.Contains(containerStatus.LastTerminationState.Terminated.Reason, "OOMKilled") {
possibleStatuses[ci.StatusTerminatedReasonOOMKilled] = 1
}
}
containerKey := createContainerKeyFromMetric(metric)
if containerKey != "" {
Expand Down
Loading

0 comments on commit 06e7538

Please sign in to comment.