Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kubernetes_state] - Collect job metrics #686

Merged
merged 4 commits into from
Aug 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions kubernetes_state/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

* [FEATURE] Support for StatefulSet metrics. See [#561][]
* [FEATURE] Support tag renaming via the labels_mapper option. See [#651][]
* [FEATURE] Add basic Job metrics. See [#686][]

1.2.0 / 2017-07-18
==================
Expand Down
60 changes: 56 additions & 4 deletions kubernetes_state/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)

import re
from collections import defaultdict

from checks import CheckException
from checks.prometheus_check import PrometheusCheck


METRIC_TYPES = ['counter', 'gauge']


Expand Down Expand Up @@ -113,9 +117,7 @@ def __init__(self, name, init_config, agentConfig, instances=None):
'kube_job_spec_parallelism',
'kube_job_status_active',
'kube_job_status_completion_time', # We could compute the duration=completion-start as a gauge
'kube_job_status_failed', # Container number gauge, redundant with job-global kube_job_failed
'kube_job_status_start_time',
'kube_job_status_succeeded', # Container number gauge, redundant with job-global kube_job_complete

]

Expand All @@ -137,8 +139,18 @@ def check(self, instance):
else:
send_buckets = True

# Job counters are monotonic: they increase at every run of the job
# We want to send the delta via the `monotonic_count` method
self.job_succeeded_count = defaultdict(int)
self.job_failed_count = defaultdict(int)

self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance)

for job_tags, job_count in self.job_succeeded_count.iteritems():
self.monotonic_count(self.NAMESPACE + '.job.succeeded', job_count, list(job_tags))
for job_tags, job_count in self.job_failed_count.iteritems():
self.monotonic_count(self.NAMESPACE + '.job.failed', job_count, list(job_tags))

def _condition_to_service_check(self, metric, sc_name, mapping, tags=None):
"""
Some metrics contains conditions, labels that have "condition" as name and "true", "false", or "unknown"
Expand Down Expand Up @@ -194,6 +206,13 @@ def _label_to_tag(self, name, labels, tag_name=None):
else:
return None

def _trim_job_tag(self, name):
"""
Trims suffix of job names if they match -(\d{4,10}$)
"""
pattern = "(-\d{4,10}$)"
return re.sub(pattern, '', name)

# Labels attached: namespace, pod, phase=Pending|Running|Succeeded|Failed|Unknown
# The phase gets not passed through; rather, it becomes the service check suffix.
def kube_pod_status_phase(self, message, **kwargs):
Expand All @@ -217,17 +236,50 @@ def kube_job_complete(self, message, **kwargs):
for metric in message.metric:
tags = []
for label in metric.label:
tags.append(self._format_tag(label.name, label.value))
if label.name == 'job':
trimmed_job = self._trim_job_tag(label.value)
tags.append(self._format_tag(label.name, trimmed_job))
else:
tags.append(self._format_tag(label.name, label.value))
self.service_check(service_check_name, self.OK, tags=tags)

def kube_job_failed(self, message, **kwargs):
service_check_name = self.NAMESPACE + '.job.complete'
for metric in message.metric:
tags = []
for label in metric.label:
tags.append(self._format_tag(label.name, label.value))
if label.name == 'job':
trimmed_job = self._trim_job_tag(label.value)
tags.append(self._format_tag(label.name, trimmed_job))
else:
tags.append(self._format_tag(label.name, label.value))
self.service_check(service_check_name, self.CRITICAL, tags=tags)

def kube_job_status_failed(self, message, **kwargs):
metric_name = self.NAMESPACE + '.job.failed'
for metric in message.metric:
tags = []
for label in metric.label:
if label.name == 'job':
trimmed_job = self._trim_job_tag(label.value)
tags.append(self._format_tag(label.name, trimmed_job))
else:
tags.append(self._format_tag(label.name, label.value))
self.job_failed_count[frozenset(tags)] += metric.gauge.value


def kube_job_status_succeeded(self, message, **kwargs):
metric_name = self.NAMESPACE + '.job.succeeded'
for metric in message.metric:
tags = []
for label in metric.label:
if label.name == 'job':
trimmed_job = self._trim_job_tag(label.value)
tags.append(self._format_tag(label.name, trimmed_job))
else:
tags.append(self._format_tag(label.name, label.value))
self.job_succeeded_count[frozenset(tags)] += metric.gauge.value

def kube_node_status_ready(self, message, **kwargs):
""" The ready status of a cluster node. """
service_check_name = self.NAMESPACE + '.node.ready'
Expand Down
62 changes: 32 additions & 30 deletions kubernetes_state/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,24 +1,4 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name
kubernetes_state.node.cpu_capacity,gauge,,cpu,,The total CPU resources of the node,0,kubernetes,k8s_state.node.cpu_capacity
kubernetes_state.node.memory_capacity,gauge,,byte,,The total memory resources of the node,0,kubernetes,k8s_state.node.memory_capacity
kubernetes_state.node.pods_capacity,gauge,,,,The total pod resources of the node,0,kubernetes,k8s_state.node.pods_capacity
kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.cpu_allocatable
kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable
kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable
kubernetes_state.node.status,gauge,,,,Submitted with a value of 1 for each node and tagged either 'status:schedulable' or 'status:unschedulable'; Sum this metric by either status to get the number of nodes in that status.,0,kubernetes,k8s_state.node.status
kubernetes_state.deployment.replicas,gauge,,,,The number of replicas per deployment,0,kubernetes,k8s_state.deployment.replicas
kubernetes_state.deployment.replicas_available,gauge,,,,The number of available replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_available
kubernetes_state.deployment.replicas_unavailable,gauge,,,,The number of unavailable replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_unavailable
kubernetes_state.deployment.replicas_updated,gauge,,,,The number of updated replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_updated
kubernetes_state.deployment.replicas_desired,gauge,,,,The number of desired replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_desired
kubernetes_state.deployment.paused,gauge,,,,Whether a deployment is paused,0,kubernetes,k8s_state.deployment.paused
kubernetes_state.deployment.rollingupdate.max_unavailable,gauge,,,,Maximum number of unavailable replicas during a rolling update,0,kubernetes,k8s_state.deployment.rollupdate.max_unavail
kubernetes_state.daemonset.scheduled,gauge,,,,The number of nodes running at least one daemon pod and that are supposed to,0,kubernetes,k8s_state.ds.scheduled
kubernetes_state.daemonset.misscheduled,gauge,,,,The number of nodes running a daemon pod but are not supposed to,-1,kubernetes,k8s_state.ds.misscheduled
kubernetes_state.daemonset.desired,gauge,,,,The number of nodes that should be running the daemon pod,0,kubernetes,k8s_state.ds.desired
kubernetes_state.daemonset.ready,gauge,,,,The number of nodes that should be running the daemon pod and have one or more running and ready,0,kubernetes,k8s_state.ds.ready
kubernetes_state.pod.ready,gauge,,,,Whether the pod is ready to serve requests,1,kubernetes,k8s_state.pod.ready
kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
kubernetes_state.container.ready,gauge,,,,Whether the containers readiness check succeeded,0,kubernetes,k8s_state.container.rdy
kubernetes_state.container.running,gauge,,,,Whether the container is currently in running state,0,kubernetes,k8s_state.container.running
kubernetes_state.container.terminated,gauge,,,,Whether the container is currently in terminated state,0,kubernetes,k8s_state.container.term
Expand All @@ -28,6 +8,38 @@ kubernetes_state.container.cpu_requested,gauge,,cpu,,The number of requested cpu
kubernetes_state.container.memory_requested,gauge,,byte,,The number of requested memory bytes by a container,0,kubernetes,k8s_state.container.mem_req
kubernetes_state.container.cpu_limit,gauge,,cpu,,The limit on cpu cores to be used by a container,0,kubernetes,k8s_state.container.cpu_limit
kubernetes_state.container.memory_limit,gauge,,byte,,The limit on memory to be used by a container,0,kubernetes,k8s_state.container.mem_limit
kubernetes_state.daemonset.scheduled,gauge,,,,The number of nodes running at least one daemon pod and that are supposed to,0,kubernetes,k8s_state.ds.scheduled
kubernetes_state.daemonset.misscheduled,gauge,,,,The number of nodes running a daemon pod but are not supposed to,-1,kubernetes,k8s_state.ds.misscheduled
kubernetes_state.daemonset.desired,gauge,,,,The number of nodes that should be running the daemon pod,0,kubernetes,k8s_state.ds.desired
kubernetes_state.daemonset.ready,gauge,,,,The number of nodes that should be running the daemon pod and have one or more running and ready,0,kubernetes,k8s_state.ds.ready
kubernetes_state.deployment.replicas,gauge,,,,The number of replicas per deployment,0,kubernetes,k8s_state.deployment.replicas
kubernetes_state.deployment.replicas_available,gauge,,,,The number of available replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_available
kubernetes_state.deployment.replicas_unavailable,gauge,,,,The number of unavailable replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_unavailable
kubernetes_state.deployment.replicas_updated,gauge,,,,The number of updated replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_updated
kubernetes_state.deployment.replicas_desired,gauge,,,,The number of desired replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_desired
kubernetes_state.deployment.paused,gauge,,,,Whether a deployment is paused,0,kubernetes,k8s_state.deployment.paused
kubernetes_state.deployment.rollingupdate.max_unavailable,gauge,,,,Maximum number of unavailable replicas during a rolling update,0,kubernetes,k8s_state.deployment.rollupdate.max_unavail
kubernetes_state.job.status.failed,counter,,,,Observed number of failed pods in a job,0,kubernetes,k8s_state.job.failed
kubernetes_state.job.status.succeeded,counter,,,,Observed number of succeeded pods in a job,0,kubernetes,k8s_state.job.succeeded
kubernetes_state.limitrange.cpu.min,gauge,,,,Minimum CPU request for this type,0,kubernetes,k8s_state.cpu.min
kubernetes_state.limitrange.cpu.max,gauge,,,,Maximum CPU limit for this type,0,kubernetes,k8s_state.cpu.max
kubernetes_state.limitrange.cpu.default,gauge,,,,Default CPU limit if not specified,0,kubernetes,k8s_state.cpu.default
kubernetes_state.limitrange.cpu.default_request,gauge,,,,Default CPU request if not specified,0,kubernetes,k8s_state.cpu.default_request
kubernetes_state.limitrange.cpu.max_limit_request_ratio,gauge,,,,Maximum CPU limit / request ratio,0,kubernetes,k8s_state.cpu.max_ratio
kubernetes_state.limitrange.memory.min,gauge,,,,Minimum memory request for this type,0,kubernetes,k8s_state.mem.min
kubernetes_state.limitrange.memory.max,gauge,,,,Maximum memory limit for this type,0,kubernetes,k8s_state.mem.max
kubernetes_state.limitrange.memory.default,gauge,,,,Default memory limit if not specified,0,kubernetes,k8s_state.mem.default
kubernetes_state.limitrange.memory.default_request,gauge,,,,Default memory request if not specified,0,kubernetes,k8s_state.mem.default_request
kubernetes_state.limitrange.memory.max_limit_request_ratio,gauge,,,,Maximum memory limit / request ratio,0,kubernetes,k8s_state.mem.max_ratio
kubernetes_state.node.cpu_capacity,gauge,,cpu,,The total CPU resources of the node,0,kubernetes,k8s_state.node.cpu_capacity
kubernetes_state.node.memory_capacity,gauge,,byte,,The total memory resources of the node,0,kubernetes,k8s_state.node.memory_capacity
kubernetes_state.node.pods_capacity,gauge,,,,The total pod resources of the node,0,kubernetes,k8s_state.node.pods_capacity
kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.cpu_allocatable
kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable
kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable
kubernetes_state.node.status,gauge,,,,Submitted with a value of 1 for each node and tagged either 'status:schedulable' or 'status:unschedulable'; Sum this metric by either status to get the number of nodes in that status.,0,kubernetes,k8s_state.node.status
kubernetes_state.pod.ready,gauge,,,,Whether the pod is ready to serve requests,1,kubernetes,k8s_state.pod.ready
kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas
kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled
kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy
Expand Down Expand Up @@ -57,15 +69,5 @@ kubernetes_state.resourcequota.requests.memory.limit,gauge,,byte,,Hard limit on
kubernetes_state.resourcequota.requests.storage.limit,gauge,,byte,,Hard limit on the total of storage bytes requested for a resource quota,0,kubernetes,k8s_state.resourcequota.requests.storage.limit
kubernetes_state.resourcequota.limits.cpu.limit,gauge,,cpu,,Hard limit on the sum of CPU core limits for a resource quota,0,kubernetes,k8s_state.resourcequota.limits.cpu.limit
kubernetes_state.resourcequota.limits.memory.limit,gauge,,byte,,Hard limit on the sum of memory bytes limits for a resource quota,0,kubernetes,k8s_state.resourcequota.limits.mem.limit
kubernetes_state.limitrange.cpu.min,gauge,,,,Minimum CPU request for this type,0,kubernetes,k8s_state.cpu.min
kubernetes_state.limitrange.cpu.max,gauge,,,,Maximum CPU limit for this type,0,kubernetes,k8s_state.cpu.max
kubernetes_state.limitrange.cpu.default,gauge,,,,Default CPU limit if not specified,0,kubernetes,k8s_state.cpu.default
kubernetes_state.limitrange.cpu.default_request,gauge,,,,Default CPU request if not specified,0,kubernetes,k8s_state.cpu.default_request
kubernetes_state.limitrange.cpu.max_limit_request_ratio,gauge,,,,Maximum CPU limit / request ratio,0,kubernetes,k8s_state.cpu.max_ratio
kubernetes_state.limitrange.memory.min,gauge,,,,Minimum memory request for this type,0,kubernetes,k8s_state.mem.min
kubernetes_state.limitrange.memory.max,gauge,,,,Maximum memory limit for this type,0,kubernetes,k8s_state.mem.max
kubernetes_state.limitrange.memory.default,gauge,,,,Default memory limit if not specified,0,kubernetes,k8s_state.mem.default
kubernetes_state.limitrange.memory.default_request,gauge,,,,Default memory request if not specified,0,kubernetes,k8s_state.mem.default_request
kubernetes_state.limitrange.memory.max_limit_request_ratio,gauge,,,,Maximum memory limit / request ratio,0,kubernetes,k8s_state.mem.max_ratio
kubernetes_state.statefulset.replicas,gauge,,,,The number of replicas per statefulset,0,kubernetes,k8s_state.statefulset.replicas
kubernetes_state.statefulset.replicas_desired,gauge,,,,The number of desired replicas per statefulset,0,kubernetes,k8s_state.statefulset.replicas_desired