From 4ac6d7fe669d35cb2703b9220c09c55d06efdcf0 Mon Sep 17 00:00:00 2001 From: Chris Stein Date: Thu, 17 Aug 2017 09:20:20 -0500 Subject: [PATCH 1/4] Collect job metrics --- kubernetes_state/CHANGELOG.md | 1 + kubernetes_state/check.py | 34 ++++++++++++++++--- kubernetes_state/metadata.csv | 62 ++++++++++++++++++----------------- 3 files changed, 63 insertions(+), 34 deletions(-) diff --git a/kubernetes_state/CHANGELOG.md b/kubernetes_state/CHANGELOG.md index a3b38c6be4b35..f9885088646e4 100644 --- a/kubernetes_state/CHANGELOG.md +++ b/kubernetes_state/CHANGELOG.md @@ -7,6 +7,7 @@ * [FEATURE] Support for StatefulSet metrics. See [#561][] * [FEATURE] Support tag renaming via the labels_mapper option. See [#651][] +* [FEATURE] Add basic Job metrics. See [#6..][] 1.2.0 / 2017-07-18 ================== diff --git a/kubernetes_state/check.py b/kubernetes_state/check.py index 6173a289df5f6..3882c11be89fc 100644 --- a/kubernetes_state/check.py +++ b/kubernetes_state/check.py @@ -4,6 +4,7 @@ from checks import CheckException from checks.prometheus_check import PrometheusCheck +import re METRIC_TYPES = ['counter', 'gauge'] @@ -113,9 +114,7 @@ def __init__(self, name, init_config, agentConfig, instances=None): 'kube_job_spec_parallelism', 'kube_job_status_active', 'kube_job_status_completion_time', # We could compute the duration=completion-start as a gauge - 'kube_job_status_failed', # Container number gauge, redundant with job-global kube_job_failed 'kube_job_status_start_time', - 'kube_job_status_succeeded', # Container number gauge, redundant with job-global kube_job_complete ] @@ -194,6 +193,13 @@ def _label_to_tag(self, name, labels, tag_name=None): else: return None + def _trim_job_tag(self, name): + """ + Trims suffix of job names if they match -(\d{4,10}$) + """ + pattern = "(-\d{4,10}$)" + return re.sub(pattern, '', name) + # Labels attached: namespace, pod, phase=Pending|Running|Succeeded|Failed|Unknown # The phase gets not passed through; rather, it becomes the service check suffix. def kube_pod_status_phase(self, message, **kwargs): @@ -217,7 +223,8 @@ def kube_job_complete(self, message, **kwargs): for metric in message.metric: tags = [] for label in metric.label: - tags.append(self._format_tag(label.name, label.value)) + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) self.service_check(service_check_name, self.OK, tags=tags) def kube_job_failed(self, message, **kwargs): @@ -225,9 +232,28 @@ def kube_job_failed(self, message, **kwargs): for metric in message.metric: tags = [] for label in metric.label: - tags.append(self._format_tag(label.name, label.value)) + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) self.service_check(service_check_name, self.CRITICAL, tags=tags) + def kube_job_status_failed(self, message, **kwargs): + metric_name = self.NAMESPACE + '.job.failed' + for metric in message.metric: + tags = [] + for label in metric.label: + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) + self.increment(metric_name, metric.gauge.value, tags=tags) + + def kube_job_status_succeeded(self, message, **kwargs): + metric_name = self.NAMESPACE + '.job.succeeded' + for metric in message.metric: + tags = [] + for label in metric.label: + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) + self.increment(metric_name, metric.gauge.value, tags=tags) + def kube_node_status_ready(self, message, **kwargs): """ The ready status of a cluster node. """ service_check_name = self.NAMESPACE + '.node.ready' diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index 9d93ac7bd384f..1016b76af133c 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -1,24 +1,4 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name -kubernetes_state.node.cpu_capacity,gauge,,cpu,,The total CPU resources of the node,0,kubernetes,k8s_state.node.cpu_capacity -kubernetes_state.node.memory_capacity,gauge,,byte,,The total memory resources of the node,0,kubernetes,k8s_state.node.memory_capacity -kubernetes_state.node.pods_capacity,gauge,,,,The total pod resources of the node,0,kubernetes,k8s_state.node.pods_capacity -kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.cpu_allocatable -kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable -kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable -kubernetes_state.node.status,gauge,,,,Submitted with a value of 1 for each node and tagged either 'status:schedulable' or 'status:unschedulable'; Sum this metric by either status to get the number of nodes in that status.,0,kubernetes,k8s_state.node.status -kubernetes_state.deployment.replicas,gauge,,,,The number of replicas per deployment,0,kubernetes,k8s_state.deployment.replicas -kubernetes_state.deployment.replicas_available,gauge,,,,The number of available replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_available -kubernetes_state.deployment.replicas_unavailable,gauge,,,,The number of unavailable replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_unavailable -kubernetes_state.deployment.replicas_updated,gauge,,,,The number of updated replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_updated -kubernetes_state.deployment.replicas_desired,gauge,,,,The number of desired replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_desired -kubernetes_state.deployment.paused,gauge,,,,Whether a deployment is paused,0,kubernetes,k8s_state.deployment.paused -kubernetes_state.deployment.rollingupdate.max_unavailable,gauge,,,,Maximum number of unavailable replicas during a rolling update,0,kubernetes,k8s_state.deployment.rollupdate.max_unavail -kubernetes_state.daemonset.scheduled,gauge,,,,The number of nodes running at least one daemon pod and that are supposed to,0,kubernetes,k8s_state.ds.scheduled -kubernetes_state.daemonset.misscheduled,gauge,,,,The number of nodes running a daemon pod but are not supposed to,-1,kubernetes,k8s_state.ds.misscheduled -kubernetes_state.daemonset.desired,gauge,,,,The number of nodes that should be running the daemon pod,0,kubernetes,k8s_state.ds.desired -kubernetes_state.daemonset.ready,gauge,,,,The number of nodes that should be running the daemon pod and have one or more running and ready,0,kubernetes,k8s_state.ds.ready -kubernetes_state.pod.ready,gauge,,,,Whether the pod is ready to serve requests,1,kubernetes,k8s_state.pod.ready -kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled kubernetes_state.container.ready,gauge,,,,Whether the containers readiness check succeeded,0,kubernetes,k8s_state.container.rdy kubernetes_state.container.running,gauge,,,,Whether the container is currently in running state,0,kubernetes,k8s_state.container.running kubernetes_state.container.terminated,gauge,,,,Whether the container is currently in terminated state,0,kubernetes,k8s_state.container.term @@ -28,6 +8,38 @@ kubernetes_state.container.cpu_requested,gauge,,cpu,,The number of requested cpu kubernetes_state.container.memory_requested,gauge,,byte,,The number of requested memory bytes by a container,0,kubernetes,k8s_state.container.mem_req kubernetes_state.container.cpu_limit,gauge,,cpu,,The limit on cpu cores to be used by a container,0,kubernetes,k8s_state.container.cpu_limit kubernetes_state.container.memory_limit,gauge,,byte,,The limit on memory to be used by a container,0,kubernetes,k8s_state.container.mem_limit +kubernetes_state.daemonset.scheduled,gauge,,,,The number of nodes running at least one daemon pod and that are supposed to,0,kubernetes,k8s_state.ds.scheduled +kubernetes_state.daemonset.misscheduled,gauge,,,,The number of nodes running a daemon pod but are not supposed to,-1,kubernetes,k8s_state.ds.misscheduled +kubernetes_state.daemonset.desired,gauge,,,,The number of nodes that should be running the daemon pod,0,kubernetes,k8s_state.ds.desired +kubernetes_state.daemonset.ready,gauge,,,,The number of nodes that should be running the daemon pod and have one or more running and ready,0,kubernetes,k8s_state.ds.ready +kubernetes_state.deployment.replicas,gauge,,,,The number of replicas per deployment,0,kubernetes,k8s_state.deployment.replicas +kubernetes_state.deployment.replicas_available,gauge,,,,The number of available replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_available +kubernetes_state.deployment.replicas_unavailable,gauge,,,,The number of unavailable replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_unavailable +kubernetes_state.deployment.replicas_updated,gauge,,,,The number of updated replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_updated +kubernetes_state.deployment.replicas_desired,gauge,,,,The number of desired replicas per deployment,0,kubernetes,k8s_state.deployment.replicas_desired +kubernetes_state.deployment.paused,gauge,,,,Whether a deployment is paused,0,kubernetes,k8s_state.deployment.paused +kubernetes_state.deployment.rollingupdate.max_unavailable,gauge,,,,Maximum number of unavailable replicas during a rolling update,0,kubernetes,k8s_state.deployment.rollupdate.max_unavail +kubernetes_state.job.status.failed,counter,,,,Observed number of failed pods in a job,0,kubernetes,k8s_state.job.failed +kubernetes_state.job.status.succeeded,counter,,,,Observed number of succeeded pods in a job,0,kubernetes,k8s_state.job.succeeded +kubernetes_state.limitrange.cpu.min,gauge,,,,Minimum CPU request for this type,0,kubernetes,k8s_state.cpu.min +kubernetes_state.limitrange.cpu.max,gauge,,,,Maximum CPU limit for this type,0,kubernetes,k8s_state.cpu.max +kubernetes_state.limitrange.cpu.default,gauge,,,,Default CPU limit if not specified,0,kubernetes,k8s_state.cpu.default +kubernetes_state.limitrange.cpu.default_request,gauge,,,,Default CPU request if not specified,0,kubernetes,k8s_state.cpu.default_request +kubernetes_state.limitrange.cpu.max_limit_request_ratio,gauge,,,,Maximum CPU limit / request ratio,0,kubernetes,k8s_state.cpu.max_ratio +kubernetes_state.limitrange.memory.min,gauge,,,,Minimum memory request for this type,0,kubernetes,k8s_state.mem.min +kubernetes_state.limitrange.memory.max,gauge,,,,Maximum memory limit for this type,0,kubernetes,k8s_state.mem.max +kubernetes_state.limitrange.memory.default,gauge,,,,Default memory limit if not specified,0,kubernetes,k8s_state.mem.default +kubernetes_state.limitrange.memory.default_request,gauge,,,,Default memory request if not specified,0,kubernetes,k8s_state.mem.default_request +kubernetes_state.limitrange.memory.max_limit_request_ratio,gauge,,,,Maximum memory limit / request ratio,0,kubernetes,k8s_state.mem.max_ratio +kubernetes_state.node.cpu_capacity,gauge,,cpu,,The total CPU resources of the node,0,kubernetes,k8s_state.node.cpu_capacity +kubernetes_state.node.memory_capacity,gauge,,byte,,The total memory resources of the node,0,kubernetes,k8s_state.node.memory_capacity +kubernetes_state.node.pods_capacity,gauge,,,,The total pod resources of the node,0,kubernetes,k8s_state.node.pods_capacity +kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.cpu_allocatable +kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable +kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable +kubernetes_state.node.status,gauge,,,,Submitted with a value of 1 for each node and tagged either 'status:schedulable' or 'status:unschedulable'; Sum this metric by either status to get the number of nodes in that status.,0,kubernetes,k8s_state.node.status +kubernetes_state.pod.ready,gauge,,,,Whether the pod is ready to serve requests,1,kubernetes,k8s_state.pod.ready +kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy @@ -57,15 +69,5 @@ kubernetes_state.resourcequota.requests.memory.limit,gauge,,byte,,Hard limit on kubernetes_state.resourcequota.requests.storage.limit,gauge,,byte,,Hard limit on the total of storage bytes requested for a resource quota,0,kubernetes,k8s_state.resourcequota.requests.storage.limit kubernetes_state.resourcequota.limits.cpu.limit,gauge,,cpu,,Hard limit on the sum of CPU core limits for a resource quota,0,kubernetes,k8s_state.resourcequota.limits.cpu.limit kubernetes_state.resourcequota.limits.memory.limit,gauge,,byte,,Hard limit on the sum of memory bytes limits for a resource quota,0,kubernetes,k8s_state.resourcequota.limits.mem.limit -kubernetes_state.limitrange.cpu.min,gauge,,,,Minimum CPU request for this type,0,kubernetes,k8s_state.cpu.min -kubernetes_state.limitrange.cpu.max,gauge,,,,Maximum CPU limit for this type,0,kubernetes,k8s_state.cpu.max -kubernetes_state.limitrange.cpu.default,gauge,,,,Default CPU limit if not specified,0,kubernetes,k8s_state.cpu.default -kubernetes_state.limitrange.cpu.default_request,gauge,,,,Default CPU request if not specified,0,kubernetes,k8s_state.cpu.default_request -kubernetes_state.limitrange.cpu.max_limit_request_ratio,gauge,,,,Maximum CPU limit / request ratio,0,kubernetes,k8s_state.cpu.max_ratio -kubernetes_state.limitrange.memory.min,gauge,,,,Minimum memory request for this type,0,kubernetes,k8s_state.mem.min -kubernetes_state.limitrange.memory.max,gauge,,,,Maximum memory limit for this type,0,kubernetes,k8s_state.mem.max -kubernetes_state.limitrange.memory.default,gauge,,,,Default memory limit if not specified,0,kubernetes,k8s_state.mem.default -kubernetes_state.limitrange.memory.default_request,gauge,,,,Default memory request if not specified,0,kubernetes,k8s_state.mem.default_request -kubernetes_state.limitrange.memory.max_limit_request_ratio,gauge,,,,Maximum memory limit / request ratio,0,kubernetes,k8s_state.mem.max_ratio kubernetes_state.statefulset.replicas,gauge,,,,The number of replicas per statefulset,0,kubernetes,k8s_state.statefulset.replicas kubernetes_state.statefulset.replicas_desired,gauge,,,,The number of desired replicas per statefulset,0,kubernetes,k8s_state.statefulset.replicas_desired From f0db0780a682745e6c27ffe5180c1811ee401a11 Mon Sep 17 00:00:00 2001 From: Chris Stein Date: Thu, 17 Aug 2017 09:25:09 -0500 Subject: [PATCH 2/4] Add PR number to changelog:wq --- kubernetes_state/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes_state/CHANGELOG.md b/kubernetes_state/CHANGELOG.md index f9885088646e4..befc9bccb91f9 100644 --- a/kubernetes_state/CHANGELOG.md +++ b/kubernetes_state/CHANGELOG.md @@ -7,7 +7,7 @@ * [FEATURE] Support for StatefulSet metrics. See [#561][] * [FEATURE] Support tag renaming via the labels_mapper option. See [#651][] -* [FEATURE] Add basic Job metrics. See [#6..][] +* [FEATURE] Add basic Job metrics. See [#686][] 1.2.0 / 2017-07-18 ================== From cad480f197267556887fdbcbb82c1bbe2294ae28 Mon Sep 17 00:00:00 2001 From: Chris Stein Date: Thu, 17 Aug 2017 13:43:09 -0500 Subject: [PATCH 3/4] Fixed conditional logic --- kubernetes_state/check.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/kubernetes_state/check.py b/kubernetes_state/check.py index 3882c11be89fc..b07cb375d7bd6 100644 --- a/kubernetes_state/check.py +++ b/kubernetes_state/check.py @@ -223,8 +223,11 @@ def kube_job_complete(self, message, **kwargs): for metric in message.metric: tags = [] for label in metric.label: - trimmed_job = self._trim_job_tag(label.value) - tags.append(self._format_tag(label.name, trimmed_job)) + if label.name == 'job': + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) + else: + tags.append(self._format_tag(label.name, label.value)) self.service_check(service_check_name, self.OK, tags=tags) def kube_job_failed(self, message, **kwargs): @@ -232,8 +235,11 @@ def kube_job_failed(self, message, **kwargs): for metric in message.metric: tags = [] for label in metric.label: - trimmed_job = self._trim_job_tag(label.value) - tags.append(self._format_tag(label.name, trimmed_job)) + if label.name == 'job': + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) + else: + tags.append(self._format_tag(label.name, label.value)) self.service_check(service_check_name, self.CRITICAL, tags=tags) def kube_job_status_failed(self, message, **kwargs): @@ -241,8 +247,11 @@ def kube_job_status_failed(self, message, **kwargs): for metric in message.metric: tags = [] for label in metric.label: - trimmed_job = self._trim_job_tag(label.value) - tags.append(self._format_tag(label.name, trimmed_job)) + if label.name == 'job': + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) + else: + tags.append(self._format_tag(label.name, label.value)) self.increment(metric_name, metric.gauge.value, tags=tags) def kube_job_status_succeeded(self, message, **kwargs): @@ -250,8 +259,11 @@ def kube_job_status_succeeded(self, message, **kwargs): for metric in message.metric: tags = [] for label in metric.label: - trimmed_job = self._trim_job_tag(label.value) - tags.append(self._format_tag(label.name, trimmed_job)) + if label.name == 'job': + trimmed_job = self._trim_job_tag(label.value) + tags.append(self._format_tag(label.name, trimmed_job)) + else: + tags.append(self._format_tag(label.name, label.value)) self.increment(metric_name, metric.gauge.value, tags=tags) def kube_node_status_ready(self, message, **kwargs): From 3d5f757a37887c3f20d40e87c823dce9994353e7 Mon Sep 17 00:00:00 2001 From: Xavier Vello Date: Mon, 21 Aug 2017 16:15:25 +0200 Subject: [PATCH 4/4] use monotonic_count for job stats, to allow for finer reporting --- kubernetes_state/check.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kubernetes_state/check.py b/kubernetes_state/check.py index b07cb375d7bd6..8b70f53bab926 100644 --- a/kubernetes_state/check.py +++ b/kubernetes_state/check.py @@ -2,9 +2,12 @@ # All rights reserved # Licensed under Simplified BSD License (see LICENSE) +import re +from collections import defaultdict + from checks import CheckException from checks.prometheus_check import PrometheusCheck -import re + METRIC_TYPES = ['counter', 'gauge'] @@ -136,8 +139,18 @@ def check(self, instance): else: send_buckets = True + # Job counters are monotonic: they increase at every run of the job + # We want to send the delta via the `monotonic_count` method + self.job_succeeded_count = defaultdict(int) + self.job_failed_count = defaultdict(int) + self.process(endpoint, send_histograms_buckets=send_buckets, instance=instance) + for job_tags, job_count in self.job_succeeded_count.iteritems(): + self.monotonic_count(self.NAMESPACE + '.job.succeeded', job_count, list(job_tags)) + for job_tags, job_count in self.job_failed_count.iteritems(): + self.monotonic_count(self.NAMESPACE + '.job.failed', job_count, list(job_tags)) + def _condition_to_service_check(self, metric, sc_name, mapping, tags=None): """ Some metrics contains conditions, labels that have "condition" as name and "true", "false", or "unknown" @@ -252,7 +265,8 @@ def kube_job_status_failed(self, message, **kwargs): tags.append(self._format_tag(label.name, trimmed_job)) else: tags.append(self._format_tag(label.name, label.value)) - self.increment(metric_name, metric.gauge.value, tags=tags) + self.job_failed_count[frozenset(tags)] += metric.gauge.value + def kube_job_status_succeeded(self, message, **kwargs): metric_name = self.NAMESPACE + '.job.succeeded' @@ -264,7 +278,7 @@ def kube_job_status_succeeded(self, message, **kwargs): tags.append(self._format_tag(label.name, trimmed_job)) else: tags.append(self._format_tag(label.name, label.value)) - self.increment(metric_name, metric.gauge.value, tags=tags) + self.job_succeeded_count[frozenset(tags)] += metric.gauge.value def kube_node_status_ready(self, message, **kwargs): """ The ready status of a cluster node. """