From e36321f9082abcfccff37d98a7b2dc33f00f4ced Mon Sep 17 00:00:00 2001 From: Mike Tougeron Date: Fri, 30 Oct 2015 11:30:48 -0700 Subject: [PATCH 1/6] The consul.catalog.nodes_up should reflect the # of nodes actually up so it is now checking the serfHealth for each service node. Also track the actual status of the nodes in the service. --- checks.d/consul.py | 51 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/checks.d/consul.py b/checks.d/consul.py index c5a9747e03..0a2af06228 100644 --- a/checks.d/consul.py +++ b/checks.d/consul.py @@ -142,7 +142,7 @@ def get_services_in_cluster(self, instance): return self.consul_request(instance, '/v1/catalog/services') def get_nodes_with_service(self, instance, service): - consul_request_url = '/v1/catalog/service/{0}'.format(service) + consul_request_url = '/v1/health/service/{0}'.format(service) return self.consul_request(instance, consul_request_url) @@ -218,15 +218,56 @@ def check(self, instance): services = self._cull_services_list(services, service_whitelist) for service in services: - nodes_with_service = self.get_nodes_with_service(instance, service) node_tags = ['consul_service_id:{0}'.format(service)] + nodes_with_service = self.get_nodes_with_service(instance, service) + node_status = {'passing': 0, 'warning': 0, 'critical': 0} + nodes_up = [] + for node in nodes_with_service: + # If there is no Check for the node then Consul considers it up + if 'Checks' not in node: + nodes_up.append(node) + node_status['passing'] += 1 + else: + # For backwards compatibility, the consul.catalog.nodes_up count needs to be + # based on the total # of nodes 'running' at part of the service. + # If the serfHealth is critical it means the agent isn't even responding + for check in node['Checks']: + if check['CheckID'] == 'serfHealth' and check['Status'] != 'critical': + nodes_up.append(node) + break + + # Now loop the Checks again to get the count of the actual health of each + # node in the service. + found_critical = False + found_warning = False + for check in node['Checks']: + if check['Status'] == 'critical': + found_critical = True + break + elif check['Status'] == 'warning': + found_warning = True + # Keep looping in case there is a critical status + + # Now increment the counter based on what was found in Checks + if found_critical: + node_status['critical'] += 1 + elif found_warning: + node_status['warning'] += 1 + else: + node_status['passing'] += 1 + self.gauge('{0}.nodes_up'.format(self.CONSUL_CATALOG_CHECK), - len(nodes_with_service), + len(nodes_up), tags=main_tags+node_tags) - for n in nodes_with_service: - node_id = n.get('Node') or None + for status_key in ['passing', 'warning', 'critical']: + self.gauge('{0}.nodes_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), + node_status[status_key], + tags=main_tags+node_tags) + + for n in nodes_up: + node_id = n.get('Node', {}).get('Node') or None if not node_id: continue From 60f0d4d8cec06cfbdb633f33069732b25697083f Mon Sep 17 00:00:00 2001 From: Mike Tougeron Date: Fri, 30 Oct 2015 14:31:43 -0700 Subject: [PATCH 2/6] Python 2.6 doesn't like the way I did the n.get --- checks.d/consul.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/checks.d/consul.py b/checks.d/consul.py index 0a2af06228..e6d5144dfa 100644 --- a/checks.d/consul.py +++ b/checks.d/consul.py @@ -267,7 +267,13 @@ def check(self, instance): tags=main_tags+node_tags) for n in nodes_up: - node_id = n.get('Node', {}).get('Node') or None + node = n.get('Node') or None + + if not node: + continue + + # The node_id is n['Node']['Node'] + node_id = node.get('Node') or None if not node_id: continue From 25037d8c16a16e4e5fb81a0c6fb359dd56126a99 Mon Sep 17 00:00:00 2001 From: Mike Tougeron Date: Mon, 2 Nov 2015 08:17:20 -0800 Subject: [PATCH 3/6] Update the mock to reflect the new API call --- tests/checks/mock/test_consul.py | 34 +++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/checks/mock/test_consul.py b/tests/checks/mock/test_consul.py index 639e9c33f1..d5923e66f7 100644 --- a/tests/checks/mock/test_consul.py +++ b/tests/checks/mock/test_consul.py @@ -121,15 +121,31 @@ def _get_random_ip(): return [ { - "Address": _get_random_ip(), - "Node": "node-1", - "ServiceAddress": "", - "ServiceID": service, - "ServiceName": service, - "ServicePort": 80, - "ServiceTags": [ - "az-us-east-1a" - ] + "Checks": [ + { + "CheckID": "serfHealth", + "Name": "Serf Health Status", + "Node": "node-1", + "Notes": "", + "Output": "Agent alive and reachable", + "ServiceID": "", + "ServiceName": "", + "Status": "passing" + } + ], + "Node": { + "Address": _get_random_ip(), + "Node": "node-1" + }, + "Service": { + "Address": "", + "ID": service, + "Port": 80, + "Service": service, + "Tags": [ + "az-us-east-1a" + ] + } } ] From ad300271b1048900dc79b7754009bb9b52f6caac Mon Sep 17 00:00:00 2001 From: talwai Date: Sun, 6 Dec 2015 22:59:35 -0500 Subject: [PATCH 4/6] [consul] Separate services_* metrics --- checks.d/consul.py | 122 ++++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 46 deletions(-) diff --git a/checks.d/consul.py b/checks.d/consul.py index e6d5144dfa..17e3483069 100644 --- a/checks.d/consul.py +++ b/checks.d/consul.py @@ -187,6 +187,7 @@ def check(self, instance): self.init_config.get('catalog_checks')) try: + # Make service checks from health checks for all services in catalog health_state = self.consul_request(instance, '/v1/health/state/any') for check in health_state: @@ -198,7 +199,7 @@ def check(self, instance): if check["ServiceName"]: tags.append("service:{0}".format(check["ServiceName"])) if check["ServiceID"]: - tags.append("service-id:{0}".format(check["ServiceID"])) + tags.append("consul_service_id:{0}".format(check["ServiceID"])) self.service_check(self.HEALTH_CHECK, status, tags=main_tags+tags) @@ -210,38 +211,61 @@ def check(self, instance): tags=service_check_tags) if perform_catalog_checks: - services = self.get_services_in_cluster(instance) - nodes_to_services = defaultdict(list) + # Collect node by service, and service by node counts for a whitelist of services + services = self.get_services_in_cluster(instance) service_whitelist = instance.get('service_whitelist', self.init_config.get('service_whitelist', [])) services = self._cull_services_list(services, service_whitelist) + + # {node_id: {"up: 0, "passing": 0, "warning": 0, "critical": 0} + nodes_to_service_status = defaultdict(lambda: defaultdict(int)) + for service in services: - node_tags = ['consul_service_id:{0}'.format(service)] + # For every service in the cluster, + # Gauge the following: + # `consul.catalog.nodes_up` : # of Nodes registered with that service + # `consul.catalog.nodes_passing` : # of Nodes with service status `passing` from those registered + # `consul.catalog.nodes_warning` : # of Nodes with service status `warning` from those registered + # `consul.catalog.nodes_critical` : # of Nodes with service status `critical` from those registered + + service_tags = ['consul_service_id:{0}'.format(service)] nodes_with_service = self.get_nodes_with_service(instance, service) - node_status = {'passing': 0, 'warning': 0, 'critical': 0} - nodes_up = [] + + # {'up': 0, 'passing': 0, 'warning': 0, 'critical': 0} + node_status = defaultdict(int) + for node in nodes_with_service: - # If there is no Check for the node then Consul considers it up + # The node_id is n['Node']['Node'] + node_id = node.get('Node', {}).get("Node") + + # An additional service is registered on this node. Bump up the counter + nodes_to_service_status[node_id]["up"] += 1 + + # If there is no Check for the node then Consul and dd-agent consider it up if 'Checks' not in node: - nodes_up.append(node) node_status['passing'] += 1 + node_status['up'] += 1 else: - # For backwards compatibility, the consul.catalog.nodes_up count needs to be - # based on the total # of nodes 'running' at part of the service. - # If the serfHealth is critical it means the agent isn't even responding - for check in node['Checks']: - if check['CheckID'] == 'serfHealth' and check['Status'] != 'critical': - nodes_up.append(node) - break - - # Now loop the Checks again to get the count of the actual health of each - # node in the service. found_critical = False found_warning = False + found_serf_health = False + for check in node['Checks']: + if check['CheckID'] == 'serfHealth': + found_serf_health = True + + # For backwards compatibility, the "up" node_status is computed + # based on the total # of nodes 'running' as part of the service. + + # If the serfHealth is `critical` it means the Consul agent isn't even responding, + # and we don't register the node as `up` + if check['Status'] != 'critical': + node_status["up"] += 1 + continue + if check['Status'] == 'critical': found_critical = True break @@ -249,39 +273,45 @@ def check(self, instance): found_warning = True # Keep looping in case there is a critical status - # Now increment the counter based on what was found in Checks + # Increment the counters based on what was found in Checks + # `critical` checks override `warning`s, and if neither are found, register the node as `passing` if found_critical: node_status['critical'] += 1 + nodes_to_service_status[node_id]["critical"] += 1 elif found_warning: node_status['warning'] += 1 + nodes_to_service_status[node_id]["warning"] += 1 else: - node_status['passing'] += 1 - - self.gauge('{0}.nodes_up'.format(self.CONSUL_CATALOG_CHECK), - len(nodes_up), - tags=main_tags+node_tags) - - for status_key in ['passing', 'warning', 'critical']: - self.gauge('{0}.nodes_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), - node_status[status_key], - tags=main_tags+node_tags) - - for n in nodes_up: - node = n.get('Node') or None - - if not node: - continue + if not found_serf_health: + # We have not found a serfHealth check for this node, which is unexpected + # If we get here assume this node's status is "up", since we register it as 'passing' + node_status['up'] += 1 - # The node_id is n['Node']['Node'] - node_id = node.get('Node') or None - - if not node_id: - continue - - nodes_to_services[node_id].append(service) - - for node, services in nodes_to_services.iteritems(): - tags = ['consul_node_id:{0}'.format(node)] + node_status['passing'] += 1 + nodes_to_service_status[node_id]["passing"] += 1 + + for status_key, status_value in node_status.items(): + self.gauge( + '{0}.nodes_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), + status_value, + tags=main_tags+service_tags + ) + + for node, service_status in nodes_to_service_status.iteritems(): + # For every node discovered for whitelisted services, gauge the following: + # `consul.catalog.services_up` : Total services registered on node + # `consul.catalog.services_passing` : Total passing services on node + # `consul.catalog.services_warning` : Total warning services on node + # `consul.catalog.services_critical` : Total critical services on node + + node_tags = ['consul_node_id:{0}'.format(node)] self.gauge('{0}.services_up'.format(self.CONSUL_CATALOG_CHECK), len(services), - tags=main_tags+tags) + tags=main_tags+node_tags) + + for status_key, status_value in service_status.items(): + self.gauge( + '{0}.services_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), + status_value, + tags=main_tags+node_tags + ) From b71d7a376145e8ab34bccd85929b64dac88b8fab Mon Sep 17 00:00:00 2001 From: talwai Date: Sun, 6 Dec 2015 23:00:35 -0500 Subject: [PATCH 5/6] [consul] comment out non-required config --- conf.d/consul.yaml.example | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf.d/consul.yaml.example b/conf.d/consul.yaml.example index 8491c83183..5b6cb87029 100644 --- a/conf.d/consul.yaml.example +++ b/conf.d/consul.yaml.example @@ -32,7 +32,7 @@ instances: # The default settings query up to 50 services. So if you have more than # this many in your Consul service catalog, you will want to fill in the # whitelist - service_whitelist: - - zookeeper - - gunicorn - - redis + # service_whitelist: + # - zookeeper + # - gunicorn + # - redis From 687ad69ea6416a43b1cdaeeedc8db5b79a72d2ed Mon Sep 17 00:00:00 2001 From: talwai Date: Mon, 7 Dec 2015 12:13:40 -0500 Subject: [PATCH 6/6] [consul] update tests for service status metrics --- checks.d/consul.py | 7 +- tests/checks/mock/test_consul.py | 152 +++++++++++++++++++++++++++++-- 2 files changed, 149 insertions(+), 10 deletions(-) diff --git a/checks.d/consul.py b/checks.d/consul.py index 17e3483069..7e444255e2 100644 --- a/checks.d/consul.py +++ b/checks.d/consul.py @@ -23,6 +23,7 @@ class ConsulCheck(AgentCheck): MAX_SERVICES = 50 # cap on distinct Consul ServiceIDs to interrogate STATUS_SC = { + 'up': AgentCheck.OK, 'passing': AgentCheck.OK, 'warning': AgentCheck.WARNING, 'critical': AgentCheck.CRITICAL, @@ -290,7 +291,8 @@ def check(self, instance): node_status['passing'] += 1 nodes_to_service_status[node_id]["passing"] += 1 - for status_key, status_value in node_status.items(): + for status_key in self.STATUS_SC: + status_value = node_status[status_key] self.gauge( '{0}.nodes_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), status_value, @@ -309,7 +311,8 @@ def check(self, instance): len(services), tags=main_tags+node_tags) - for status_key, status_value in service_status.items(): + for status_key in self.STATUS_SC: + status_value = service_status[status_key] self.gauge( '{0}.services_{1}'.format(self.CONSUL_CATALOG_CHECK, status_key), status_value, diff --git a/tests/checks/mock/test_consul.py b/tests/checks/mock/test_consul.py index d5923e66f7..5f5db69390 100644 --- a/tests/checks/mock/test_consul.py +++ b/tests/checks/mock/test_consul.py @@ -41,6 +41,10 @@ }] } +def _get_random_ip(): + rand_int = int(15 * random.random()) + 10 + return "10.0.2.{0}".format(rand_int) + class TestCheckConsul(AgentCheckTest): CHECK_NAME = 'consul' @@ -115,9 +119,6 @@ def mock_get_nodes_in_cluster(self, instance): def mock_get_nodes_with_service(self, instance, service): - def _get_random_ip(): - rand_int = int(15 * random.random()) + 10 - return "10.0.2.{0}".format(rand_int) return [ { @@ -131,6 +132,111 @@ def _get_random_ip(): "ServiceID": "", "ServiceName": "", "Status": "passing" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "passing" + } + ], + "Node": { + "Address": _get_random_ip(), + "Node": "node-1" + }, + "Service": { + "Address": "", + "ID": service, + "Port": 80, + "Service": service, + "Tags": [ + "az-us-east-1a" + ] + } + } + ] + + def mock_get_nodes_with_service_warning(self, instance, service): + + return [ + { + "Checks": [ + { + "CheckID": "serfHealth", + "Name": "Serf Health Status", + "Node": "node-1", + "Notes": "", + "Output": "Agent alive and reachable", + "ServiceID": "", + "ServiceName": "", + "Status": "passing" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "warning" + } + ], + "Node": { + "Address": _get_random_ip(), + "Node": "node-1" + }, + "Service": { + "Address": "", + "ID": service, + "Port": 80, + "Service": service, + "Tags": [ + "az-us-east-1a" + ] + } + } + ] + + + def mock_get_nodes_with_service_critical(self, instance, service): + + return [ + { + "Checks": [ + { + "CheckID": "serfHealth", + "Name": "Serf Health Status", + "Node": "node-1", + "Notes": "", + "Output": "Agent alive and reachable", + "ServiceID": "", + "ServiceName": "", + "Status": "passing" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "warning" + }, + { + "CheckID": "service:{0}".format(service), + "Name": "service check {0}".format(service), + "Node": "node-1", + "Notes": "", + "Output": "Service {0} alive".format(service), + "ServiceID": service, + "ServiceName": "", + "Status": "critical" } ], "Node": { @@ -170,6 +276,41 @@ def test_bad_config(self): def test_get_nodes_with_service(self): self.run_check(MOCK_CONFIG, mocks=self._get_consul_mocks()) self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_passing', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_warning', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_critical', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_passing', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_critical', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + + def test_get_nodes_with_service_warning(self): + my_mocks = self._get_consul_mocks() + my_mocks['get_nodes_with_service'] = self.mock_get_nodes_with_service_warning + + self.run_check(MOCK_CONFIG, mocks=my_mocks) + self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_passing', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_warning', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_critical', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_passing', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_warning', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_critical', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + + def test_get_nodes_with_service_critical(self): + my_mocks = self._get_consul_mocks() + my_mocks['get_nodes_with_service'] = self.mock_get_nodes_with_service_critical + + self.run_check(MOCK_CONFIG, mocks=my_mocks) + self.assertMetric('consul.catalog.nodes_up', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_passing', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_warning', value=0, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.nodes_critical', value=1, tags=['consul_datacenter:dc1', 'consul_service_id:service-1']) + self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_passing', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_warning', value=0, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) + self.assertMetric('consul.catalog.services_critical', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) def test_get_peers_in_cluster(self): mocks = self._get_consul_mocks() @@ -184,11 +325,6 @@ def test_get_peers_in_cluster(self): self.run_check(MOCK_CONFIG, mocks=mocks) self.assertMetric('consul.peers', value=3, tags=['consul_datacenter:dc1', 'mode:follower']) - - def test_get_services_on_node(self): - self.run_check(MOCK_CONFIG, mocks=self._get_consul_mocks()) - self.assertMetric('consul.catalog.services_up', value=6, tags=['consul_datacenter:dc1', 'consul_node_id:node-1']) - def test_cull_services_list(self): self.check = load_check(self.CHECK_NAME, MOCK_CONFIG_LEADER_CHECK, self.DEFAULT_AGENT_CONFIG)