diff --git a/checks.d/varnish.py b/checks.d/varnish.py index a9cb3fe61a..be0e3df1ed 100644 --- a/checks.d/varnish.py +++ b/checks.d/varnish.py @@ -1,12 +1,29 @@ # stdlib -import xml.parsers.expat # python 2.4 compatible +from collections import defaultdict import re import subprocess +import xml.parsers.expat # python 2.4 compatible # project from checks import AgentCheck + +class BackendStatus(object): + HEALTHY = 'healthy' + SICK = 'sick' + ALL = (HEALTHY, SICK) + + @classmethod + def to_check_status(cls, status): + if status == cls.HEALTHY: + return AgentCheck.OK + elif status == cls.SICK: + return AgentCheck.CRITICAL + return AgentCheck.UNKNOWN + class Varnish(AgentCheck): + SERVICE_CHECK_NAME = 'varnish.backend_healthy' + # XML parsing bits, a.k.a. Kafka in Code def _reset(self): self._current_element = "" @@ -47,39 +64,6 @@ def _char_data(self, data): self._current_str = data def check(self, instance): - """Extract stats from varnishstat -x - - The text option (-1) is not reliable enough when counters get large. - VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615 - - 2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g") - https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h - - Bitmaps are not supported. - - - - fetch_304 - 0 - a - Fetch no body (304) - - - n_sess_mem - 334 - i - N struct sess_mem - - - LCK - vcl - creat - 1 - a - Created locks - - - """ # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") @@ -88,16 +72,45 @@ def check(self, instance): tags = [] else: tags = list(set(tags)) + varnishstat_path = instance.get("varnishstat") name = instance.get('name') + # Get version and version-specific args from varnishstat -V. + version, use_xml = self._get_version_info(varnishstat_path) + + # Parse metrics from varnishstat. + arg = '-x' if use_xml else '-1' + cmd = [varnishstat_path, arg] + + if name is not None: + cmd.extend(['-n', name]) + tags += [u'varnish_name:%s' % name] + else: + tags += [u'varnish_name:default'] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = proc.communicate() + if error and len(error) > 0: + self.log.error(error) + self._parse_varnishstat_metrics(varnishstat_path, use_xml, tags) + + # Parse service checks from varnishadm. + varnishadm_path = instance.get('varnishadm') + secretfile_path = instance.get('secretfile', '/etc/varnish/secret') + cmd = [varnishadm_path, '-S', secretfile_path, 'debug.health'] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) + output, _ = proc.communicate() + if varnishadm_path: + self._parse_varnishadm(varnishadm_path) + + def _get_version_info(self, varnishstat_path): # Get the varnish version from varnishstat - output, error = subprocess.Popen([instance.get("varnishstat"), "-V"], + output, error = subprocess.Popen([varnishstat_path, "-V"], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() # Assumptions regarding varnish's version use_xml = True - arg = "-x" # varnishstat argument version = 3 m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE) @@ -118,26 +131,44 @@ def check(self, instance): # Location of varnishstat if version <= 2: use_xml = False - arg = "-1" - cmd = [instance.get("varnishstat"), arg] - if name is not None: - cmd.extend(['-n', name]) - tags += [u'varnish_name:%s' % name] - else: - tags += [u'varnish_name:default'] - try: - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - output, error = proc.communicate() - except Exception: - self.log.error(u"Failed to run %s" % repr(cmd)) - raise - if error and len(error) > 0: - self.log.error(error) - self._parse_varnishstat(output, use_xml, tags) + return version, use_xml def _parse_varnishstat(self, output, use_xml, tags=None): + """Extract stats from varnishstat -x + + The text option (-1) is not reliable enough when counters get large. + VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615 + + 2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g") + https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h + + Bitmaps are not supported. + + Example XML output (with `use_xml=True`) + + + fetch_304 + 0 + a + Fetch no body (304) + + + n_sess_mem + 334 + i + N struct sess_mem + + + LCK + vcl + creat + 1 + a + Created locks + + + """ tags = tags or [] if use_xml: p = xml.parsers.expat.ParserCreate() @@ -165,4 +196,50 @@ def _parse_varnishstat(self, output, use_xml, tags=None): self.log.debug("Varnish (rate) %s %d" % (metric_name, int(gauge_val))) self.rate(metric_name, float(gauge_val), tags=tags) - \ No newline at end of file + def _parse_varnishadm(self, output): + """ Parse out service checks from varnishadm. + + Example output: + + Backend b0 is Sick + Current states good: 2 threshold: 3 window: 5 + Average responsetime of good probes: 0.000000 + Oldest Newest + ================================================================ + -------------------------------------------------------------444 Good IPv4 + -------------------------------------------------------------XXX Good Xmit + -------------------------------------------------------------RRR Good Recv + ----------------------------------------------------------HHH--- Happy + Backend b1 is Sick + Current states good: 2 threshold: 3 window: 5 + Average responsetime of good probes: 0.000000 + Oldest Newest + ================================================================ + ----------------------------------------------------------HHH--- Happy + + """ + # Process status by backend. + backends_by_status = defaultdict(list) + backend, status, message = None, None, None + for line in output.split("\n"): + tokens = line.strip().split(' ') + if len(tokens) > 0: + if tokens[0] == 'Backend': + backend = tokens[1] + status = tokens[1].lower() + elif tokens[0] == 'Current' and backend is not None: + try: + message = ' '.join(tokens[2:]).strip() + except Exception: + # If we can't parse a message still send a status. + self.log.exception('Error when parsing message from varnishadm') + message = '' + backends_by_status[status].append((backend, message)) + + for status, backends in backends_by_status.iteritems(): + check_status = BackendStatus.to_check_status(status) + for backend, message in backends: + tags = ['backend:%s' % backend] + self.service_check(self.SERVICE_CHECK_NAME, check_status, + tags=tags, message=message) + diff --git a/conf.d/varnish.yaml.example b/conf.d/varnish.yaml.example index f1af203b70..ee08a50baf 100644 --- a/conf.d/varnish.yaml.example +++ b/conf.d/varnish.yaml.example @@ -1,13 +1,23 @@ init_config: instances: -# - varnishstat: (required) String path to varnishstat binary -# name: (optional) String name of varnish instance. Passed to the -n parameter of varnishstat. Will also tag each metric with this name. -# tags: (optional) Additional tags to tag each metric with -# -# Example: -# - - varnishstat: /usr/bin/varnishstat - name: myvarnishinstance - tags: - - instance:production + # The full path to the varnishstat binary +# - varnishstat: /usr/bin/varnishstat + + # The (optional) name will be used in the varnishstat command for the + # -n argument and will add a name:$instancename tag to all metrics. +# name: myvarnishinstance + + # The (optional) list of tags will be applied to every emitted metric. +# tags: +# - instance:production + + # The (optional) path to the varnishadm binary will signal the check to + # emit a service check status on backend health using `debug.health`. + # The service check will be tagged by backend. NOTE: The Agent must + # be able to access the Varnish secretfile for this to work. +# varnishadm: /usr/bin/varnishadm + + # The (optional) path to the varnish secretfile will be used in the + # varnishadm command, if enabled. +# secretfile: /etc/varnish/secret diff --git a/tests/test_varnish.py b/tests/test_varnish.py index 29d5261f5b..5074b2fef7 100644 --- a/tests/test_varnish.py +++ b/tests/test_varnish.py @@ -1,8 +1,9 @@ -import logging import os import time import unittest +from nose.plugins.attrib import attr + from tests.common import get_check @@ -1853,7 +1854,7 @@ def setUp(self): """ - def testParsing(self): + def test_parsing(self): v, instances = get_check('varnish', self.config) v._parse_varnishstat(self.v_dump, False) metrics = v.get_metrics() @@ -1868,7 +1869,7 @@ def testParsing(self): if m[0] == "varnish.SMA.s0.g_space"][0], 120606) assert "varnish.SMA.transient.c_bytes" not in [m[0] for m in metrics] - def testCheck(self): + def test_check(self): v, instances = get_check('varnish', self.config) import pprint try: @@ -1879,5 +1880,36 @@ def testCheck(self): except Exception: pass + def test_service_check(self): + varnishadm_dump = """ +Backend b0 is Sick +Current states good: 0 threshold: 3 window: 5 +Average responsetime of good probes: 0.000000 +Oldest Newest +================================================================ +4444444444444444444444444444444444444444444444444444444444444444 Good IPv4 +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Good Xmit +RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR Good Recv +---------------------------------------------------------------- Happy +Backend b1 is Sick +Current states good: 0 threshold: 3 window: 5 +Average responsetime of good probes: 0.000000 +Oldest Newest +================================================================ +---------------------------------------------------------------- Happy + """ + v, instances = get_check('varnish', self.config) + v._parse_varnishadm(varnishadm_dump) + service_checks = v.get_service_checks() + self.assertEquals(len(service_checks), 2) + + b0_check = service_checks[0] + self.assertEquals(b0_check['check'], v.SERVICE_CHECK_NAME) + self.assertEquals(b0_check['tags'], ['backend:b0']) + + b1_check = service_checks[1] + self.assertEquals(b1_check['check'], v.SERVICE_CHECK_NAME) + self.assertEquals(b1_check['tags'], ['backend:b1']) + if __name__ == '__main__': unittest.main()