diff --git a/checks.d/varnish.py b/checks.d/varnish.py
index a9cb3fe61a..be0e3df1ed 100644
--- a/checks.d/varnish.py
+++ b/checks.d/varnish.py
@@ -1,12 +1,29 @@
# stdlib
-import xml.parsers.expat # python 2.4 compatible
+from collections import defaultdict
import re
import subprocess
+import xml.parsers.expat # python 2.4 compatible
# project
from checks import AgentCheck
+
+class BackendStatus(object):
+ HEALTHY = 'healthy'
+ SICK = 'sick'
+ ALL = (HEALTHY, SICK)
+
+ @classmethod
+ def to_check_status(cls, status):
+ if status == cls.HEALTHY:
+ return AgentCheck.OK
+ elif status == cls.SICK:
+ return AgentCheck.CRITICAL
+ return AgentCheck.UNKNOWN
+
class Varnish(AgentCheck):
+ SERVICE_CHECK_NAME = 'varnish.backend_healthy'
+
# XML parsing bits, a.k.a. Kafka in Code
def _reset(self):
self._current_element = ""
@@ -47,39 +64,6 @@ def _char_data(self, data):
self._current_str = data
def check(self, instance):
- """Extract stats from varnishstat -x
-
- The text option (-1) is not reliable enough when counters get large.
- VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
-
- 2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
- https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
-
- Bitmaps are not supported.
-
-
-
- fetch_304
- 0
- a
- Fetch no body (304)
-
-
- n_sess_mem
- 334
- i
- N struct sess_mem
-
-
- LCK
- vcl
- creat
- 1
- a
- Created locks
-
-
- """
# Not configured? Not a problem.
if instance.get("varnishstat", None) is None:
raise Exception("varnishstat is not configured")
@@ -88,16 +72,45 @@ def check(self, instance):
tags = []
else:
tags = list(set(tags))
+ varnishstat_path = instance.get("varnishstat")
name = instance.get('name')
+ # Get version and version-specific args from varnishstat -V.
+ version, use_xml = self._get_version_info(varnishstat_path)
+
+ # Parse metrics from varnishstat.
+ arg = '-x' if use_xml else '-1'
+ cmd = [varnishstat_path, arg]
+
+ if name is not None:
+ cmd.extend(['-n', name])
+ tags += [u'varnish_name:%s' % name]
+ else:
+ tags += [u'varnish_name:default']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ output, error = proc.communicate()
+ if error and len(error) > 0:
+ self.log.error(error)
+ self._parse_varnishstat_metrics(varnishstat_path, use_xml, tags)
+
+ # Parse service checks from varnishadm.
+ varnishadm_path = instance.get('varnishadm')
+ secretfile_path = instance.get('secretfile', '/etc/varnish/secret')
+ cmd = [varnishadm_path, '-S', secretfile_path, 'debug.health']
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+ output, _ = proc.communicate()
+ if varnishadm_path:
+ self._parse_varnishadm(varnishadm_path)
+
+ def _get_version_info(self, varnishstat_path):
# Get the varnish version from varnishstat
- output, error = subprocess.Popen([instance.get("varnishstat"), "-V"],
+ output, error = subprocess.Popen([varnishstat_path, "-V"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()
# Assumptions regarding varnish's version
use_xml = True
- arg = "-x" # varnishstat argument
version = 3
m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE)
@@ -118,26 +131,44 @@ def check(self, instance):
# Location of varnishstat
if version <= 2:
use_xml = False
- arg = "-1"
- cmd = [instance.get("varnishstat"), arg]
- if name is not None:
- cmd.extend(['-n', name])
- tags += [u'varnish_name:%s' % name]
- else:
- tags += [u'varnish_name:default']
- try:
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- output, error = proc.communicate()
- except Exception:
- self.log.error(u"Failed to run %s" % repr(cmd))
- raise
- if error and len(error) > 0:
- self.log.error(error)
- self._parse_varnishstat(output, use_xml, tags)
+ return version, use_xml
def _parse_varnishstat(self, output, use_xml, tags=None):
+ """Extract stats from varnishstat -x
+
+ The text option (-1) is not reliable enough when counters get large.
+ VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
+
+ 2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
+ https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
+
+ Bitmaps are not supported.
+
+ Example XML output (with `use_xml=True`)
+
+
+ fetch_304
+ 0
+ a
+ Fetch no body (304)
+
+
+ n_sess_mem
+ 334
+ i
+ N struct sess_mem
+
+
+ LCK
+ vcl
+ creat
+ 1
+ a
+ Created locks
+
+
+ """
tags = tags or []
if use_xml:
p = xml.parsers.expat.ParserCreate()
@@ -165,4 +196,50 @@ def _parse_varnishstat(self, output, use_xml, tags=None):
self.log.debug("Varnish (rate) %s %d" % (metric_name, int(gauge_val)))
self.rate(metric_name, float(gauge_val), tags=tags)
-
\ No newline at end of file
+ def _parse_varnishadm(self, output):
+ """ Parse out service checks from varnishadm.
+
+ Example output:
+
+ Backend b0 is Sick
+ Current states good: 2 threshold: 3 window: 5
+ Average responsetime of good probes: 0.000000
+ Oldest Newest
+ ================================================================
+ -------------------------------------------------------------444 Good IPv4
+ -------------------------------------------------------------XXX Good Xmit
+ -------------------------------------------------------------RRR Good Recv
+ ----------------------------------------------------------HHH--- Happy
+ Backend b1 is Sick
+ Current states good: 2 threshold: 3 window: 5
+ Average responsetime of good probes: 0.000000
+ Oldest Newest
+ ================================================================
+ ----------------------------------------------------------HHH--- Happy
+
+ """
+ # Process status by backend.
+ backends_by_status = defaultdict(list)
+ backend, status, message = None, None, None
+ for line in output.split("\n"):
+ tokens = line.strip().split(' ')
+ if len(tokens) > 0:
+ if tokens[0] == 'Backend':
+ backend = tokens[1]
+ status = tokens[1].lower()
+ elif tokens[0] == 'Current' and backend is not None:
+ try:
+ message = ' '.join(tokens[2:]).strip()
+ except Exception:
+ # If we can't parse a message still send a status.
+ self.log.exception('Error when parsing message from varnishadm')
+ message = ''
+ backends_by_status[status].append((backend, message))
+
+ for status, backends in backends_by_status.iteritems():
+ check_status = BackendStatus.to_check_status(status)
+ for backend, message in backends:
+ tags = ['backend:%s' % backend]
+ self.service_check(self.SERVICE_CHECK_NAME, check_status,
+ tags=tags, message=message)
+
diff --git a/conf.d/varnish.yaml.example b/conf.d/varnish.yaml.example
index f1af203b70..ee08a50baf 100644
--- a/conf.d/varnish.yaml.example
+++ b/conf.d/varnish.yaml.example
@@ -1,13 +1,23 @@
init_config:
instances:
-# - varnishstat: (required) String path to varnishstat binary
-# name: (optional) String name of varnish instance. Passed to the -n parameter of varnishstat. Will also tag each metric with this name.
-# tags: (optional) Additional tags to tag each metric with
-#
-# Example:
-#
- - varnishstat: /usr/bin/varnishstat
- name: myvarnishinstance
- tags:
- - instance:production
+ # The full path to the varnishstat binary
+# - varnishstat: /usr/bin/varnishstat
+
+ # The (optional) name will be used in the varnishstat command for the
+ # -n argument and will add a name:$instancename tag to all metrics.
+# name: myvarnishinstance
+
+ # The (optional) list of tags will be applied to every emitted metric.
+# tags:
+# - instance:production
+
+ # The (optional) path to the varnishadm binary will signal the check to
+ # emit a service check status on backend health using `debug.health`.
+ # The service check will be tagged by backend. NOTE: The Agent must
+ # be able to access the Varnish secretfile for this to work.
+# varnishadm: /usr/bin/varnishadm
+
+ # The (optional) path to the varnish secretfile will be used in the
+ # varnishadm command, if enabled.
+# secretfile: /etc/varnish/secret
diff --git a/tests/test_varnish.py b/tests/test_varnish.py
index 29d5261f5b..5074b2fef7 100644
--- a/tests/test_varnish.py
+++ b/tests/test_varnish.py
@@ -1,8 +1,9 @@
-import logging
import os
import time
import unittest
+from nose.plugins.attrib import attr
+
from tests.common import get_check
@@ -1853,7 +1854,7 @@ def setUp(self):
"""
- def testParsing(self):
+ def test_parsing(self):
v, instances = get_check('varnish', self.config)
v._parse_varnishstat(self.v_dump, False)
metrics = v.get_metrics()
@@ -1868,7 +1869,7 @@ def testParsing(self):
if m[0] == "varnish.SMA.s0.g_space"][0], 120606)
assert "varnish.SMA.transient.c_bytes" not in [m[0] for m in metrics]
- def testCheck(self):
+ def test_check(self):
v, instances = get_check('varnish', self.config)
import pprint
try:
@@ -1879,5 +1880,36 @@ def testCheck(self):
except Exception:
pass
+ def test_service_check(self):
+ varnishadm_dump = """
+Backend b0 is Sick
+Current states good: 0 threshold: 3 window: 5
+Average responsetime of good probes: 0.000000
+Oldest Newest
+================================================================
+4444444444444444444444444444444444444444444444444444444444444444 Good IPv4
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Good Xmit
+RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR Good Recv
+---------------------------------------------------------------- Happy
+Backend b1 is Sick
+Current states good: 0 threshold: 3 window: 5
+Average responsetime of good probes: 0.000000
+Oldest Newest
+================================================================
+---------------------------------------------------------------- Happy
+ """
+ v, instances = get_check('varnish', self.config)
+ v._parse_varnishadm(varnishadm_dump)
+ service_checks = v.get_service_checks()
+ self.assertEquals(len(service_checks), 2)
+
+ b0_check = service_checks[0]
+ self.assertEquals(b0_check['check'], v.SERVICE_CHECK_NAME)
+ self.assertEquals(b0_check['tags'], ['backend:b0'])
+
+ b1_check = service_checks[1]
+ self.assertEquals(b1_check['check'], v.SERVICE_CHECK_NAME)
+ self.assertEquals(b1_check['tags'], ['backend:b1'])
+
if __name__ == '__main__':
unittest.main()