Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a varnish service check. #1213

Merged
merged 1 commit into from
Dec 10, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 133 additions & 54 deletions checks.d/varnish.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
# stdlib
import xml.parsers.expat # python 2.4 compatible
from collections import defaultdict
import re
import subprocess
import xml.parsers.expat # python 2.4 compatible

# project
from checks import AgentCheck


class BackendStatus(object):
HEALTHY = 'healthy'
SICK = 'sick'
ALL = (HEALTHY, SICK)

@classmethod
def to_check_status(cls, status):
if status == cls.HEALTHY:
return AgentCheck.OK
elif status == cls.SICK:
return AgentCheck.CRITICAL
return AgentCheck.UNKNOWN

class Varnish(AgentCheck):
SERVICE_CHECK_NAME = 'varnish.backend_healthy'

# XML parsing bits, a.k.a. Kafka in Code
def _reset(self):
self._current_element = ""
Expand Down Expand Up @@ -47,39 +64,6 @@ def _char_data(self, data):
self._current_str = data

def check(self, instance):
"""Extract stats from varnishstat -x

The text option (-1) is not reliable enough when counters get large.
VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615

2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h

Bitmaps are not supported.

<varnishstat>
<stat>
<name>fetch_304</name>
<value>0</value>
<flag>a</flag>
<description>Fetch no body (304)</description>
</stat>
<stat>
<name>n_sess_mem</name>
<value>334</value>
<flag>i</flag>
<description>N struct sess_mem</description>
</stat>
<stat>
<type>LCK</type>
<ident>vcl</ident>
<name>creat</name>
<value>1</value>
<flag>a</flag>
<description>Created locks</description>
</stat>
</varnishstat>
"""
# Not configured? Not a problem.
if instance.get("varnishstat", None) is None:
raise Exception("varnishstat is not configured")
Expand All @@ -88,16 +72,47 @@ def check(self, instance):
tags = []
else:
tags = list(set(tags))
varnishstat_path = instance.get("varnishstat")
name = instance.get('name')

# Get version and version-specific args from varnishstat -V.
version, use_xml = self._get_version_info(varnishstat_path)

# Parse metrics from varnishstat.
arg = '-x' if use_xml else '-1'
cmd = [varnishstat_path, arg]

if name is not None:
cmd.extend(['-n', name])
tags += [u'varnish_name:%s' % name]
else:
tags += [u'varnish_name:default']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, error = proc.communicate()
if error and len(error) > 0:
self.log.error(error)
self._parse_varnishstat_metrics(varnishstat_path, use_xml, tags)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should pass "output" instead of "varnishstat_path" here.
See pullrequest #1377


# Parse service checks from varnishadm.
varnishadm_path = instance.get('varnishadm')
if varnishadm_path:
secretfile_path = instance.get('secretfile', '/etc/varnish/secret')
varnishadm_path = 'sudo %s' % varnishadm_path
cmd = [varnishadm_path, '-S', secretfile_path, 'debug.health']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
output, _ = proc.communicate()
if output:
self._parse_varnishadm(output)

def _get_version_info(self, varnishstat_path):
# Get the varnish version from varnishstat
output, error = subprocess.Popen([instance.get("varnishstat"), "-V"],
output, error = subprocess.Popen([varnishstat_path, "-V"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()

# Assumptions regarding varnish's version
use_xml = True
arg = "-x" # varnishstat argument
version = 3

m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE)
Expand All @@ -118,26 +133,44 @@ def check(self, instance):
# Location of varnishstat
if version <= 2:
use_xml = False
arg = "-1"

cmd = [instance.get("varnishstat"), arg]
if name is not None:
cmd.extend(['-n', name])
tags += [u'varnish_name:%s' % name]
else:
tags += [u'varnish_name:default']
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, error = proc.communicate()
except Exception:
self.log.error(u"Failed to run %s" % repr(cmd))
raise
if error and len(error) > 0:
self.log.error(error)
self._parse_varnishstat(output, use_xml, tags)
return version, use_xml

def _parse_varnishstat(self, output, use_xml, tags=None):
"""Extract stats from varnishstat -x

The text option (-1) is not reliable enough when counters get large.
VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615

2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h

Bitmaps are not supported.

Example XML output (with `use_xml=True`)
<varnishstat>
<stat>
<name>fetch_304</name>
<value>0</value>
<flag>a</flag>
<description>Fetch no body (304)</description>
</stat>
<stat>
<name>n_sess_mem</name>
<value>334</value>
<flag>i</flag>
<description>N struct sess_mem</description>
</stat>
<stat>
<type>LCK</type>
<ident>vcl</ident>
<name>creat</name>
<value>1</value>
<flag>a</flag>
<description>Created locks</description>
</stat>
</varnishstat>
"""
tags = tags or []
if use_xml:
p = xml.parsers.expat.ParserCreate()
Expand Down Expand Up @@ -165,4 +198,50 @@ def _parse_varnishstat(self, output, use_xml, tags=None):
self.log.debug("Varnish (rate) %s %d" % (metric_name, int(gauge_val)))
self.rate(metric_name, float(gauge_val), tags=tags)


def _parse_varnishadm(self, output):
""" Parse out service checks from varnishadm.

Example output:

Backend b0 is Sick
Current states good: 2 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
-------------------------------------------------------------444 Good IPv4
-------------------------------------------------------------XXX Good Xmit
-------------------------------------------------------------RRR Good Recv
----------------------------------------------------------HHH--- Happy
Backend b1 is Sick
Current states good: 2 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
----------------------------------------------------------HHH--- Happy

"""
# Process status by backend.
backends_by_status = defaultdict(list)
backend, status, message = None, None, None
for line in output.split("\n"):
tokens = line.strip().split(' ')
if len(tokens) > 0:
if tokens[0] == 'Backend':
backend = tokens[1]
status = tokens[1].lower()
elif tokens[0] == 'Current' and backend is not None:
try:
message = ' '.join(tokens[2:]).strip()
except Exception:
# If we can't parse a message still send a status.
self.log.exception('Error when parsing message from varnishadm')
message = ''
backends_by_status[status].append((backend, message))

for status, backends in backends_by_status.iteritems():
check_status = BackendStatus.to_check_status(status)
for backend, message in backends:
tags = ['backend:%s' % backend]
self.service_check(self.SERVICE_CHECK_NAME, check_status,
tags=tags, message=message)

35 changes: 25 additions & 10 deletions conf.d/varnish.yaml.example
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
init_config:

instances:
# - varnishstat: (required) String path to varnishstat binary
# name: (optional) String name of varnish instance. Passed to the -n parameter of varnishstat. Will also tag each metric with this name.
# tags: (optional) Additional tags to tag each metric with
#
# Example:
#
- varnishstat: /usr/bin/varnishstat
name: myvarnishinstance
tags:
- instance:production
# The full path to the varnishstat binary
# - varnishstat: /usr/bin/varnishstat

# The (optional) name will be used in the varnishstat command for the
# -n argument and will add a name:$instancename tag to all metrics.
# name: myvarnishinstance

# The (optional) list of tags will be applied to every emitted metric.
# tags:
# - instance:production

# The (optional) path to the varnishadm binary will signal the check to
# emit a service check status on backend health using `debug.health`.
# The service check will be tagged by backend.
# NOTE: The Agent must be able to access varnishadm as with root
# privilleges. You can configure your sudoers file for this:
#
# example /etc/sudoers entry:
# dd-agent ALL=(ALL) NOPASSWD:/usr/bin/varnishadm
#
# varnishadm: /usr/bin/varnishadm

# The (optional) path to the varnish secretfile will be used in the
# varnishadm command, if enabled.
# secretfile: /etc/varnish/secret
38 changes: 35 additions & 3 deletions tests/test_varnish.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
import os
import time
import unittest

from nose.plugins.attrib import attr

from tests.common import get_check


Expand Down Expand Up @@ -1853,7 +1854,7 @@ def setUp(self):
"""


def testParsing(self):
def test_parsing(self):
v, instances = get_check('varnish', self.config)
v._parse_varnishstat(self.v_dump, False)
metrics = v.get_metrics()
Expand All @@ -1868,7 +1869,7 @@ def testParsing(self):
if m[0] == "varnish.SMA.s0.g_space"][0], 120606)
assert "varnish.SMA.transient.c_bytes" not in [m[0] for m in metrics]

def testCheck(self):
def test_check(self):
v, instances = get_check('varnish', self.config)
import pprint
try:
Expand All @@ -1879,5 +1880,36 @@ def testCheck(self):
except Exception:
pass

def test_service_check(self):
varnishadm_dump = """
Backend b0 is Sick
Current states good: 0 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
4444444444444444444444444444444444444444444444444444444444444444 Good IPv4
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Good Xmit
RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR Good Recv
---------------------------------------------------------------- Happy
Backend b1 is Sick
Current states good: 0 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
---------------------------------------------------------------- Happy
"""
v, instances = get_check('varnish', self.config)
v._parse_varnishadm(varnishadm_dump)
service_checks = v.get_service_checks()
self.assertEquals(len(service_checks), 2)

b0_check = service_checks[0]
self.assertEquals(b0_check['check'], v.SERVICE_CHECK_NAME)
self.assertEquals(b0_check['tags'], ['backend:b0'])

b1_check = service_checks[1]
self.assertEquals(b1_check['check'], v.SERVICE_CHECK_NAME)
self.assertEquals(b1_check['tags'], ['backend:b1'])

if __name__ == '__main__':
unittest.main()