Skip to content

Commit

Permalink
Merge pull request #1447 from DataDog/remh/repl_sc
Browse files Browse the repository at this point in the history
[Redis] add replication service check and 'down since' metric
  • Loading branch information
Remi Hakim committed Mar 19, 2015
2 parents 68d7e4a + 721e6f5 commit 2619a7c
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ Vagrantfile
.vagrant/*
embedded/*
.pip-cache/*
dump.rdb
23 changes: 22 additions & 1 deletion checks.d/redisdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,16 @@
DEFAULT_MAX_SLOW_ENTRIES = 128
MAX_SLOW_ENTRIES_KEY = "slowlog-max-len"

REPL_KEY = 'master_link_status'
LINK_DOWN_KEY = 'master_link_down_since_seconds'

class Redis(AgentCheck):
db_key_pattern = re.compile(r'^db\d+')
slave_key_pattern = re.compile(r'^slave\d+')
subkeys = ['keys', 'expires']



SOURCE_TYPE_NAME = 'redis'

GAUGE_KEYS = {
Expand Down Expand Up @@ -230,6 +235,11 @@ def _check_db(self, instance, custom_tags=None):
self.warning("{0} key not found in redis".format(key))
self.gauge('redis.key.length', 0, tags=key_tags)


self._check_replication(info, tags)

def _check_replication(self, info, tags):

# Save the replication delay for each slave
for key in info:
if self.slave_key_pattern.match(key) and isinstance(info[key], dict):
Expand All @@ -244,7 +254,18 @@ def _check_db(self, instance, custom_tags=None):
slave_tags.append('slave_{0}:{1}'.format(slave_tag, info[key][slave_tag]))
slave_tags.append('slave_id:%s' % key.lstrip('slave'))
self.gauge('redis.replication.delay', delay, tags=slave_tags)


if REPL_KEY in info:
if info[REPL_KEY] == 'up':
status = AgentCheck.OK
down_seconds = 0
else:
status = AgentCheck.CRITICAL
down_seconds = info[LINK_DOWN_KEY]

self.service_check('redis.replication.master_link_status', status, tags=tags)
self.gauge('redis.replication.master_link_down_since_seconds', down_seconds, tags=tags)


def _check_slowlog(self, instance, custom_tags):
"""Retrieve length and entries from Redis' SLOWLOG
Expand Down
4 changes: 4 additions & 0 deletions ci/redis.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def redis_rootdir
$TRAVIS_BUILD_DIR/ci/resources/redis/auth.conf)
sh %(#{redis_rootdir}/src/redis-server\
$TRAVIS_BUILD_DIR/ci/resources/redis/noauth.conf)
sh %(#{redis_rootdir}/src/redis-server\
$TRAVIS_BUILD_DIR/ci/resources/redis/slave_healthy.conf)
sh %(#{redis_rootdir}/src/redis-server\
$TRAVIS_BUILD_DIR/ci/resources/redis/slave_unhealthy.conf)
end

task :script => ['ci:common:script'] do
Expand Down
5 changes: 5 additions & 0 deletions ci/resources/redis/slave_healthy.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
daemonize yes
pidfile /tmp/dd-redis-noauth.pid
bind 127.0.0.1
port 36379
slaveof 127.0.0.1 16379
5 changes: 5 additions & 0 deletions ci/resources/redis/slave_unhealthy.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
daemonize yes
pidfile /tmp/dd-redis-noauth.pid
bind 127.0.0.1
port 46379
slaveof 127.0.0.1 55555
53 changes: 53 additions & 0 deletions tests/test_redis.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from checks import AgentCheck
import logging
import unittest
from nose.plugins.attrib import attr
Expand All @@ -12,6 +13,8 @@
MAX_WAIT = 20
NOAUTH_PORT = 16379
AUTH_PORT = 26379
SLAVE_HEALTHY_PORT = 36379
SLAVE_UNHEALTHY_PORT = 46379
DEFAULT_PORT = 6379
MISSING_KEY_TOLERANCE = 0.5

Expand Down Expand Up @@ -139,6 +142,56 @@ def assert_key_present(expected, present, tolerance):
keys = [m[0] for m in metrics]
assert 'redis.net.commands' in keys

def test_redis_replication_link_metric(self):
metric_name = 'redis.replication.master_link_down_since_seconds'
r = load_check('redisdb', {}, {})

def extract_metric(instance):
r.check(instance)
metrics = [m for m in r.get_metrics() if m[0] == metric_name]
return (metrics and metrics[0]) or None

# Healthy host
metric = extract_metric({
'host': 'localhost',
'port': SLAVE_HEALTHY_PORT
})
assert metric, "%s metric not returned" % metric_name
self.assertEqual(metric[2], 0, "Value of %s should be 0" % metric_name)

# Unhealthy host
time.sleep(5) # Give time for the replication failure metrics to build up
metric = extract_metric({
'host': 'localhost',
'port': SLAVE_UNHEALTHY_PORT
})
self.assert_(metric[2] > 0, "Value of %s should be greater than 0" % metric_name)

def test_redis_replication_service_check(self):
check_name = 'redis.replication.master_link_status'
r = load_check('redisdb', {}, {})

def extract_check(instance):
r.check(instance)
checks = [c for c in r.get_service_checks() if c['check'] == check_name]
return (checks and checks[0]) or None

# Healthy host
time.sleep(5) # Give time for the replication failure metrics to build up
check = extract_check({
'host': 'localhost',
'port': SLAVE_HEALTHY_PORT
})
assert check, "%s service check not returned" % check_name
self.assertEqual(check['status'], AgentCheck.OK, "Value of %s service check should be OK" % check_name)

# Unhealthy host
check = extract_check({
'host': 'localhost',
'port': SLAVE_UNHEALTHY_PORT
})
self.assertEqual(check['status'], AgentCheck.CRITICAL, "Value of %s service check should be CRITICAL" % check_name)

def test_redis_repl(self):
master_instance = {
'host': 'localhost',
Expand Down

0 comments on commit 2619a7c

Please sign in to comment.