Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

redisdb: add replication service check and 'down since' metric #1329

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ Vagrantfile
.vagrant/*
embedded/*
.pip-cache/*
dump.rdb
15 changes: 15 additions & 0 deletions checks.d/redisdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,21 @@ def _check_db(self, instance, custom_tags=None):
self.warning("{0} key not found in redis".format(key))
self.gauge('redis.key.length', 0, tags=key_tags)

self._check_replication(info, tags)

def _check_replication(self, info, tags):
status_key = 'master_link_status'
if status_key in info:
if info[status_key] == 'up':
status = AgentCheck.OK
down_seconds = 0
else:
status = AgentCheck.CRITICAL
down_seconds = info['master_link_down_since_seconds']

self.service_check('custom_check.redis.replication.master_link_status', status, tags=tags)
self.gauge('redis.replication.master_link_down_since_seconds', down_seconds, tags=tags)

def check(self, instance):
if (not "host" in instance or not "port" in instance) and not "unix_socket_path" in instance:
raise Exception("You must specify a host/port couple or a unix_socket_path")
Expand Down
4 changes: 4 additions & 0 deletions ci/redis.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ def redis_rootdir
$TRAVIS_BUILD_DIR/ci/resources/redis/auth.conf)
sh %(#{redis_rootdir}/src/redis-server\
$TRAVIS_BUILD_DIR/ci/resources/redis/noauth.conf)
sh %(#{redis_rootdir}/src/redis-server\
$TRAVIS_BUILD_DIR/ci/resources/redis/slave_healthy.conf)
sh %(#{redis_rootdir}/src/redis-server\
$TRAVIS_BUILD_DIR/ci/resources/redis/slave_unhealthy.conf)
end

task :script => ['ci:common:script'] do
Expand Down
5 changes: 5 additions & 0 deletions ci/resources/redis/slave_healthy.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
daemonize yes
pidfile /tmp/dd-redis-noauth.pid
bind 127.0.0.1
port 36379
slaveof 127.0.0.1 16379
5 changes: 5 additions & 0 deletions ci/resources/redis/slave_unhealthy.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
daemonize yes
pidfile /tmp/dd-redis-noauth.pid
bind 127.0.0.1
port 46379
slaveof 127.0.0.1 55555
53 changes: 53 additions & 0 deletions tests/test_redis.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from checks import AgentCheck
import logging
import unittest
from nose.plugins.attrib import attr
Expand All @@ -11,6 +12,8 @@
MAX_WAIT = 20
NOAUTH_PORT = 16379
AUTH_PORT = 26379
SLAVE_HEALTHY_PORT = 36379
SLAVE_UNHEALTHY_PORT = 46379
DEFAULT_PORT = 6379
MISSING_KEY_TOLERANCE = 0.5

Expand Down Expand Up @@ -129,6 +132,56 @@ def assert_key_present(expected, present, tolerance):
keys = [m[0] for m in metrics]
assert 'redis.net.commands' in keys

def test_redis_replication_link_metric(self):
metric_name = 'redis.replication.master_link_down_since_seconds'
r = load_check('redisdb', {}, {})

def extract_metric(instance):
r.check(instance)
metrics = [m for m in r.get_metrics() if m[0] == metric_name]
return (metrics and metrics[0]) or None

# Healthy host
metric = extract_metric({
'host': 'localhost',
'port': SLAVE_HEALTHY_PORT
})
assert metric, "%s metric not returned" % metric_name
self.assertEqual(metric[2], 0, "Value of %s should be 0" % metric_name)

# Unhealthy host
time.sleep(5) # Give time for the replication failure metrics to build up
metric = extract_metric({
'host': 'localhost',
'port': SLAVE_UNHEALTHY_PORT
})
self.assert_(metric[2] > 0, "Value of %s should be greater than 0" % metric_name)

def test_redis_replication_service_check(self):
check_name = 'custom_check.redis.replication.master_link_status'
r = load_check('redisdb', {}, {})

def extract_check(instance):
r.check(instance)
checks = [c for c in r.get_service_checks() if c['check'] == check_name]
return (checks and checks[0]) or None

# Healthy host
time.sleep(5) # Give time for the replication failure metrics to build up
check = extract_check({
'host': 'localhost',
'port': SLAVE_HEALTHY_PORT
})
assert check, "%s service check not returned" % check_name
self.assertEqual(check['status'], AgentCheck.OK, "Value of %s service check should be OK" % check_name)

# Unhealthy host
check = extract_check({
'host': 'localhost',
'port': SLAVE_UNHEALTHY_PORT
})
self.assertEqual(check['status'], AgentCheck.CRITICAL, "Value of %s service check should be CRITICAL" % check_name)

def _sort_metrics(self, metrics):
def sort_by(m):
return m[0], m[1], m[3]
Expand Down