diff --git a/.gitignore b/.gitignore index 0c30821f4e..6cf39f7be9 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ Vagrantfile .vagrant/* embedded/* .pip-cache/* +dump.rdb \ No newline at end of file diff --git a/checks.d/redisdb.py b/checks.d/redisdb.py index 95dc3f9d52..19954448ef 100644 --- a/checks.d/redisdb.py +++ b/checks.d/redisdb.py @@ -215,6 +215,21 @@ def _check_db(self, instance, custom_tags=None): self.warning("{0} key not found in redis".format(key)) self.gauge('redis.key.length', 0, tags=key_tags) + self._check_replication(info, tags) + + def _check_replication(self, info, tags): + status_key = 'master_link_status' + if status_key in info: + if info[status_key] == 'up': + status = AgentCheck.OK + down_seconds = 0 + else: + status = AgentCheck.CRITICAL + down_seconds = info['master_link_down_since_seconds'] + + self.service_check('custom_check.redis.replication.master_link_status', status, tags=tags) + self.gauge('redis.replication.master_link_down_since_seconds', down_seconds, tags=tags) + def check(self, instance): if (not "host" in instance or not "port" in instance) and not "unix_socket_path" in instance: raise Exception("You must specify a host/port couple or a unix_socket_path") diff --git a/ci/redis.rb b/ci/redis.rb index eedbadf3c9..4d05cd8e66 100644 --- a/ci/redis.rb +++ b/ci/redis.rb @@ -31,6 +31,10 @@ def redis_rootdir $TRAVIS_BUILD_DIR/ci/resources/redis/auth.conf) sh %(#{redis_rootdir}/src/redis-server\ $TRAVIS_BUILD_DIR/ci/resources/redis/noauth.conf) + sh %(#{redis_rootdir}/src/redis-server\ + $TRAVIS_BUILD_DIR/ci/resources/redis/slave_healthy.conf) + sh %(#{redis_rootdir}/src/redis-server\ + $TRAVIS_BUILD_DIR/ci/resources/redis/slave_unhealthy.conf) end task :script => ['ci:common:script'] do diff --git a/ci/resources/redis/slave_healthy.conf b/ci/resources/redis/slave_healthy.conf new file mode 100644 index 0000000000..1b0ea9c5c3 --- /dev/null +++ b/ci/resources/redis/slave_healthy.conf @@ -0,0 +1,5 @@ +daemonize yes +pidfile /tmp/dd-redis-noauth.pid +bind 127.0.0.1 +port 36379 +slaveof 127.0.0.1 16379 diff --git a/ci/resources/redis/slave_unhealthy.conf b/ci/resources/redis/slave_unhealthy.conf new file mode 100644 index 0000000000..afd3e3f0c6 --- /dev/null +++ b/ci/resources/redis/slave_unhealthy.conf @@ -0,0 +1,5 @@ +daemonize yes +pidfile /tmp/dd-redis-noauth.pid +bind 127.0.0.1 +port 46379 +slaveof 127.0.0.1 55555 \ No newline at end of file diff --git a/tests/test_redis.py b/tests/test_redis.py index d9ca3c2e4b..b9f3e93278 100644 --- a/tests/test_redis.py +++ b/tests/test_redis.py @@ -1,3 +1,4 @@ +from checks import AgentCheck import logging import unittest from nose.plugins.attrib import attr @@ -11,6 +12,8 @@ MAX_WAIT = 20 NOAUTH_PORT = 16379 AUTH_PORT = 26379 +SLAVE_HEALTHY_PORT = 36379 +SLAVE_UNHEALTHY_PORT = 46379 DEFAULT_PORT = 6379 MISSING_KEY_TOLERANCE = 0.5 @@ -129,6 +132,56 @@ def assert_key_present(expected, present, tolerance): keys = [m[0] for m in metrics] assert 'redis.net.commands' in keys + def test_redis_replication_link_metric(self): + metric_name = 'redis.replication.master_link_down_since_seconds' + r = load_check('redisdb', {}, {}) + + def extract_metric(instance): + r.check(instance) + metrics = [m for m in r.get_metrics() if m[0] == metric_name] + return (metrics and metrics[0]) or None + + # Healthy host + metric = extract_metric({ + 'host': 'localhost', + 'port': SLAVE_HEALTHY_PORT + }) + assert metric, "%s metric not returned" % metric_name + self.assertEqual(metric[2], 0, "Value of %s should be 0" % metric_name) + + # Unhealthy host + time.sleep(5) # Give time for the replication failure metrics to build up + metric = extract_metric({ + 'host': 'localhost', + 'port': SLAVE_UNHEALTHY_PORT + }) + self.assert_(metric[2] > 0, "Value of %s should be greater than 0" % metric_name) + + def test_redis_replication_service_check(self): + check_name = 'custom_check.redis.replication.master_link_status' + r = load_check('redisdb', {}, {}) + + def extract_check(instance): + r.check(instance) + checks = [c for c in r.get_service_checks() if c['check'] == check_name] + return (checks and checks[0]) or None + + # Healthy host + time.sleep(5) # Give time for the replication failure metrics to build up + check = extract_check({ + 'host': 'localhost', + 'port': SLAVE_HEALTHY_PORT + }) + assert check, "%s service check not returned" % check_name + self.assertEqual(check['status'], AgentCheck.OK, "Value of %s service check should be OK" % check_name) + + # Unhealthy host + check = extract_check({ + 'host': 'localhost', + 'port': SLAVE_UNHEALTHY_PORT + }) + self.assertEqual(check['status'], AgentCheck.CRITICAL, "Value of %s service check should be CRITICAL" % check_name) + def _sort_metrics(self, metrics): def sort_by(m): return m[0], m[1], m[3]