Skip to content

Commit

Permalink
Merge pull request #2432 from DataDog/yann/stripe-asf-mongo-leader-el…
Browse files Browse the repository at this point in the history
…ection-events

[mongo] Improve messaging&tags on MongoDB replset member state events
  • Loading branch information
yannmh committed Apr 21, 2016
2 parents fdfabc0 + f82e476 commit 70e303c
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 53 deletions.
115 changes: 62 additions & 53 deletions checks.d/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# project
from checks import AgentCheck
from util import get_hostname
from urlparse import urlsplit

DEFAULT_TIMEOUT = 30
GAUGE = AgentCheck.gauge
Expand Down Expand Up @@ -46,27 +46,10 @@ class MongoDb(AgentCheck):
"""
SOURCE_TYPE_NAME = 'mongodb'

# Service checks
# Service check
SERVICE_CHECK_NAME = 'mongodb.can_connect'

# Metrics
"""
MongoDB replica set states, as documented at
https://docs.mongodb.org/manual/reference/replica-states/
"""
REPLSET_STATES = {
0: 'startup',
1: 'primary',
2: 'secondary',
3: 'recovering',
5: 'startup2',
6: 'unknown',
7: 'arbiter',
8: 'down',
9: 'rollback',
10: 'removed'
}

"""
Core metrics collected by default.
"""
Expand Down Expand Up @@ -382,6 +365,25 @@ class MongoDb(AgentCheck):
'top': TOP_METRICS,
}

# Replication states
"""
MongoDB replica set states, as documented at
https://docs.mongodb.org/manual/reference/replica-states/
"""
REPLSET_MEMBER_STATES = {
0: ('STARTUP', 'Starting Up'),
1: ('PRIMARY', 'Primary'),
2: ('SECONDARY', 'Secondary'),
3: ('RECOVERING', 'Recovering'),
4: ('Fatal', 'Fatal'), # MongoDB docs don't list this state
5: ('STARTUP2', 'Starting up (forking threads)'),
6: ('UNKNOWN', 'Unknown to this replset member'),
7: ('ARBITER', 'Arbiter'),
8: ('DOWN', 'Down'),
9: ('ROLLBACK', 'Rollback'),
10: ('REMOVED', 'Removed'),
}

def __init__(self, name, init_config, agentConfig, instances=None):
AgentCheck.__init__(self, name, init_config, agentConfig, instances)

Expand All @@ -394,54 +396,60 @@ def __init__(self, name, init_config, agentConfig, instances=None):
def get_library_versions(self):
return {"pymongo": pymongo.version}

def _report_replica_set_state(self, state, clean_server_name, agentConfig):
def get_state_description(self, state):
if state in self.REPLSET_MEMBER_STATES:
return self.REPLSET_MEMBER_STATES[state][1]
else:
return 'Replset state %d is unknown to the Datadog agent' % state

def get_state_name(self, state):
if state in self.REPLSET_MEMBER_STATES:
return self.REPLSET_MEMBER_STATES[state][0]
else:
return 'UNKNOWN'

def _report_replica_set_state(self, state, clean_server_name, replSet, agentConfig):
"""
Report the member's replica set state
* Submit a service check.
* Create an event on state change.
"""
if self._last_state_by_server.get(clean_server_name, -1) != state:
self._last_state_by_server[clean_server_name] = state
return self.create_event(state, clean_server_name, agentConfig)

def create_event(self, state, clean_server_name, agentConfig):
last_state = self._last_state_by_server.get(clean_server_name, -1)
self._last_state_by_server[clean_server_name] = state
if last_state != state and last_state != -1:
return self.create_event(last_state, state, clean_server_name, replSet['set'], agentConfig)

def hostname_for_event(self, clean_server_name, agentConfig):
"""Return a reasonable hostname for a replset membership event to mention."""
uri = urlsplit(clean_server_name)
hostname = uri.netloc.split(':')[0]
if hostname == 'localhost':
hostname = self.hostname
return hostname

def create_event(self, last_state, state, clean_server_name, replset_name, agentConfig):
"""Create an event with a message describing the replication
state of a mongo node"""

def get_state_description(state):
if state == 0:
return 'Starting Up'
elif state == 1:
return 'Primary'
elif state == 2:
return 'Secondary'
elif state == 3:
return 'Recovering'
elif state == 4:
return 'Fatal'
elif state == 5:
return 'Starting up (forking threads)'
elif state == 6:
return 'Unknown'
elif state == 7:
return 'Arbiter'
elif state == 8:
return 'Down'
elif state == 9:
return 'Rollback'

status = get_state_description(state)
hostname = get_hostname(agentConfig)
msg_title = "%s is %s" % (clean_server_name, status)
msg = "MongoDB %s just reported as %s" % (clean_server_name, status)
status = self.get_state_description(state)
short_status = self.get_state_name(state)
last_short_status = self.get_state_name(last_state)
hostname = self.hostname_for_event(clean_server_name, agentConfig)
msg_title = "%s is %s for %s" % (hostname, short_status, replset_name)
msg = "MongoDB %s (%s) just reported as %s (%s) for %s; it was %s before." % (hostname, clean_server_name, status, short_status, replset_name, last_short_status)

self.event({
'timestamp': int(time.time()),
'event_type': 'Mongo',
'api_key': agentConfig.get('api_key', ''),
'msg_title': msg_title,
'msg_text': msg,
'host': hostname
'host': hostname,
'tags': [
'member_status:' + short_status,
'previous_member_status:' + last_short_status,
'replset:' + replset_name,
]
})

def _build_metric_list_to_collect(self, additional_metrics):
Expand Down Expand Up @@ -690,10 +698,11 @@ def total_seconds(td):
data['health'] = current['health']

data['state'] = replSet['myState']
tags.append('replset_state:%s' % self.REPLSET_STATES[data['state']])
tags.append('replset_state:%s' % self.get_state_name(data['state']))
self._report_replica_set_state(
data['state'],
clean_server_name,
replSet,
self.agentConfig)
status['replSet'] = data

Expand Down
20 changes: 20 additions & 0 deletions tests/checks/integration/test_mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,26 @@ def test_metric_normalization(self):
self.assertEquals((RATE, 'mongodb.foobar.intent_exclusiveps'), resolve_metric('foobar.w', metrics_to_collect)) # noqa
self.assertEquals((GAUGE, 'mongodb.foobar.exclusive'), resolve_metric('foobar.W', metrics_to_collect)) # noqa

def test_state_translation(self):
"""
Check that resolving replset member state IDs match to names and descriptions properly.
"""
# Initialize check
config = {
'instances': [self.MONGODB_CONFIG]
}
self.load_check(config)

self.assertEquals('STARTUP2', self.check.get_state_name(5))
self.assertEquals('PRIMARY', self.check.get_state_name(1))

self.assertEquals('Starting Up', self.check.get_state_description(0))
self.assertEquals('Recovering', self.check.get_state_description(3))

# Unknown states:
self.assertEquals('UNKNOWN', self.check.get_state_name(500))
unknown_desc = self.check.get_state_description(500)
self.assertTrue(unknown_desc.find('500') != -1)

@attr(requires='mongo')
class TestMongo(unittest.TestCase):
Expand Down

0 comments on commit 70e303c

Please sign in to comment.