From 8adccfd39e973241e55d115efa7094742c4d6f64 Mon Sep 17 00:00:00 2001 From: Celene Date: Thu, 10 Apr 2014 18:47:12 +0000 Subject: [PATCH 01/33] fix mysql check duplicate metric --- checks.d/mysql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checks.d/mysql.py b/checks.d/mysql.py index 175c14d5d2..56718c24e8 100644 --- a/checks.d/mysql.py +++ b/checks.d/mysql.py @@ -18,7 +18,7 @@ 'Innodb_data_reads': ('mysql.innodb.data_reads', RATE), 'Innodb_data_writes': ('mysql.innodb.data_writes', RATE), 'Innodb_os_log_fsyncs': ('mysql.innodb.os_log_fsyncs', RATE), - 'Innodb_data_reads': ('mysql.innodb.buffer_pool_size', RATE), + 'Innodb_buffer_pool_size': ('mysql.innodb.buffer_pool_size', RATE), 'Slow_queries': ('mysql.performance.slow_queries', RATE), 'Questions': ('mysql.performance.questions', RATE), 'Queries': ('mysql.performance.queries', RATE), From 724000b13a124134ca2932bac67d9332dd947101 Mon Sep 17 00:00:00 2001 From: Arthur Wang Date: Thu, 17 Apr 2014 15:18:55 +0000 Subject: [PATCH 02/33] Fix open_file_descriptors metric name --- checks.d/process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checks.d/process.py b/checks.d/process.py index b2a225b6e5..718fd87960 100644 --- a/checks.d/process.py +++ b/checks.d/process.py @@ -10,7 +10,7 @@ class ProcessCheck(AgentCheck): 'system.processes.mem.rss', 'system.processes.mem.vms', 'system.processes.mem.real', - 'system.processes.open_file_decorators', + 'system.processes.open_file_descriptors', 'system.processes.ioread_count', 'system.processes.iowrite_count', 'system.processes.ioread_bytes', From 0ae96b11717e01de2a897f56bf0927d19413e81b Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Thu, 17 Apr 2014 17:03:14 +0100 Subject: [PATCH 03/33] Typo --- checks/check_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checks/check_status.py b/checks/check_status.py index 37ccc89a2f..508a95516a 100644 --- a/checks/check_status.py +++ b/checks/check_status.py @@ -346,7 +346,7 @@ def body_lines(self): ntp_offset, ntp_styles = get_ntp_info() lines.append(' ' + style('NTP offset', *ntp_styles) + ': ' + style('%s s' % round(ntp_offset, 4), *ntp_styles)) except Exception, e: - lines.append(' NTP offset: Unkwown (%s)' % str(e)) + lines.append(' NTP offset: Unknown (%s)' % str(e)) lines.append(' System UTC time: ' + datetime.datetime.utcnow().__str__()) lines.append('') From 077ccf5e4ea6d43a02ce2763b882a5ecf5549495 Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Mon, 21 Apr 2014 11:22:33 -0400 Subject: [PATCH 04/33] Adding release 5.0.0 --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fefc31a435..2c93ab4965 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ Changes ======= +# 5.0.0 / Unreleased + +### Integrations affected + +### Changes + # 4.2.1 / 04-09-2014 From 3f12340296cf8059f653823556ed3f593f96c9e8 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 21 Apr 2014 11:48:10 -0400 Subject: [PATCH 05/33] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c93ab4965..e2d85eb536 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ Changes ### Integrations affected ### Changes - +* [BUGFIX] Fix incorrect open file descriptors metric name in process check: See [#904] # 4.2.1 / 04-09-2014 From d4c53661803e732570e42d8d7784cca3786bfe4c Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 21 Apr 2014 11:49:38 -0400 Subject: [PATCH 06/33] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d85eb536..da9d98d0d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ Changes ### Integrations affected ### Changes -* [BUGFIX] Fix incorrect open file descriptors metric name in process check: See [#904] +* [BUGFIX] Fix incorrect open file descriptors metric name in process check: See [#904](https://github.com/DataDog/dd-agent/pull/904) # 4.2.1 / 04-09-2014 From 75ccee4a770f46667f3cf230148d040b53b5bf0f Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Tue, 22 Apr 2014 18:17:52 +0000 Subject: [PATCH 07/33] add ability for AgentCheck instances to collect service check data --- checks/__init__.py | 31 +++++++++++++++++++++++++++++++ tests/test_common.py | 23 +++++++++++++++++++++-- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index 74cfa1c038..118bec1e87 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -262,6 +262,7 @@ def get_metrics(self, expire=True): return metrics class AgentCheck(object): + OK, WARNING, CRITICAL, UNKNOWN, NONE = (0, 1, 2, 3, 4) def __init__(self, name, init_config, agentConfig, instances=None): """ @@ -284,6 +285,7 @@ def __init__(self, name, init_config, agentConfig, instances=None): self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None)) self.events = [] + self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None @@ -392,6 +394,23 @@ def event(self, event): event['api_key'] = self.agentConfig['api_key'] self.events.append(event) + def service_check(self, service_name, status, tags=None, timestamp=None): + """ + Save a service check. + + :param service_name: string, name of the service + :param status: int, describing the status. 0 for success, 1 for + warning, 2 for failure + :param tags: list of strings, a list of tags for this run + :param timestamp: int, unix timestamp for when the run occurred + """ + self.service_checks.append({ + 'service_name': service_name, + 'status': status, + 'tags': tags, + 'timestamp': int(timestamp or time.time()) + }) + def has_events(self): """ Check whether the check has saved any events @@ -421,6 +440,18 @@ def get_events(self): self.events = [] return events + def get_service_checks(self): + """ + Return a list of the service checks saved by the check, if any + and clears them out of the instance's service_checks list + + @return the list of service checks saved by this check + @rtype list of service check dicts + """ + service_checks = self.service_checks + self.service_checks = [] + return service_checks + def has_warnings(self): """ Check whether the instance run created any warnings diff --git a/tests/test_common.py b/tests/test_common.py index 4ec2499d5a..079a423a59 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -2,13 +2,14 @@ import unittest import logging logger = logging.getLogger() -from checks import Check, CheckException, UnknownValue, CheckException, Infinity +from checks import (Check, AgentCheck, + CheckException, UnknownValue, CheckException, Infinity) from checks.collector import Collector from aggregator import MetricsAggregator class TestCore(unittest.TestCase): "Tests to validate the core check logic" - + def setUp(self): self.c = Check(logger) self.c.gauge("test-metric") @@ -95,6 +96,24 @@ def test_metadata(self): assert "socket-fqdn" in c._get_metadata() assert "socket-hostname" in c._get_metadata() + def test_service_check(self): + service_name = 'test.service_check' + status = AgentCheck.CRITICAL + tags = ['host:test', 'other:thing'] + timestamp = int(time.time()) + + check = AgentCheck('test', {}, {}) + check.service_check(service_name, status, tags, timestamp) + self.assertEquals(len(check.service_checks), 1, check.service_checks) + val = check.get_service_checks() + self.assertEquals([{ + 'service_name': service_name, + 'status': status, + 'tags': tags, + 'timestamp': timestamp + }], val) + self.assertEquals(len(check.service_checks), 0, check.service_checks) + class TestAggregator(unittest.TestCase): def setUp(self): self.aggr = MetricsAggregator('test-aggr') From b5e24dd53df0e5f488f5c07ce17b20be6f3b4548 Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Tue, 22 Apr 2014 18:36:02 +0000 Subject: [PATCH 08/33] update the collector to collect service check data from agent checks --- checks/check_status.py | 36 ++++++++++++++------------- checks/collector.py | 56 ++++++++++++++++++++++++------------------ 2 files changed, 51 insertions(+), 41 deletions(-) diff --git a/checks/check_status.py b/checks/check_status.py index 508a95516a..2f2fb543c5 100644 --- a/checks/check_status.py +++ b/checks/check_status.py @@ -188,7 +188,7 @@ def to_dict(self): def _not_running_message(cls): lines = cls._title_lines() + [ style(" %s is not running." % cls.NAME, 'red'), - style(""" You can get more details in the logs: + style(""" You can get more details in the logs: %s""" % logger_info(), 'red'), "", "" @@ -264,13 +264,15 @@ def has_warnings(self): class CheckStatus(object): - def __init__(self, check_name, instance_statuses, metric_count, - event_count, init_failed_error=None, - init_failed_traceback=None, library_versions=None): + def __init__(self, check_name, instance_statuses, metric_count=None, + event_count=None, service_check_count=None, + init_failed_error=None, init_failed_traceback=None, + library_versions=None): self.name = check_name self.instance_statuses = instance_statuses - self.metric_count = metric_count - self.event_count = event_count + self.metric_count = metric_count or 0 + self.event_count = event_count or 0 + self.service_check_count = service_check_count or 0 self.init_failed_error = init_failed_error self.init_failed_traceback = init_failed_traceback self.library_versions = library_versions @@ -363,7 +365,7 @@ def body_lines(self): confd_path = config.get_confd_path(osname) except config.PathNotFound: confd_path = 'Not found' - + try: checksd_path = config.get_checksd_path(osname) except config.PathNotFound: @@ -442,7 +444,7 @@ def body_lines(self): s.traceback.split('\n')) check_lines += [ - " - Collected %s metrics & %s events" % (cs.metric_count, cs.event_count), + " - Collected %s metrics, %s events & %s service checks" % (cs.metric_count, cs.event_count, cs.service_check_count), ] if cs.library_versions is not None: @@ -537,7 +539,7 @@ def to_dict(self): status_info['confd_path'] = config.get_confd_path(osname) except config.PathNotFound: status_info['confd_path'] = 'Not found' - + try: status_info['checksd_path'] = config.get_checksd_path(osname) except config.PathNotFound: @@ -650,7 +652,7 @@ def get_jmx_status(): - One generated by jmxfetch that return information about the collection of metrics its format is as following: - + ### timestamp: 1391037347435 checks: @@ -679,10 +681,10 @@ def get_jmx_status(): if status_age > 60: check_statuses.append(CheckStatus("jmx", [InstanceStatus( - 0, - STATUS_ERROR, + 0, + STATUS_ERROR, error="JMXfetch didn't return any metrics during the last minute" - )], 0, 0)) + )])) else: for check_name, instances in jmx_checks.get('failed_checks', {}).iteritems(): @@ -693,7 +695,7 @@ def get_jmx_status(): instance_name = info.get('instance_name', None) check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count)) check_data[check_name]['metric_count'].append(metric_count) - + for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems(): for info in instances: message = info.get('message', None) @@ -704,18 +706,18 @@ def get_jmx_status(): check_data[check_name]['metric_count'].append(metric_count) for check_name, data in check_data.iteritems(): - check_status = CheckStatus(check_name, data['statuses'], sum(data['metric_count']), 0) + check_status = CheckStatus(check_name, data['statuses'], sum(data['metric_count'])) check_statuses.append(check_status) if os.path.exists(python_status_path): python_jmx_stats = yaml.load(file(python_status_path)) jmx_checks = python_jmx_stats.get('invalid_checks', {}) for check_name, excep in jmx_checks.iteritems(): - check_statuses.append(CheckStatus(check_name, [], 0, 0, init_failed_error=excep)) + check_statuses.append(CheckStatus(check_name, [], init_failed_error=excep)) return check_statuses - except Exception, e: + except Exception: log.exception("Couldn't load latest jmx status") return [] diff --git a/checks/collector.py b/checks/collector.py index d467354c04..bf55cb6da6 100644 --- a/checks/collector.py +++ b/checks/collector.py @@ -43,7 +43,7 @@ def __init__(self, agentConfig, emitters, systemStats): # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None - self.emitters = emitters + self.emitters = emitters self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) @@ -107,7 +107,7 @@ def stop(self): """ Tell the collector to stop at the next logical point. """ - # This is called when the process is being killed, so + # This is called when the process is being killed, so # try to stop the collector as soon as possible. # Most importantly, don't try to submit to the emitters # because the forwarder is quite possibly already killed @@ -116,7 +116,7 @@ def stop(self): self.continue_running = False for check in self.initialized_checks_d: check.stop() - + def run(self, checksd=None, start_event=True): """ Collect data from each check and submit their data. @@ -130,6 +130,7 @@ def run(self, checksd=None, start_event=True): payload = self._build_payload(start_event=start_event) metrics = payload['metrics'] events = payload['events'] + service_checks = payload['service_checks'] if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is of type {check_name: check} self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} @@ -156,21 +157,21 @@ def run(self, checksd=None, start_event=True): load = sys_checks['load'].check(self.agentConfig) payload.update(load) - + memory = sys_checks['memory'].check(self.agentConfig) if memory: payload.update({ - 'memPhysUsed' : memory.get('physUsed'), - 'memPhysPctUsable' : memory.get('physPctUsable'), - 'memPhysFree' : memory.get('physFree'), - 'memPhysTotal' : memory.get('physTotal'), - 'memPhysUsable' : memory.get('physUsable'), - 'memSwapUsed' : memory.get('swapUsed'), - 'memSwapFree' : memory.get('swapFree'), - 'memSwapPctFree' : memory.get('swapPctFree'), - 'memSwapTotal' : memory.get('swapTotal'), - 'memCached' : memory.get('physCached'), + 'memPhysUsed' : memory.get('physUsed'), + 'memPhysPctUsable' : memory.get('physPctUsable'), + 'memPhysFree' : memory.get('physFree'), + 'memPhysTotal' : memory.get('physTotal'), + 'memPhysUsable' : memory.get('physUsable'), + 'memSwapUsed' : memory.get('swapUsed'), + 'memSwapFree' : memory.get('swapFree'), + 'memSwapPctFree' : memory.get('swapPctFree'), + 'memSwapTotal' : memory.get('swapTotal'), + 'memCached' : memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) @@ -194,7 +195,7 @@ def run(self, checksd=None, start_event=True): if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData - + # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) @@ -211,7 +212,7 @@ def run(self, checksd=None, start_event=True): if ddforwarderData: payload['datadog'] = ddforwarderData - # Process the event checks. + # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(log, self.agentConfig) if event_data: @@ -226,12 +227,12 @@ def run(self, checksd=None, start_event=True): if snaps: has_resource = True res_value = { 'snaps': snaps, - 'format_version': resources_check.get_format_version() } + 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][resources_check.RESOURCE_KEY] = res_value - + if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], @@ -250,7 +251,7 @@ def run(self, checksd=None, start_event=True): if not self.continue_running: return log.info("Running check %s" % check.name) - instance_statuses = [] + instance_statuses = [] metric_count = 0 event_count = 0 try: @@ -260,6 +261,7 @@ def run(self, checksd=None, start_event=True): # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() + current_check_service_checks = check.get_service_checks() # Save them for the payload. metrics.extend(current_check_metrics) @@ -268,20 +270,24 @@ def run(self, checksd=None, start_event=True): events[check.name] = current_check_events else: events[check.name] += current_check_events + if current_check_service_checks: + service_checks.extend(current_check_service_checks) # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) - except Exception, e: + service_check_count = len(current_check_service_checks) + except Exception: log.exception("Error running check %s" % check.name) - check_status = CheckStatus(check.name, instance_statuses, metric_count, event_count, library_versions=check.get_library_info()) + check_status = CheckStatus(check.name, instance_statuses, metric_count, event_count, service_check_count, + library_versions=check.get_library_info()) check_statuses.append(check_status) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return - check_status = CheckStatus(check_name, None, None, None, + check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) @@ -290,13 +296,14 @@ def run(self, checksd=None, start_event=True): # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events + payload['service_checks'] = service_checks collect_duration = timer.step() if self.os != 'windows': - payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig, + payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration, time.clock() - cpu_clock)) else: - payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig, + payload['metrics'].extend(self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration)) @@ -353,6 +360,7 @@ def _build_payload(self, start_event=True): 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], + 'service_checks': [], 'resources': {}, 'internalHostname' : get_hostname(self.agentConfig), 'uuid' : get_uuid(), From ce97e7b1c5933c4e2119afe5ee6870264a41cd3a Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Tue, 22 Apr 2014 15:21:39 -0400 Subject: [PATCH 09/33] Upgrade to cassandra 2.0.7 to fix travis tests --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6e0acd4826..e68c58c981 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,9 +31,9 @@ install: - pip install . --use-mirrors before_script: - curl -L https://raw.github.com/DataDog/dd-agent/master/tests/haproxy.cfg > /tmp/haproxy.cfg - - curl -L http://apache.mirrors.multidist.eu/cassandra/2.0.6/apache-cassandra-2.0.6-bin.tar.gz > /tmp/cassandra2.tar.gz + - curl -L http://apache.mirrors.multidist.eu/cassandra/2.0.7/apache-cassandra-2.0.7-bin.tar.gz > /tmp/cassandra2.tar.gz - tar -xzvf /tmp/cassandra2.tar.gz -C /tmp - - sudo /tmp/apache-cassandra-2.0.6/bin/cassandra + - sudo /tmp/apache-cassandra-2.0.7/bin/cassandra - sudo service haproxy restart - sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/master/tests/tomcat_cfg.xml > /etc/tomcat6/server.xml" - sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/master/tests/tomcat6 >> /etc/default/tomcat6" From 851dfad9059396c01d3adf297bda7cae68351b83 Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Thu, 24 Apr 2014 15:16:22 +0000 Subject: [PATCH 10/33] rename service_name to check_name Services could have multiple checks --- checks/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index 118bec1e87..4520c6a4e4 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -394,18 +394,18 @@ def event(self, event): event['api_key'] = self.agentConfig['api_key'] self.events.append(event) - def service_check(self, service_name, status, tags=None, timestamp=None): + def service_check(self, check_name, status, tags=None, timestamp=None): """ Save a service check. - :param service_name: string, name of the service + :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: list of strings, a list of tags for this run :param timestamp: int, unix timestamp for when the run occurred """ self.service_checks.append({ - 'service_name': service_name, + 'check': check_name, 'status': status, 'tags': tags, 'timestamp': int(timestamp or time.time()) From c42f70216c6aa5d05f243e953760713cd2885227 Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Thu, 24 Apr 2014 16:18:05 +0000 Subject: [PATCH 11/33] allow for float timestamps in service checks --- checks/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index 4520c6a4e4..2310e79df8 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -402,13 +402,13 @@ def service_check(self, check_name, status, tags=None, timestamp=None): :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: list of strings, a list of tags for this run - :param timestamp: int, unix timestamp for when the run occurred + :param timestamp: float, unix timestamp for when the run occurred """ self.service_checks.append({ 'check': check_name, 'status': status, 'tags': tags, - 'timestamp': int(timestamp or time.time()) + 'timestamp': float(timestamp or time.time()) }) def has_events(self): From babb7379db6cfd3a5ab19fa4c95365290245be80 Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Thu, 24 Apr 2014 16:27:44 +0000 Subject: [PATCH 12/33] fix service check test --- tests/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_common.py b/tests/test_common.py index 079a423a59..8736c50643 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -100,14 +100,14 @@ def test_service_check(self): service_name = 'test.service_check' status = AgentCheck.CRITICAL tags = ['host:test', 'other:thing'] - timestamp = int(time.time()) + timestamp = time.time() check = AgentCheck('test', {}, {}) check.service_check(service_name, status, tags, timestamp) self.assertEquals(len(check.service_checks), 1, check.service_checks) val = check.get_service_checks() self.assertEquals([{ - 'service_name': service_name, + 'check': service_name, 'status': status, 'tags': tags, 'timestamp': timestamp From a94060cfe939c97e8c053cdf7fbf73be656796b6 Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Thu, 24 Apr 2014 16:30:22 +0000 Subject: [PATCH 13/33] add generated ids to service checks --- checks/__init__.py | 6 ++++-- tests/test_common.py | 4 ++++ util.py | 8 ++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index 2310e79df8..8e6304e973 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -15,7 +15,7 @@ import copy from pprint import pprint -from util import LaconicFilter, get_os, get_hostname +from util import LaconicFilter, get_os, get_hostname, get_next_id from config import get_confd_path from checks import check_status @@ -394,7 +394,8 @@ def event(self, event): event['api_key'] = self.agentConfig['api_key'] self.events.append(event) - def service_check(self, check_name, status, tags=None, timestamp=None): + def service_check(self, check_name, status, tags=None, timestamp=None, + check_id=None): """ Save a service check. @@ -405,6 +406,7 @@ def service_check(self, check_name, status, tags=None, timestamp=None): :param timestamp: float, unix timestamp for when the run occurred """ self.service_checks.append({ + 'id': check_id or get_next_id('service_check'), 'check': check_name, 'status': status, 'tags': tags, diff --git a/tests/test_common.py b/tests/test_common.py index 8736c50643..b396fd50c5 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -106,7 +106,11 @@ def test_service_check(self): check.service_check(service_name, status, tags, timestamp) self.assertEquals(len(check.service_checks), 1, check.service_checks) val = check.get_service_checks() + self.assertEquals(len(val), 1) + service_check_id = val[0].get('id', None) + self.assertNotEquals(service_check_id, None) self.assertEquals([{ + 'id': service_check_id, 'check': service_name, 'status': status, 'tags': tags, diff --git a/util.py b/util.py index 4d1855d14d..845c84aa75 100644 --- a/util.py +++ b/util.py @@ -149,6 +149,14 @@ def cast_metric_val(val): raise ValueError return val +_IDS = {} +def get_next_id(name): + global _IDS + current_id = _IDS.get(name, 0) + current_id += 1 + _IDS[name] = current_id + return current_id + def is_valid_hostname(hostname): if hostname.lower() in set([ 'localhost', From 8d9410ba33e67b597a0d13fde1a79555dde05dd9 Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Thu, 24 Apr 2014 16:34:37 +0000 Subject: [PATCH 14/33] allow service_check host_name be overridable, defaulting to agent host name --- checks/__init__.py | 9 +++++++-- tests/test_common.py | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index 8e6304e973..01b70b4142 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -395,7 +395,7 @@ def event(self, event): self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, - check_id=None): + host_name=None, check_id=None): """ Save a service check. @@ -405,10 +405,15 @@ def service_check(self, check_name, status, tags=None, timestamp=None, :param tags: list of strings, a list of tags for this run :param timestamp: float, unix timestamp for when the run occurred """ + if host_name is None: + host_name = self.host_name + if check_id is None: + check_id = get_next_id('service_check') self.service_checks.append({ - 'id': check_id or get_next_id('service_check'), + 'id': check_id, 'check': check_name, 'status': status, + 'host_name': host_name, 'tags': tags, 'timestamp': float(timestamp or time.time()) }) diff --git a/tests/test_common.py b/tests/test_common.py index b396fd50c5..a1ce73e392 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -100,10 +100,11 @@ def test_service_check(self): service_name = 'test.service_check' status = AgentCheck.CRITICAL tags = ['host:test', 'other:thing'] + host_name = 'foohost' timestamp = time.time() check = AgentCheck('test', {}, {}) - check.service_check(service_name, status, tags, timestamp) + check.service_check(service_name, status, tags, timestamp, host_name) self.assertEquals(len(check.service_checks), 1, check.service_checks) val = check.get_service_checks() self.assertEquals(len(val), 1) @@ -113,6 +114,7 @@ def test_service_check(self): 'id': service_check_id, 'check': service_name, 'status': status, + 'host_name': host_name, 'tags': tags, 'timestamp': timestamp }], val) From 6dc99fe4cf4e226b5fb7a3e59360cdf6903919cc Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Thu, 24 Apr 2014 17:54:20 +0000 Subject: [PATCH 15/33] update service check docstring --- checks/__init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index 01b70b4142..f76bebfed0 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -400,10 +400,15 @@ def service_check(self, check_name, status, tags=None, timestamp=None, Save a service check. :param check_name: string, name of the service check - :param status: int, describing the status. 0 for success, 1 for - warning, 2 for failure - :param tags: list of strings, a list of tags for this run - :param timestamp: float, unix timestamp for when the run occurred + :param status: int, describing the status. + 0 for success, 1 for warning, 2 for failure + :param tags: (optional) list of strings, a list of tags for this run + :param timestamp: (optional) float, unix timestamp for when the run occurred + :param host_name: (optional) str, host that generated the service + check. Defaults to the host_name of the agent + :param check_id: (optional) int, id used for logging and tracing + purposes. Don't need to be unique. If not specified, + one will be generated. """ if host_name is None: host_name = self.host_name From bbb53e36163689bd36dd58be86b7cf7b94bc6ece Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Thu, 24 Apr 2014 17:56:40 +0000 Subject: [PATCH 16/33] rename check_id to check_run_id for clarify --- checks/__init__.py | 14 +++++++------- tests/test_common.py | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index f76bebfed0..ad3f4bba80 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -395,7 +395,7 @@ def event(self, event): self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, - host_name=None, check_id=None): + host_name=None, check_run_id=None): """ Save a service check. @@ -406,16 +406,16 @@ def service_check(self, check_name, status, tags=None, timestamp=None, :param timestamp: (optional) float, unix timestamp for when the run occurred :param host_name: (optional) str, host that generated the service check. Defaults to the host_name of the agent - :param check_id: (optional) int, id used for logging and tracing - purposes. Don't need to be unique. If not specified, - one will be generated. + :param check_run_id: (optional) int, id used for logging and tracing + purposes. Don't need to be unique. If not + specified, one will be generated. """ if host_name is None: host_name = self.host_name - if check_id is None: - check_id = get_next_id('service_check') + if check_run_id is None: + check_run_id = get_next_id('service_check') self.service_checks.append({ - 'id': check_id, + 'id': check_run_id, 'check': check_name, 'status': status, 'host_name': host_name, diff --git a/tests/test_common.py b/tests/test_common.py index a1ce73e392..842d889390 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -97,22 +97,22 @@ def test_metadata(self): assert "socket-hostname" in c._get_metadata() def test_service_check(self): - service_name = 'test.service_check' + check_name = 'test.service_check' status = AgentCheck.CRITICAL tags = ['host:test', 'other:thing'] host_name = 'foohost' timestamp = time.time() check = AgentCheck('test', {}, {}) - check.service_check(service_name, status, tags, timestamp, host_name) + check.service_check(check_name, status, tags, timestamp, host_name) self.assertEquals(len(check.service_checks), 1, check.service_checks) val = check.get_service_checks() self.assertEquals(len(val), 1) - service_check_id = val[0].get('id', None) - self.assertNotEquals(service_check_id, None) + check_run_id = val[0].get('id', None) + self.assertNotEquals(check_run_id, None) self.assertEquals([{ - 'id': service_check_id, - 'check': service_name, + 'id': check_run_id, + 'check': check_name, 'status': status, 'host_name': host_name, 'tags': tags, From 2d2287d30d0caa0081dba96b405f427fc64042e4 Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Fri, 25 Apr 2014 13:50:29 -0400 Subject: [PATCH 17/33] Support redis check on Windows --- setup.py | 2 ++ win32/gui.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5c5991ed71..08b5f23024 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ 'pycurl', 'MySQLdb', 'psutil', + 'redis', ]) # Modules to force-include in the exe @@ -60,6 +61,7 @@ 'MySQLdb', 'psutil', 'psycopg2', + 'redis', # agent 'checks.services_checks', diff --git a/win32/gui.py b/win32/gui.py index 71ec1d928e..598ed02a6d 100644 --- a/win32/gui.py +++ b/win32/gui.py @@ -35,7 +35,7 @@ EXCLUDED_WINDOWS_CHECKS = [ 'cacti', 'directory', 'gearmand', 'hdfs', 'kafka_consumer', 'mcache', 'network', - 'redis', 'postfix', 'process', 'gunicorn', 'zk', + 'postfix', 'process', 'gunicorn', 'zk', ] MAIN_WINDOW_TITLE = "Datadog Agent Manager" From be2de5480fceab13d64d5c8281473b57fdde35c8 Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Fri, 25 Apr 2014 14:13:12 -0400 Subject: [PATCH 18/33] Fix the way we add redis in the package --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 08b5f23024..f7ecb92bc0 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ 'pymongo==2.3', 'psycopg2', 'python-memcached==1.48', - 'redis==2.6.2', 'adodbapi' 'elementtree', 'pycurl', From 937d37c700cad579190756290483a3b7fed3bd28 Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Fri, 25 Apr 2014 15:30:24 -0400 Subject: [PATCH 19/33] Bump version to 4.2.2 --- config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.py b/config.py index c79e4b3d42..e6a57f8ccd 100644 --- a/config.py +++ b/config.py @@ -63,7 +63,7 @@ def get_parsed_args(): def get_version(): - return "4.2.1" + return "4.2.2" def skip_leading_wsp(f): "Works on a file, returns a file-like object" From 272f4ea78f35628f9340a61d4996f005e8e76919 Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Fri, 25 Apr 2014 15:53:48 -0400 Subject: [PATCH 20/33] Update changelog for 4.2.2 --- CHANGELOG.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da9d98d0d9..77daae9558 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,17 @@ Changes ### Integrations affected ### Changes -* [BUGFIX] Fix incorrect open file descriptors metric name in process check: See [#904](https://github.com/DataDog/dd-agent/pull/904) +* [BUGFIX] Fix incorrect open file descriptors metric name in process check: See [#904] + +# 4.2.2 / 04-25-2014 + +**Windows Only** + +### Integrations affected +* Redis + +### Changes +* [FEATURE] Support Redis check on Windows: See [#917] # 4.2.1 / 04-09-2014 @@ -958,6 +968,8 @@ If you use ganglia, you want this version. [#883]: https://github.com/DataDog/dd-agent/issues/883 [#893]: https://github.com/DataDog/dd-agent/issues/893 [#894]: https://github.com/DataDog/dd-agent/issues/894 +[#904]: https://github.com/DataDog/dd-agent/issues/904 +[#917]: https://github.com/DataDog/dd-agent/issues/917 [@CaptTofu]: https://github.com/CaptTofu [@brettlangdon]: https://github.com/brettlangdon [@charles-dyfis-net]: https://github.com/charles-dyfis-net @@ -982,4 +994,4 @@ If you use ganglia, you want this version. [@steeve]: https://github.com/steeve [@stefan-mees]: https://github.com/stefan-mees [@tomduckering]: https://github.com/tomduckering -[@walkeran]: https://github.com/walkeran +[@walkeran]: https://github.com/walkeran \ No newline at end of file From a995fa2aad5137a57877532367c7d55dcbb56fd0 Mon Sep 17 00:00:00 2001 From: graemej Date: Mon, 28 Apr 2014 12:06:39 -0400 Subject: [PATCH 21/33] basic monitoring support for Apache Mesos --- checks.d/mesos.py | 113 ++++++++++++++++++++++++++++++++++++++ conf.d/mesos.yaml.example | 7 +++ 2 files changed, 120 insertions(+) create mode 100644 checks.d/mesos.py create mode 100644 conf.d/mesos.yaml.example diff --git a/checks.d/mesos.py b/checks.d/mesos.py new file mode 100644 index 0000000000..ac4bb2c426 --- /dev/null +++ b/checks.d/mesos.py @@ -0,0 +1,113 @@ +import time +import requests + +from checks import AgentCheck +from util import json, headers +from hashlib import md5 +import urllib2 + +class Mesos(AgentCheck): + def check(self, instance): + if 'url' not in instance: + self.log.info("Skipping instance, no url found.") + return + + # Load values from the instance config + url = instance['url'] + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + response = self.master_roles(url, timeout) + if response is not None: + for role in response['roles']: + tags = ['mesos','role:' + role['name']] + self.gauge('mesos.role.frameworks', len(role['frameworks']), tags=tags) + self.gauge('mesos.role.weight', role['weight'], tags=tags) + resources = role['resources'] + for attr in ['cpus','mem']: + if attr in resources: + self.gauge('mesos.role.' + attr, resources[attr], tags=tags) + + response = self.master_stats(url, timeout) + if response is not None: + for key in iter(response): + self.gauge('mesos.stats.' + key, response[key], tags=['mesos']) + + response = self.master_state(url, timeout) + if response is not None: + for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']: + tags = ['mesos'] + self.gauge('mesos.state.' + attr, response[attr], tags=tags) + + for framework in response['frameworks']: + tags = ['mesos','framework:' + framework['id']] + resources = framework['resources'] + for attr in ['cpus','mem']: + if attr in resources: + self.gauge('mesos.state.framework.' + attr, resources[attr], tags=tags) + + for slave in response['slaves']: + tags = ['mesos','slave:' + slave['id']] + resources = slave['resources'] + for attr in ['cpus','mem','disk']: + if attr in resources: + self.gauge('mesos.state.slave.' + attr, resources[attr], tags=tags) + + def master_roles(self, url, timeout): + return self.get_json(url + "/master/roles.json", timeout) + + def master_stats(self, url, timeout): + return self.get_json(url + "/master/stats.json", timeout) + + def master_state(self, url, timeout): + return self.get_json(url + "/master/state.json", timeout) + + def get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + + try: + response = requests.get(url, timeout=timeout) + parsed = response.json() + return parsed + except requests.exceptions.Timeout as e: + # If there's a timeout + self.timeout_event(url, timeout, aggregation_key) + return None + + if r.status_code != 200: + self.status_code_event(url, r, aggregation_key) + return None + + + def timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'URL timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'Invalid reponse code for %s' % url, + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + +if __name__ == '__main__': + check, instances = Mesos.from_yaml('/etc/dd-agent/conf.d/mesos.yaml') + for instance in instances: + print "\nRunning the check against url: %s" % (instance['url']) + check.check(instance) + if check.has_events(): + print 'Events: %s' % (check.get_events()) + + i = 0 + print 'Metrics:\n' + for metric in check.get_metrics(): + print " %d: %s" % (i, metric) + i += 1 \ No newline at end of file diff --git a/conf.d/mesos.yaml.example b/conf.d/mesos.yaml.example new file mode 100644 index 0000000000..c4527bc9c6 --- /dev/null +++ b/conf.d/mesos.yaml.example @@ -0,0 +1,7 @@ +init_config: +# time to wait on a Mesos API request +# default_timeout: 5 + +instances: +# url: the API endpoint of your Mesos master +# - url: "https://server:port" \ No newline at end of file From 1a30c74861ae1198c82f9974213d0e7ce313f26f Mon Sep 17 00:00:00 2001 From: Carlo Cabanilla Date: Mon, 28 Apr 2014 19:00:55 +0000 Subject: [PATCH 22/33] fix hostname vs host_name --- checks/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/checks/__init__.py b/checks/__init__.py index ad3f4bba80..0476c969ee 100644 --- a/checks/__init__.py +++ b/checks/__init__.py @@ -395,7 +395,7 @@ def event(self, event): self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, - host_name=None, check_run_id=None): + hostname=None, check_run_id=None): """ Save a service check. @@ -404,21 +404,21 @@ def service_check(self, check_name, status, tags=None, timestamp=None, 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred - :param host_name: (optional) str, host that generated the service + :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Don't need to be unique. If not specified, one will be generated. """ - if host_name is None: - host_name = self.host_name + if hostname is None: + hostname = self.hostname if check_run_id is None: check_run_id = get_next_id('service_check') self.service_checks.append({ 'id': check_run_id, 'check': check_name, 'status': status, - 'host_name': host_name, + 'host_name': hostname, 'tags': tags, 'timestamp': float(timestamp or time.time()) }) From 74028f020100fc5e3ff4e065c4b9dfc08673d02d Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Mon, 28 Apr 2014 16:30:32 -0400 Subject: [PATCH 23/33] Update jmx.yaml.example --- conf.d/jmx.yaml.example | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/conf.d/jmx.yaml.example b/conf.d/jmx.yaml.example index ff1d56ccc3..2d793a0f6b 100644 --- a/conf.d/jmx.yaml.example +++ b/conf.d/jmx.yaml.example @@ -1,22 +1,22 @@ init_config: instances: -# - host: localhost -# port: 7199 -# name: jmx_instance -# user: username -# password: password -# #java_bin_path: /path/to/java #Optional, should be set if the agent cannot find your java executable -# #trust_store_path: /path/to/trustStore.jks # Optional, should be set if ssl is enabled -# #trust_store_password: password -# tags: +# - host: localhost +# port: 7199 +# name: jmx_instance +# user: username +# password: password +# #java_bin_path: /path/to/java #Optional, should be set if the agent cannot find your java executable +# #trust_store_path: /path/to/trustStore.jks # Optional, should be set if ssl is enabled +# #trust_store_password: password +# tags: # env: stage # newTag: test # # # List of metrics to be collected by the integration # # Read http://docs.datadoghq.com/integrations/java/ to learn how to customize it -# conf: +# conf: # - include: # domain: my_domain # bean: my_bean From 887b49ae81400cd653d3398a85ae4b61e9365f9e Mon Sep 17 00:00:00 2001 From: graemej Date: Tue, 29 Apr 2014 17:07:06 -0400 Subject: [PATCH 24/33] add get_ prefix and raise on missing url --- checks.d/mesos.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/checks.d/mesos.py b/checks.d/mesos.py index ac4bb2c426..7449e629fe 100644 --- a/checks.d/mesos.py +++ b/checks.d/mesos.py @@ -9,7 +9,7 @@ class Mesos(AgentCheck): def check(self, instance): if 'url' not in instance: - self.log.info("Skipping instance, no url found.") + raise Exception('Mesos instance missing "url" value.') return # Load values from the instance config @@ -17,7 +17,7 @@ def check(self, instance): default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) - response = self.master_roles(url, timeout) + response = self.get_master_roles(url, timeout) if response is not None: for role in response['roles']: tags = ['mesos','role:' + role['name']] @@ -28,12 +28,12 @@ def check(self, instance): if attr in resources: self.gauge('mesos.role.' + attr, resources[attr], tags=tags) - response = self.master_stats(url, timeout) + response = self.get_master_stats(url, timeout) if response is not None: for key in iter(response): self.gauge('mesos.stats.' + key, response[key], tags=['mesos']) - response = self.master_state(url, timeout) + response = self.get_master_state(url, timeout) if response is not None: for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']: tags = ['mesos'] @@ -53,13 +53,13 @@ def check(self, instance): if attr in resources: self.gauge('mesos.state.slave.' + attr, resources[attr], tags=tags) - def master_roles(self, url, timeout): + def get_master_roles(self, url, timeout): return self.get_json(url + "/master/roles.json", timeout) - def master_stats(self, url, timeout): + def get_master_stats(self, url, timeout): return self.get_json(url + "/master/stats.json", timeout) - def master_state(self, url, timeout): + def get_master_state(self, url, timeout): return self.get_json(url + "/master/state.json", timeout) def get_json(self, url, timeout): From aebea2f12e6e9d374f6ba16e57b7a16d7ed0942d Mon Sep 17 00:00:00 2001 From: graemej Date: Tue, 29 Apr 2014 17:17:07 -0400 Subject: [PATCH 25/33] basic monitoring support for Mesosphere Marathon --- checks.d/marathon.py | 95 ++++++++++++++++++++++++++++++++++++ conf.d/marathon.yaml.example | 7 +++ 2 files changed, 102 insertions(+) create mode 100644 checks.d/marathon.py create mode 100644 conf.d/marathon.yaml.example diff --git a/checks.d/marathon.py b/checks.d/marathon.py new file mode 100644 index 0000000000..b7b531f054 --- /dev/null +++ b/checks.d/marathon.py @@ -0,0 +1,95 @@ +import time +import requests + +from checks import AgentCheck +from util import json, headers +from hashlib import md5 +import urllib2 + +class Marathon(AgentCheck): + def check(self, instance): + if 'url' not in instance: + raise Exception('Marathon instance missing "url" value.') + return + + # Load values from the instance config + url = instance['url'] + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + response = self.get_v2_apps(url, timeout) + if response is not None: + self.gauge('marathon.apps', len(response['apps']), tags=['marathon']) + for app in response['apps']: + tags = ['marathon', 'app_id:' + app['id'], 'version:' + app['version']] + for attr in ['taskRateLimit','instances','cpus','mem','tasksStaged','tasksRunning']: + self.gauge('marathon.' + attr, app[attr], tags=tags) + versions_reply = self.get_v2_app_versions(url, app['id'], timeout) + if versions_reply is not None: + self.gauge('marathon.versions', len(versions_reply['versions']), tags=tags) + + def get_v2_apps(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + + try: + response = requests.get(url + "/v2/apps", timeout=timeout) + apps = response.json() + return apps + except requests.exceptions.Timeout as e: + # If there's a timeout + self.timeout_event(url, timeout, aggregation_key) + return None + + if r.status_code != 200: + self.status_code_event(url, r, aggregation_key) + return None + + def get_v2_app_versions(self, url, app_id, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + + try: + response = requests.get(url + "/v2/apps/" + app_id + "/versions", timeout=timeout) + apps = response.json() + return apps + except requests.exceptions.Timeout as e: + # If there's a timeout + self.timeout_event(url, timeout, aggregation_key) + return None + + if r.status_code != 200: + self.status_code_event(url, r, aggregation_key) + return None + + def timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'URL timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'Invalid reponse code for %s' % url, + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + +if __name__ == '__main__': + check, instances = Marathon.from_yaml('/etc/dd-agent/conf.d/marathon.yaml') + for instance in instances: + print "\nRunning the check against url: %s" % (instance['url']) + check.check(instance) + if check.has_events(): + print 'Events: %s' % (check.get_events()) + + i = 0 + print 'Metrics:\n' + for metric in check.get_metrics(): + print " %d: %s" % (i, metric) + i += 1 \ No newline at end of file diff --git a/conf.d/marathon.yaml.example b/conf.d/marathon.yaml.example new file mode 100644 index 0000000000..b230b0b6cd --- /dev/null +++ b/conf.d/marathon.yaml.example @@ -0,0 +1,7 @@ +init_config: +# time to wait on a Marathon API request +# default_timeout: 5 + +instances: +# url: the API endpoint of your Marathon master +# - url: "https://server:port" \ No newline at end of file From 418435534ffdac7838db68087f822adac0facc55 Mon Sep 17 00:00:00 2001 From: graemej Date: Wed, 30 Apr 2014 13:23:06 -0400 Subject: [PATCH 26/33] add instance tags --- checks.d/mesos.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/checks.d/mesos.py b/checks.d/mesos.py index 7449e629fe..19b0eaf6f8 100644 --- a/checks.d/mesos.py +++ b/checks.d/mesos.py @@ -14,13 +14,14 @@ def check(self, instance): # Load values from the instance config url = instance['url'] + instance_tags = instance.get('tags', []) default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) response = self.get_master_roles(url, timeout) if response is not None: for role in response['roles']: - tags = ['mesos','role:' + role['name']] + tags = ['role:' + role['name']] + instance_tags self.gauge('mesos.role.frameworks', len(role['frameworks']), tags=tags) self.gauge('mesos.role.weight', role['weight'], tags=tags) resources = role['resources'] @@ -30,24 +31,25 @@ def check(self, instance): response = self.get_master_stats(url, timeout) if response is not None: + tags = instance_tags for key in iter(response): - self.gauge('mesos.stats.' + key, response[key], tags=['mesos']) + self.gauge('mesos.stats.' + key, response[key], tags=tags) response = self.get_master_state(url, timeout) if response is not None: + tags = instance_tags for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']: - tags = ['mesos'] self.gauge('mesos.state.' + attr, response[attr], tags=tags) for framework in response['frameworks']: - tags = ['mesos','framework:' + framework['id']] + tags = ['framework:' + framework['id']] + instance_tags resources = framework['resources'] for attr in ['cpus','mem']: if attr in resources: self.gauge('mesos.state.framework.' + attr, resources[attr], tags=tags) for slave in response['slaves']: - tags = ['mesos','slave:' + slave['id']] + tags = ['mesos','slave:' + slave['id']] + instance_tags resources = slave['resources'] for attr in ['cpus','mem','disk']: if attr in resources: From 87623ac676dc1115b721d89289ccbc905ffa1097 Mon Sep 17 00:00:00 2001 From: graemej Date: Wed, 30 Apr 2014 13:31:50 -0400 Subject: [PATCH 27/33] add instance tags --- checks.d/marathon.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/checks.d/marathon.py b/checks.d/marathon.py index b7b531f054..340e5aa4e1 100644 --- a/checks.d/marathon.py +++ b/checks.d/marathon.py @@ -14,14 +14,15 @@ def check(self, instance): # Load values from the instance config url = instance['url'] + instance_tags = instance.get('tags', []) default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) response = self.get_v2_apps(url, timeout) if response is not None: - self.gauge('marathon.apps', len(response['apps']), tags=['marathon']) + self.gauge('marathon.apps', len(response['apps']), tags=instance_tags) for app in response['apps']: - tags = ['marathon', 'app_id:' + app['id'], 'version:' + app['version']] + tags = ['app_id:' + app['id'], 'version:' + app['version']] + instance_tags for attr in ['taskRateLimit','instances','cpus','mem','tasksStaged','tasksRunning']: self.gauge('marathon.' + attr, app[attr], tags=tags) versions_reply = self.get_v2_app_versions(url, app['id'], timeout) From 7b0dea036dcda5ccdcd3883b6adfbdaad86fd3dd Mon Sep 17 00:00:00 2001 From: Rudy Date: Thu, 1 May 2014 09:40:56 -0400 Subject: [PATCH 28/33] CI| Remove Unused Code making pylint fail --- checks/datadog.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/checks/datadog.py b/checks/datadog.py index 46ec5efe90..fa5886a367 100644 --- a/checks/datadog.py +++ b/checks/datadog.py @@ -565,20 +565,6 @@ def check(self, agentConfig, move_end=True): return {} -def testDogStream(): - import logging - - logger = logging.getLogger("ddagent.checks.datadog") - logger.setLevel(logging.DEBUG) - logger.addHandler(logging.StreamHandler()) - dogstream = Dogstream(logger) - - while True: - events = dogstream.check({'api_key':'my_apikey', 'dogstream_log': sys.argv[1]}, move_end=True) - for e in events: - print "Event:", e - time.sleep(5) - def testddForwarder(): import logging From e331e03356545f1dc66d15e45448da7bd9448b76 Mon Sep 17 00:00:00 2001 From: Rudy Date: Thu, 1 May 2014 10:22:31 -0400 Subject: [PATCH 29/33] CI| pylint errors in graphite.py --- graphite.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/graphite.py b/graphite.py index 7066f1cedd..fb77ad4212 100644 --- a/graphite.py +++ b/graphite.py @@ -82,7 +82,8 @@ def _postMetric(self, name, host, device, datapoint): ts = datapoint[0] value = datapoint[1] - self.app.appendMetric("graphite", name, host, device, ts, value) + if self.app is not None: + self.app.appendMetric("graphite", name, host, device, ts, value) def _processMetric(self, metric, datapoint): """Parse the metric name to fetch (host, metric, device) and @@ -114,7 +115,8 @@ def _decode(self, data): self.stream.read_bytes(4, self._on_read_header) def start_graphite_listener(port): - echo_server = GraphiteServer() + from util import get_hostname + echo_server = GraphiteServer(None,get_hostname(None)) echo_server.listen(port) IOLoop.instance().start() From 7e2f50764b0b3e9ac885146611966c34c69b6e12 Mon Sep 17 00:00:00 2001 From: Rudy Date: Mon, 5 May 2014 12:07:57 -0400 Subject: [PATCH 30/33] Style| Forgot space after comma --- graphite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphite.py b/graphite.py index fb77ad4212..44187e0881 100644 --- a/graphite.py +++ b/graphite.py @@ -116,7 +116,7 @@ def _decode(self, data): def start_graphite_listener(port): from util import get_hostname - echo_server = GraphiteServer(None,get_hostname(None)) + echo_server = GraphiteServer(None, get_hostname(None)) echo_server.listen(port) IOLoop.instance().start() From 924e46eb3bd2b6ffdc060a901c0f168069525c47 Mon Sep 17 00:00:00 2001 From: Arthur Neves Date: Mon, 5 May 2014 12:33:39 -0400 Subject: [PATCH 31/33] Add delete_misses/delete_hits to memcache See https://github.com/memcached/memcached/blob/master/doc/protocol.txt, there is delete_* stats we could use to report to datadog. --- checks.d/mcache.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/checks.d/mcache.py b/checks.d/mcache.py index 4732212b22..57231e81c5 100644 --- a/checks.d/mcache.py +++ b/checks.d/mcache.py @@ -9,34 +9,37 @@ # version string Version string of this server # pointer_size 32 Default size of pointers on the host OS # (generally 32 or 64) -# rusage_user 32u:32u Accumulated user time for this process +# rusage_user 32u:32u Accumulated user time for this process # (seconds:microseconds) -# rusage_system 32u:32u Accumulated system time for this process +# rusage_system 32u:32u Accumulated system time for this process # (seconds:microseconds) # curr_items 32u Current number of items stored by the server -# total_items 32u Total number of items stored by this server +# total_items 32u Total number of items stored by this server # ever since it started -# bytes 64u Current number of bytes used by this server +# bytes 64u Current number of bytes used by this server # to store items # curr_connections 32u Number of open connections -# total_connections 32u Total number of connections opened since +# total_connections 32u Total number of connections opened since # the server started running -# connection_structures 32u Number of connection structures allocated +# connection_structures 32u Number of connection structures allocated # by the server # cmd_get 64u Cumulative number of retrieval requests # cmd_set 64u Cumulative number of storage requests -# get_hits 64u Number of keys that have been requested and +# get_hits 64u Number of keys that have been requested and # found present -# get_misses 64u Number of items that have been requested +# get_misses 64u Number of items that have been requested # and not found +# delete_misses 64u Number of deletions reqs for missing keys +# delete_hits 64u Number of deletion reqs resulting in +# an item being removed. # evictions 64u Number of valid items removed from cache # to free memory for new items -# bytes_read 64u Total number of bytes read by this server +# bytes_read 64u Total number of bytes read by this server # from network -# bytes_written 64u Total number of bytes sent by this server to +# bytes_written 64u Total number of bytes sent by this server to # network # limit_maxbytes 32u Number of bytes this server is allowed to -# use for storage. +# use for storage. # threads 32u Number of worker threads requested. # (see doc/threads.txt) # >>> mc.get_stats() @@ -77,6 +80,8 @@ class Memcache(AgentCheck): "cmd_flush", "get_hits", "get_misses", + "delete_misses", + "delete_hits", "evictions", "bytes_read", "bytes_written", From 33d774f36bcbf507807272bd7497bd693f13ef60 Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Mon, 5 May 2014 14:11:08 -0400 Subject: [PATCH 32/33] Fix memcache test since new metrics have been added --- tests/test_mcache.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_mcache.py b/tests/test_mcache.py index f68c1a40fd..311976c98e 100644 --- a/tests/test_mcache.py +++ b/tests/test_mcache.py @@ -46,8 +46,8 @@ def testMetrics(self): # Check that we got metrics from 3 hosts (aka all but the dummy host) self.assertEquals(len([t for t in r if t[0] == "memcache.total_items"]), 3, r) - # Check that we got 21 metrics for a specific host - self.assertEquals(len([t for t in r if t[3].get('tags') == ["instance:mythirdtag"]]), 21, r) + # Check that we got 23 metrics for a specific host + self.assertEquals(len([t for t in r if t[3].get('tags') == ["instance:mythirdtag"]]), 23, r) def testTagging(self): instance = { @@ -64,7 +64,7 @@ def testTagging(self): r = self.c.get_metrics() # Check the tags - self.assertEquals(len([t for t in r if t[3].get('tags') == ["regular_old_tag"]]), 21, r) + self.assertEquals(len([t for t in r if t[3].get('tags') == ["regular_old_tag"]]), 23, r) conf = { 'memcache_server': 'localhost', @@ -80,7 +80,7 @@ def testTagging(self): r = self.c.get_metrics() # Check the tags - self.assertEquals(len([t for t in r if t[3].get('tags') == ["instance:localhost_11211"]]), 21, r) + self.assertEquals(len([t for t in r if t[3].get('tags') == ["instance:localhost_11211"]]), 23, r) def testDummyHost(self): new_conf = self.c.parse_agent_config({"memcache_instance_1": "dummy:11211:myothertag"}) From 05fe34d741e1a52f5427387fd63ed560c867038d Mon Sep 17 00:00:00 2001 From: Remi Hakim Date: Mon, 5 May 2014 15:32:23 -0400 Subject: [PATCH 33/33] Make docker integration compatible with latest versions of docker --- checks.d/docker.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/checks.d/docker.py b/checks.d/docker.py index 241d631bdc..22c6c058fa 100644 --- a/checks.d/docker.py +++ b/checks.d/docker.py @@ -13,7 +13,7 @@ LXC_METRICS = [ { "cgroup": "memory", - "file": "lxc/%s/memory.stat", + "file": "%s/%s/memory.stat", "metrics": { "active_anon": ("docker.mem.active_anon", "gauge"), "active_file": ("docker.mem.active_file", "gauge"), @@ -47,7 +47,7 @@ }, { "cgroup": "cpuacct", - "file": "lxc/%s/cpuacct.stat", + "file": "%s/%s/cpuacct.stat", "metrics": { "user": ("docker.cpu.user", "gauge"), "system": ("docker.cpu.system", "gauge"), @@ -103,12 +103,30 @@ def unix_open(self, req): class Docker(AgentCheck): def __init__(self, *args, **kwargs): super(Docker, self).__init__(*args, **kwargs) - urllib2.install_opener(urllib2.build_opener(UnixSocketHandler())) self._mounpoints = {} + self.cgroup_path_prefix = None # Depending on the version for metric in LXC_METRICS: self._mounpoints[metric["cgroup"]] = self._find_cgroup(metric["cgroup"]) + self._path_prefix = None + + @property + def path_prefix(self): + if self._path_prefix is None: + metric = LXC_METRICS[0] + mountpoint = self._mounpoints[metric["cgroup"]] + stat_file_lxc = os.path.join(mountpoint, "lxc") + stat_file_docker = os.path.join(mountpoint, "docker") + + if os.path.exists(stat_file_lxc): + self._path_prefix = "lxc" + elif os.path.exists(stat_file_docker): + self._path_prefix = "docker" + else: + raise Exception("Cannot find Docker cgroup file. If you are using Docker 0.9 or 0.10, it is a known bug in Docker fixed in Docker 0.10.1") + return self._path_prefix def check(self, instance): + urllib2.install_opener(urllib2.build_opener(UnixSocketHandler())) # We need to reinstall the opener every time as it gets uninstalled tags = instance.get("tags") or [] containers = self._get_containers(instance) if not containers: @@ -143,7 +161,7 @@ def check(self, instance): getattr(self, metric_type)(dd_key, int(container[key]), tags=container_tags) for metric in LXC_METRICS: mountpoint = self._mounpoints[metric["cgroup"]] - stat_file = os.path.join(mountpoint, metric["file"] % container["Id"]) + stat_file = os.path.join(mountpoint, metric["file"] % (self.path_prefix, container["Id"])) stats = self._parse_cgroup_file(stat_file) for key, (dd_key, metric_type) in metric["metrics"].items(): if key in stats: @@ -213,7 +231,7 @@ def _parse_cgroup_file(self, file_): try: fp = open(file_) except IOError: - raise IOError("Can't open %s. If you are using Docker 0.9.0 or higher, the Datadog agent is not yet compatible with these versions. Please get in touch with Datadog Support for more information" % file_) + raise IOError("Can't open %s. If you are using Docker 0.9 or 0.10, it is a known bug in Docker fixed in Docker 0.10.1" % file_) return dict(map(lambda x: x.split(), fp.read().splitlines())) finally: