Skip to content

Commit

Permalink
Merge with master branch to keep this one on track
Browse files Browse the repository at this point in the history
  • Loading branch information
Remi Hakim committed May 5, 2014
2 parents 603e547 + 1effb39 commit e2b7257
Show file tree
Hide file tree
Showing 21 changed files with 439 additions and 98 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ before_script:
- psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE datadog_test to datadog"
- psql -U datadog -c "CREATE TABLE Persons (PersonID int, LastName varchar(255), FirstName varchar(255), Address varchar(255), City varchar(255))" datadog_test
- curl -L https://raw.github.com/DataDog/dd-agent/master/tests/haproxy.cfg > /tmp/haproxy.cfg
- curl -L http://apache.mirrors.multidist.eu/cassandra/2.0.6/apache-cassandra-2.0.6-bin.tar.gz > /tmp/cassandra2.tar.gz
- curl -L http://apache.mirrors.multidist.eu/cassandra/2.0.7/apache-cassandra-2.0.7-bin.tar.gz > /tmp/cassandra2.tar.gz
- tar -xzvf /tmp/cassandra2.tar.gz -C /tmp
- sudo /tmp/apache-cassandra-2.0.6/bin/cassandra
- sudo /tmp/apache-cassandra-2.0.7/bin/cassandra
- sudo service haproxy restart
- sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/master/tests/tomcat_cfg.xml > /etc/tomcat6/server.xml"
- sudo bash -c "curl -L https://raw.github.com/DataDog/dd-agent/master/tests/tomcat6 >> /etc/default/tomcat6"
Expand Down
20 changes: 19 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
Changes
=======
# 5.0.0 / Unreleased

### Integrations affected

### Changes
* [BUGFIX] Fix incorrect open file descriptors metric name in process check: See [#904]

# 4.2.2 / 04-25-2014

**Windows Only**

### Integrations affected
* Redis

### Changes
* [FEATURE] Support Redis check on Windows: See [#917]

# 4.2.1 / 04-09-2014

Expand Down Expand Up @@ -952,6 +968,8 @@ If you use ganglia, you want this version.
[#883]: https://github.com/DataDog/dd-agent/issues/883
[#893]: https://github.com/DataDog/dd-agent/issues/893
[#894]: https://github.com/DataDog/dd-agent/issues/894
[#904]: https://github.com/DataDog/dd-agent/issues/904
[#917]: https://github.com/DataDog/dd-agent/issues/917
[@CaptTofu]: https://github.com/CaptTofu
[@brettlangdon]: https://github.com/brettlangdon
[@charles-dyfis-net]: https://github.com/charles-dyfis-net
Expand All @@ -976,4 +994,4 @@ If you use ganglia, you want this version.
[@steeve]: https://github.com/steeve
[@stefan-mees]: https://github.com/stefan-mees
[@tomduckering]: https://github.com/tomduckering
[@walkeran]: https://github.com/walkeran
[@walkeran]: https://github.com/walkeran
28 changes: 23 additions & 5 deletions checks.d/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
LXC_METRICS = [
{
"cgroup": "memory",
"file": "lxc/%s/memory.stat",
"file": "%s/%s/memory.stat",
"metrics": {
"active_anon": ("docker.mem.active_anon", "gauge"),
"active_file": ("docker.mem.active_file", "gauge"),
Expand Down Expand Up @@ -47,7 +47,7 @@
},
{
"cgroup": "cpuacct",
"file": "lxc/%s/cpuacct.stat",
"file": "%s/%s/cpuacct.stat",
"metrics": {
"user": ("docker.cpu.user", "gauge"),
"system": ("docker.cpu.system", "gauge"),
Expand Down Expand Up @@ -103,12 +103,30 @@ def unix_open(self, req):
class Docker(AgentCheck):
def __init__(self, *args, **kwargs):
super(Docker, self).__init__(*args, **kwargs)
urllib2.install_opener(urllib2.build_opener(UnixSocketHandler()))
self._mounpoints = {}
self.cgroup_path_prefix = None # Depending on the version
for metric in LXC_METRICS:
self._mounpoints[metric["cgroup"]] = self._find_cgroup(metric["cgroup"])
self._path_prefix = None

@property
def path_prefix(self):
if self._path_prefix is None:
metric = LXC_METRICS[0]
mountpoint = self._mounpoints[metric["cgroup"]]
stat_file_lxc = os.path.join(mountpoint, "lxc")
stat_file_docker = os.path.join(mountpoint, "docker")

if os.path.exists(stat_file_lxc):
self._path_prefix = "lxc"
elif os.path.exists(stat_file_docker):
self._path_prefix = "docker"
else:
raise Exception("Cannot find Docker cgroup file. If you are using Docker 0.9 or 0.10, it is a known bug in Docker fixed in Docker 0.10.1")
return self._path_prefix

def check(self, instance):
urllib2.install_opener(urllib2.build_opener(UnixSocketHandler())) # We need to reinstall the opener every time as it gets uninstalled
tags = instance.get("tags") or []
containers = self._get_containers(instance)
if not containers:
Expand Down Expand Up @@ -143,7 +161,7 @@ def check(self, instance):
getattr(self, metric_type)(dd_key, int(container[key]), tags=container_tags)
for metric in LXC_METRICS:
mountpoint = self._mounpoints[metric["cgroup"]]
stat_file = os.path.join(mountpoint, metric["file"] % container["Id"])
stat_file = os.path.join(mountpoint, metric["file"] % (self.path_prefix, container["Id"]))
stats = self._parse_cgroup_file(stat_file)
for key, (dd_key, metric_type) in metric["metrics"].items():
if key in stats:
Expand Down Expand Up @@ -213,7 +231,7 @@ def _parse_cgroup_file(self, file_):
try:
fp = open(file_)
except IOError:
raise IOError("Can't open %s. If you are using Docker 0.9.0 or higher, the Datadog agent is not yet compatible with these versions. Please get in touch with Datadog Support for more information" % file_)
raise IOError("Can't open %s. If you are using Docker 0.9 or 0.10, it is a known bug in Docker fixed in Docker 0.10.1" % file_)
return dict(map(lambda x: x.split(), fp.read().splitlines()))

finally:
Expand Down
96 changes: 96 additions & 0 deletions checks.d/marathon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import time
import requests

from checks import AgentCheck
from util import json, headers
from hashlib import md5
import urllib2

class Marathon(AgentCheck):
def check(self, instance):
if 'url' not in instance:
raise Exception('Marathon instance missing "url" value.')
return

# Load values from the instance config
url = instance['url']
instance_tags = instance.get('tags', [])
default_timeout = self.init_config.get('default_timeout', 5)
timeout = float(instance.get('timeout', default_timeout))

response = self.get_v2_apps(url, timeout)
if response is not None:
self.gauge('marathon.apps', len(response['apps']), tags=instance_tags)
for app in response['apps']:
tags = ['app_id:' + app['id'], 'version:' + app['version']] + instance_tags
for attr in ['taskRateLimit','instances','cpus','mem','tasksStaged','tasksRunning']:
self.gauge('marathon.' + attr, app[attr], tags=tags)
versions_reply = self.get_v2_app_versions(url, app['id'], timeout)
if versions_reply is not None:
self.gauge('marathon.versions', len(versions_reply['versions']), tags=tags)

def get_v2_apps(self, url, timeout):
# Use a hash of the URL as an aggregation key
aggregation_key = md5(url).hexdigest()

try:
response = requests.get(url + "/v2/apps", timeout=timeout)
apps = response.json()
return apps
except requests.exceptions.Timeout as e:
# If there's a timeout
self.timeout_event(url, timeout, aggregation_key)
return None

if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
return None

def get_v2_app_versions(self, url, app_id, timeout):
# Use a hash of the URL as an aggregation key
aggregation_key = md5(url).hexdigest()

try:
response = requests.get(url + "/v2/apps/" + app_id + "/versions", timeout=timeout)
apps = response.json()
return apps
except requests.exceptions.Timeout as e:
# If there's a timeout
self.timeout_event(url, timeout, aggregation_key)
return None

if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
return None

def timeout_event(self, url, timeout, aggregation_key):
self.event({
'timestamp': int(time.time()),
'event_type': 'http_check',
'msg_title': 'URL timeout',
'msg_text': '%s timed out after %s seconds.' % (url, timeout),
'aggregation_key': aggregation_key
})

def status_code_event(self, url, r, aggregation_key):
self.event({
'timestamp': int(time.time()),
'event_type': 'http_check',
'msg_title': 'Invalid reponse code for %s' % url,
'msg_text': '%s returned a status of %s' % (url, r.status_code),
'aggregation_key': aggregation_key
})

if __name__ == '__main__':
check, instances = Marathon.from_yaml('/etc/dd-agent/conf.d/marathon.yaml')
for instance in instances:
print "\nRunning the check against url: %s" % (instance['url'])
check.check(instance)
if check.has_events():
print 'Events: %s' % (check.get_events())

i = 0
print 'Metrics:\n'
for metric in check.get_metrics():
print " %d: %s" % (i, metric)
i += 1
27 changes: 16 additions & 11 deletions checks.d/mcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,37 @@
# version string Version string of this server
# pointer_size 32 Default size of pointers on the host OS
# (generally 32 or 64)
# rusage_user 32u:32u Accumulated user time for this process
# rusage_user 32u:32u Accumulated user time for this process
# (seconds:microseconds)
# rusage_system 32u:32u Accumulated system time for this process
# rusage_system 32u:32u Accumulated system time for this process
# (seconds:microseconds)
# curr_items 32u Current number of items stored by the server
# total_items 32u Total number of items stored by this server
# total_items 32u Total number of items stored by this server
# ever since it started
# bytes 64u Current number of bytes used by this server
# bytes 64u Current number of bytes used by this server
# to store items
# curr_connections 32u Number of open connections
# total_connections 32u Total number of connections opened since
# total_connections 32u Total number of connections opened since
# the server started running
# connection_structures 32u Number of connection structures allocated
# connection_structures 32u Number of connection structures allocated
# by the server
# cmd_get 64u Cumulative number of retrieval requests
# cmd_set 64u Cumulative number of storage requests
# get_hits 64u Number of keys that have been requested and
# get_hits 64u Number of keys that have been requested and
# found present
# get_misses 64u Number of items that have been requested
# get_misses 64u Number of items that have been requested
# and not found
# delete_misses 64u Number of deletions reqs for missing keys
# delete_hits 64u Number of deletion reqs resulting in
# an item being removed.
# evictions 64u Number of valid items removed from cache
# to free memory for new items
# bytes_read 64u Total number of bytes read by this server
# bytes_read 64u Total number of bytes read by this server
# from network
# bytes_written 64u Total number of bytes sent by this server to
# bytes_written 64u Total number of bytes sent by this server to
# network
# limit_maxbytes 32u Number of bytes this server is allowed to
# use for storage.
# use for storage.
# threads 32u Number of worker threads requested.
# (see doc/threads.txt)
# >>> mc.get_stats()
Expand Down Expand Up @@ -77,6 +80,8 @@ class Memcache(AgentCheck):
"cmd_flush",
"get_hits",
"get_misses",
"delete_misses",
"delete_hits",
"evictions",
"bytes_read",
"bytes_written",
Expand Down
115 changes: 115 additions & 0 deletions checks.d/mesos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import time
import requests

from checks import AgentCheck
from util import json, headers
from hashlib import md5
import urllib2

class Mesos(AgentCheck):
def check(self, instance):
if 'url' not in instance:
raise Exception('Mesos instance missing "url" value.')
return

# Load values from the instance config
url = instance['url']
instance_tags = instance.get('tags', [])
default_timeout = self.init_config.get('default_timeout', 5)
timeout = float(instance.get('timeout', default_timeout))

response = self.get_master_roles(url, timeout)
if response is not None:
for role in response['roles']:
tags = ['role:' + role['name']] + instance_tags
self.gauge('mesos.role.frameworks', len(role['frameworks']), tags=tags)
self.gauge('mesos.role.weight', role['weight'], tags=tags)
resources = role['resources']
for attr in ['cpus','mem']:
if attr in resources:
self.gauge('mesos.role.' + attr, resources[attr], tags=tags)

response = self.get_master_stats(url, timeout)
if response is not None:
tags = instance_tags
for key in iter(response):
self.gauge('mesos.stats.' + key, response[key], tags=tags)

response = self.get_master_state(url, timeout)
if response is not None:
tags = instance_tags
for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']:
self.gauge('mesos.state.' + attr, response[attr], tags=tags)

for framework in response['frameworks']:
tags = ['framework:' + framework['id']] + instance_tags
resources = framework['resources']
for attr in ['cpus','mem']:
if attr in resources:
self.gauge('mesos.state.framework.' + attr, resources[attr], tags=tags)

for slave in response['slaves']:
tags = ['mesos','slave:' + slave['id']] + instance_tags
resources = slave['resources']
for attr in ['cpus','mem','disk']:
if attr in resources:
self.gauge('mesos.state.slave.' + attr, resources[attr], tags=tags)

def get_master_roles(self, url, timeout):
return self.get_json(url + "/master/roles.json", timeout)

def get_master_stats(self, url, timeout):
return self.get_json(url + "/master/stats.json", timeout)

def get_master_state(self, url, timeout):
return self.get_json(url + "/master/state.json", timeout)

def get_json(self, url, timeout):
# Use a hash of the URL as an aggregation key
aggregation_key = md5(url).hexdigest()

try:
response = requests.get(url, timeout=timeout)
parsed = response.json()
return parsed
except requests.exceptions.Timeout as e:
# If there's a timeout
self.timeout_event(url, timeout, aggregation_key)
return None

if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
return None


def timeout_event(self, url, timeout, aggregation_key):
self.event({
'timestamp': int(time.time()),
'event_type': 'http_check',
'msg_title': 'URL timeout',
'msg_text': '%s timed out after %s seconds.' % (url, timeout),
'aggregation_key': aggregation_key
})

def status_code_event(self, url, r, aggregation_key):
self.event({
'timestamp': int(time.time()),
'event_type': 'http_check',
'msg_title': 'Invalid reponse code for %s' % url,
'msg_text': '%s returned a status of %s' % (url, r.status_code),
'aggregation_key': aggregation_key
})

if __name__ == '__main__':
check, instances = Mesos.from_yaml('/etc/dd-agent/conf.d/mesos.yaml')
for instance in instances:
print "\nRunning the check against url: %s" % (instance['url'])
check.check(instance)
if check.has_events():
print 'Events: %s' % (check.get_events())

i = 0
print 'Metrics:\n'
for metric in check.get_metrics():
print " %d: %s" % (i, metric)
i += 1
2 changes: 1 addition & 1 deletion checks.d/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
'Innodb_data_reads': ('mysql.innodb.data_reads', RATE),
'Innodb_data_writes': ('mysql.innodb.data_writes', RATE),
'Innodb_os_log_fsyncs': ('mysql.innodb.os_log_fsyncs', RATE),
'Innodb_data_reads': ('mysql.innodb.buffer_pool_size', RATE),
'Innodb_buffer_pool_size': ('mysql.innodb.buffer_pool_size', RATE),
'Slow_queries': ('mysql.performance.slow_queries', RATE),
'Questions': ('mysql.performance.questions', RATE),
'Queries': ('mysql.performance.queries', RATE),
Expand Down
Loading

0 comments on commit e2b7257

Please sign in to comment.