From f9a1dfcaea273126e8ae8fa4d10c67225aaf80b8 Mon Sep 17 00:00:00 2001 From: Haissam Kaj <haissam.kaj@datadoghq.com> Date: Thu, 23 Jul 2015 16:30:06 -0400 Subject: [PATCH 1/3] Add ECS tags for docker check, add docker-py as a req --- checks.d/docker.py | 41 ++++++++++++++++++++++++++++++++++++++ conf.d/docker.yaml.example | 5 +++++ requirements.txt | 3 +++ util.py | 2 +- utils/platform.py | 12 ++++++++++- 5 files changed, 61 insertions(+), 2 deletions(-) diff --git a/checks.d/docker.py b/checks.d/docker.py index 49b2748e53..89319a520f 100644 --- a/checks.d/docker.py +++ b/checks.d/docker.py @@ -3,6 +3,7 @@ import httplib import os import re +import requests import socket import time import urllib @@ -13,6 +14,7 @@ from checks import AgentCheck from config import _is_affirmative from util import json +from utils.platform import Platform EVENT_TYPE = SOURCE_TYPE_NAME = 'docker' @@ -175,8 +177,16 @@ def _get_and_count_containers(self, instance): running_containers_ids = set([container['Id'] for container in running_containers]) + # Dict of container ids and a list of their Amazon ECS task tags + if Platform.is_ecs_instance() and instance.get('ecs_tags', True): + ecs_tags = self._get_ecs_tags(instance, running_containers) + else: + ecs_tags = None + for container in all_containers: container_tags = list(tags) + if ecs_tags: + container_tags += ecs_tags.get(container['Id'], []) for key in DOCKER_TAGS: tag = self._make_tag(key, container[key], instance) if tag: @@ -229,11 +239,18 @@ def _report_containers_metrics(self, containers, instance): collect_uncommon_metrics = _is_affirmative(instance.get("collect_all_metrics", False)) tags = instance.get("tags", []) + if Platform.is_ecs_instance() and instance.get('ecs_tags', True): + ecs_tags = self._get_ecs_tags(instance, containers) + else: + ecs_tags = None + # Pre-compile regex to include/exclude containers use_filters = self._prepare_filters(instance) for container in containers: container_tags = list(tags) + if ecs_tags: + container_tags += ecs_tags.get(container['Id'], []) for name in container["Names"]: container_tags.append(self._make_tag("name", name.lstrip("/"), instance)) for key in DOCKER_TAGS: @@ -277,6 +294,26 @@ def _new_tags_conversion(self, tag): # Prefix tags to avoid conflict with AWS tags return NEW_TAGS_MAP.get(tag, tag) + def _get_ecs_tags(self, instance, containers): + ecs_id = None + for co in containers: + if '/ecs-agent' in co.get('Names', []): + ecs_id = co.get('Id') + if ecs_id is None: + return [] + ecs_config = self._inspect_container(instance, ecs_id) + net_conf = ecs_config['NetworkSettings'].get('Ports', {}) + net_conf = net_conf.get(net_conf.keys()[0], []) + container_tags = {} + if net_conf: + net_conf = net_conf[0] if isinstance(net_conf, list) else net_conf + ip, port = ecs_config.get('NetworkSettings', {})['IPAddress'], net_conf.get('HostPort') + tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() + for task in tasks.get('Tasks', []): + for container in task.get('Containers', []): + tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] + container_tags[container['DockerId']] = tags + return container_tags # Events @@ -354,6 +391,10 @@ def _get_containers(self, instance, with_size=False, get_all=False): """Gets the list of running/all containers in Docker.""" return self._get_json("%(url)s/containers/json" % instance, params={'size': with_size, 'all': get_all}) + def _inspect_container(self, instance, container_id): + """Get the list of running/all containers in Docker.""" + return self._get_json("%s/containers/%s/json" % (instance['url'], container_id)) + def _get_images(self, instance, with_size=True, get_all=False): """Gets the list of images in Docker.""" return self._get_json("%(url)s/images/json" % instance, params={'all': get_all}) diff --git a/conf.d/docker.yaml.example b/conf.d/docker.yaml.example index d869225de8..85c6860cc4 100644 --- a/conf.d/docker.yaml.example +++ b/conf.d/docker.yaml.example @@ -27,6 +27,11 @@ instances: # Example: # tags: ["extra_tag", "env:example"] + # If the agent is running in an Amazon ECS task, tags container metrics with the ECS task name and version. + # Default: true + # + # ecs_tags: false + # Exclude containers based on their tags # An excluded container will ne longer report performance metrics or events. However, # we still count the number of running and stopped of all containers. diff --git a/requirements.txt b/requirements.txt index 96440ef786..6fbec9d7e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -57,3 +57,6 @@ pyvmomi==5.5.0.2014.1.1 # checks.d/hdfs.py snakebite==1.3.11 + +# utils/platform.py +docker-py==1.3.1 diff --git a/util.py b/util.py index 4a2aae25b0..8ed4d3ba9a 100644 --- a/util.py +++ b/util.py @@ -208,7 +208,7 @@ def _get_hostname_unix(): hostname = unix_hostname # if we have an ec2 default hostname, see if there's an instance-id available - if hostname is not None and True in [hostname.lower().startswith(p) for p in [u'ip-', u'domu']]: + if (Platform.is_ecs_instance()) or (hostname is not None and True in [hostname.lower().startswith(p) for p in [u'ip-', u'domu']]): instanceid = EC2.get_instance_id(config) if instanceid: hostname = instanceid diff --git a/utils/platform.py b/utils/platform.py index d12921cf5b..5809e6005c 100644 --- a/utils/platform.py +++ b/utils/platform.py @@ -1,5 +1,5 @@ import sys - +import docker class Platform(object): """ @@ -53,3 +53,13 @@ def is_win32(name=None): @staticmethod def is_windows(name=None): return Platform.is_win32(name) + + @staticmethod + def is_ecs_instance(): + """Return True if the agent is running in an ECS instance, False otherwise.""" + cl = docker.Client(version='auto') + containers = cl.containers() + for co in containers: + if '/ecs-agent' in co.get('Names', ''): + return True + return False From 96c0305c13fdc8dfa004e551466bb1638873f71e Mon Sep 17 00:00:00 2001 From: Remi Hakim <remi@datadoghq.com> Date: Wed, 26 Aug 2015 14:27:23 -0400 Subject: [PATCH 2/3] [docker] New docker check This is a new version of the docker check, it's called docker_daemon (name can be changed). The previous check "docker" is now deprecated and won't receive further support. In terms of features, this adds: - Support for TLS connections to the daemon - New metrics: - Network metrics - Memory limits - Container size (rootfs) - Image size - Support for labels (convert them into tags). Off by default, uses a list of labels that should be converted. - Support for ECS tags: task name and task version Backward incompatible changes: - docker.disk.size metric is renamed to docker.container.size_rw - Old optional metrics: https://github.com/DataDog/dd-agent/blob/5.4.x/checks.d/docker.py#L29-L38 Are not collected anymore - Old old tags are not supported anymore (e.g. `name` instead of container_name) fix: #1299 #1648 #1739 #1742 #1896 --- Rakefile | 1 + checks.d/docker.py | 63 +- checks.d/docker_daemon.py | 648 ++++++++++++++++++ ci/common.rb | 2 +- ci/docker_daemon.rb | 42 ++ circle.yml | 20 - conf.d/docker.yaml.example | 7 +- conf.d/docker_daemon.yaml.example | 115 ++++ tests/checks/common.py | 4 +- .../checks/integration/test_docker_daemon.py | 469 +++++++++++++ tests/checks/mock/test_docker.py | 77 --- utils/dockerutil.py | 94 +++ utils/platform.py | 27 +- 13 files changed, 1404 insertions(+), 165 deletions(-) create mode 100644 checks.d/docker_daemon.py create mode 100644 ci/docker_daemon.rb delete mode 100644 circle.yml create mode 100644 conf.d/docker_daemon.yaml.example create mode 100644 tests/checks/integration/test_docker_daemon.py delete mode 100644 tests/checks/mock/test_docker.py create mode 100644 utils/dockerutil.py diff --git a/Rakefile b/Rakefile index 920ab835fd..30b3ee8004 100755 --- a/Rakefile +++ b/Rakefile @@ -39,6 +39,7 @@ require './ci/tomcat' require './ci/varnish' require './ci/windows' require './ci/zookeeper' +require './ci/docker_daemon' CLOBBER.include '**/*.pyc' diff --git a/checks.d/docker.py b/checks.d/docker.py index 89319a520f..1386f56536 100644 --- a/checks.d/docker.py +++ b/checks.d/docker.py @@ -1,20 +1,18 @@ # stdlib -from collections import defaultdict +import urllib2 +import urllib import httplib +import socket import os import re -import requests -import socket import time -import urllib -import urllib2 from urlparse import urlsplit +from util import json +from collections import defaultdict # project from checks import AgentCheck from config import _is_affirmative -from util import json -from utils.platform import Platform EVENT_TYPE = SOURCE_TYPE_NAME = 'docker' @@ -133,6 +131,8 @@ def __init__(self, name, init_config, agentConfig, instances=None): def check(self, instance): # Report image metrics + self.warning('Using the "docker" check is deprecated and will be removed' + ' in a future version of the agent. Please use the "docker_daemon" one instead') if _is_affirmative(instance.get('collect_images_stats', True)): self._count_images(instance) @@ -177,16 +177,8 @@ def _get_and_count_containers(self, instance): running_containers_ids = set([container['Id'] for container in running_containers]) - # Dict of container ids and a list of their Amazon ECS task tags - if Platform.is_ecs_instance() and instance.get('ecs_tags', True): - ecs_tags = self._get_ecs_tags(instance, running_containers) - else: - ecs_tags = None - for container in all_containers: container_tags = list(tags) - if ecs_tags: - container_tags += ecs_tags.get(container['Id'], []) for key in DOCKER_TAGS: tag = self._make_tag(key, container[key], instance) if tag: @@ -239,24 +231,14 @@ def _report_containers_metrics(self, containers, instance): collect_uncommon_metrics = _is_affirmative(instance.get("collect_all_metrics", False)) tags = instance.get("tags", []) - if Platform.is_ecs_instance() and instance.get('ecs_tags', True): - ecs_tags = self._get_ecs_tags(instance, containers) - else: - ecs_tags = None - # Pre-compile regex to include/exclude containers use_filters = self._prepare_filters(instance) for container in containers: container_tags = list(tags) - if ecs_tags: - container_tags += ecs_tags.get(container['Id'], []) for name in container["Names"]: container_tags.append(self._make_tag("name", name.lstrip("/"), instance)) for key in DOCKER_TAGS: - if key == 'Image' and ':' in container[key]: - tag = self._make_tag('image_repository', container[key].split(':')[0], instance) - container_tags.append(tag) tag = self._make_tag(key, container[key], instance) if tag: container_tags.append(tag) @@ -294,26 +276,6 @@ def _new_tags_conversion(self, tag): # Prefix tags to avoid conflict with AWS tags return NEW_TAGS_MAP.get(tag, tag) - def _get_ecs_tags(self, instance, containers): - ecs_id = None - for co in containers: - if '/ecs-agent' in co.get('Names', []): - ecs_id = co.get('Id') - if ecs_id is None: - return [] - ecs_config = self._inspect_container(instance, ecs_id) - net_conf = ecs_config['NetworkSettings'].get('Ports', {}) - net_conf = net_conf.get(net_conf.keys()[0], []) - container_tags = {} - if net_conf: - net_conf = net_conf[0] if isinstance(net_conf, list) else net_conf - ip, port = ecs_config.get('NetworkSettings', {})['IPAddress'], net_conf.get('HostPort') - tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() - for task in tasks.get('Tasks', []): - for container in task.get('Containers', []): - tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] - container_tags[container['DockerId']] = tags - return container_tags # Events @@ -391,10 +353,6 @@ def _get_containers(self, instance, with_size=False, get_all=False): """Gets the list of running/all containers in Docker.""" return self._get_json("%(url)s/containers/json" % instance, params={'size': with_size, 'all': get_all}) - def _inspect_container(self, instance, container_id): - """Get the list of running/all containers in Docker.""" - return self._get_json("%s/containers/%s/json" % (instance['url'], container_id)) - def _get_images(self, instance, with_size=True, get_all=False): """Gets the list of images in Docker.""" return self._get_json("%(url)s/images/json" % instance, params={'all': get_all}) @@ -449,14 +407,13 @@ def _get_json(self, uri, params=None, multi=False): # Cgroups - def _find_cgroup_filename_pattern(self, container_id): + def _find_cgroup_filename_pattern(self): if self._mountpoints: # We try with different cgroups so that it works even if only one is properly working for mountpoint in self._mountpoints.values(): stat_file_path_lxc = os.path.join(mountpoint, "lxc") stat_file_path_docker = os.path.join(mountpoint, "docker") stat_file_path_coreos = os.path.join(mountpoint, "system.slice") - stat_file_path_kubernetes = os.path.join(mountpoint, container_id) if os.path.exists(stat_file_path_lxc): return os.path.join('%(mountpoint)s/lxc/%(id)s/%(file)s') @@ -464,15 +421,13 @@ def _find_cgroup_filename_pattern(self, container_id): return os.path.join('%(mountpoint)s/docker/%(id)s/%(file)s') elif os.path.exists(stat_file_path_coreos): return os.path.join('%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s') - elif os.path.exists(stat_file_path_kubernetes): - return os.path.join('%(mountpoint)s/%(id)s/%(file)s') raise Exception("Cannot find Docker cgroup directory. Be sure your system is supported.") def _get_cgroup_file(self, cgroup, container_id, filename): # This can't be initialized at startup because cgroups may not be mounted yet if not self._cgroup_filename_pattern: - self._cgroup_filename_pattern = self._find_cgroup_filename_pattern(container_id) + self._cgroup_filename_pattern = self._find_cgroup_filename_pattern() return self._cgroup_filename_pattern % (dict( mountpoint=self._mountpoints[cgroup], diff --git a/checks.d/docker_daemon.py b/checks.d/docker_daemon.py new file mode 100644 index 0000000000..42536446d6 --- /dev/null +++ b/checks.d/docker_daemon.py @@ -0,0 +1,648 @@ +# stdlib +import os +import re +import requests +import time +import socket +import urllib2 +from collections import defaultdict, Counter, deque + +# project +from checks import AgentCheck +from config import _is_affirmative +from utils.dockerutil import find_cgroup, find_cgroup_filename_pattern, get_client, MountException, set_docker_settings +from utils.platform import Platform + + +EVENT_TYPE = 'docker' +SERVICE_CHECK_NAME = 'docker.service_up' +SIZE_REFRESH_RATE = 5 # Collect container sizes every 5 iterations of the check +MAX_CGROUP_LISTING_RETRIES = 3 + +GAUGE = AgentCheck.gauge +RATE = AgentCheck.rate + +CGROUP_METRICS = [ + { + "cgroup": "memory", + "file": "memory.stat", + "metrics": { + "cache": ("docker.mem.cache", GAUGE), + "rss": ("docker.mem.rss", GAUGE), + "swap": ("docker.mem.swap", GAUGE), + }, + "to_compute": { + # We only get these metrics if they are properly set, i.e. they are a "reasonable" value + "docker.mem.limit": (["hierarchical_memory_limit"], lambda x: float(x) if float(x) < 2 ** 60 else None, GAUGE), + "docker.mem.sw_limit": (["hierarchical_memsw_limit"], lambda x: float(x) if float(x) < 2 ** 60 else None, GAUGE), + "docker.mem.in_use": (["rss", "hierarchical_memory_limit"], lambda x,y: float(x)/float(y) if float(y) < 2 ** 60 else None, GAUGE), + "docker.mem.sw_in_use": (["swap", "rss", "hierarchical_memsw_limit"], lambda x,y,z: float(x + y)/float(z) if float(z) < 2 ** 60 else None, GAUGE) + + } + }, + { + "cgroup": "cpuacct", + "file": "cpuacct.stat", + "metrics": { + "user": ("docker.cpu.user", RATE), + "system": ("docker.cpu.system", RATE), + }, + }, + { + "cgroup": "blkio", + "file": 'blkio.throttle.io_service_bytes', + "metrics": { + "io_read": ("docker.io.read_bytes", RATE), + "io_write": ("docker.io.write_bytes", RATE), + }, + }, +] + +DEFAULT_CONTAINER_TAGS = [ + "docker_image", + "image_name", + "image_tag", +] + +DEFAULT_PERFORMANCE_TAGS = [ + "container_name", + "docker_image", + "image_name", + "image_tag", +] + +DEFAULT_IMAGE_TAGS = [ + 'image_name', + 'image_tag' +] + + +def image_tag_extractor(c, key): + if "Image" in c: + split = c["Image"].split(":", 1) + if len(split) <= key: + return None + else: + return [split[key]] + if "RepoTags" in c: + splits = [el.split(":", 1) for el in c["RepoTags"]] + tags = [] + for split in splits: + if len(split) > key: + tags.append(split[key]) + if len(tags) > 0: + return list(set(tags)) + return None + + +TAG_EXTRACTORS = { + "docker_image": lambda c: [c["Image"]], + "image_name": lambda c: image_tag_extractor(c, 0), + "image_tag": lambda c: image_tag_extractor(c, 1), + "container_command": lambda c: [c["Command"]], + "container_name": lambda c: [str(c['Names'][0]).lstrip("/")] if c.get("Names") else [c['Id'][:11]], +} + +CONTAINER = "container" +PERFORMANCE = "performance" +FILTERED = "filtered" +IMAGE = "image" + + +def get_mountpoints(docker_root): + mountpoints = {} + for metric in CGROUP_METRICS: + mountpoints[metric["cgroup"]] = find_cgroup(metric["cgroup"], docker_root) + return mountpoints + +def get_filters(include, exclude): + # The reasoning is to check exclude first, so we can skip if there is no exclude + if not exclude: + return + + filtered_tag_names = [] + exclude_patterns = [] + include_patterns = [] + + # Compile regex + for rule in exclude: + exclude_patterns.append(re.compile(rule)) + filtered_tag_names.append(rule.split(':')[0]) + for rule in include: + include_patterns.append(re.compile(rule)) + filtered_tag_names.append(rule.split(':')[0]) + + return set(exclude_patterns), set(include_patterns), set(filtered_tag_names) + + +class DockerDaemon(AgentCheck): + """Collect metrics and events from Docker API and cgroups.""" + + def __init__(self, name, init_config, agentConfig, instances=None): + if instances is not None and len(instances) > 1: + raise Exception("Docker check only supports one configured instance.") + AgentCheck.__init__(self, name, init_config, + agentConfig, instances=instances) + + self.init_success = False + self.init() + + + def init(self): + try: + # We configure the check with the right cgroup settings for this host + # Just needs to be done once + instance = self.instances[0] + set_docker_settings(self.init_config, instance) + + self.client = get_client() + self._docker_root = self.init_config.get('docker_root', '/') + self._mountpoints = get_mountpoints(self._docker_root) + self.cgroup_listing_retries = 0 + self._latest_size_query = 0 + self._filtered_containers = set() + + # At first run we'll just collect the events from the latest 60 secs + self._last_event_collection_ts = int(time.time()) - 60 + + # Set tagging options + self.custom_tags = instance.get("tags", []) + self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) + self.tag_names = { + CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), + PERFORMANCE: instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS), + IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) + + } + + # Set filtering settings + if not instance.get("exclude"): + self._filtering_enabled = False + if instance.get("include"): + self.log.warning("You must specify an exclude section to enable filtering") + else: + self._filtering_enabled = True + include = instance.get("include", []) + exclude = instance.get("exclude", []) + self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) + self.tag_names[FILTERED] = _filtered_tag_names + + + # Other options + self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', True)) + self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) + self.collect_events = _is_affirmative(instance.get('collect_events', True)) + self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) + self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() + except Exception, e: + self.log.critical(e) + self.warning("Initialization failed. Will retry at next iteration") + else: + self.init_success = True + + def check(self, instance): + """Run the Docker check for one instance.""" + + if not self.init_success: + # Initialization can fail if cgroups are not ready. So we retry if needed + # https://github.com/DataDog/dd-agent/issues/1896 + self.init() + + # Report image metrics + if self.collect_image_stats: + self._count_and_weigh_images() + + if self.collect_ecs_tags: + self.refresh_ecs_tags() + + # Get the list of containers and the index of their names + containers_by_id = self._get_and_count_containers() + containers_by_id = self._crawl_container_pids(containers_by_id) + + # Report performance container metrics (cpu, mem, net, io) + self._report_performance_metrics(containers_by_id) + + if self.collect_container_size: + self._report_container_size(containers_by_id) + + # Send events from Docker API + if self.collect_events: + self._process_events(containers_by_id) + + def _count_and_weigh_images(self): + try: + tags = self._get_tags() + active_images = self.client.images(all=False) + active_images_len = len(active_images) + all_images_len = len(self.client.images(quiet=True, all=True)) + self.gauge("docker.images.available", active_images_len, tags=tags) + self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags) + + if self.collect_image_size: + self._report_image_size(active_images) + + except Exception, e: + # It's not an important metric, keep going if it fails + self.warning("Failed to count Docker images. Exception: {0}".format(e)) + + def _get_and_count_containers(self): + """List all the containers from the API, filter and count them.""" + + # Querying the size of containers is slow, we don't do it at each run + must_query_size = self.collect_container_size and self._latest_size_query == 0 + self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE + + running_containers_count = Counter() + all_containers_count = Counter() + + try: + containers = self.client.containers(all=True, size=must_query_size) + except Exception, e: + message = "Unable to list Docker containers: {0}".format(e) + self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, + message=message) + raise Exception(message) + + else: + self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK) + + # Filter containers according to the exclude/include rules + self._filter_containers(containers) + + containers_by_id = {} + + for container in containers: + container_name = str(container['Names'][0]).strip('/') + + container_status_tags = self._get_tags(container, CONTAINER) + + all_containers_count[tuple(sorted(container_status_tags))] += 1 + if self._is_container_running(container): + running_containers_count[tuple(sorted(container_status_tags))] += 1 + + # Check if the container is included/excluded via its tags + if self._is_container_excluded(container): + self.log.debug("Container {0} is excluded".format(container_name)) + continue + + containers_by_id[container['Id']] = container + + for tags, count in running_containers_count.iteritems(): + self.gauge("docker.containers.running", count, tags=list(tags)) + + for tags, count in all_containers_count.iteritems(): + stopped_count = count - running_containers_count[tags] + self.gauge("docker.containers.stopped", stopped_count, tags=list(tags)) + + return containers_by_id + + def _is_container_running(self, container): + """Tell if a container is running, according to its status. + + There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated. + See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35 + """ + return container["Status"].startswith("Up") or container["Status"].startswith("Restarting") + + def _get_tags(self, entity=None, tag_type=None): + """Generate the tags for a given entity (container or image) according to a list of tag names.""" + # Start with custom tags + tags = list(self.custom_tags) + if entity is not None: + + # Get labels as tags + labels = entity.get("Labels") + if labels is not None: + for k in self.collect_labels_as_tags: + if k in labels: + v = labels[k] + if not v: + tags.append(k) + else: + tags.append("%s:%s" % (k,v)) + + # Get entity specific tags + if tag_type is not None: + tag_names = self.tag_names[tag_type] + for tag_name in tag_names: + tag_value = self._extract_tag_value(entity, tag_name) + if tag_value is not None: + for t in tag_value: + tags.append('%s:%s' % (tag_name, str(t).strip())) + + # Add ECS tags + if self.collect_ecs_tags: + entity_id = entity.get("Id") + if entity_id in self.ecs_tags: + ecs_tags = self.ecs_tags[entity_id] + tags.extend(ecs_tags) + + return tags + + def _extract_tag_value(self, entity, tag_name): + """Extra tag information from the API result (containers or images). + Cache extracted tags inside the entity object. + """ + if tag_name not in TAG_EXTRACTORS: + self.warning("{0} isn't a supported tag".format(tag_name)) + return + + # Check for already extracted tags + if "_tag_values" not in entity: + entity["_tag_values"] = {} + + if tag_name not in entity["_tag_values"]: + entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity) + + return entity["_tag_values"][tag_name] + + def refresh_ecs_tags(self): + ecs_config = self.client.inspect_container('ecs-agent') + net_conf = ecs_config['NetworkSettings'].get('Ports', {}) + net_conf = net_conf.get(net_conf.keys()[0], []) + ecs_tags = {} + if net_conf: + net_conf = net_conf[0] if isinstance(net_conf, list) else net_conf + ip, port = net_conf.get('HostIp'), net_conf.get('HostPort') + tasks = requests.get('http://%s:%s' % (ip, port)).json() + for task in tasks.get('Tasks', []): + for container in task.get('Containers', []): + tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] + ecs_tags[container['DockerId']] = tags + + self.ecs_tags = ecs_tags + + def _filter_containers(self, containers): + if not self._filtering_enabled: + return + + self._filtered_containers = set() + for container in containers: + container_tags = self._get_tags(container, FILTERED) + if self._are_tags_filtered(container_tags): + container_name = TAG_EXTRACTORS["container_name"](container)[0] + self._filtered_containers.add(container_name) + self.log.debug("Container {0} is filtered".format(container["Names"][0])) + + + def _are_tags_filtered(self, tags): + if self._tags_match_patterns(tags, self._exclude_patterns): + if self._tags_match_patterns(tags, self._include_patterns): + return False + return True + return False + + def _tags_match_patterns(self, tags, filters): + for rule in filters: + for tag in tags: + if re.match(rule, tag): + return True + return False + + def _is_container_excluded(self, container): + """Check if a container is excluded according to the filter rules. + + Requires _filter_containers to run first. + """ + container_name = TAG_EXTRACTORS["container_name"](container)[0] + return container_name in self._filtered_containers + + def _report_container_size(self, containers_by_id): + container_list_with_size = None + for container in containers_by_id.itervalues(): + if self._is_container_excluded(container): + continue + + tags = self._get_tags(container, PERFORMANCE) + + if "SizeRw" in container: + self.gauge('docker.container.size_rw', container['SizeRw'], + tags=tags) + if "SizeRootFs" in container: + self.gauge( + 'docker.container.size_rootfs', container['SizeRootFs'], + tags=tags) + + def _report_image_size(self, images): + for image in images: + tags = self._get_tags(image, IMAGE) + if 'VirtualSize' in image: + self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags) + if 'Size' in image: + self.gauge('docker.image.size', image['Size'], tags=tags) + + # Performance metrics + + def _report_performance_metrics(self, containers_by_id): + for container in containers_by_id.itervalues(): + if self._is_container_excluded(container) or not self._is_container_running(container): + continue + + tags = self._get_tags(container, PERFORMANCE) + self._report_cgroup_metrics(container, tags) + self._report_net_metrics(container, tags) + + def _report_cgroup_metrics(self, container, tags): + try: + for cgroup in CGROUP_METRICS: + stat_file = self._get_cgroup_file(cgroup["cgroup"], container['Id'], cgroup['file']) + stats = self._parse_cgroup_file(stat_file) + if stats: + for key, (dd_key, metric_func) in cgroup['metrics'].iteritems(): + if key in stats: + metric_func(self, dd_key, int(stats[key]), tags=tags) + + # Computed metrics + for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems(): + values = [stats[key] for key in key_list if key in stats] + if len(values) != len(key_list): + self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname)) + continue + value = fct(*values) + if value is not None: + metric_func(self, mname, value, tags=tags) + + + + except MountException as ex: + if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES: + raise ex + else: + self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now." + "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries)) + self.cgroup_listing_retries += 1 + else: + self.cgroup_listing_retries = 0 + + def _report_net_metrics(self, container, tags): + """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" + proc_net_file = os.path.join(container['_proc_root'], 'net/dev') + try: + with open(proc_net_file, 'r') as fp: + lines = fp.readlines() + """Two first lines are headers: + Inter-| Receive | Transmit + face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed + """ + for l in lines[2:]: + cols = l.split(':', 1) + interface_name = str(cols[0]).strip() + if interface_name == 'eth0': + x = cols[1].split() + self.rate("docker.net.bytes_rcvd", long(x[0]), tags) + self.rate("docker.net.bytes_sent", long(x[8]), tags) + break + except Exception, e: + # It is possible that the container got stopped between the API call and now + self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e)) + + def _process_events(self, containers_by_id): + try: + api_events = self._get_events() + aggregated_events = self._pre_aggregate_events(api_events, containers_by_id) + events = self._format_events(aggregated_events, containers_by_id) + except (socket.timeout, urllib2.URLError): + self.warning('Timeout when collecting events. Events will be missing.') + return + except Exception, e: + self.warning("Unexpected exception when collecting events: {0}. " + "Events will be missing".format(e)) + return + + for ev in events: + self.log.debug("Creating event: %s" % ev['msg_title']) + self.event(ev) + + def _get_events(self): + """Get the list of events.""" + now = int(time.time()) + events = [] + event_generator = self.client.events(since=self._last_event_collection_ts, + until=now, decode=True) + for event in event_generator: + if event != '': + events.append(event) + self._last_event_collection_ts = now + return events + + def _pre_aggregate_events(self, api_events, containers_by_id): + # Aggregate events, one per image. Put newer events first. + events = defaultdict(deque) + for event in api_events: + # Skip events related to filtered containers + container = containers_by_id.get(event['id']) + if container is not None and self._is_container_excluded(container): + self.log.debug("Excluded event: container {0} status changed to {1}".format( + event['id'], event['status'])) + continue + # Known bug: from may be missing + if 'from' in event: + events[event['from']].appendleft(event) + return events + + def _format_events(self, aggregated_events, containers_by_id): + events = [] + for image_name, event_group in aggregated_events.iteritems(): + max_timestamp = 0 + status = defaultdict(int) + status_change = [] + for event in event_group: + max_timestamp = max(max_timestamp, int(event['time'])) + status[event['status']] += 1 + container_name = event['id'][:11] + if event['id'] in containers_by_id: + container_name = str(containers_by_id[event['id']]['Names'][0]).strip('/') + + status_change.append([container_name, event['status']]) + + status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) + msg_title = "%s %s on %s" % (image_name, status_text, self.hostname) + msg_body = ( + "%%%\n" + "{image_name} {status} on {hostname}\n" + "```\n{status_changes}\n```\n" + "%%%" + ).format( + image_name=image_name, + status=status_text, + hostname=self.hostname, + status_changes="\n".join( + ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) + ) + events.append({ + 'timestamp': max_timestamp, + 'host': self.hostname, + 'event_type': EVENT_TYPE, + 'msg_title': msg_title, + 'msg_text': msg_body, + 'source_type_name': EVENT_TYPE, + 'event_object': 'docker:%s' % image_name, + }) + + return events + + # Cgroups + + def _get_cgroup_file(self, cgroup, container_id, filename): + """Find a specific cgroup file, containing metrics to extract.""" + params = { + "mountpoint": self._mountpoints[cgroup], + "id": container_id, + "file": filename, + } + + return find_cgroup_filename_pattern(self._mountpoints, container_id) % (params) + + def _parse_cgroup_file(self, stat_file): + """Parse a cgroup pseudo file for key/values.""" + self.log.debug("Opening cgroup file: %s" % stat_file) + try: + with open(stat_file, 'r') as fp: + if 'blkio' in stat_file: + return self._parse_blkio_metrics(fp.read().splitlines()) + else: + return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines())) + except IOError: + # It is possible that the container got stopped between the API call and now + self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) + + def _parse_blkio_metrics(self, stats): + """Parse the blkio metrics.""" + metrics = { + 'io_read': 0, + 'io_write': 0, + } + for line in stats: + if 'Read' in line: + metrics['io_read'] += int(line.split()[2]) + if 'Write' in line: + metrics['io_write'] += int(line.split()[2]) + return metrics + + # proc files + def _crawl_container_pids(self, container_dict): + """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" + proc_dir = os.path.join(self._docker_root, 'proc') + for folder in os.listdir(proc_dir): + try: + int(folder) + except ValueError: + continue + try: + path = os.path.join(proc_dir, folder, 'cgroup') + with open(path, 'r') as f: + content = [line.strip().split(':') for line in f.readlines()] + except Exception, e: + self.warning("Cannot read %s : %s" % (path, str(e))) + continue + + try: + content = dict((line[1], line[2]) for line in content) + if 'docker/' in content.get('cpuacct'): + container_id = content['cpuacct'].split('docker/')[1] + container_dict[container_id]['_pid'] = folder + container_dict[container_id]['_proc_root'] = os.path.join(proc_dir, folder) + except Exception, e: + self.warning("Cannot parse %s content: %s" % (path, str(e))) + continue + return container_dict diff --git a/ci/common.rb b/ci/common.rb index fd96740308..937ea40e8c 100644 --- a/ci/common.rb +++ b/ci/common.rb @@ -199,7 +199,7 @@ def self.for(smth, max_timeout = DEFAULT_TIMEOUT) # separate dir we symlink stuff in the rootdir path = %(PATH="#{ENV['INTEGRATIONS_DIR']}/bin:#{ENV['PATH']}" ) end - sh %(#{path}nosetests -v -A "#{nose}" #{tests_directory}) + sh %(#{path}nosetests -s -v -A "#{nose}" #{tests_directory}) t.reenable end task execute: [:before_install, :install, :before_script, :script] diff --git a/ci/docker_daemon.rb b/ci/docker_daemon.rb new file mode 100644 index 0000000000..6a23d88133 --- /dev/null +++ b/ci/docker_daemon.rb @@ -0,0 +1,42 @@ +require './ci/common' + +namespace :ci do + namespace :docker_daemon do |flavor| + task before_install: ['ci:common:before_install'] + + task before_script: ['ci:common:before_script'] + + task script: ['ci:common:script'] do + this_provides = [ + 'docker_daemon' + ] + Rake::Task['ci:common:run_tests'].invoke(this_provides) + end + + task before_cache: ['ci:common:before_cache'] + + task cache: ['ci:common:cache'] + + task cleanup: ['ci:common:cleanup'] + + task :execute do + exception = nil + begin + %w(before_install before_script + script before_cache cache).each do |t| + Rake::Task["#{flavor.scope.path}:#{t}"].invoke + end + rescue => e + exception = e + puts "Failed task: #{e.class} #{e.message}".red + end + if ENV['SKIP_CLEANUP'] + puts 'Skipping cleanup, disposable environments are great'.yellow + else + puts 'Cleaning up' + Rake::Task["#{flavor.scope.path}:cleanup"].invoke + end + fail exception if exception + end + end +end diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 505259584b..0000000000 --- a/circle.yml +++ /dev/null @@ -1,20 +0,0 @@ -machine: - services: - - docker - -general: - branches: - only: - - remh/dockerv2 - -dependencies: - override: - - pip install -r requirements.txt - - pip install -r requirements-opt.txt - - pip install -r requirements-test.txt - - bundle install - - -test: - override: - - "rake ci:run[docker_daemon]" diff --git a/conf.d/docker.yaml.example b/conf.d/docker.yaml.example index 85c6860cc4..c711ca1e17 100644 --- a/conf.d/docker.yaml.example +++ b/conf.d/docker.yaml.example @@ -27,11 +27,6 @@ instances: # Example: # tags: ["extra_tag", "env:example"] - # If the agent is running in an Amazon ECS task, tags container metrics with the ECS task name and version. - # Default: true - # - # ecs_tags: false - # Exclude containers based on their tags # An excluded container will ne longer report performance metrics or events. However, # we still count the number of running and stopped of all containers. @@ -72,6 +67,6 @@ instances: # collect_all_metrics: false # Collect images stats - # Number of available active images and intermediate images as gauges. Default: true. + # Number of available active images and intermediate images as gauges. Default: false. # collect_images_stats: false diff --git a/conf.d/docker_daemon.yaml.example b/conf.d/docker_daemon.yaml.example new file mode 100644 index 0000000000..9a7ae261c1 --- /dev/null +++ b/conf.d/docker_daemon.yaml.example @@ -0,0 +1,115 @@ +init_config: + # Change the root directory to look at to get cgroup statistics. Useful when running inside a + # container with host directories mounted on a different folder. Default: /. + # Example for the docker-dd-agent container: + # docker_root: /host + + # Timeout in seconds for the connection to the docker daemon + # Default: 5 seconds + # + # timeout: 10 + + # The version of the API the client will use. Specify 'auto' to use the API version provided by the server. + # api_version: auto + + # Use TLS encryption while communicating with the Docker API + # + # tls: False + # tls_client_cert: /path/to/client-cert.pem + # tls_client_key: /path/to/client-key.pem + # tls_cacert: /path/to/ca.pem + # tls_verify: True + +instances: + - ## Daemon and system configuration + ## + + # URL of the Docker daemon socket to reach the Docker API. HTTP/HTTPS also works. + # Warning: if that's a non-local daemon, we won't be able to collect performance metrics. + # + url: "unix://var/run/docker.sock" + + ## Data collection + ## + + # Create events whenever a container status change. + # Defaults to true. + # + # collect_events: false + + # Collect disk usage per container with docker.container.size_rw and + # docker.container.size_rootfs metrics. + # Warning: This might take time for Docker daemon to generate, + # ensure that `docker ps -a -q` run fast before enabling it. + # Defaults to false. + # + # collect_container_size: false + + # Collect images stats + # Number of available active images and intermediate images as gauges. + # Defaults to true. + # + collect_images_stats: false + + # Collect disk usage per image with docker.image.size and docker.image.virtual_size metrics. + # The check gets this size with the `docker images` command. + # Requires collect_images_stats to be enabled. + # Defaults to false. + # + # collect_image_size: false + + + # Exclude containers based on their tags + # An excluded container will be completely ignored. The rule is a regex on the tags. + # + # How it works: exclude first. + # If a tag matches an exclude rule, it won't be included unless it also matches an include rule. + # Examples: + # exclude all, except ubuntu and debian. + # exclude: [".*"] + # include: ["docker_image:ubuntu", "docker_image:debian"] + # + # include all, except ubuntu and debian. + # exclude: ["docker_image:ubuntu", "docker_image:debian"] + # include: [] + # + # Default: include all containers + + + + ## Tagging + ## + + # You can add extra tags to your Docker metrics with the tags list option. + # Example: ["extra_tag", "env:testing"] + # + # tags: [] + + # If the agent is running in an Amazon ECS task, tags container metrics with the ECS task name and version. + # Default: true + # + # ecs_tags: false + + # Custom metrics tagging + # Define which Docker tags to apply on metrics. + # Since it impacts the aggregation, modify it carefully (only if you really need it). + # + # Tags for performance metrics. + # Available: + # - image_name: Name of the image (example: "nginx") + # - image_tag: Tag of the image (example: "latest") + # - docker_image: LEGACY. The full image name:tag string (example: "nginx:latest") + # - container_name: Name of the container (example: "boring_euclid") + # - container_command: Command ran by the container (example: "echo 1") + # + # performance_tags: ["container_name", image_name", "image_tag", "docker_image"] + + # Tags for containers count metrics. + # Available: ["image_name", "image_tag", "docker_image", "container_command"] + # + # container_tags: ["image_name", "image_tag", "docker_image"] + + # List of container label names that should be collected and sent as tags. + # Default to None + # Example: + # collect_labels_as_tags: ["com.docker.compose.service", "com.docker.compose.project"] diff --git a/tests/checks/common.py b/tests/checks/common.py index ee4c158e48..88539f1206 100644 --- a/tests/checks/common.py +++ b/tests/checks/common.py @@ -68,8 +68,10 @@ def load_check(name, config, agentConfig): # init the check class try: return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances) - except Exception as e: + except TypeError as e: raise Exception("Check is using old API, {0}".format(e)) + except Exception: + raise def kill_subprocess(process_obj): diff --git a/tests/checks/integration/test_docker_daemon.py b/tests/checks/integration/test_docker_daemon.py new file mode 100644 index 0000000000..3bcf81c2f9 --- /dev/null +++ b/tests/checks/integration/test_docker_daemon.py @@ -0,0 +1,469 @@ +# stdlib +import logging + +# project +from tests.checks.common import AgentCheckTest +from utils.dockerutil import get_client, set_docker_settings, get_docker_settings, reset_docker_settings + +# 3rd party +from nose.plugins.attrib import attr + +log = logging.getLogger('tests') + +CONTAINERS_TO_RUN = [ + "nginx", + "redis:latest", + +] + +@attr(requires='docker_daemon') +class TestCheckDockerDaemon(AgentCheckTest): + CHECK_NAME = 'docker_daemon' + + def setUp(self): + self.docker_client = get_client() + for c in CONTAINERS_TO_RUN: + images = [i["RepoTags"][0] for i in self.docker_client.images(c.split(":")[0]) if i["RepoTags"][0].startswith(c)] + if len(images) == 0: + for line in self.docker_client.pull(c, stream=True): + print line + + self.containers = [] + for c in CONTAINERS_TO_RUN: + name = "test-new-{0}".format(c.replace(":", "-")) + host_config = None + labels = None + if c == "nginx": + host_config = {"Memory": 137438953472} + labels = {"label1": "nginx", "foo": "bar"} + + cont = self.docker_client.create_container( + c, detach=True, name=name, host_config=host_config, labels=labels) + self.containers.append(cont) + + for c in self.containers: + log.info("Starting container: {0}".format(c)) + self.docker_client.start(c) + + + def tearDown(self): + for c in self.containers: + log.info("Stopping container: {0}".format(c)) + self.docker_client.remove_container(c, force=True) + + def test_basic_config_single(self): + expected_metrics = [ + ('docker.containers.running', ['docker_image:nginx', 'image_name:nginx']), + ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:nginx', 'image_name:nginx']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.size', ['image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9', 'image_tag:1.9.4', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9', 'image_tag:1.9.4', 'image_tag:latest']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']), + ('docker.images.available', None), + ('docker.images.intermediate', None), + ('docker.mem.cache', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.cache', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.rss', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.rss', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.limit' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.in_use' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ] + + + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + "collect_image_size": True, + }, + ], + } + + self.run_check(config, force_reload=True) + for mname, tags in expected_metrics: + self.assertMetric(mname, tags=tags, count=1, at_least=1) + + + def test_basic_config_twice(self): + expected_metrics = [ + ('docker.containers.running', ['docker_image:nginx', 'image_name:nginx']), + ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:nginx', 'image_name:nginx']), + ('docker.images.available', None), + ('docker.images.intermediate', None), + ('docker.cpu.system', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.cpu.system', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.cpu.user', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.cpu.user', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.io.read_bytes', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.io.read_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.io.write_bytes', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.io.write_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.cache', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.cache', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.rss', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.rss', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.limit' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.in_use' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.net.bytes_rcvd', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.net.bytes_rcvd', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.net.bytes_sent', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.net.bytes_sent', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']) + + ] + + custom_tags = ["extra_tag", "env:testing"] + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + "tags": custom_tags, + }, + ], + } + + self.run_check_twice(config, force_reload=True) + for mname, tags in expected_metrics: + expected_tags = list(custom_tags) + if tags is not None: + expected_tags += tags + self.assertMetric(mname, tags=expected_tags, count=1, at_least=1) + + def test_exclude_filter(self): + expected_metrics = [ + ('docker.containers.running', ['docker_image:nginx', 'image_name:nginx']), + ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:nginx', 'image_name:nginx']), + ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.cpu.system', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.cpu.user', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.size', ['image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1', 'image_tag:latest', 'image_tag:1.9', 'image_tag:1.9.4']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.size', ['image_name:buildpack-deps', 'image_tag:precise']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.virtual_size', ['image_name:buildpack-deps', 'image_tag:precise']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1', 'image_tag:latest', 'image_tag:1.9', 'image_tag:1.9.4']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']), + ('docker.images.available', None), + ('docker.images.intermediate', None), + ('docker.io.read_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.io.write_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.cache', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.rss', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.net.bytes_rcvd', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.net.bytes_sent', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']) + ] + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + "exclude": ["docker_image:nginx"], + "collect_image_size": True, + }, + ], + } + + self.run_check_twice(config, force_reload=True) + + for mname, tags in expected_metrics: + self.assertMetric(mname, tags=tags, count=1, at_least=1) + + perf_metrics = [ + "docker.cpu.system", + "docker.cpu.user", + "docker.io.read_bytes", + "docker.io.write_bytes", + "docker.mem.cache", + "docker.mem.rss", + "docker.net.bytes_rcvd", + "docker.net.bytes_sent" + ] + + nginx_tags = ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx'] + for mname in perf_metrics: + self.assertMetric(mname, tags=nginx_tags, count=0) + + def test_include_filter(self): + expected_metrics = [ + ('docker.containers.running', ['docker_image:nginx', 'image_name:nginx']), + ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:nginx', 'image_name:nginx']), + ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.cpu.system', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.cpu.user', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.size', ['image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1', 'image_tag:latest', 'image_tag:1.9', 'image_tag:1.9.4']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.size', ['image_name:buildpack-deps', 'image_tag:precise']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.virtual_size', ['image_name:buildpack-deps', 'image_tag:precise']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1', 'image_tag:latest', 'image_tag:1.9', 'image_tag:1.9.4']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']), + ('docker.images.available', None), + ('docker.images.intermediate', None), + ('docker.io.read_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.io.write_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.cache', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.rss', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.net.bytes_rcvd', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.net.bytes_sent', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']) + ] + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + "include": ["image_name:redis"], + "exclude": [".*"], + "collect_image_size": True, + }, + ], + } + + self.run_check_twice(config, force_reload=True) + + for mname, tags in expected_metrics: + self.assertMetric(mname, tags=tags, count=1, at_least=1) + + perf_metrics = [ + "docker.cpu.system", + "docker.cpu.user", + "docker.io.read_bytes", + "docker.io.write_bytes", + "docker.mem.cache", + "docker.mem.rss", + "docker.net.bytes_rcvd", + "docker.net.bytes_sent" + ] + + nginx_tags = ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx'] + for m in perf_metrics: + self.assertMetric(mname, tags=nginx_tags, count=0) + + def test_tags_options(self): + expected_metrics = [ + ('docker.containers.running', ["container_command:nginx -g 'daemon off;'"]), + ('docker.containers.running', ['container_command:/entrypoint.sh redis-server']), + ('docker.containers.stopped', ["container_command:nginx -g 'daemon off;'"]), + ('docker.containers.stopped', ['container_command:/entrypoint.sh redis-server']), + ('docker.cpu.system', ["container_command:nginx -g 'daemon off;'"]), + ('docker.cpu.system', ['container_command:/entrypoint.sh redis-server']), + ('docker.cpu.user', ['container_command:/entrypoint.sh redis-server']), + ('docker.cpu.user', ["container_command:nginx -g 'daemon off;'"]), + ('docker.image.size', ['image_name:<none>', 'image_tag:<none>']), + ('docker.image.size', ['image_name:ubuntu', 'image_tag:14.04']), + ('docker.image.size', ['image_name:ruby', 'image_tag:2.2']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.size', ['image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9.4', 'image_tag:1.9', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9.4', 'image_tag:1.9', 'image_tag:latest']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']), + ('docker.images.available', None), + ('docker.images.intermediate', None), + ('docker.io.read_bytes', ["container_command:nginx -g 'daemon off;'"]), + ('docker.io.read_bytes', ['container_command:/entrypoint.sh redis-server']), + ('docker.io.write_bytes', ['container_command:/entrypoint.sh redis-server']), + ('docker.io.write_bytes', ["container_command:nginx -g 'daemon off;'"]), + ('docker.mem.cache', ["container_command:nginx -g 'daemon off;'"]), + ('docker.mem.cache', ['container_command:/entrypoint.sh redis-server']), + ('docker.mem.rss', ['container_command:/entrypoint.sh redis-server']), + ('docker.mem.rss', ["container_command:nginx -g 'daemon off;'"]), + ('docker.mem.limit' ,["container_command:nginx -g 'daemon off;'"]), + ('docker.mem.in_use' ,["container_command:nginx -g 'daemon off;'"]), + ('docker.net.bytes_rcvd', ['container_command:/entrypoint.sh redis-server']), + ('docker.net.bytes_rcvd', ["container_command:nginx -g 'daemon off;'"]), + ('docker.net.bytes_sent', ["container_command:nginx -g 'daemon off;'"]), + ('docker.net.bytes_sent', ['container_command:/entrypoint.sh redis-server']) + ] + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + "performance_tags": ["container_command"], + "container_tags": ["container_command"], + "collect_image_size": True, + }, + ], + } + + self.run_check_twice(config, force_reload=True) + for mname, tags in expected_metrics: + self.assertMetric(mname, tags=tags, count=1, at_least=1) + + def test_set_docker_settings(self): + self.assertEqual(get_docker_settings()["version"], "auto") + cur_loc = __file__ + init_config = { + "api_version": "foobar", + "timeout": "42", + "tls_client_cert": cur_loc, + "tls_client_key": cur_loc, + "tls_cacert": cur_loc, + "tls": True + + } + + instance = { + "url": "https://foo.bar:42", + } + + set_docker_settings(init_config, instance) + client = get_client() + self.assertEqual(client.verify, cur_loc) + self.assertEqual(client.cert, (cur_loc, cur_loc)) + reset_docker_settings() + + def test_labels_collection(self): + expected_metrics = [ + ('docker.containers.running', ['docker_image:nginx', 'image_name:nginx', 'label1:nginx']), + ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:nginx', 'image_name:nginx', 'label1:nginx']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.size', ['image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9', 'image_tag:1.9.4', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9', 'image_tag:1.9.4', 'image_tag:latest']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']), + ('docker.images.available', None), + ('docker.images.intermediate', None), + ('docker.mem.cache', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx', 'label1:nginx']), + ('docker.mem.cache', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.rss', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx', 'label1:nginx']), + ('docker.mem.rss', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.limit' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx', 'label1:nginx']), + ('docker.mem.in_use' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx', 'label1:nginx']), + ] + + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + "collect_labels_as_tags": ["label1"], + "collect_image_size": True, + }, + ], + } + + self.run_check(config, force_reload=True) + for mname, tags in expected_metrics: + self.assertMetric(mname, tags=tags, count=1, at_least=1) + + def test_events(self): + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + }, + ], + } + + self.run_check(config, force_reload=True) + self.assertEqual(len(self.events), 2) + + def test_container_size(self): + expected_metrics = [ + ('docker.containers.running', ['docker_image:nginx', 'image_name:nginx']), + ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.containers.stopped', ['docker_image:nginx', 'image_name:nginx']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.size', ['image_name:redis', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9', 'image_tag:1.9.4', 'image_tag:latest']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.1']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7', 'image_tag:1.7.12']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.0']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.7.11']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1', 'image_tag:1.9', 'image_tag:1.9.4', 'image_tag:latest']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.2']), + ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:1.9.3']), + ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']), + ('docker.images.available', None), + ('docker.images.intermediate', None), + ('docker.mem.cache', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.cache', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.rss', ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.rss', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ('docker.mem.limit' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ('docker.mem.in_use' ,['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + # Container size metrics + ("docker.container.size_rootfs", ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + ("docker.container.size_rootfs", ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']), + ("docker.container.size_rw", ['container_name:test-new-nginx', 'docker_image:nginx', 'image_name:nginx']), + + ] + + config = { + "init_config": {}, + "instances": [{ + "url": "unix://var/run/docker.sock", + "collect_container_size": True, + "collect_image_size": True, + }, + ], + } + + self.run_check(config, force_reload=True) + for mname, tags in expected_metrics: + self.assertMetric(mname, tags=tags, count=1, at_least=1) diff --git a/tests/checks/mock/test_docker.py b/tests/checks/mock/test_docker.py deleted file mode 100644 index 14cff8d014..0000000000 --- a/tests/checks/mock/test_docker.py +++ /dev/null @@ -1,77 +0,0 @@ -import unittest - -from mock import patch - -from tests.checks.common import get_check_class - -def _mocked_find_cgroup(*args, **kwargs): - return - -class DockerCheckTest(unittest.TestCase): - def test_tag_exclude_all(self): - """ exclude all, except ubuntu and debian. """ - instance = { - 'include': [ - 'docker_image:ubuntu', - 'docker_image:debian', - ], - 'exclude': ['.*'], - } - - klass = get_check_class('docker') - # NO-OP but loads the check - with patch.object(klass, '_find_cgroup', _mocked_find_cgroup): - check = klass('docker', {}, {}) - - check._prepare_filters(instance) - self.assertEquals(len(instance['exclude_patterns']), 1) - self.assertEquals(len(instance['include_patterns']), 2) - - truth_table_exclusion = { - 'some_tag': True, - 'debian:ubuntu': True, - 'docker_image:centos': True, - 'docker_image:ubuntu': False, - 'docker_image:debian': False, - } - - for tag, val in truth_table_exclusion.iteritems(): - self.assertEquals( - check._is_container_excluded(instance, [tag]), - val, - "{0} expected {1} but is not".format(tag, val) - ) - - def test_tag_include_all(self): - """ exclude all, except ubuntu and debian. """ - instance = { - 'include': [], - 'exclude': [ - 'docker_image:ubuntu', - 'docker_image:debian', - ], - } - - klass = get_check_class('docker') - # NO-OP but loads the check - with patch.object(klass, '_find_cgroup', _mocked_find_cgroup): - check = klass('docker', {}, {}) - - check._prepare_filters(instance) - self.assertEquals(len(instance['exclude_patterns']), 2) - self.assertEquals(len(instance['include_patterns']), 0) - - truth_table_exclusion = { - 'some_tag': False, - 'debian:ubuntu': False, - 'docker_image:centos': False, - 'docker_image:ubuntu': True, - 'docker_image:debian': True, - } - - for tag, val in truth_table_exclusion.iteritems(): - self.assertEquals( - check._is_container_excluded(instance, [tag]), - val, - "{0} expected {1} but is not".format(tag, val) - ) diff --git a/utils/dockerutil.py b/utils/dockerutil.py new file mode 100644 index 0000000000..3702c6061e --- /dev/null +++ b/utils/dockerutil.py @@ -0,0 +1,94 @@ +# stdlib +import os + +# 3rd party +from docker import Client +from docker import tls + +class MountException(Exception): + pass + +# Default docker client settings +DEFAULT_TIMEOUT = 5 +DEFAULT_VERSION = 'auto' + +_docker_client_settings = {"version": DEFAULT_VERSION} + +def get_docker_settings(): + global _docker_client_settings + return _docker_client_settings + +def reset_docker_settings(): + global _docker_client_settings + _docker_client_settings = {"version": DEFAULT_VERSION} + +def set_docker_settings(init_config, instance): + global _docker_client_settings + _docker_client_settings = { + "version": init_config.get('api_version', DEFAULT_VERSION), + "base_url": instance.get("url"), + "timeout": int(init_config.get('timeout', DEFAULT_TIMEOUT)), + } + + if init_config.get('tls', False): + client_cert_path = init_config.get('tls_client_cert') + client_key_path = init_config.get('tls_client_key') + cacert = init_config.get('tls_cacert') + verify = init_config.get('tls_verify') + + client_cert = None + if client_cert_path is not None and client_key_path is not None: + client_cert = (client_cert_path, client_key_path) + + verify = verify if verify is not None else cacert + tls_config = tls.TLSConfig(client_cert=client_cert, verify=verify) + _docker_client_settings["tls"] = tls_config + +def get_client(): + return Client(**_docker_client_settings) + +def find_cgroup(hierarchy, docker_root): + """Find the mount point for a specified cgroup hierarchy. + + Works with old style and new style mounts. + """ + with open(os.path.join(docker_root, "/proc/mounts"), 'r') as fp: + mounts = map(lambda x: x.split(), fp.read().splitlines()) + cgroup_mounts = filter(lambda x: x[2] == "cgroup", mounts) + if len(cgroup_mounts) == 0: + raise Exception( + "Can't find mounted cgroups. If you run the Agent inside a container," + " please refer to the documentation.") + # Old cgroup style + if len(cgroup_mounts) == 1: + return os.path.join(docker_root, cgroup_mounts[0][1]) + + candidate = None + for _, mountpoint, _, opts, _, _ in cgroup_mounts: + if hierarchy in opts: + if mountpoint.startswith("/host/"): + return os.path.join(docker_root, mountpoint) + candidate = mountpoint + + if candidate is not None: + return os.path.join(docker_root, candidate) + raise Exception("Can't find mounted %s cgroups." % hierarchy) + +def find_cgroup_filename_pattern(mountpoints, container_id): + # We try with different cgroups so that it works even if only one is properly working + for mountpoint in mountpoints.itervalues(): + stat_file_path_lxc = os.path.join(mountpoint, "lxc") + stat_file_path_docker = os.path.join(mountpoint, "docker") + stat_file_path_coreos = os.path.join(mountpoint, "system.slice") + stat_file_path_kubernetes = os.path.join(mountpoint, container_id) + + if os.path.exists(stat_file_path_lxc): + return os.path.join('%(mountpoint)s/lxc/%(id)s/%(file)s') + elif os.path.exists(stat_file_path_docker): + return os.path.join('%(mountpoint)s/docker/%(id)s/%(file)s') + elif os.path.exists(stat_file_path_coreos): + return os.path.join('%(mountpoint)s/system.slice/docker-%(id)s.scope/%(file)s') + elif os.path.exists(stat_file_path_kubernetes): + return os.path.join('%(mountpoint)s/%(id)s/%(file)s') + + raise MountException("Cannot find Docker cgroup directory. Be sure your system is supported.") diff --git a/utils/platform.py b/utils/platform.py index 5809e6005c..d1573ce7a2 100644 --- a/utils/platform.py +++ b/utils/platform.py @@ -1,5 +1,10 @@ +# stdlib import sys -import docker + +# project +from utils.dockerutil import get_client + +_is_ecs = None class Platform(object): """ @@ -57,9 +62,19 @@ def is_windows(name=None): @staticmethod def is_ecs_instance(): """Return True if the agent is running in an ECS instance, False otherwise.""" - cl = docker.Client(version='auto') - containers = cl.containers() - for co in containers: - if '/ecs-agent' in co.get('Names', ''): - return True + global _is_ecs + if _is_ecs is not None: + return _is_ecs + + try: + client = get_client() + containers = client.containers() + for co in containers: + if '/ecs-agent' in co.get('Names', ''): + _is_ecs = True + return True + except Exception, e: + pass + + _is_ecs = False return False From ba18323199b624e510de677594dd203f182ee610 Mon Sep 17 00:00:00 2001 From: Remi Hakim <remi@datadoghq.com> Date: Tue, 15 Sep 2015 17:44:48 +0000 Subject: [PATCH 3/3] [docker_daemon] Handle case where /proc is not properly mounted --- checks.d/docker_daemon.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/checks.d/docker_daemon.py b/checks.d/docker_daemon.py index 42536446d6..fd7864e06b 100644 --- a/checks.d/docker_daemon.py +++ b/checks.d/docker_daemon.py @@ -161,6 +161,7 @@ def init(self): self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() + self._disable_net_metrics = False # At first run we'll just collect the events from the latest 60 secs self._last_event_collection_ts = int(time.time()) - 60 @@ -476,6 +477,14 @@ def _report_cgroup_metrics(self, container, tags): def _report_net_metrics(self, container, tags): """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" + if self._disable_net_metrics: + self.log.debug("Network metrics are disabled. Skipping") + return + + if "_proc_root" not in container: + self.warning("Couldn't find pid directory for container: {0}".format(container)) + return + proc_net_file = os.path.join(container['_proc_root'], 'net/dev') try: with open(proc_net_file, 'r') as fp: @@ -622,14 +631,23 @@ def _parse_blkio_metrics(self, stats): # proc files def _crawl_container_pids(self, container_dict): """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" - proc_dir = os.path.join(self._docker_root, 'proc') - for folder in os.listdir(proc_dir): - try: - int(folder) - except ValueError: - continue + proc_path = os.path.join(self._docker_root, 'proc') + pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] + + if len(pid_dirs) == 0: + self.warning("Unable to find any pid directory in {0}. " + "If you are running the agent in a container, make sure to " + 'share the volume properly: "/proc:/host/proc:ro". ' + "Network metrics will be missing".format(proc_path)) + self._disable_net_metrics = True + return container_dict + + self._disable_net_metrics = False + + for folder in pid_dirs: + try: - path = os.path.join(proc_dir, folder, 'cgroup') + path = os.path.join(proc_path, folder, 'cgroup') with open(path, 'r') as f: content = [line.strip().split(':') for line in f.readlines()] except Exception, e: @@ -641,7 +659,7 @@ def _crawl_container_pids(self, container_dict): if 'docker/' in content.get('cpuacct'): container_id = content['cpuacct'].split('docker/')[1] container_dict[container_id]['_pid'] = folder - container_dict[container_id]['_proc_root'] = os.path.join(proc_dir, folder) + container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder) except Exception, e: self.warning("Cannot parse %s content: %s" % (path, str(e))) continue