From 8d15b4a8f14423a193d5e768955373d01fb4f71d Mon Sep 17 00:00:00 2001 From: Dorian Zaccaria Date: Thu, 9 Apr 2015 17:08:08 -0400 Subject: [PATCH 1/3] [mesos] Improve mesos integration * Add a check for mesos masters, only the leader will report metrics * Add a check for mesos slave, slaves will report metrics from the selected tasks only if the task is running on the node * Add a mocked tests for mesos integration --- checks.d/mesos_master.py | 241 ++++++++++++++++++++++++ checks.d/mesos_slave.py | 196 ++++++++++++++++++++ conf.d/mesos_master.yaml.example | 5 + conf.d/mesos_slave.yaml.example | 7 + tests/test_mesos_master.py | 309 +++++++++++++++++++++++++++++++ tests/test_mesos_slave.py | 218 ++++++++++++++++++++++ 6 files changed, 976 insertions(+) create mode 100644 checks.d/mesos_master.py create mode 100644 checks.d/mesos_slave.py create mode 100644 conf.d/mesos_master.yaml.example create mode 100644 conf.d/mesos_slave.yaml.example create mode 100644 tests/test_mesos_master.py create mode 100644 tests/test_mesos_slave.py diff --git a/checks.d/mesos_master.py b/checks.d/mesos_master.py new file mode 100644 index 0000000000..e6a954834a --- /dev/null +++ b/checks.d/mesos_master.py @@ -0,0 +1,241 @@ +"""Mesos Master check + +Collects metrics from mesos master node, only the leader is sending metrics. +""" +# stdlib +from hashlib import md5 +import time + +# project +from checks import AgentCheck + +# 3rd party +import requests + + +class MesosMaster(AgentCheck): + GAUGE = AgentCheck.gauge + RATE = AgentCheck.rate + SERVICE_CHECK_NAME = "mesos_master.can_connect" + SERVICE_CHECK_NEEDED = True + + + FRAMEWORK_METRICS = { + 'cpus' : ('mesos.framework.cpu', GAUGE), + 'mem' : ('mesos.framework.mem', GAUGE), + 'disk' : ('mesos.framework.disk', GAUGE), + } + + ROLE_RESOURCES_METRICS = { + 'cpus' : ('mesos.role.cpu', GAUGE), + 'mem' : ('mesos.role.mem', GAUGE), + 'disk' : ('mesos.role.disk', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_TASKS_METRICS = { + 'staged_tasks' : ('mesos.cluster.staged_tasks', GAUGE), + 'started_tasks' : ('mesos.cluster.started_tasks', GAUGE), + 'finished_tasks' : ('mesos.cluster.finished_tasks', GAUGE), + 'killed_tasks' : ('mesos.cluster.killed_tasks', GAUGE), + 'failed_tasks' : ('mesos.cluster.failed_tasks', GAUGE), + 'lost_tasks' : ('mesos.cluster.lost_tasks', GAUGE), + 'active_tasks_gauge' : ('mesos.cluster.active_tasks_gauge', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_SLAVES_METRICS = { + 'master/slave_registrations' : ('mesos.cluster.slave_registrations', GAUGE), + 'master/slave_removals' : ('mesos.cluster.slave_removals', GAUGE), + 'master/slave_reregistrations' : ('mesos.cluster.slave_reregistrations', GAUGE), + 'master/slave_shutdowns_canceled' : ('mesos.cluster.slave_shutdowns_canceled', GAUGE), + 'master/slave_shutdowns_scheduled' : ('mesos.cluster.slave_shutdowns_scheduled', GAUGE), + 'master/slaves_active' : ('mesos.cluster.slaves_active', GAUGE), + 'master/slaves_connected' : ('mesos.cluster.slaves_connected', GAUGE), + 'master/slaves_disconnected' : ('mesos.cluster.slaves_disconnected', GAUGE), + 'master/slaves_inactive' : ('mesos.cluster.slaves_inactive', GAUGE), + 'master/recovery_slave_removals' : ('mesos.cluster.recovery_slave_removals', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_RESOURCES_METRICS = { + 'master/cpus_percent' : ('mesos.cluster.cpus_percent', GAUGE), + 'master/cpus_total' : ('mesos.cluster.cpus_total', GAUGE), + 'master/cpus_used' : ('mesos.cluster.cpus_used', GAUGE), + 'master/disk_percent' : ('mesos.cluster.disk_percent', GAUGE), + 'master/disk_total' : ('mesos.cluster.disk_total', GAUGE), + 'master/disk_used' : ('mesos.cluster.disk_used', GAUGE), + 'master/mem_percent' : ('mesos.cluster.mem_percent', GAUGE), + 'master/mem_total' : ('mesos.cluster.mem_total', GAUGE), + 'master/mem_used' : ('mesos.cluster.mem_used', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_REGISTRAR_METRICS = { + 'registrar/queued_operations' : ('mesos.registrar.queued_operations', GAUGE), + 'registrar/registry_size_bytes' : ('mesos.registrar.registry_size_bytes', GAUGE), + 'registrar/state_fetch_ms' : ('mesos.registrar.state_fetch_ms', GAUGE), + 'registrar/state_store_ms' : ('mesos.registrar.state_store_ms', GAUGE), + 'registrar/state_store_ms/count' : ('mesos.registrar.state_store_ms.count', GAUGE), + 'registrar/state_store_ms/max' : ('mesos.registrar.state_store_ms.max', GAUGE), + 'registrar/state_store_ms/min' : ('mesos.registrar.state_store_ms.min', GAUGE), + 'registrar/state_store_ms/p50' : ('mesos.registrar.state_store_ms.p50', GAUGE), + 'registrar/state_store_ms/p90' : ('mesos.registrar.state_store_ms.p90', GAUGE), + 'registrar/state_store_ms/p95' : ('mesos.registrar.state_store_ms.p95', GAUGE), + 'registrar/state_store_ms/p99' : ('mesos.registrar.state_store_ms.p99', GAUGE), + 'registrar/state_store_ms/p999' : ('mesos.registrar.state_store_ms.p999', GAUGE), + 'registrar/state_store_ms/p9999' : ('mesos.registrar.state_store_ms.p9999', GAUGE), + } + + # These metrics are aggregated only on the elected master + CLUSTER_FRAMEWORK_METRICS = { + 'master/frameworks_active' : ('mesos.cluster.frameworks_active', GAUGE), + 'master/frameworks_connected' : ('mesos.cluster.frameworks_connected', GAUGE), + 'master/frameworks_disconnected' : ('mesos.cluster.frameworks_disconnected', GAUGE), + 'master/frameworks_inactive' : ('mesos.cluster.frameworks_inactive', GAUGE), + } + + # These metrics are aggregated on all nodes in the cluster + SYSTEM_METRICS = { + 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), + 'system/load_15min' : ('mesos.stats.system.load_15min', RATE), + 'system/load_1min' : ('mesos.stats.system.load_1min', RATE), + 'system/load_5min' : ('mesos.stats.system.load_5min', RATE), + 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), + 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), + 'master/elected' : ('mesos.stats.elected', GAUGE), + 'master/uptime_secs' : ('mesos.stats.uptime_secs', GAUGE), + } + + # These metrics are aggregated only on the elected master + STATS_METRICS = { + 'active_schedulers' : ('mesos.cluster.active_schedulers', GAUGE), + 'total_schedulers' : ('mesos.cluster.total_schedulers', GAUGE), + 'outstanding_offers' : ('mesos.cluster.outstanding_offers', GAUGE), + 'master/dropped_messages' : ('mesos.cluster.dropped_messages', GAUGE), + 'master/outstanding_offers' : ('mesos.cluster.outstanding_offers', GAUGE), + 'master/event_queue_dispatches' : ('mesos.cluster.event_queue_dispatches', GAUGE), + 'master/event_queue_http_requests' : ('mesos.cluster.event_queue_http_requests', GAUGE), + 'master/event_queue_messages' : ('mesos.cluster.event_queue_messages', GAUGE), + 'master/invalid_framework_to_executor_messages' : ('mesos.cluster.invalid_framework_to_executor_messages', GAUGE), + 'master/invalid_status_update_acknowledgements' : ('mesos.cluster.invalid_status_update_acknowledgements', GAUGE), + 'master/invalid_status_updates' : ('mesos.cluster.invalid_status_updates', GAUGE), + 'master/valid_framework_to_executor_messages' : ('mesos.cluster.valid_framework_to_executor_messages', GAUGE), + 'master/valid_status_update_acknowledgements' : ('mesos.cluster.valid_status_update_acknowledgements', GAUGE), + 'master/valid_status_updates' : ('mesos.cluster.valid_status_updates', GAUGE), + } + + def _timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'URL timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def _status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'Invalid reponse code for %s' % url, + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + + def _get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + tags = ["url:%s" % url] + msg = None + status = None + try: + r = requests.get(url, timeout=timeout) + if r.status_code != 200: + self._status_code_event(url, r, aggregation_key) + status = AgentCheck.CRITICAL + msg = "Got %s when hitting %s" % (r.status_code, url) + else: + status = AgentCheck.OK + msg = "Mesos master instance detected at %s " % url + except requests.exceptions.Timeout as e: + # If there's a timeout + self._timeout_event(url, timeout, aggregation_key) + msg = "%s seconds timeout when hitting %s" % (timeout, url) + status = AgentCheck.CRITICAL + except Exception as e: + msg = e.message + status = AgentCheck.CRITICAL + finally: + if self.SERVICE_CHECK_NEEDED: + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, + message=msg) + self.SERVICE_CHECK_NEEDED = False + if status is AgentCheck.CRITICAL: + self.warning(msg) + return None + + return r.json() + + def _get_master_state(self, url, timeout): + return self._get_json(url + '/state.json', timeout) + + def _get_master_stats(self, url, timeout): + return self._get_json(url + '/stats.json', timeout) + + def _get_master_roles(self, url, timeout): + return self._get_json(url + '/roles.json', timeout) + + def _check_leadership(self, url, timeout): + json = self._get_master_state(url, timeout) + + if json is not None and json['leader'] == json['pid']: + self.leader = True + else: + self.leader = False + return json + + def check(self, instance): + if 'url' not in instance: + raise Exception('Mesos instance missing "url" value.') + + url = instance['url'] + instance_tags = instance.get('tags', []) + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + json = self._check_leadership(url, timeout) + if json: + tags = ['mesos_cluster:' + json['cluster'], 'mesos_pid:' + json['pid'], 'mesos_id:' + json['id'], 'mesos_node:master'] + instance_tags + + if self.leader: + self.GAUGE('mesos.cluster.total_frameworks', len(json['frameworks']), tags=tags) + + for framework in json['frameworks']: + framework_tags = ['framework:' + framework['id'], 'framework_name:' + framework['name']] + tags + self.GAUGE('mesos.framework.total_tasks', len(framework['tasks']), tags=framework_tags) + resources = framework['used_resources'] + [v[1](self, v[0], resources[k], tags=framework_tags) for k, v in self.FRAMEWORK_METRICS.iteritems()] + + json = self._get_master_roles(url, timeout) + if json is not None: + for role in json['roles']: + role_tags = ['mesos_role:' + role['name']] + tags + self.GAUGE('mesos.role.frameworks', len(role['frameworks']), tags=role_tags) + self.GAUGE('mesos.role.weight', role['weight'], tags=role_tags) + [v[1](self, v[0], role['resources'][k], tags=role_tags) for k, v in self.ROLE_RESOURCES_METRICS.iteritems()] + + json = self._get_master_stats(url, timeout) + if json is not None: + if self.leader: + metrics = {} + for d in (self.CLUSTER_TASKS_METRICS, self.CLUSTER_SLAVES_METRICS, + self.CLUSTER_RESOURCES_METRICS, self.CLUSTER_REGISTRAR_METRICS, + self.CLUSTER_FRAMEWORK_METRICS, self.SYSTEM_METRICS, self.STATS_METRICS): + metrics.update(d) + else: + metrics = self.SYSTEM_METRICS + [v[1](self, v[0], json[k], tags=tags) for k, v in metrics.iteritems()] + + + self.SERVICE_CHECK_NEEDED = True diff --git a/checks.d/mesos_slave.py b/checks.d/mesos_slave.py new file mode 100644 index 0000000000..b4e83e24e5 --- /dev/null +++ b/checks.d/mesos_slave.py @@ -0,0 +1,196 @@ +"""Mesos Slave check + +Collects metrics from mesos slave node. +""" +# stdlib +from hashlib import md5 +import time + +# project +from checks import AgentCheck + +# 3rd party +import requests + + +class MesosSlave(AgentCheck): + GAUGE = AgentCheck.gauge + RATE = AgentCheck.rate + SERVICE_CHECK_NAME = "mesos_slave.can_connect" + SERVICE_CHECK_NEEDED = True + + TASK_STATUS = { + 'TASK_STARTING' : AgentCheck.OK, + 'TASK_RUNNING' : AgentCheck.OK, + 'TASK_FINISHED' : AgentCheck.OK, + 'TASK_FAILED' : AgentCheck.CRITICAL, + 'TASK_KILLED' : AgentCheck.WARNING, + 'TASK_LOST' : AgentCheck.CRITICAL, + 'TASK_STAGING' : AgentCheck.OK, + 'TASK_ERROR' : AgentCheck.CRITICAL, + } + + TASK_METRICS = { + 'cpus' : ('mesos.state.task.cpu', GAUGE), + 'mem' : ('mesos.state.task.mem', GAUGE), + 'disk' : ('mesos.state.task.disk', GAUGE), + } + + SLAVE_TASKS_METRICS = { + 'failed_tasks' : ('mesos.slave.failed_tasks', GAUGE), + 'finished_tasks' : ('mesos.slave.finished_tasks', GAUGE), + 'killed_tasks' : ('mesos.slave.killed_tasks', GAUGE), + 'lost_tasks' : ('mesos.slave.lost_tasks', GAUGE), + 'staged_tasks' : ('mesos.slave.staged_tasks', GAUGE), + 'started_tasks' : ('mesos.slave.started_tasks', GAUGE), + 'launched_tasks_gauge' : ('mesos.slave.launched_tasks_gauge', GAUGE), + 'queued_tasks_gauge' : ('mesos.slave.queued_tasks_gauge', GAUGE), + } + + SYSTEM_METRICS = { + 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), + 'system/load_15min' : ('mesos.stats.system.load_15min', RATE), + 'system/load_1min' : ('mesos.stats.system.load_1min', RATE), + 'system/load_5min' : ('mesos.stats.system.load_5min', RATE), + 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), + 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), + 'slave/registered' : ('mesos.stats.registered', GAUGE), + 'slave/uptime_secs' : ('mesos.stats.uptime_secs', GAUGE), + } + + SLAVE_RESOURCE_METRICS = { + 'slave/cpus_percent' : ('mesos.slave.cpus_percent', GAUGE), + 'slave/cpus_total' : ('mesos.slave.cpus_total', GAUGE), + 'slave/cpus_used' : ('mesos.slave.cpus_used', GAUGE), + 'slave/disk_percent' : ('mesos.slave.disk_percent', GAUGE), + 'slave/disk_total' : ('mesos.slave.disk_total', GAUGE), + 'slave/disk_used' : ('mesos.slave.disk_used', GAUGE), + 'slave/mem_percent' : ('mesos.slave.mem_percent', GAUGE), + 'slave/mem_total' : ('mesos.slave.mem_total', GAUGE), + 'slave/mem_used' : ('mesos.slave.mem_used', GAUGE), + } + + SLAVE_EXECUTORS_METRICS = { + 'slave/executors_registering' : ('mesos.slave.executors_registering', GAUGE), + 'slave/executors_running' : ('mesos.slave.executors_running', GAUGE), + 'slave/executors_terminated' : ('mesos.slave.executors_terminated', GAUGE), + 'slave/executors_terminating' : ('mesos.slave.executors_terminating', GAUGE), + } + + STATS_METRICS = { + 'total_frameworks' : ('mesos.slave.total_frameworks', GAUGE), + 'slave/frameworks_active' : ('mesos.slave.frameworks_active', GAUGE), + 'slave/invalid_framework_messages' : ('mesos.slave.invalid_framework_messages', GAUGE), + 'slave/invalid_status_updates' : ('mesos.slave.invalid_status_updates', GAUGE), + 'slave/recovery_errors' : ('mesos.slave.recovery_errors', GAUGE), + 'slave/valid_framework_messages' : ('mesos.slave.valid_framework_messages', GAUGE), + 'slave/valid_status_updates' : ('mesos.slave.valid_status_updates', GAUGE), + } + + cluster_name = None + + def _timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'URL timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def _status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'Invalid reponse code for %s' % url, + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + + def _get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + tags = ["url:%s" % url] + msg = None + status = None + try: + r = requests.get(url, timeout=timeout) + if r.status_code != 200: + self._status_code_event(url, r, aggregation_key) + status = AgentCheck.CRITICAL + msg = "Got %s when hitting %s" % (r.status_code, url) + else: + status = AgentCheck.OK + msg = "Mesos master instance detected at %s " % url + except requests.exceptions.Timeout as e: + # If there's a timeout + self._timeout_event(url, timeout, aggregation_key) + msg = "%s seconds timeout when hitting %s" % (timeout, url) + status = AgentCheck.CRITICAL + except Exception as e: + msg = e.message + status = AgentCheck.CRITICAL + finally: + if self.SERVICE_CHECK_NEEDED: + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) + self.SERVICE_CHECK_NEEDED = False + if status is AgentCheck.CRITICAL: + self.warning(msg) + return None + + return r.json() + + def _get_state(self, url, timeout): + return self._get_json(url + '/state.json', timeout) + + def _get_stats(self, url, timeout): + return self._get_json(url + '/stats.json', timeout) + + def _get_constant_attributes(self, url, timeout): + json = None + if self.cluster_name is None: + json = self._get_state(url, timeout) + if json is not None: + master_state = self._get_state('http://' + json['master_hostname'] + ':5050', timeout) + if master_state is not None: + self.cluster_name = master_state['cluster'] + + return json + + def check(self, instance): + if 'url' not in instance: + raise Exception('Mesos instance missing "url" value.') + + url = instance['url'] + instance_tags = instance.get('tags', []) + tasks = instance.get('tasks', []) + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + json = self._get_constant_attributes(url, timeout) + tags = None + + if json is None: + json = self._get_state(url, timeout) + if json: + tags = ['mesos_cluster:' + self.cluster_name, 'mesos_id:' + json['id'], 'mesos_pid:' + json['pid'], 'mesos_node:slave'] + instance_tags + + for task in tasks: + for framework in json['frameworks']: + for executor in framework['executors']: + for t in executor['tasks']: + if task.lower() in t['name'].lower() and t['slave_id'] == json['id']: + task_tags = ['framework_id:' + t['framework_id'], 'executor_id:' + t['executor_id'], 'task_name:' + t['name']] + tags + self.service_check(t['name'] + '.ok', self.TASK_STATUS[t['state']], tags=task_tags) + [v[1](self, v[0], t['resources'][k], tags=task_tags) for k, v in self.TASK_METRICS.iteritems()] + + json = self._get_stats(url, timeout) + if json: + tags = tags if tags else instance_tags + metrics = {} + for d in (self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS, self.SLAVE_RESOURCE_METRICS, + self.SLAVE_EXECUTORS_METRICS, self.STATS_METRICS): + metrics.update(d) + [v[1](self, v[0], json[k], tags=tags) for k, v in metrics.iteritems()] + + self.SERVICE_CHECK_NEEDED = True diff --git a/conf.d/mesos_master.yaml.example b/conf.d/mesos_master.yaml.example new file mode 100644 index 0000000000..46b68d94cc --- /dev/null +++ b/conf.d/mesos_master.yaml.example @@ -0,0 +1,5 @@ +init_config: + default_timeout: 10 + +instances: + - url: "http://localhost:5050" diff --git a/conf.d/mesos_slave.yaml.example b/conf.d/mesos_slave.yaml.example new file mode 100644 index 0000000000..2d154c4997 --- /dev/null +++ b/conf.d/mesos_slave.yaml.example @@ -0,0 +1,7 @@ +init_config: + default_timeout: 10 + +instances: + - url: "http://localhost:5051" + # tasks: + # - "hello" diff --git a/tests/test_mesos_master.py b/tests/test_mesos_master.py new file mode 100644 index 0000000000..177556b95c --- /dev/null +++ b/tests/test_mesos_master.py @@ -0,0 +1,309 @@ +from tests.common import AgentCheckTest, get_check_class + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import time + +state = { + "version": "0.22.0", + "unregistered_frameworks": [], + "started_tasks": 0, + "start_time": 1428951954.34111, + "staged_tasks": 0, + "slaves": [ + { + "resources": { + "ports": "[31000-32000]", + "mem": 244, + "disk": 35164, + "cpus": 1 + }, + "reregistered_time": 1428951983.53731, + "registered_time": 1428951983.53725, + "pid": "slave(1)@127.0.0.1:5051", + "id": "20150410-134224-16777343-5050-1778-S0", + "hostname": "localhost", + "attributes": {}, + "active": 'true' + } + ], + "pid": "master@127.0.0.1:5050", + "orphan_tasks": [], + "lost_tasks": 0, + "log_dir": "/var/log/mesos", + "leader": "master@127.0.0.1:5050", + "killed_tasks": 0, + "elected_time": 1428951954.3774, + "deactivated_slaves": 0, + "completed_frameworks": [], + "cluster": "datadog-test", + "build_user": "root", + "build_time": 1427376927, + "build_date": "2015-03-26 13:35:27", + "activated_slaves": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "flags": { + "zk_session_timeout": "10secs", + "zk": "zk://localhost:2181/mesos", + "work_dir": "/var/lib/mesos", + "webui_dir": "/usr/share/mesos/webui", + "version": "false", + "user_sorter": "drf", + "slave_reregister_timeout": "10mins", + "root_submissions": "true", + "registry_strict": "false", + "registry_store_timeout": "5secs", + "registry_fetch_timeout": "1mins", + "registry": "replicated_log", + "initialize_driver_logging": "true", + "help": "false", + "framework_sorter": "drf", + "cluster": "datadog-test", + "authenticators": "crammd5", + "authenticate_slaves": "false", + "authenticate": "false", + "allocation_interval": "1secs", + "log_auto_initialize": "true", + "log_dir": "/var/log/mesos", + "logbufsecs": "0", + "logging_level": "INFO", + "port": "5050", + "quiet": "false", + "quorum": "1", + "recovery_slave_removal_limit": "100%" + }, + "frameworks": [ + { + "webui_url": "http://192.168.33.20:8080", + "user": "root", + "offered_resources": { + "mem": 0, + "disk": 0, + "cpus": 0 + }, + "name": "marathon", + "id": "20150403-140128-251789322-5050-6047-0000", + "hostname": "vagrant-ubuntu-trusty-64", + "failover_timeout": 604800, + "completed_tasks": [], + "checkpoint": 'true', + "active": 'true', + "offers": [], + "registered_time": 1428951955.38871, + "reregistered_time": 1428951955.38872, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "role": "*", + "tasks": [ + { + "statuses": [ + { + "timestamp": 1428673971.61592, + "state": "TASK_RUNNING" + } + ], + "executor_id": "", + "framework_id": "20150403-140128-251789322-5050-6047-0000", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "labels": [], + "name": "hello", + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "slave_id": "20150410-134224-16777343-5050-1778-S0", + "state": "TASK_RUNNING" + } + ], + "unregistered_time": 0, + "used_resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + } + } + ], + "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", + "git_tag": "0.22.0", + "hostname": "localhost", + "id": "20150413-190554-16777343-5050-16324" +} + +stats = { + "valid_status_updates": 0, + "uptime": 706.524240128, + "total_schedulers": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 13815808, + "system/load_5min": 0.02, + "system/load_1min": 0, + "system/load_15min": 0.07, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 0, + "registrar/state_store_ms/p9999": 9.90120192, + "registrar/state_store_ms/p999": 9.8956032, + "registrar/state_store_ms/p99": 9.839616, + "registrar/state_store_ms/p95": 9.590784, + "registrar/state_store_ms/p90": 9.279744, + "registrar/state_store_ms/p50": 6.791424, + "registrar/state_store_ms/min": 3.681024, + "registrar/state_store_ms/max": 9.901824, + "registrar/state_store_ms/count": 2, + "registrar/state_store_ms": 9.901824, + "registrar/state_fetch_ms": 3.717888, + "registrar/registry_size_bytes": 246, + "registrar/queued_operations": 0, + "outstanding_offers": 0, + "mem_used": 100, + "mem_total": 244, + "mem_percent": 0.409836065573771, + "master/valid_status_updates": 0, + "master/valid_status_update_acknowledgements": 0, + "master/valid_framework_to_executor_messages": 0, + "master/uptime_secs": 706.52485632, + "master/tasks_starting": 0, + "master/tasks_staging": 0, + "master/tasks_running": 1, + "master/tasks_lost": 0, + "master/tasks_killed": 0, + "master/tasks_finished": 0, + "master/tasks_failed": 0, + "master/tasks_error": 0, + "master/slaves_inactive": 0, + "master/slaves_disconnected": 0, + "master/invalid_framework_to_executor_messages": 0, + "master/frameworks_inactive": 0, + "master/frameworks_disconnected": 0, + "master/frameworks_connected": 1, + "master/frameworks_active": 1, + "master/event_queue_messages": 0, + "master/event_queue_http_requests": 0, + "master/event_queue_dispatches": 17, + "master/elected": 1, + "master/dropped_messages": 1, + "master/disk_used": 0, + "master/disk_total": 35164, + "master/disk_percent": 0, + "master/cpus_used": 1, + "master/cpus_total": 1, + "master/cpus_percent": 1, + "disk_percent": 0, + "deactivated_slaves": 0, + "cpus_used": 1, + "cpus_total": 1, + "cpus_percent": 1, + "active_tasks_gauge": 1, + "active_schedulers": 1, + "activated_slaves": 1, + "disk_total": 35164, + "disk_used": 0, + "elected": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "lost_tasks": 0, + "master/invalid_status_update_acknowledgements": 0, + "master/invalid_status_updates": 0, + "master/mem_percent": 0.409836065573771, + "master/mem_total": 244, + "master/mem_used": 100, + "master/messages_authenticate": 0, + "master/messages_deactivate_framework": 0, + "master/messages_decline_offers": 123, + "master/messages_exited_executor": 0, + "master/messages_framework_to_executor": 0, + "master/messages_kill_task": 0, + "master/messages_launch_tasks": 0, + "master/messages_reconcile_tasks": 6, + "master/messages_register_framework": 0, + "master/messages_register_slave": 0, + "master/messages_reregister_framework": 1, + "master/messages_reregister_slave": 2, + "master/messages_resource_request": 0, + "master/messages_revive_offers": 0, + "master/messages_status_update": 0, + "master/messages_status_update_acknowledgement": 0, + "master/messages_unregister_framework": 0, + "master/messages_unregister_slave": 0, + "master/outstanding_offers": 0, + "master/recovery_slave_removals": 0, + "master/slave_registrations": 0, + "master/slave_removals": 0, + "master/slave_reregistrations": 1, + "master/slave_shutdowns_canceled": 0, + "master/slave_shutdowns_scheduled": 0, + "master/slaves_active": 1, + "master/slaves_connected": 1 +} + +roles = { + "roles": [ + { + "weight": 1, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "name": "*", + "frameworks": [ + "20150403-140128-251789322-5050-6047-0000" + ] + } + ] +} + +def _mocked_get_master_state(*args, **kwargs): + return state +def _mocked_get_master_stats(*args, **kwargs): + return stats +def _mocked_get_master_roles(*args, **kwargs): + return roles + + +@attr(requires='mesos_master') +class TestMesosMaster(AgentCheckTest): + CHECK_NAME = 'mesos_master' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050' + } + ] + } + + klass = get_check_class('mesos_master') + with patch.object(klass, '_get_master_state', _mocked_get_master_state): + with patch.object(klass, '_get_master_stats', _mocked_get_master_stats): + with patch.object(klass, '_get_master_roles', _mocked_get_master_roles): + check = klass('mesos_master', {}, {}) + self.run_check(config) + time.sleep(1) + self.run_check(config) + metrics = {} + for d in (check.CLUSTER_TASKS_METRICS, check.CLUSTER_SLAVES_METRICS, + check.CLUSTER_RESOURCES_METRICS, check.CLUSTER_REGISTRAR_METRICS, + check.CLUSTER_FRAMEWORK_METRICS, check.SYSTEM_METRICS, check.STATS_METRICS): + metrics.update(d) + [self.assertMetric(v[0]) for k, v in check.FRAMEWORK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in metrics.iteritems()] + [self.assertMetric(v[0]) for k, v in check.ROLE_RESOURCES_METRICS.iteritems()] + self.assertMetric('mesos.cluster.total_frameworks') + self.assertMetric('mesos.framework.total_tasks') + self.assertMetric('mesos.role.frameworks') + self.assertMetric('mesos.role.weight') diff --git a/tests/test_mesos_slave.py b/tests/test_mesos_slave.py new file mode 100644 index 0000000000..c74c6174d3 --- /dev/null +++ b/tests/test_mesos_slave.py @@ -0,0 +1,218 @@ +from tests.common import AgentCheckTest, get_check_class + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import time + +state = { + "version": "0.22.0", + "started_tasks": 0, + "start_time": 1428673344.06054, + "staged_tasks": 1, + "cluster": "test", + "resources": { + "ports": "[31000-32000]", + "mem": 244, + "disk": 35164, + "cpus": 1 + }, + "pid": "slave(1)@127.0.0.1:5051", + "master_hostname": "localhost", + "flags": { + "work_dir": "/tmp/mesos", + "version": "false", + "switch_user": "true", + "strict": "true", + "resource_monitoring_interval": "1secs", + "registration_backoff_factor": "1secs", + "recovery_timeout": "15mins", + "recover": "reconnect", + "executor_shutdown_grace_period": "5secs", + "executor_registration_timeout": "1mins", + "enforce_container_disk_quota": "false", + "docker_stop_timeout": "0ns", + "docker_sandbox_directory": "/mnt/mesos/sandbox", + "docker_remove_delay": "6hrs", + "docker": "docker", + "disk_watch_interval": "1mins", + "authenticatee": "crammd5", + "cgroups_enable_cfs": "false", + "cgroups_hierarchy": "/sys/fs/cgroup", + "cgroups_limit_swap": "false", + "cgroups_root": "mesos", + "container_disk_watch_interval": "15secs", + "containerizers": "mesos", + "default_role": "*", + "frameworks_home": "", + "gc_delay": "1weeks", + "gc_disk_headroom": "0.1", + "hadoop_home": "", + "help": "false", + "initialize_driver_logging": "true", + "isolation": "posix/cpu,posix/mem", + "launcher_dir": "/usr/libexec/mesos", + "log_dir": "/var/log/mesos", + "logbufsecs": "0", + "logging_level": "INFO", + "master": "zk://localhost:2181/mesos", + "perf_duration": "10secs", + "perf_interval": "1mins", + "port": "5051", + "quiet": "false" + }, + "finished_tasks": 0, + "failed_tasks": 0, + "completed_frameworks": [], + "build_user": "root", + "build_time": 1427376927, + "build_date": "2015-03-26 13:35:27", + "attributes": {}, + "frameworks": [ + { + "user": "root", + "checkpoint": 'true', + "completed_executors": [], + "executors": [ + { + "tasks": [ + { + "statuses": [ + { + "timestamp": 1428673971.61592, + "state": "TASK_RUNNING" + } + ], + "executor_id": "", + "framework_id": "20150403-140128-251789322-5050-6047-0000", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "labels": [], + "name": "hello", + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "slave_id": "20150410-134224-16777343-5050-1778-S0", + "state": "TASK_RUNNING" + } + ], + "completed_tasks": [], + "container": "f67a5e0b-91f9-474a-94a0-e2c6a3b28ea4", + "directory": "/tmp/mesos/slaves/20150410-134224-16777343-5050-1778-S0/frameworks/20150403-140128-251789322-5050-6047-0000/executors/hello.dc130e23-df88-11e4-b9ec-080027fc1312/runs/f67a5e0b-91f9-474a-94a0-e2c6a3b28ea4", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "name": "Command Executor (Task: hello.dc130e23-df88-11e4-b9ec-080027fc1312) (Command: sh -c 'cd hello && ...')", + "queued_tasks": [], + "resources": { + "ports": "[31915-31915]", + "mem": 132, + "disk": 0, + "cpus": 1.1 + }, + "source": "hello.dc130e23-df88-11e4-b9ec-080027fc1312" + } + ], + "failover_timeout": 604800, + "hostname": "vagrant-ubuntu-trusty-64", + "id": "20150403-140128-251789322-5050-6047-0000", + "name": "marathon", + "role": "*" + } + ], + "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", + "git_tag": "0.22.0", + "hostname": "localhost", + "id": "20150410-134224-16777343-5050-1778-S0", + "killed_tasks": 0, + "log_dir": "/var/log/mesos", + "lost_tasks": 0 +} + +stats = { + "valid_status_updates": 1, + "uptime": 280965.77977984, + "total_frameworks": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 34271232, + "system/load_5min": 0.08, + "system/load_1min": 0.1, + "system/load_15min": 0.06, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 1, + "slave/valid_status_updates": 1, + "slave/valid_framework_messages": 0, + "slave/uptime_secs": 280965.78028288, + "slave/tasks_starting": 0, + "slave/tasks_staging": 0, + "slave/executors_registering": 0, + "slave/disk_used": 0, + "slave/disk_total": 35164, + "slave/disk_percent": 0, + "slave/cpus_used": 1.1, + "slave/cpus_total": 1, + "slave/cpus_percent": 1.1, + "registered": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "launched_tasks_gauge": 1, + "lost_tasks": 0, + "queued_tasks_gauge": 0, + "recovery_errors": 0, + "slave/executors_running": 1, + "slave/executors_terminated": 0, + "slave/executors_terminating": 0, + "slave/frameworks_active": 1, + "slave/invalid_framework_messages": 0, + "slave/invalid_status_updates": 0, + "slave/mem_percent": 0.540983606557377, + "slave/mem_total": 244, + "slave/mem_used": 132, + "slave/recovery_errors": 0, + "slave/registered": 1, + "slave/tasks_failed": 0, + "slave/tasks_finished": 0, + "slave/tasks_killed": 0, + "slave/tasks_lost": 0, + "slave/tasks_running": 1 +} + +def _mocked_get_state(*args, **kwargs): + return state +def _mocked_get_stats(*args, **kwargs): + return stats + +@attr(requires='mesos_slave') +class TestMesosSlave(AgentCheckTest): + CHECK_NAME = 'mesos_slave' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050', + 'tasks': ['hello'] + } + ] + } + + klass = get_check_class('mesos_slave') + with patch.object(klass, '_get_state', _mocked_get_state): + with patch.object(klass, '_get_stats', _mocked_get_stats): + check = klass('mesos_slave', {}, {}) + self.run_check(config) + time.sleep(1) + self.run_check(config) + metrics = {} + for d in (check.SLAVE_TASKS_METRICS, check.SYSTEM_METRICS, check.SLAVE_RESOURCE_METRICS, + check.SLAVE_EXECUTORS_METRICS, check.STATS_METRICS): + metrics.update(d) + [self.assertMetric(v[0]) for k, v in check.TASK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in metrics.iteritems()] + self.assertServiceCheck('hello.ok', + count=1, status=AgentCheck.OK + ) From e033b23393f818d05bf2021d7567322be38a0d82 Mon Sep 17 00:00:00 2001 From: Dorian Zaccaria Date: Wed, 6 May 2015 17:13:57 -0400 Subject: [PATCH 2/3] [mesos] Support multiple versions and Updates for the review * Change mesos stats endpoint for versions above 0.22.0 * Simplify syntax * Update tests for @degemer refactor --- checks.d/mesos_master.py | 117 ++++++++---------- checks.d/mesos_slave.py | 106 +++++++--------- requirements.txt | 2 +- .../integration}/test_mesos_master.py | 4 +- .../integration}/test_mesos_slave.py | 6 +- 5 files changed, 107 insertions(+), 128 deletions(-) rename tests/{ => checks/integration}/test_mesos_master.py (98%) rename tests/{ => checks/integration}/test_mesos_slave.py (97%) diff --git a/checks.d/mesos_master.py b/checks.d/mesos_master.py index e6a954834a..07edbdbe68 100644 --- a/checks.d/mesos_master.py +++ b/checks.d/mesos_master.py @@ -6,12 +6,12 @@ from hashlib import md5 import time -# project -from checks import AgentCheck - # 3rd party import requests +# project +from checks import AgentCheck, CheckException + class MesosMaster(AgentCheck): GAUGE = AgentCheck.gauge @@ -34,13 +34,14 @@ class MesosMaster(AgentCheck): # These metrics are aggregated only on the elected master CLUSTER_TASKS_METRICS = { - 'staged_tasks' : ('mesos.cluster.staged_tasks', GAUGE), - 'started_tasks' : ('mesos.cluster.started_tasks', GAUGE), - 'finished_tasks' : ('mesos.cluster.finished_tasks', GAUGE), - 'killed_tasks' : ('mesos.cluster.killed_tasks', GAUGE), - 'failed_tasks' : ('mesos.cluster.failed_tasks', GAUGE), - 'lost_tasks' : ('mesos.cluster.lost_tasks', GAUGE), - 'active_tasks_gauge' : ('mesos.cluster.active_tasks_gauge', GAUGE), + 'master/tasks_error' : ('mesos.cluster.tasks_error', GAUGE), + 'master/tasks_failed' : ('mesos.cluster.tasks_failed', GAUGE), + 'master/tasks_finished' : ('mesos.cluster.tasks_finished', GAUGE), + 'master/tasks_killed' : ('mesos.cluster.tasks_killed', GAUGE), + 'master/tasks_lost' : ('mesos.cluster.tasks_lost', GAUGE), + 'master/tasks_running' : ('mesos.cluster.tasks_running', GAUGE), + 'master/tasks_staging' : ('mesos.cluster.tasks_staging', GAUGE), + 'master/tasks_starting' : ('mesos.cluster.tasks_starting', GAUGE), } # These metrics are aggregated only on the elected master @@ -109,9 +110,6 @@ class MesosMaster(AgentCheck): # These metrics are aggregated only on the elected master STATS_METRICS = { - 'active_schedulers' : ('mesos.cluster.active_schedulers', GAUGE), - 'total_schedulers' : ('mesos.cluster.total_schedulers', GAUGE), - 'outstanding_offers' : ('mesos.cluster.outstanding_offers', GAUGE), 'master/dropped_messages' : ('mesos.cluster.dropped_messages', GAUGE), 'master/outstanding_offers' : ('mesos.cluster.outstanding_offers', GAUGE), 'master/event_queue_dispatches' : ('mesos.cluster.event_queue_dispatches', GAUGE), @@ -125,24 +123,6 @@ class MesosMaster(AgentCheck): 'master/valid_status_updates' : ('mesos.cluster.valid_status_updates', GAUGE), } - def _timeout_event(self, url, timeout, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'http_check', - 'msg_title': 'URL timeout', - 'msg_text': '%s timed out after %s seconds.' % (url, timeout), - 'aggregation_key': aggregation_key - }) - - def _status_code_event(self, url, r, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'http_check', - 'msg_title': 'Invalid reponse code for %s' % url, - 'msg_text': '%s returned a status of %s' % (url, r.status_code), - 'aggregation_key': aggregation_key - }) - def _get_json(self, url, timeout): # Use a hash of the URL as an aggregation key aggregation_key = md5(url).hexdigest() @@ -152,7 +132,6 @@ def _get_json(self, url, timeout): try: r = requests.get(url, timeout=timeout) if r.status_code != 200: - self._status_code_event(url, r, aggregation_key) status = AgentCheck.CRITICAL msg = "Got %s when hitting %s" % (r.status_code, url) else: @@ -160,11 +139,10 @@ def _get_json(self, url, timeout): msg = "Mesos master instance detected at %s " % url except requests.exceptions.Timeout as e: # If there's a timeout - self._timeout_event(url, timeout, aggregation_key) msg = "%s seconds timeout when hitting %s" % (timeout, url) status = AgentCheck.CRITICAL except Exception as e: - msg = e.message + msg = str(e) status = AgentCheck.CRITICAL finally: if self.SERVICE_CHECK_NEEDED: @@ -172,8 +150,9 @@ def _get_json(self, url, timeout): message=msg) self.SERVICE_CHECK_NEEDED = False if status is AgentCheck.CRITICAL: - self.warning(msg) - return None + self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, + message=msg) + raise CheckException("Cannot connect to mesos, please check your configuration.") return r.json() @@ -181,19 +160,25 @@ def _get_master_state(self, url, timeout): return self._get_json(url + '/state.json', timeout) def _get_master_stats(self, url, timeout): - return self._get_json(url + '/stats.json', timeout) + if self.version >= [0, 22, 0]: + endpoint = '/metrics/snapshot' + else: + endpoint = '/stats.json' + return self._get_json(url + endpoint, timeout) def _get_master_roles(self, url, timeout): return self._get_json(url + '/roles.json', timeout) def _check_leadership(self, url, timeout): - json = self._get_master_state(url, timeout) + state_metrics = self._get_master_state(url, timeout) - if json is not None and json['leader'] == json['pid']: - self.leader = True + if state_metrics is not None: + self.version = map(int, state_metrics['version'].split('.')) + if state_metrics['leader'] == state_metrics['pid']: + self.leader = True else: self.leader = False - return json + return state_metrics def check(self, instance): if 'url' not in instance: @@ -204,38 +189,44 @@ def check(self, instance): default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) - json = self._check_leadership(url, timeout) - if json: - tags = ['mesos_cluster:' + json['cluster'], 'mesos_pid:' + json['pid'], 'mesos_id:' + json['id'], 'mesos_node:master'] + instance_tags + state_metrics = self._check_leadership(url, timeout) + if state_metrics: + tags = [ + 'mesos_cluster:{0}'.format(state_metrics['cluster']), + 'mesos_pid:{0}'.format(state_metrics['pid']), + 'mesos_node:master' + ] + tags += instance_tags if self.leader: - self.GAUGE('mesos.cluster.total_frameworks', len(json['frameworks']), tags=tags) + self.GAUGE('mesos.cluster.total_frameworks', len(state_metrics['frameworks']), tags=tags) - for framework in json['frameworks']: - framework_tags = ['framework:' + framework['id'], 'framework_name:' + framework['name']] + tags + for framework in state_metrics['frameworks']: + framework_tags = ['framework_name:' + framework['name']] + tags self.GAUGE('mesos.framework.total_tasks', len(framework['tasks']), tags=framework_tags) resources = framework['used_resources'] - [v[1](self, v[0], resources[k], tags=framework_tags) for k, v in self.FRAMEWORK_METRICS.iteritems()] + for key_name, (metric_name, metric_func) in self.FRAMEWORK_METRICS.iteritems(): + metric_func(self, metric_name, resources[key_name], tags=framework_tags) - json = self._get_master_roles(url, timeout) - if json is not None: - for role in json['roles']: + role_metrics = self._get_master_roles(url, timeout) + if role_metrics is not None: + for role in role_metrics['roles']: role_tags = ['mesos_role:' + role['name']] + tags - self.GAUGE('mesos.role.frameworks', len(role['frameworks']), tags=role_tags) + self.GAUGE('mesos.role.frameworks.count', len(role['frameworks']), tags=role_tags) self.GAUGE('mesos.role.weight', role['weight'], tags=role_tags) - [v[1](self, v[0], role['resources'][k], tags=role_tags) for k, v in self.ROLE_RESOURCES_METRICS.iteritems()] + for key_name, (metric_name, metric_func) in self.ROLE_RESOURCES_METRICS.iteritems(): + metric_func(self, metric_name, role['resources'][key_name], tags=role_tags) - json = self._get_master_stats(url, timeout) - if json is not None: + stats_metrics = self._get_master_stats(url, timeout) + if stats_metrics is not None: + metrics = [self.SYSTEM_METRICS] if self.leader: - metrics = {} - for d in (self.CLUSTER_TASKS_METRICS, self.CLUSTER_SLAVES_METRICS, - self.CLUSTER_RESOURCES_METRICS, self.CLUSTER_REGISTRAR_METRICS, - self.CLUSTER_FRAMEWORK_METRICS, self.SYSTEM_METRICS, self.STATS_METRICS): - metrics.update(d) - else: - metrics = self.SYSTEM_METRICS - [v[1](self, v[0], json[k], tags=tags) for k, v in metrics.iteritems()] + metrics += [self.CLUSTER_TASKS_METRICS, self.CLUSTER_SLAVES_METRICS, + self.CLUSTER_RESOURCES_METRICS, self.CLUSTER_REGISTRAR_METRICS, + self.CLUSTER_FRAMEWORK_METRICS, self.STATS_METRICS] + for m in metrics: + for key_name, (metric_name, metric_func) in m.iteritems(): + metric_func(self, metric_name, stats_metrics[key_name], tags=tags) self.SERVICE_CHECK_NEEDED = True diff --git a/checks.d/mesos_slave.py b/checks.d/mesos_slave.py index b4e83e24e5..3115790e58 100644 --- a/checks.d/mesos_slave.py +++ b/checks.d/mesos_slave.py @@ -6,12 +6,12 @@ from hashlib import md5 import time -# project -from checks import AgentCheck - # 3rd party import requests +# project +from checks import AgentCheck, CheckException + class MesosSlave(AgentCheck): GAUGE = AgentCheck.gauge @@ -37,14 +37,13 @@ class MesosSlave(AgentCheck): } SLAVE_TASKS_METRICS = { - 'failed_tasks' : ('mesos.slave.failed_tasks', GAUGE), - 'finished_tasks' : ('mesos.slave.finished_tasks', GAUGE), - 'killed_tasks' : ('mesos.slave.killed_tasks', GAUGE), - 'lost_tasks' : ('mesos.slave.lost_tasks', GAUGE), - 'staged_tasks' : ('mesos.slave.staged_tasks', GAUGE), - 'started_tasks' : ('mesos.slave.started_tasks', GAUGE), - 'launched_tasks_gauge' : ('mesos.slave.launched_tasks_gauge', GAUGE), - 'queued_tasks_gauge' : ('mesos.slave.queued_tasks_gauge', GAUGE), + 'slave/tasks_failed' : ('mesos.slave.tasks_failed', GAUGE), + 'slave/tasks_finished' : ('mesos.slave.tasks_finished', GAUGE), + 'slave/tasks_killed' : ('mesos.slave.tasks_killed', GAUGE), + 'slave/tasks_lost' : ('mesos.slave.tasks_lost', GAUGE), + 'slave/tasks_running' : ('mesos.slave.tasks_running', GAUGE), + 'slave/tasks_staging' : ('mesos.slave.tasks_staging', GAUGE), + 'slave/tasks_starting' : ('mesos.slave.tasks_starting', GAUGE), } SYSTEM_METRICS = { @@ -78,7 +77,6 @@ class MesosSlave(AgentCheck): } STATS_METRICS = { - 'total_frameworks' : ('mesos.slave.total_frameworks', GAUGE), 'slave/frameworks_active' : ('mesos.slave.frameworks_active', GAUGE), 'slave/invalid_framework_messages' : ('mesos.slave.invalid_framework_messages', GAUGE), 'slave/invalid_status_updates' : ('mesos.slave.invalid_status_updates', GAUGE), @@ -87,25 +85,9 @@ class MesosSlave(AgentCheck): 'slave/valid_status_updates' : ('mesos.slave.valid_status_updates', GAUGE), } - cluster_name = None - - def _timeout_event(self, url, timeout, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'http_check', - 'msg_title': 'URL timeout', - 'msg_text': '%s timed out after %s seconds.' % (url, timeout), - 'aggregation_key': aggregation_key - }) - - def _status_code_event(self, url, r, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'http_check', - 'msg_title': 'Invalid reponse code for %s' % url, - 'msg_text': '%s returned a status of %s' % (url, r.status_code), - 'aggregation_key': aggregation_key - }) + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + self.cluster_name = None def _get_json(self, url, timeout): # Use a hash of the URL as an aggregation key @@ -116,7 +98,6 @@ def _get_json(self, url, timeout): try: r = requests.get(url, timeout=timeout) if r.status_code != 200: - self._status_code_event(url, r, aggregation_key) status = AgentCheck.CRITICAL msg = "Got %s when hitting %s" % (r.status_code, url) else: @@ -124,19 +105,17 @@ def _get_json(self, url, timeout): msg = "Mesos master instance detected at %s " % url except requests.exceptions.Timeout as e: # If there's a timeout - self._timeout_event(url, timeout, aggregation_key) msg = "%s seconds timeout when hitting %s" % (timeout, url) status = AgentCheck.CRITICAL except Exception as e: - msg = e.message + msg = str(e) status = AgentCheck.CRITICAL finally: if self.SERVICE_CHECK_NEEDED: self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) self.SERVICE_CHECK_NEEDED = False if status is AgentCheck.CRITICAL: - self.warning(msg) - return None + raise CheckException("Cannot connect to mesos, please check your configuration.") return r.json() @@ -144,18 +123,23 @@ def _get_state(self, url, timeout): return self._get_json(url + '/state.json', timeout) def _get_stats(self, url, timeout): - return self._get_json(url + '/stats.json', timeout) + if self.version >= [0, 22, 0]: + endpoint = '/metrics/snapshot' + else: + endpoint = '/stats.json' + return self._get_json(url + endpoint, timeout) def _get_constant_attributes(self, url, timeout): - json = None + state_metrics = None if self.cluster_name is None: - json = self._get_state(url, timeout) - if json is not None: - master_state = self._get_state('http://' + json['master_hostname'] + ':5050', timeout) + state_metrics = self._get_state(url, timeout) + if state_metrics is not None: + self.version = map(int, state_metrics['version'].split('.')) + master_state = self._get_state('http://' + state_metrics['master_hostname'] + ':5050', timeout) if master_state is not None: self.cluster_name = master_state['cluster'] - return json + return state_metrics def check(self, instance): if 'url' not in instance: @@ -167,30 +151,36 @@ def check(self, instance): default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) - json = self._get_constant_attributes(url, timeout) + state_metrics = self._get_constant_attributes(url, timeout) tags = None - if json is None: - json = self._get_state(url, timeout) - if json: - tags = ['mesos_cluster:' + self.cluster_name, 'mesos_id:' + json['id'], 'mesos_pid:' + json['pid'], 'mesos_node:slave'] + instance_tags + if state_metrics is None: + state_metrics = self._get_state(url, timeout) + if state_metrics: + tags = [ + 'mesos_cluster:{0}'.format(self.cluster_name), + 'mesos_pid:{0}'.format(state_metrics['pid']), + 'mesos_node:slave' + ] + tags += instance_tags for task in tasks: - for framework in json['frameworks']: + for framework in state_metrics['frameworks']: for executor in framework['executors']: for t in executor['tasks']: - if task.lower() in t['name'].lower() and t['slave_id'] == json['id']: - task_tags = ['framework_id:' + t['framework_id'], 'executor_id:' + t['executor_id'], 'task_name:' + t['name']] + tags + if task.lower() in t['name'].lower() and t['slave_id'] == state_metrics['id']: + task_tags = ['task_name:' + t['name']] + tags self.service_check(t['name'] + '.ok', self.TASK_STATUS[t['state']], tags=task_tags) - [v[1](self, v[0], t['resources'][k], tags=task_tags) for k, v in self.TASK_METRICS.iteritems()] + for key_name, (metric_name, metric_func) in self.TASK_METRICS.iteritems(): + metric_func(self, metric_name, t['resources'][key_name], tags=task_tags) - json = self._get_stats(url, timeout) - if json: + stats_metrics = self._get_stats(url, timeout) + if stats_metrics: tags = tags if tags else instance_tags - metrics = {} - for d in (self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS, self.SLAVE_RESOURCE_METRICS, - self.SLAVE_EXECUTORS_METRICS, self.STATS_METRICS): - metrics.update(d) - [v[1](self, v[0], json[k], tags=tags) for k, v in metrics.iteritems()] + metrics = [self.SLAVE_TASKS_METRICS, self.SYSTEM_METRICS, self.SLAVE_RESOURCE_METRICS, + self.SLAVE_EXECUTORS_METRICS, self.STATS_METRICS] + for m in metrics: + for key_name, (metric_name, metric_func) in m.iteritems(): + metric_func(self, metric_name, stats_metrics[key_name], tags=tags) self.SERVICE_CHECK_NEEDED = True diff --git a/requirements.txt b/requirements.txt index 88afeff46c..af5775f485 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ ########################################################### # These modules are the deps needed by the -# agent core, meaning every module that is +# agent core, meaning every module that is # not a check # They're installed in the CI and when doing # a source install diff --git a/tests/test_mesos_master.py b/tests/checks/integration/test_mesos_master.py similarity index 98% rename from tests/test_mesos_master.py rename to tests/checks/integration/test_mesos_master.py index 177556b95c..89a38259ec 100644 --- a/tests/test_mesos_master.py +++ b/tests/checks/integration/test_mesos_master.py @@ -1,4 +1,4 @@ -from tests.common import AgentCheckTest, get_check_class +from tests.checks.common import AgentCheckTest, get_check_class from nose.plugins.attrib import attr from mock import patch @@ -305,5 +305,5 @@ def test_checks(self): [self.assertMetric(v[0]) for k, v in check.ROLE_RESOURCES_METRICS.iteritems()] self.assertMetric('mesos.cluster.total_frameworks') self.assertMetric('mesos.framework.total_tasks') - self.assertMetric('mesos.role.frameworks') + self.assertMetric('mesos.role.frameworks.count') self.assertMetric('mesos.role.weight') diff --git a/tests/test_mesos_slave.py b/tests/checks/integration/test_mesos_slave.py similarity index 97% rename from tests/test_mesos_slave.py rename to tests/checks/integration/test_mesos_slave.py index c74c6174d3..f548a0fa7f 100644 --- a/tests/test_mesos_slave.py +++ b/tests/checks/integration/test_mesos_slave.py @@ -1,4 +1,4 @@ -from tests.common import AgentCheckTest, get_check_class +from tests.checks.common import AgentCheckTest, get_check_class from nose.plugins.attrib import attr from mock import patch @@ -213,6 +213,4 @@ def test_checks(self): metrics.update(d) [self.assertMetric(v[0]) for k, v in check.TASK_METRICS.iteritems()] [self.assertMetric(v[0]) for k, v in metrics.iteritems()] - self.assertServiceCheck('hello.ok', - count=1, status=AgentCheck.OK - ) + self.assertServiceCheck('hello.ok', count=1, status=AgentCheck.OK) From d036028f3af96bc2dc9145c0b26455dd44613ee8 Mon Sep 17 00:00:00 2001 From: Dorian Zaccaria Date: Thu, 28 May 2015 16:27:43 -0400 Subject: [PATCH 3/3] [mesos] Change some metrics type and update test suite --- checks.d/mesos_master.py | 24 +- checks.d/mesos_slave.py | 24 +- tests/checks/fixtures/mesos_master/roles.json | 17 + tests/checks/fixtures/mesos_master/state.json | 132 ++++++++ tests/checks/fixtures/mesos_master/stats.json | 108 ++++++ .../mesos_slave/state.json} | 95 +----- tests/checks/fixtures/mesos_slave/stats.json | 50 +++ tests/checks/integration/test_mesos_master.py | 309 ------------------ tests/checks/mock/test_mesos_master.py | 51 +++ tests/checks/mock/test_mesos_slave.py | 43 +++ 10 files changed, 426 insertions(+), 427 deletions(-) create mode 100644 tests/checks/fixtures/mesos_master/roles.json create mode 100644 tests/checks/fixtures/mesos_master/state.json create mode 100644 tests/checks/fixtures/mesos_master/stats.json rename tests/checks/{integration/test_mesos_slave.py => fixtures/mesos_slave/state.json} (57%) create mode 100644 tests/checks/fixtures/mesos_slave/stats.json delete mode 100644 tests/checks/integration/test_mesos_master.py create mode 100644 tests/checks/mock/test_mesos_master.py create mode 100644 tests/checks/mock/test_mesos_slave.py diff --git a/checks.d/mesos_master.py b/checks.d/mesos_master.py index 07edbdbe68..d552817b4d 100644 --- a/checks.d/mesos_master.py +++ b/checks.d/mesos_master.py @@ -15,9 +15,9 @@ class MesosMaster(AgentCheck): GAUGE = AgentCheck.gauge - RATE = AgentCheck.rate + MONOTONIC_COUNT = AgentCheck.monotonic_count SERVICE_CHECK_NAME = "mesos_master.can_connect" - SERVICE_CHECK_NEEDED = True + service_check_needed = True FRAMEWORK_METRICS = { @@ -35,10 +35,10 @@ class MesosMaster(AgentCheck): # These metrics are aggregated only on the elected master CLUSTER_TASKS_METRICS = { 'master/tasks_error' : ('mesos.cluster.tasks_error', GAUGE), - 'master/tasks_failed' : ('mesos.cluster.tasks_failed', GAUGE), - 'master/tasks_finished' : ('mesos.cluster.tasks_finished', GAUGE), - 'master/tasks_killed' : ('mesos.cluster.tasks_killed', GAUGE), - 'master/tasks_lost' : ('mesos.cluster.tasks_lost', GAUGE), + 'master/tasks_failed' : ('mesos.cluster.tasks_failed', MONOTONIC_COUNT), + 'master/tasks_finished' : ('mesos.cluster.tasks_finished', MONOTONIC_COUNT), + 'master/tasks_killed' : ('mesos.cluster.tasks_killed', MONOTONIC_COUNT), + 'master/tasks_lost' : ('mesos.cluster.tasks_lost', MONOTONIC_COUNT), 'master/tasks_running' : ('mesos.cluster.tasks_running', GAUGE), 'master/tasks_staging' : ('mesos.cluster.tasks_staging', GAUGE), 'master/tasks_starting' : ('mesos.cluster.tasks_starting', GAUGE), @@ -99,9 +99,9 @@ class MesosMaster(AgentCheck): # These metrics are aggregated on all nodes in the cluster SYSTEM_METRICS = { 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), - 'system/load_15min' : ('mesos.stats.system.load_15min', RATE), - 'system/load_1min' : ('mesos.stats.system.load_1min', RATE), - 'system/load_5min' : ('mesos.stats.system.load_5min', RATE), + 'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE), + 'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE), + 'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE), 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), 'master/elected' : ('mesos.stats.elected', GAUGE), @@ -145,10 +145,10 @@ def _get_json(self, url, timeout): msg = str(e) status = AgentCheck.CRITICAL finally: - if self.SERVICE_CHECK_NEEDED: + if self.service_check_needed: self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) - self.SERVICE_CHECK_NEEDED = False + self.service_check_needed = False if status is AgentCheck.CRITICAL: self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) @@ -229,4 +229,4 @@ def check(self, instance): metric_func(self, metric_name, stats_metrics[key_name], tags=tags) - self.SERVICE_CHECK_NEEDED = True + self.service_check_needed = True diff --git a/checks.d/mesos_slave.py b/checks.d/mesos_slave.py index 3115790e58..78b25507fd 100644 --- a/checks.d/mesos_slave.py +++ b/checks.d/mesos_slave.py @@ -15,9 +15,9 @@ class MesosSlave(AgentCheck): GAUGE = AgentCheck.gauge - RATE = AgentCheck.rate + MONOTONIC_COUNT = AgentCheck.monotonic_count SERVICE_CHECK_NAME = "mesos_slave.can_connect" - SERVICE_CHECK_NEEDED = True + service_check_needed = True TASK_STATUS = { 'TASK_STARTING' : AgentCheck.OK, @@ -37,10 +37,10 @@ class MesosSlave(AgentCheck): } SLAVE_TASKS_METRICS = { - 'slave/tasks_failed' : ('mesos.slave.tasks_failed', GAUGE), - 'slave/tasks_finished' : ('mesos.slave.tasks_finished', GAUGE), - 'slave/tasks_killed' : ('mesos.slave.tasks_killed', GAUGE), - 'slave/tasks_lost' : ('mesos.slave.tasks_lost', GAUGE), + 'slave/tasks_failed' : ('mesos.slave.tasks_failed', MONOTONIC_COUNT), + 'slave/tasks_finished' : ('mesos.slave.tasks_finished', MONOTONIC_COUNT), + 'slave/tasks_killed' : ('mesos.slave.tasks_killed', MONOTONIC_COUNT), + 'slave/tasks_lost' : ('mesos.slave.tasks_lost', MONOTONIC_COUNT), 'slave/tasks_running' : ('mesos.slave.tasks_running', GAUGE), 'slave/tasks_staging' : ('mesos.slave.tasks_staging', GAUGE), 'slave/tasks_starting' : ('mesos.slave.tasks_starting', GAUGE), @@ -48,9 +48,9 @@ class MesosSlave(AgentCheck): SYSTEM_METRICS = { 'system/cpus_total' : ('mesos.stats.system.cpus_total', GAUGE), - 'system/load_15min' : ('mesos.stats.system.load_15min', RATE), - 'system/load_1min' : ('mesos.stats.system.load_1min', RATE), - 'system/load_5min' : ('mesos.stats.system.load_5min', RATE), + 'system/load_15min' : ('mesos.stats.system.load_15min', GAUGE), + 'system/load_1min' : ('mesos.stats.system.load_1min', GAUGE), + 'system/load_5min' : ('mesos.stats.system.load_5min', GAUGE), 'system/mem_free_bytes' : ('mesos.stats.system.mem_free_bytes', GAUGE), 'system/mem_total_bytes' : ('mesos.stats.system.mem_total_bytes', GAUGE), 'slave/registered' : ('mesos.stats.registered', GAUGE), @@ -111,9 +111,9 @@ def _get_json(self, url, timeout): msg = str(e) status = AgentCheck.CRITICAL finally: - if self.SERVICE_CHECK_NEEDED: + if self.service_check_needed: self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) - self.SERVICE_CHECK_NEEDED = False + self.service_check_needed = False if status is AgentCheck.CRITICAL: raise CheckException("Cannot connect to mesos, please check your configuration.") @@ -183,4 +183,4 @@ def check(self, instance): for key_name, (metric_name, metric_func) in m.iteritems(): metric_func(self, metric_name, stats_metrics[key_name], tags=tags) - self.SERVICE_CHECK_NEEDED = True + self.service_check_needed = True diff --git a/tests/checks/fixtures/mesos_master/roles.json b/tests/checks/fixtures/mesos_master/roles.json new file mode 100644 index 0000000000..e54ea83873 --- /dev/null +++ b/tests/checks/fixtures/mesos_master/roles.json @@ -0,0 +1,17 @@ +{ + "roles": [ + { + "weight": 1, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "name": "*", + "frameworks": [ + "20150403-140128-251789322-5050-6047-0000" + ] + } + ] +} diff --git a/tests/checks/fixtures/mesos_master/state.json b/tests/checks/fixtures/mesos_master/state.json new file mode 100644 index 0000000000..fbb58294f6 --- /dev/null +++ b/tests/checks/fixtures/mesos_master/state.json @@ -0,0 +1,132 @@ +{ + "version": "0.22.0", + "unregistered_frameworks": [], + "started_tasks": 0, + "start_time": 1428951954.34111, + "staged_tasks": 0, + "slaves": [ + { + "resources": { + "ports": "[31000-32000]", + "mem": 244, + "disk": 35164, + "cpus": 1 + }, + "reregistered_time": 1428951983.53731, + "registered_time": 1428951983.53725, + "pid": "slave(1)@127.0.0.1:5051", + "id": "20150410-134224-16777343-5050-1778-S0", + "hostname": "localhost", + "attributes": {}, + "active": 'true' + } + ], + "pid": "master@127.0.0.1:5050", + "orphan_tasks": [], + "lost_tasks": 0, + "log_dir": "/var/log/mesos", + "leader": "master@127.0.0.1:5050", + "killed_tasks": 0, + "elected_time": 1428951954.3774, + "deactivated_slaves": 0, + "completed_frameworks": [], + "cluster": "datadog-test", + "build_user": "root", + "build_time": 1427376927, + "build_date": "2015-03-26 13:35:27", + "activated_slaves": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "flags": { + "zk_session_timeout": "10secs", + "zk": "zk://localhost:2181/mesos", + "work_dir": "/var/lib/mesos", + "webui_dir": "/usr/share/mesos/webui", + "version": "false", + "user_sorter": "drf", + "slave_reregister_timeout": "10mins", + "root_submissions": "true", + "registry_strict": "false", + "registry_store_timeout": "5secs", + "registry_fetch_timeout": "1mins", + "registry": "replicated_log", + "initialize_driver_logging": "true", + "help": "false", + "framework_sorter": "drf", + "cluster": "datadog-test", + "authenticators": "crammd5", + "authenticate_slaves": "false", + "authenticate": "false", + "allocation_interval": "1secs", + "log_auto_initialize": "true", + "log_dir": "/var/log/mesos", + "logbufsecs": "0", + "logging_level": "INFO", + "port": "5050", + "quiet": "false", + "quorum": "1", + "recovery_slave_removal_limit": "100%" + }, + "frameworks": [ + { + "webui_url": "http://192.168.33.20:8080", + "user": "root", + "offered_resources": { + "mem": 0, + "disk": 0, + "cpus": 0 + }, + "name": "marathon", + "id": "20150403-140128-251789322-5050-6047-0000", + "hostname": "vagrant-ubuntu-trusty-64", + "failover_timeout": 604800, + "completed_tasks": [], + "checkpoint": 'true', + "active": 'true', + "offers": [], + "registered_time": 1428951955.38871, + "reregistered_time": 1428951955.38872, + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "role": "*", + "tasks": [ + { + "statuses": [ + { + "timestamp": 1428673971.61592, + "state": "TASK_RUNNING" + } + ], + "executor_id": "", + "framework_id": "20150403-140128-251789322-5050-6047-0000", + "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", + "labels": [], + "name": "hello", + "resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + }, + "slave_id": "20150410-134224-16777343-5050-1778-S0", + "state": "TASK_RUNNING" + } + ], + "unregistered_time": 0, + "used_resources": { + "ports": "[31915-31915]", + "mem": 100, + "disk": 0, + "cpus": 1 + } + } + ], + "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", + "git_tag": "0.22.0", + "hostname": "localhost", + "id": "20150413-190554-16777343-5050-16324" +} diff --git a/tests/checks/fixtures/mesos_master/stats.json b/tests/checks/fixtures/mesos_master/stats.json new file mode 100644 index 0000000000..7e449bcc72 --- /dev/null +++ b/tests/checks/fixtures/mesos_master/stats.json @@ -0,0 +1,108 @@ +{ + "valid_status_updates": 0, + "uptime": 706.524240128, + "total_schedulers": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 13815808, + "system/load_5min": 0.02, + "system/load_1min": 0, + "system/load_15min": 0.07, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 0, + "registrar/state_store_ms/p9999": 9.90120192, + "registrar/state_store_ms/p999": 9.8956032, + "registrar/state_store_ms/p99": 9.839616, + "registrar/state_store_ms/p95": 9.590784, + "registrar/state_store_ms/p90": 9.279744, + "registrar/state_store_ms/p50": 6.791424, + "registrar/state_store_ms/min": 3.681024, + "registrar/state_store_ms/max": 9.901824, + "registrar/state_store_ms/count": 2, + "registrar/state_store_ms": 9.901824, + "registrar/state_fetch_ms": 3.717888, + "registrar/registry_size_bytes": 246, + "registrar/queued_operations": 0, + "outstanding_offers": 0, + "mem_used": 100, + "mem_total": 244, + "mem_percent": 0.409836065573771, + "master/valid_status_updates": 0, + "master/valid_status_update_acknowledgements": 0, + "master/valid_framework_to_executor_messages": 0, + "master/uptime_secs": 706.52485632, + "master/tasks_starting": 0, + "master/tasks_staging": 0, + "master/tasks_running": 1, + "master/tasks_lost": 0, + "master/tasks_killed": 0, + "master/tasks_finished": 0, + "master/tasks_failed": 0, + "master/tasks_error": 0, + "master/slaves_inactive": 0, + "master/slaves_disconnected": 0, + "master/invalid_framework_to_executor_messages": 0, + "master/frameworks_inactive": 0, + "master/frameworks_disconnected": 0, + "master/frameworks_connected": 1, + "master/frameworks_active": 1, + "master/event_queue_messages": 0, + "master/event_queue_http_requests": 0, + "master/event_queue_dispatches": 17, + "master/elected": 1, + "master/dropped_messages": 1, + "master/disk_used": 0, + "master/disk_total": 35164, + "master/disk_percent": 0, + "master/cpus_used": 1, + "master/cpus_total": 1, + "master/cpus_percent": 1, + "disk_percent": 0, + "deactivated_slaves": 0, + "cpus_used": 1, + "cpus_total": 1, + "cpus_percent": 1, + "active_tasks_gauge": 1, + "active_schedulers": 1, + "activated_slaves": 1, + "disk_total": 35164, + "disk_used": 0, + "elected": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "lost_tasks": 0, + "master/invalid_status_update_acknowledgements": 0, + "master/invalid_status_updates": 0, + "master/mem_percent": 0.409836065573771, + "master/mem_total": 244, + "master/mem_used": 100, + "master/messages_authenticate": 0, + "master/messages_deactivate_framework": 0, + "master/messages_decline_offers": 123, + "master/messages_exited_executor": 0, + "master/messages_framework_to_executor": 0, + "master/messages_kill_task": 0, + "master/messages_launch_tasks": 0, + "master/messages_reconcile_tasks": 6, + "master/messages_register_framework": 0, + "master/messages_register_slave": 0, + "master/messages_reregister_framework": 1, + "master/messages_reregister_slave": 2, + "master/messages_resource_request": 0, + "master/messages_revive_offers": 0, + "master/messages_status_update": 0, + "master/messages_status_update_acknowledgement": 0, + "master/messages_unregister_framework": 0, + "master/messages_unregister_slave": 0, + "master/outstanding_offers": 0, + "master/recovery_slave_removals": 0, + "master/slave_registrations": 0, + "master/slave_removals": 0, + "master/slave_reregistrations": 1, + "master/slave_shutdowns_canceled": 0, + "master/slave_shutdowns_scheduled": 0, + "master/slaves_active": 1, + "master/slaves_connected": 1 +} diff --git a/tests/checks/integration/test_mesos_slave.py b/tests/checks/fixtures/mesos_slave/state.json similarity index 57% rename from tests/checks/integration/test_mesos_slave.py rename to tests/checks/fixtures/mesos_slave/state.json index f548a0fa7f..4ea97fe8b4 100644 --- a/tests/checks/integration/test_mesos_slave.py +++ b/tests/checks/fixtures/mesos_slave/state.json @@ -1,11 +1,4 @@ -from tests.checks.common import AgentCheckTest, get_check_class - -from nose.plugins.attrib import attr -from mock import patch -from checks import AgentCheck -import time - -state = { +{ "version": "0.22.0", "started_tasks": 0, "start_time": 1428673344.06054, @@ -128,89 +121,3 @@ "log_dir": "/var/log/mesos", "lost_tasks": 0 } - -stats = { - "valid_status_updates": 1, - "uptime": 280965.77977984, - "total_frameworks": 1, - "system/mem_total_bytes": 513798144, - "system/mem_free_bytes": 34271232, - "system/load_5min": 0.08, - "system/load_1min": 0.1, - "system/load_15min": 0.06, - "system/cpus_total": 1, - "started_tasks": 0, - "staged_tasks": 1, - "slave/valid_status_updates": 1, - "slave/valid_framework_messages": 0, - "slave/uptime_secs": 280965.78028288, - "slave/tasks_starting": 0, - "slave/tasks_staging": 0, - "slave/executors_registering": 0, - "slave/disk_used": 0, - "slave/disk_total": 35164, - "slave/disk_percent": 0, - "slave/cpus_used": 1.1, - "slave/cpus_total": 1, - "slave/cpus_percent": 1.1, - "registered": 1, - "failed_tasks": 0, - "finished_tasks": 0, - "invalid_status_updates": 0, - "killed_tasks": 0, - "launched_tasks_gauge": 1, - "lost_tasks": 0, - "queued_tasks_gauge": 0, - "recovery_errors": 0, - "slave/executors_running": 1, - "slave/executors_terminated": 0, - "slave/executors_terminating": 0, - "slave/frameworks_active": 1, - "slave/invalid_framework_messages": 0, - "slave/invalid_status_updates": 0, - "slave/mem_percent": 0.540983606557377, - "slave/mem_total": 244, - "slave/mem_used": 132, - "slave/recovery_errors": 0, - "slave/registered": 1, - "slave/tasks_failed": 0, - "slave/tasks_finished": 0, - "slave/tasks_killed": 0, - "slave/tasks_lost": 0, - "slave/tasks_running": 1 -} - -def _mocked_get_state(*args, **kwargs): - return state -def _mocked_get_stats(*args, **kwargs): - return stats - -@attr(requires='mesos_slave') -class TestMesosSlave(AgentCheckTest): - CHECK_NAME = 'mesos_slave' - - def test_checks(self): - config = { - 'init_config': {}, - 'instances': [ - { - 'url': 'http://localhost:5050', - 'tasks': ['hello'] - } - ] - } - - klass = get_check_class('mesos_slave') - with patch.object(klass, '_get_state', _mocked_get_state): - with patch.object(klass, '_get_stats', _mocked_get_stats): - check = klass('mesos_slave', {}, {}) - self.run_check(config) - time.sleep(1) - self.run_check(config) - metrics = {} - for d in (check.SLAVE_TASKS_METRICS, check.SYSTEM_METRICS, check.SLAVE_RESOURCE_METRICS, - check.SLAVE_EXECUTORS_METRICS, check.STATS_METRICS): - metrics.update(d) - [self.assertMetric(v[0]) for k, v in check.TASK_METRICS.iteritems()] - [self.assertMetric(v[0]) for k, v in metrics.iteritems()] - self.assertServiceCheck('hello.ok', count=1, status=AgentCheck.OK) diff --git a/tests/checks/fixtures/mesos_slave/stats.json b/tests/checks/fixtures/mesos_slave/stats.json new file mode 100644 index 0000000000..62fa0c5564 --- /dev/null +++ b/tests/checks/fixtures/mesos_slave/stats.json @@ -0,0 +1,50 @@ +{ + "valid_status_updates": 1, + "uptime": 280965.77977984, + "total_frameworks": 1, + "system/mem_total_bytes": 513798144, + "system/mem_free_bytes": 34271232, + "system/load_5min": 0.08, + "system/load_1min": 0.1, + "system/load_15min": 0.06, + "system/cpus_total": 1, + "started_tasks": 0, + "staged_tasks": 1, + "slave/valid_status_updates": 1, + "slave/valid_framework_messages": 0, + "slave/uptime_secs": 280965.78028288, + "slave/tasks_starting": 0, + "slave/tasks_staging": 0, + "slave/executors_registering": 0, + "slave/disk_used": 0, + "slave/disk_total": 35164, + "slave/disk_percent": 0, + "slave/cpus_used": 1.1, + "slave/cpus_total": 1, + "slave/cpus_percent": 1.1, + "registered": 1, + "failed_tasks": 0, + "finished_tasks": 0, + "invalid_status_updates": 0, + "killed_tasks": 0, + "launched_tasks_gauge": 1, + "lost_tasks": 0, + "queued_tasks_gauge": 0, + "recovery_errors": 0, + "slave/executors_running": 1, + "slave/executors_terminated": 0, + "slave/executors_terminating": 0, + "slave/frameworks_active": 1, + "slave/invalid_framework_messages": 0, + "slave/invalid_status_updates": 0, + "slave/mem_percent": 0.540983606557377, + "slave/mem_total": 244, + "slave/mem_used": 132, + "slave/recovery_errors": 0, + "slave/registered": 1, + "slave/tasks_failed": 0, + "slave/tasks_finished": 0, + "slave/tasks_killed": 0, + "slave/tasks_lost": 0, + "slave/tasks_running": 1 +} diff --git a/tests/checks/integration/test_mesos_master.py b/tests/checks/integration/test_mesos_master.py deleted file mode 100644 index 89a38259ec..0000000000 --- a/tests/checks/integration/test_mesos_master.py +++ /dev/null @@ -1,309 +0,0 @@ -from tests.checks.common import AgentCheckTest, get_check_class - -from nose.plugins.attrib import attr -from mock import patch -from checks import AgentCheck -import time - -state = { - "version": "0.22.0", - "unregistered_frameworks": [], - "started_tasks": 0, - "start_time": 1428951954.34111, - "staged_tasks": 0, - "slaves": [ - { - "resources": { - "ports": "[31000-32000]", - "mem": 244, - "disk": 35164, - "cpus": 1 - }, - "reregistered_time": 1428951983.53731, - "registered_time": 1428951983.53725, - "pid": "slave(1)@127.0.0.1:5051", - "id": "20150410-134224-16777343-5050-1778-S0", - "hostname": "localhost", - "attributes": {}, - "active": 'true' - } - ], - "pid": "master@127.0.0.1:5050", - "orphan_tasks": [], - "lost_tasks": 0, - "log_dir": "/var/log/mesos", - "leader": "master@127.0.0.1:5050", - "killed_tasks": 0, - "elected_time": 1428951954.3774, - "deactivated_slaves": 0, - "completed_frameworks": [], - "cluster": "datadog-test", - "build_user": "root", - "build_time": 1427376927, - "build_date": "2015-03-26 13:35:27", - "activated_slaves": 1, - "failed_tasks": 0, - "finished_tasks": 0, - "flags": { - "zk_session_timeout": "10secs", - "zk": "zk://localhost:2181/mesos", - "work_dir": "/var/lib/mesos", - "webui_dir": "/usr/share/mesos/webui", - "version": "false", - "user_sorter": "drf", - "slave_reregister_timeout": "10mins", - "root_submissions": "true", - "registry_strict": "false", - "registry_store_timeout": "5secs", - "registry_fetch_timeout": "1mins", - "registry": "replicated_log", - "initialize_driver_logging": "true", - "help": "false", - "framework_sorter": "drf", - "cluster": "datadog-test", - "authenticators": "crammd5", - "authenticate_slaves": "false", - "authenticate": "false", - "allocation_interval": "1secs", - "log_auto_initialize": "true", - "log_dir": "/var/log/mesos", - "logbufsecs": "0", - "logging_level": "INFO", - "port": "5050", - "quiet": "false", - "quorum": "1", - "recovery_slave_removal_limit": "100%" - }, - "frameworks": [ - { - "webui_url": "http://192.168.33.20:8080", - "user": "root", - "offered_resources": { - "mem": 0, - "disk": 0, - "cpus": 0 - }, - "name": "marathon", - "id": "20150403-140128-251789322-5050-6047-0000", - "hostname": "vagrant-ubuntu-trusty-64", - "failover_timeout": 604800, - "completed_tasks": [], - "checkpoint": 'true', - "active": 'true', - "offers": [], - "registered_time": 1428951955.38871, - "reregistered_time": 1428951955.38872, - "resources": { - "ports": "[31915-31915]", - "mem": 100, - "disk": 0, - "cpus": 1 - }, - "role": "*", - "tasks": [ - { - "statuses": [ - { - "timestamp": 1428673971.61592, - "state": "TASK_RUNNING" - } - ], - "executor_id": "", - "framework_id": "20150403-140128-251789322-5050-6047-0000", - "id": "hello.dc130e23-df88-11e4-b9ec-080027fc1312", - "labels": [], - "name": "hello", - "resources": { - "ports": "[31915-31915]", - "mem": 100, - "disk": 0, - "cpus": 1 - }, - "slave_id": "20150410-134224-16777343-5050-1778-S0", - "state": "TASK_RUNNING" - } - ], - "unregistered_time": 0, - "used_resources": { - "ports": "[31915-31915]", - "mem": 100, - "disk": 0, - "cpus": 1 - } - } - ], - "git_sha": "e890e2414903bb69cab730d5204f10b10d2e91bb", - "git_tag": "0.22.0", - "hostname": "localhost", - "id": "20150413-190554-16777343-5050-16324" -} - -stats = { - "valid_status_updates": 0, - "uptime": 706.524240128, - "total_schedulers": 1, - "system/mem_total_bytes": 513798144, - "system/mem_free_bytes": 13815808, - "system/load_5min": 0.02, - "system/load_1min": 0, - "system/load_15min": 0.07, - "system/cpus_total": 1, - "started_tasks": 0, - "staged_tasks": 0, - "registrar/state_store_ms/p9999": 9.90120192, - "registrar/state_store_ms/p999": 9.8956032, - "registrar/state_store_ms/p99": 9.839616, - "registrar/state_store_ms/p95": 9.590784, - "registrar/state_store_ms/p90": 9.279744, - "registrar/state_store_ms/p50": 6.791424, - "registrar/state_store_ms/min": 3.681024, - "registrar/state_store_ms/max": 9.901824, - "registrar/state_store_ms/count": 2, - "registrar/state_store_ms": 9.901824, - "registrar/state_fetch_ms": 3.717888, - "registrar/registry_size_bytes": 246, - "registrar/queued_operations": 0, - "outstanding_offers": 0, - "mem_used": 100, - "mem_total": 244, - "mem_percent": 0.409836065573771, - "master/valid_status_updates": 0, - "master/valid_status_update_acknowledgements": 0, - "master/valid_framework_to_executor_messages": 0, - "master/uptime_secs": 706.52485632, - "master/tasks_starting": 0, - "master/tasks_staging": 0, - "master/tasks_running": 1, - "master/tasks_lost": 0, - "master/tasks_killed": 0, - "master/tasks_finished": 0, - "master/tasks_failed": 0, - "master/tasks_error": 0, - "master/slaves_inactive": 0, - "master/slaves_disconnected": 0, - "master/invalid_framework_to_executor_messages": 0, - "master/frameworks_inactive": 0, - "master/frameworks_disconnected": 0, - "master/frameworks_connected": 1, - "master/frameworks_active": 1, - "master/event_queue_messages": 0, - "master/event_queue_http_requests": 0, - "master/event_queue_dispatches": 17, - "master/elected": 1, - "master/dropped_messages": 1, - "master/disk_used": 0, - "master/disk_total": 35164, - "master/disk_percent": 0, - "master/cpus_used": 1, - "master/cpus_total": 1, - "master/cpus_percent": 1, - "disk_percent": 0, - "deactivated_slaves": 0, - "cpus_used": 1, - "cpus_total": 1, - "cpus_percent": 1, - "active_tasks_gauge": 1, - "active_schedulers": 1, - "activated_slaves": 1, - "disk_total": 35164, - "disk_used": 0, - "elected": 1, - "failed_tasks": 0, - "finished_tasks": 0, - "invalid_status_updates": 0, - "killed_tasks": 0, - "lost_tasks": 0, - "master/invalid_status_update_acknowledgements": 0, - "master/invalid_status_updates": 0, - "master/mem_percent": 0.409836065573771, - "master/mem_total": 244, - "master/mem_used": 100, - "master/messages_authenticate": 0, - "master/messages_deactivate_framework": 0, - "master/messages_decline_offers": 123, - "master/messages_exited_executor": 0, - "master/messages_framework_to_executor": 0, - "master/messages_kill_task": 0, - "master/messages_launch_tasks": 0, - "master/messages_reconcile_tasks": 6, - "master/messages_register_framework": 0, - "master/messages_register_slave": 0, - "master/messages_reregister_framework": 1, - "master/messages_reregister_slave": 2, - "master/messages_resource_request": 0, - "master/messages_revive_offers": 0, - "master/messages_status_update": 0, - "master/messages_status_update_acknowledgement": 0, - "master/messages_unregister_framework": 0, - "master/messages_unregister_slave": 0, - "master/outstanding_offers": 0, - "master/recovery_slave_removals": 0, - "master/slave_registrations": 0, - "master/slave_removals": 0, - "master/slave_reregistrations": 1, - "master/slave_shutdowns_canceled": 0, - "master/slave_shutdowns_scheduled": 0, - "master/slaves_active": 1, - "master/slaves_connected": 1 -} - -roles = { - "roles": [ - { - "weight": 1, - "resources": { - "ports": "[31915-31915]", - "mem": 100, - "disk": 0, - "cpus": 1 - }, - "name": "*", - "frameworks": [ - "20150403-140128-251789322-5050-6047-0000" - ] - } - ] -} - -def _mocked_get_master_state(*args, **kwargs): - return state -def _mocked_get_master_stats(*args, **kwargs): - return stats -def _mocked_get_master_roles(*args, **kwargs): - return roles - - -@attr(requires='mesos_master') -class TestMesosMaster(AgentCheckTest): - CHECK_NAME = 'mesos_master' - - def test_checks(self): - config = { - 'init_config': {}, - 'instances': [ - { - 'url': 'http://localhost:5050' - } - ] - } - - klass = get_check_class('mesos_master') - with patch.object(klass, '_get_master_state', _mocked_get_master_state): - with patch.object(klass, '_get_master_stats', _mocked_get_master_stats): - with patch.object(klass, '_get_master_roles', _mocked_get_master_roles): - check = klass('mesos_master', {}, {}) - self.run_check(config) - time.sleep(1) - self.run_check(config) - metrics = {} - for d in (check.CLUSTER_TASKS_METRICS, check.CLUSTER_SLAVES_METRICS, - check.CLUSTER_RESOURCES_METRICS, check.CLUSTER_REGISTRAR_METRICS, - check.CLUSTER_FRAMEWORK_METRICS, check.SYSTEM_METRICS, check.STATS_METRICS): - metrics.update(d) - [self.assertMetric(v[0]) for k, v in check.FRAMEWORK_METRICS.iteritems()] - [self.assertMetric(v[0]) for k, v in metrics.iteritems()] - [self.assertMetric(v[0]) for k, v in check.ROLE_RESOURCES_METRICS.iteritems()] - self.assertMetric('mesos.cluster.total_frameworks') - self.assertMetric('mesos.framework.total_tasks') - self.assertMetric('mesos.role.frameworks.count') - self.assertMetric('mesos.role.weight') diff --git a/tests/checks/mock/test_mesos_master.py b/tests/checks/mock/test_mesos_master.py new file mode 100644 index 0000000000..aaaeb8a4aa --- /dev/null +++ b/tests/checks/mock/test_mesos_master.py @@ -0,0 +1,51 @@ +from tests.checks.common import AgentCheckTest, get_check_class, Fixtures + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import json +import time + + +def _mocked_get_master_state(*args, **kwargs): + state = json.loads(Fixtures.read_file('state.json')) + return state +def _mocked_get_master_stats(*args, **kwargs): + stats = json.loads(Fixtures.read_file('stats.json')) + return stats +def _mocked_get_master_roles(*args, **kwargs): + roles = json.loads(Fixtures.read_file('roles.json')) + return roles + +@attr(requires='mesos_master') +class TestMesosMaster(AgentCheckTest): + CHECK_NAME = 'mesos_master' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050' + } + ] + } + + klass = get_check_class('mesos_master') + with patch.object(klass, '_get_master_state', _mocked_get_master_state): + with patch.object(klass, '_get_master_stats', _mocked_get_master_stats): + with patch.object(klass, '_get_master_roles', _mocked_get_master_roles): + check = klass('mesos_master', {}, {}) + self.run_check_twice(config) + metrics = {} + for d in (check.CLUSTER_TASKS_METRICS, check.CLUSTER_SLAVES_METRICS, + check.CLUSTER_RESOURCES_METRICS, check.CLUSTER_REGISTRAR_METRICS, + check.CLUSTER_FRAMEWORK_METRICS, check.SYSTEM_METRICS, check.STATS_METRICS): + metrics.update(d) + [self.assertMetric(v[0]) for k, v in check.FRAMEWORK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in metrics.iteritems()] + [self.assertMetric(v[0]) for k, v in check.ROLE_RESOURCES_METRICS.iteritems()] + self.assertMetric('mesos.cluster.total_frameworks') + self.assertMetric('mesos.framework.total_tasks') + self.assertMetric('mesos.role.frameworks.count') + self.assertMetric('mesos.role.weight') diff --git a/tests/checks/mock/test_mesos_slave.py b/tests/checks/mock/test_mesos_slave.py new file mode 100644 index 0000000000..2fbfe19683 --- /dev/null +++ b/tests/checks/mock/test_mesos_slave.py @@ -0,0 +1,43 @@ +from tests.checks.common import AgentCheckTest, get_check_class, Fixtures + +from nose.plugins.attrib import attr +from mock import patch +from checks import AgentCheck +import json +import time + + +def _mocked_get_state(*args, **kwargs): + state = json.loads(Fixtures.read_file('state.json')) + return state +def _mocked_get_stats(*args, **kwargs): + stats = json.loads(Fixtures.read_file('stats.json')) + return stats + +@attr(requires='mesos_slave') +class TestMesosSlave(AgentCheckTest): + CHECK_NAME = 'mesos_slave' + + def test_checks(self): + config = { + 'init_config': {}, + 'instances': [ + { + 'url': 'http://localhost:5050', + 'tasks': ['hello'] + } + ] + } + + klass = get_check_class('mesos_slave') + with patch.object(klass, '_get_state', _mocked_get_state): + with patch.object(klass, '_get_stats', _mocked_get_stats): + check = klass('mesos_slave', {}, {}) + self.run_check_twice(config) + metrics = {} + for d in (check.SLAVE_TASKS_METRICS, check.SYSTEM_METRICS, check.SLAVE_RESOURCE_METRICS, + check.SLAVE_EXECUTORS_METRICS, check.STATS_METRICS): + metrics.update(d) + [self.assertMetric(v[0]) for k, v in check.TASK_METRICS.iteritems()] + [self.assertMetric(v[0]) for k, v in metrics.iteritems()] + self.assertServiceCheck('hello.ok', count=1, status=AgentCheck.OK)