Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit Prometheus/OpenMetrics checks to 2000 metrics per run by default #2093

Merged
merged 14 commits into from
Aug 31, 2018
53 changes: 52 additions & 1 deletion datadog_checks_base/datadog_checks/checks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@
from ..config import is_affirmative
from ..utils.common import ensure_bytes
from ..utils.proxy import config_proxy_skip
from ..utils.limiter import Limiter


# Metric types for which it's only useful to submit once per context
ONE_PER_CONTEXT_METRIC_TYPES = [
aggregator.GAUGE,
aggregator.RATE,
aggregator.MONOTONIC_COUNT,
]


class AgentCheck(object):
Expand All @@ -34,6 +43,18 @@ class AgentCheck(object):
"""
OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3)

"""
DEFAULT_METRIC_LIMIT allows to set a limit on metric contexts this check can send
per run. This is useful for check that have an unbounded number of contexts,
depending on the input payload.
The logic counts one context per gauge/rate/monotonic_count call, and deduplicates
contexts for other metric types. The first N contexts in submission order will
be sent to the aggregator, the rest are dropped. The state is reset after each run.
See https://github.com/DataDog/integrations-core/pull/2093 for more information
"""
DEFAULT_METRIC_LIMIT = 0

def __init__(self, *args, **kwargs):
"""
args: `name`, `init_config`, `agentConfig` (deprecated), `instances`
Expand All @@ -45,6 +66,7 @@ def __init__(self, *args, **kwargs):
self.init_config = kwargs.get('init_config', {})
self.agentConfig = kwargs.get('agentConfig', {})
self.warnings = []
self.metric_limiter = None

if len(args) > 0:
self.name = args[0]
Expand Down Expand Up @@ -98,6 +120,19 @@ def __init__(self, *args, **kwargs):
],
}

# Setup metric limits
try:
metric_limit = self.instances[0].get("max_returned_metrics", self.DEFAULT_METRIC_LIMIT)
# Do not allow to disable limiting if the class has set a non-zero default value
if metric_limit == 0 and self.DEFAULT_METRIC_LIMIT > 0:
metric_limit = self.DEFAULT_METRIC_LIMIT
self.warning("Setting max_returned_metrics to zero is not allowed," +
"reverting to the default of {} metrics".format(self.DEFAULT_METRIC_LIMIT))
except Exception:
metric_limit = self.DEFAULT_METRIC_LIMIT
if metric_limit > 0:
self.metric_limiter = Limiter("metrics", metric_limit, self.warning)

@property
def in_developer_mode(self):
self._log_deprecation('in_developer_mode')
Expand All @@ -117,6 +152,9 @@ def get_instance_proxy(self, instance, uri, proxies=None):

return config_proxy_skip(proxies, uri, skip)

def _context_uid(mtype, name, value, tags=None, hostname=None):
return '{}-{}-{}-{}'.format(mtype, name, tags if tags is None else hash(frozenset(tags)), hostname)

def _submit_metric(self, mtype, name, value, tags=None, hostname=None, device_name=None):
if value is None:
# ignore metric sample
Expand All @@ -126,6 +164,17 @@ def _submit_metric(self, mtype, name, value, tags=None, hostname=None, device_na
if hostname is None:
hostname = b''

if self.metric_limiter:
if mtype in ONE_PER_CONTEXT_METRIC_TYPES:
# Fast path for gauges, rates, monotonic counters, assume one context per call
if self.metric_limiter.is_reached():
return
else:
# Other metric types have a legit use case for several calls per context, track unique contexts
context = self._context_uid(mtype, name, tags, hostname)
if self.metric_limiter.is_reached(context):
return

aggregator.submit_metric(self, self.check_id, mtype, ensure_bytes(name), float(value), tags, hostname)

def gauge(self, name, value, tags=None, hostname=None, device_name=None):
Expand Down Expand Up @@ -302,14 +351,16 @@ def run(self):
try:
self.check(copy.deepcopy(self.instances[0]))
result = b''

except Exception as e:
result = json.dumps([
{
"message": str(e),
"traceback": traceback.format_exc(),
}
])
finally:
if self.metric_limiter:
self.metric_limiter.reset()

return result

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ class OpenMetricsBaseCheck(OpenMetricsScraperMixin, AgentCheck):
- bar
- foo
"""
DEFAULT_METRIC_LIMIT = 2000

def __init__(self, name, init_config, agentConfig, instances=None, default_instances=None, default_namespace=None):
super(OpenMetricsBaseCheck, self).__init__(name, init_config, agentConfig, instances=instances)
self.config_map = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class GenericPrometheusCheck(AgentCheck):
- bar
- foo
"""
DEFAULT_METRIC_LIMIT = 2000

def __init__(self, name, init_config, agentConfig, instances=None, default_instances=None, default_namespace=""):
super(GenericPrometheusCheck, self).__init__(name, init_config, agentConfig, instances)
self.scrapers_map = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#

class PrometheusCheck(PrometheusScraperMixin, AgentCheck):
DEFAULT_METRIC_LIMIT = 2000

def __init__(self, name, init_config, agentConfig, instances=None):
super(PrometheusCheck, self).__init__(name, init_config, agentConfig, instances)

Expand Down
67 changes: 67 additions & 0 deletions datadog_checks_base/datadog_checks/utils/limiter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)


class Limiter(object):
"""
Limiter implements a simple cut-off capping logic for object count.
It is used by the AgentCheck class to limit the number of metric contexts
that can be set by an instance.
"""
def __init__(self, object_name, object_limit, warning_func=None):
"""
:param object_name: (plural) name of counted objects for warning wording
:param object_limit: maximum number of objects to accept before limiting
:param warning_func: callback function, called with a string when limit is exceeded
"""
self.warning = warning_func
self.name = object_name
self.limit = object_limit

self.reached_limit = False
self.count = 0
self.seen = set()

def reset(self):
"""
Resets state and uid set. To be called asap to free memory
"""
self.reached_limit = False
self.count = 0
self.seen.clear()

def is_reached(self, uid=None):
"""
is_reached is to be called for every object that counts towards the limit.
- When called with no uid, the Limiter assumes this is a new object and
unconditionally increments the counter (less CPU and memory usage).
- When a given object can be passed multiple times, a uid must be provided to
deduplicate calls. Only the first occurrence of a uid will increment the counter.
:param uid: (optional) unique identifier of the object, to deduplicate calls
:returns: boolean, true if limit exceeded
"""
if self.reached_limit:
return True

if uid:
if uid in self.seen:
return False
self.count += 1
self.seen.add(uid)
else:
self.count += 1

if self.count > self.limit:
if self.warning:
self.warning("Exceeded limit of {} {}, ignoring next ones".format(self.limit, self.name))
self.reached_limit = True
return True
return False

def get_status(self):
"""
Returns the internal state of the limiter for unit tests
"""
return (self.count, self.limit, self.reached_limit)
78 changes: 78 additions & 0 deletions datadog_checks_base/tests/test_agent_check.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
import pytest

from datadog_checks.checks import AgentCheck


@pytest.fixture
def aggregator():
from datadog_checks.stubs import aggregator
aggregator.reset()
return aggregator


def test_instance():
"""
Simply assert the class can be insantiated
Expand Down Expand Up @@ -45,3 +54,72 @@ def test_unicode_string(self):

assert normalized_tags is not tags
assert normalized_tag == tag.encode('utf-8')


class LimitedCheck(AgentCheck):
DEFAULT_METRIC_LIMIT = 10


class TestLimits():
def test_metric_limit_gauges(self, aggregator):
check = LimitedCheck()
assert check.get_warnings() == []

for i in range(0, 10):
check.gauge("metric", 0)
assert len(check.get_warnings()) == 0
assert len(aggregator.metrics("metric")) == 10

for i in range(0, 10):
check.gauge("metric", 0)
assert len(check.get_warnings()) == 1
assert len(aggregator.metrics("metric")) == 10

def test_metric_limit_count(self, aggregator):
check = LimitedCheck()
assert check.get_warnings() == []

# Multiple calls for a single context should not trigger
for i in range(0, 20):
check.count("metric", 0, hostname="host-single")
assert len(check.get_warnings()) == 0
assert len(aggregator.metrics("metric")) == 20

# Multiple contexts should trigger
# Only 9 new contexts should pass through
for i in range(0, 20):
check.count("metric", 0, hostname="host-{}".format(i))
assert len(check.get_warnings()) == 1
assert len(aggregator.metrics("metric")) == 29

def test_metric_limit_instance_config(self, aggregator):
instances = [
{
"max_returned_metrics": 42,
}
]
check = AgentCheck("test", {}, instances)
assert check.get_warnings() == []

for i in range(0, 42):
check.gauge("metric", 0)
assert len(check.get_warnings()) == 0
assert len(aggregator.metrics("metric")) == 42

check.gauge("metric", 0)
assert len(check.get_warnings()) == 1
assert len(aggregator.metrics("metric")) == 42

def test_metric_limit_instance_config_zero(self, aggregator):
instances = [
{
"max_returned_metrics": 0,
}
]
check = LimitedCheck("test", {}, instances)
assert len(check.get_warnings()) == 1

for i in range(0, 42):
check.gauge("metric", 0)
assert len(check.get_warnings()) == 1 # get_warnings resets the array
assert len(aggregator.metrics("metric")) == 10
55 changes: 55 additions & 0 deletions datadog_checks_base/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# (C) Datadog, Inc. 2018
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

from datadog_checks.utils.common import pattern_filter
from datadog_checks.utils.limiter import Limiter


class Item:
Expand Down Expand Up @@ -50,3 +52,56 @@ def test_key_function(self):
assert pattern_filter(items, whitelist=whitelist, key=lambda item: item.name) == [
Item('abc'), Item('def'), Item('abcdef')
]


class TestLimiter():
def test_no_uid(self):
warnings = []
limiter = Limiter("names", 10, warning_func=warnings.append)
for i in range(0, 10):
assert limiter.is_reached() is False
assert limiter.get_status() == (10, 10, False)

# Reach limit
assert limiter.is_reached() is True
assert limiter.get_status() == (11, 10, True)
assert warnings == ["Exceeded limit of 10 names, ignoring next ones"]

# Make sure warning is only sent once
assert limiter.is_reached() is True
assert len(warnings) == 1

def test_with_uid(self):
warnings = []
limiter = Limiter("names", 10, warning_func=warnings.append)
for i in range(0, 20):
assert limiter.is_reached("dummy1") is False
assert limiter.get_status() == (1, 10, False)

for i in range(0, 20):
assert limiter.is_reached("dummy2") is False
assert limiter.get_status() == (2, 10, False)
assert len(warnings) == 0

def test_mixed(self):
limiter = Limiter("names", 10)

for i in range(0, 20):
assert limiter.is_reached("dummy1") is False
assert limiter.get_status() == (1, 10, False)

for i in range(0, 5):
assert limiter.is_reached() is False
assert limiter.get_status() == (6, 10, False)

def test_reset(self):
limiter = Limiter("names", 10)

for i in range(1, 20):
limiter.is_reached("dummy1")
assert limiter.get_status() == (1, 10, False)

limiter.reset()
assert limiter.get_status() == (0, 10, False)
assert limiter.is_reached("dummy1") is False
assert limiter.get_status() == (1, 10, False)
1 change: 1 addition & 0 deletions gitlab/datadog_checks/gitlab/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class GitlabCheck(PrometheusCheck):
EVENT_TYPE = SOURCE_TYPE_NAME = 'gitlab'
DEFAULT_CONNECT_TIMEOUT = 5
DEFAULT_RECEIVE_TIMEOUT = 15
DEFAULT_METRIC_LIMIT = 0

PROMETHEUS_SERVICE_CHECK_NAME = 'gitlab.prometheus_endpoint_up'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class GitlabRunnerCheck(PrometheusCheck):

DEFAULT_CONNECT_TIMEOUT = 5
DEFAULT_RECEIVE_TIMEOUT = 15
DEFAULT_METRIC_LIMIT = 0

def __init__(self, name, init_config, agentConfig, instances=None):
super(GitlabRunnerCheck, self).__init__(name, init_config, agentConfig, instances)
Expand Down
1 change: 1 addition & 0 deletions istio/datadog_checks/istio/istio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class Istio(PrometheusCheck):
MIXER_NAMESPACE = 'istio.mixer'
MESH_NAMESPACE = 'istio.mesh'
DEFAULT_METRIC_LIMIT = 0

def __init__(self, name, init_config, agentConfig, instances=None):
super(Istio, self).__init__(name, init_config, agentConfig, instances)
Expand Down
2 changes: 2 additions & 0 deletions kube_dns/datadog_checks/kube_dns/kube_dns.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ class KubeDNSCheck(PrometheusCheck):
"""
Collect kube-dns metrics from Prometheus
"""
DEFAULT_METRIC_LIMIT = 0

def __init__(self, name, init_config, agentConfig, instances=None):
super(KubeDNSCheck, self).__init__(name, init_config, agentConfig, instances)
self.NAMESPACE = 'kubedns'
Expand Down
Loading