Skip to content

Commit

Permalink
envoy: add connection limits metrics
Browse files Browse the repository at this point in the history
see also
https://www.envoyproxy.io/docs/envoy/latest/configuration/listeners/network_filters/connection_limit_filter

- Add envoy connection limits metrics
- unit test
- changelog
- Update metadata.csv
- adapt stat_prefix behaviour to 1.29
  see also envoyproxy/envoy@ea71e73

Signed-off-by: William Dauchy <[email protected]>

fix metric map
  • Loading branch information
wdauchy committed Feb 9, 2024
1 parent 07945a3 commit 87b76c4
Show file tree
Hide file tree
Showing 14 changed files with 151 additions and 7 deletions.
1 change: 1 addition & 0 deletions envoy/changelog.d/16718.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add connection limit metrics for envoy
11 changes: 11 additions & 0 deletions envoy/datadog_checks/envoy/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
'cluster_name': 'envoy_cluster',
'envoy_cluster_name': 'envoy_cluster',
'envoy_local_http_ratelimit_prefix': 'stat_prefix', # local rate limit
'envoy_connection_limit_prefix': 'stat_prefix', # connection limit
'envoy_http_conn_manager_prefix': 'stat_prefix', # tracing
'envoy_listener_address': 'address', # listener
'envoy_virtual_cluster': 'virtual_envoy_cluster', # vhost
Expand Down Expand Up @@ -90,6 +91,16 @@
'metric_type': 'monotonic_count',
'new_name': 'listener.downstream_cx.count',
},
r'envoy_connection_limit_(.+)_active_connections$': {
'label_name': 'stat_prefix',
'metric_type': 'monotonic_count',
'new_name': 'connection_limit.active_connections.count',
},
r'envoy_connection_limit_(.+)_limited_connections$': {
'label_name': 'stat_prefix',
'metric_type': 'monotonic_count',
'new_name': 'connection_limit.limited_connections.count',
},
r'envoy_(.+)_http_local_rate_limit_enabled$': {
'label_name': 'stat_prefix',
'metric_type': 'monotonic_count',
Expand Down
16 changes: 16 additions & 0 deletions envoy/datadog_checks/envoy/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,8 @@
'envoy_cluster_client_ssl_socket_factory_ssl_context_update_by_sds': 'cluster.client_ssl_socket_factory.ssl_context_update_by_sds', # noqa: E501
'envoy_cluster_client_ssl_socket_factory_upstream_context_secrets_not_ready': 'cluster.client_ssl_socket_factory.upstream_context_secrets_not_ready', # noqa: E501
'envoy_cluster_client_ssl_socket_factory_downstream_context_secrets_not_ready': 'cluster.client_ssl_socket_factory.downstream_context_secrets_not_ready', # noqa: E501
'envoy_connection_limit_active_connections': 'connection_limit.active_connections',
'envoy_connection_limit_limited_connections': 'connection_limit.limited_connections',
}

# fmt: off
Expand Down Expand Up @@ -3933,6 +3935,20 @@
),
'method': 'monotonic_count',
},
'connection_limit.active_connections': {
'tags': (
('stat_prefix',),
(),
),
'method': 'monotonic_count',
},
'connection_limit.limited_connections': {
'tags': (
('stat_prefix',),
(),
),
'method': 'monotonic_count',
},
}
# fmt: on

Expand Down
4 changes: 4 additions & 0 deletions envoy/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,10 @@ envoy.cluster_manager.cluster_updated.count,count,,,,[OpenMetrics V2] Total clus
envoy.cluster_manager.custer_updated_via_merge.count,count,,,,[OpenMetrics V2],0,envoy,,
envoy.cluster_manager.update_merge_cancelled.count,count,,,,[OpenMetrics V2] Total merged updates that got cancelled and delivered early,0,envoy,,
envoy.cluster_manager.update_out_of_merge_window.count,count,,,,[OpenMetrics V2] Total updates which arrived out of a merge window,0,envoy,,
envoy.connection_limit.active_connections.count,count,,,,[OpenMetrics V2] Number of currently active connections in the scope of this network filter chain,0,envoy,,
envoy.connection_limit.limited_connections.count,count,,,,[OpenMetrics V2] Total connections that have been rejected due to connection limit exceeded,0,envoy,,
envoy.connection_limit.active_connections,count,,,,[Legacy] Number of currently active connections in the scope of this network filter chain,0,envoy,,
envoy.connection_limit.limited_connections,count,,,,[Legacy] Total connections that have been rejected due to connection limit exceeded,0,envoy,,
envoy.filesystem.flushed_by_timer.count,count,,,,[OpenMetrics V2],0,envoy,,
envoy.filesystem.reopen_failed.count,count,,,,[OpenMetrics V2],0,envoy,,
envoy.filesystem.write_buffered.count,count,,,,[OpenMetrics V2],0,envoy,,
Expand Down
13 changes: 11 additions & 2 deletions envoy/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,13 +376,22 @@
"vhost.vcluster.upstream_rq.count",
]

CONNECTION_LIMIT_METRICS = [
"connection_limit.active_connections.count",
"connection_limit.limited_connections.count",
]

CONNECTION_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:ingress_http'

LOCAL_RATE_LIMIT_METRICS = [
"http.local_rate_limit_enabled.count",
"http.local_rate_limit_enforced.count",
"http.local_rate_limit_rate_limited.count",
"http.local_rate_limit_ok.count",
]

RATE_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:http_local_rate_limiter'

CLUSTER_AND_LISTENER_SSL_METRICS = [
"cluster.client_ssl_socket_factory.downstream_context_secrets_not_ready.count",
"cluster.client_ssl_socket_factory.ssl_context_update_by_sds.count",
Expand All @@ -394,8 +403,6 @@

CONNECT_STATE_METRIC = ['control_plane.connected_state']

RATE_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:http_local_rate_limiter'

FLAKY_METRICS = [
"listener.downstream_cx_active",
"listener.downstream_cx_destroy.count",
Expand Down Expand Up @@ -731,6 +738,8 @@
"tcp.on_demand_cluster_timeout.count",
"tcp.upstream_flush.count",
"tcp.upstream_flush_active",
"connection_limit.active_connections.count",
"connection_limit.limited_connections.count",
]


Expand Down
6 changes: 6 additions & 0 deletions envoy/tests/docker/api_v3/front-envoy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ static_resources:
socket_address: {address: 0.0.0.0, port_value: 80}
filter_chains:
- filters:
- name: envoy.filters.network.connection_limit
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.connection_limit.v3.ConnectionLimit
stat_prefix: ingress_http
max_connections: 1000
delay: 0s
- name: envoy.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
Expand Down
2 changes: 2 additions & 0 deletions envoy/tests/fixtures/legacy/connection_limit.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
connection_limit.ingress_http.active_connections: 0
connection_limit.ingress_http.limited_connections: 0
2 changes: 2 additions & 0 deletions envoy/tests/fixtures/legacy/stat_prefix
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ cluster.foo.ext_authz.bar.disabled: 6
cluster.foo.ext_authz.bar.error: 7
cluster.foo.ext_authz.bar.failure_mode_allowed: 8
cluster.foo.ext_authz.bar.ok: 9
connection_limit.ingress_http.active_connections: 0
connection_limit.ingress_http.limited_connections: 0
4 changes: 4 additions & 0 deletions envoy/tests/fixtures/openmetrics/openmetrics.txt
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,10 @@ envoy_filesystem_write_buffered{} 7
envoy_http_downstream_cx_upgrades_total{envoy_http_conn_manager_prefix="admin"} 0
# TYPE envoy_cluster_manager_cluster_removed counter
envoy_cluster_manager_cluster_removed{} 0
# TYPE envoy_connection_limit_ingress_http_active_connections counter
envoy_connection_limit_ingress_http_active_connections{} 0
# TYPE envoy_connection_limit_ingress_http_limited_connections counter
envoy_connection_limit_ingress_http_limited_connections{} 0
# TYPE envoy_server_debug_assertion_failures counter
envoy_server_debug_assertion_failures{} 0
# TYPE envoy_server_worker_3_watchdog_miss counter
Expand Down
4 changes: 4 additions & 0 deletions envoy/tests/fixtures/openmetrics/openmetrics_1_29.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# TYPE envoy_connection_limit_active_connections counter
envoy_connection_limit_active_connections{envoy_connection_limit_prefix="ingress_http"} 0
# TYPE envoy_connection_limit_limited_connections counter
envoy_connection_limit_limited_connections{envoy_connection_limit_prefix="ingress_http"} 0
9 changes: 8 additions & 1 deletion envoy/tests/legacy/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,14 @@
"envoy.http_local_rate_limit.ok",
]

STAT_PREFIX_TAG = ['stat_prefix:http_local_rate_limiter', 'stat_prefix:foo_buz_112']
RATE_LIMIT_STAT_PREFIX_TAG = ['stat_prefix:http_local_rate_limiter', 'stat_prefix:foo_buz_112']

CONNECTION_LIMIT_METRICS = [
"envoy.connection_limit.active_connections",
"envoy.connection_limit.limited_connections",
]

CONNECTION_LIMIT_STAT_PREFIX_TAG = ['stat_prefix:ingress_http']

RBAC_METRICS = [
"envoy.http.rbac.allowed",
Expand Down
21 changes: 18 additions & 3 deletions envoy/tests/legacy/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@
from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS

from .common import (
CONNECTION_LIMIT_METRICS,
CONNECTION_LIMIT_STAT_PREFIX_TAG,
ENVOY_VERSION,
EXT_METRICS,
FLAVOR,
HOST,
INSTANCES,
LOCAL_RATE_LIMIT_METRICS,
RATE_LIMIT_STAT_PREFIX_TAG,
RBAC_METRICS,
STAT_PREFIX_TAG,
)

CHECK_NAME = 'envoy'
Expand Down Expand Up @@ -289,7 +291,7 @@ def test_stats_prefix_optional_tags(
standard_tags.append('endpoint:{}'.format(instance["stats_url"]))
tags_prefix = standard_tags + additional_tags
c = check(instance)
mock_http_response(file_path=fixture_path(fixture_file)).return_value
mock_http_response(file_path=fixture_path(fixture_file))
dd_run_check(c)

# To ensure that this change didn't break the old behavior, both the value and the tags are asserted.
Expand All @@ -313,7 +315,20 @@ def test_local_rate_limit_metrics(aggregator, fixture_path, mock_http_response,

for metric in LOCAL_RATE_LIMIT_METRICS:
aggregator.assert_metric(metric)
for tag in STAT_PREFIX_TAG:
for tag in RATE_LIMIT_STAT_PREFIX_TAG:
aggregator.assert_metric_has_tag(metric, tag, count=1)

aggregator.assert_metrics_using_metadata(get_metadata_metrics())


def test_connection_limit_metrics(aggregator, fixture_path, mock_http_response, check, dd_run_check):
instance = INSTANCES['main']
c = check(instance)

mock_http_response(file_path=fixture_path('./legacy/connection_limit.txt'))
dd_run_check(c)
for metric in CONNECTION_LIMIT_METRICS:
for tag in CONNECTION_LIMIT_STAT_PREFIX_TAG:
aggregator.assert_metric_has_tag(metric, tag, count=1)

aggregator.assert_metrics_using_metadata(get_metadata_metrics())
22 changes: 22 additions & 0 deletions envoy/tests/legacy/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@ def test_wildcard_removal_tree():
()
),
"method": "monotonic_count",
},
"connection_limit.*.active_connections": {
"tags": (
("stat_prefix",),
()
),
"method": "monotonic_count",
},
"connection_limit.*.limited_connections": {
"tags": (
("stat_prefix",),
()
),
"method": "monotonic_count",
}
}

Expand All @@ -85,6 +99,14 @@ def test_wildcard_removal_tree():
"http_local_rate_limit.enforced": {
"tags": (("stat_prefix",), (), ()),
"method": "monotonic_count",
},
"connection_limit.active_connections": {
"tags": (("stat_prefix",), ()),
"method": "monotonic_count",
},
"connection_limit.limited_connections": {
"tags": (("stat_prefix",), ()),
"method": "monotonic_count",
}
}
# fmt: on
43 changes: 42 additions & 1 deletion envoy/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from .common import (
CLUSTER_AND_LISTENER_SSL_METRICS,
CONNECT_STATE_METRIC,
CONNECTION_LIMIT_METRICS,
CONNECTION_LIMIT_STAT_PREFIX_TAG,
DEFAULT_INSTANCE,
LOCAL_RATE_LIMIT_METRICS,
MOCKED_PROMETHEUS_METRICS,
Expand Down Expand Up @@ -46,7 +48,12 @@ def test_check(aggregator, dd_run_check, check, mock_http_response):

dd_run_check(c)

for metric in MOCKED_PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CLUSTER_AND_LISTENER_SSL_METRICS:
for metric in (
MOCKED_PROMETHEUS_METRICS
+ LOCAL_RATE_LIMIT_METRICS
+ CONNECTION_LIMIT_METRICS
+ CLUSTER_AND_LISTENER_SSL_METRICS
):
aggregator.assert_metric("envoy.{}".format(metric))

for metric in CONNECT_STATE_METRIC:
Expand Down Expand Up @@ -130,6 +137,40 @@ def test_local_rate_limit_metrics(aggregator, dd_run_check, check, mock_http_res
aggregator.assert_metrics_using_metadata(get_metadata_metrics())


@requires_py3
@pytest.mark.parametrize(
'fixture_file',
[
'openmetrics.txt',
'openmetrics_1_29.txt',
],
ids=[
"Envoy < 1.29",
"Envoy >= 1.29",
],
)
def test_connection_limit_metrics(aggregator, dd_run_check, check, mock_http_response, fixture_file):
# Envoy 1.29+ fixed this metric by moving the variable stat_prefix into a label which follows the normal
# OpenMetrics convention. However older versions still have the stat_prefix inside the metric name.
# https://github.com/envoyproxy/envoy/commit/ea71e737298a03298f478489c181395629a21ce3
mock_http_response(file_path=get_fixture_path('./openmetrics/{}'.format(fixture_file)))

c = check(DEFAULT_INSTANCE)

dd_run_check(c)

for metric in CONNECTION_LIMIT_METRICS:
aggregator.assert_metric('envoy.{}'.format(metric))
aggregator.assert_metric_has_tag('envoy.{}'.format(metric), CONNECTION_LIMIT_STAT_PREFIX_TAG)

aggregator.assert_service_check(
"envoy.openmetrics.health", status=AgentCheck.OK, tags=['endpoint:http://localhost:8001/stats/prometheus']
)

aggregator.assert_no_duplicate_metrics()
aggregator.assert_metrics_using_metadata(get_metadata_metrics())


@requires_py3
def test_tags_in_ssl_metrics(aggregator, dd_run_check, check, mock_http_response):
mock_http_response(file_path=get_fixture_path('./openmetrics/openmetrics_ssl_metrics.txt'))
Expand Down

0 comments on commit 87b76c4

Please sign in to comment.