From 18abf96516d5bd414dbbc11e62c4215035b3d78f Mon Sep 17 00:00:00 2001 From: William Dauchy Date: Sun, 28 Jan 2024 14:18:20 +0100 Subject: [PATCH] envoy: add connection limits metrics see also https://www.envoyproxy.io/docs/envoy/latest/configuration/listeners/network_filters/connection_limit_filter - Add envoy connection limits metrics - unit test - changelog - Update metadata.csv - adapt stat_prefix behaviour to 1.29 see also https://github.com/envoyproxy/envoy/commit/ea71e737298a03298f478489c181395629a21ce3 Signed-off-by: William Dauchy fix metric map --- envoy/changelog.d/16718.added | 1 + envoy/datadog_checks/envoy/check.py | 11 +++++ envoy/datadog_checks/envoy/metrics.py | 16 +++++++ envoy/metadata.csv | 3 ++ envoy/tests/common.py | 13 +++++- envoy/tests/docker/api_v3/front-envoy.yaml | 6 +++ .../fixtures/legacy/connection_limit.txt | 2 + envoy/tests/fixtures/legacy/stat_prefix | 2 + .../fixtures/openmetrics/openmetrics.txt | 4 ++ .../fixtures/openmetrics/openmetrics_1_29.txt | 4 ++ envoy/tests/legacy/common.py | 9 +++- envoy/tests/legacy/test_unit.py | 21 +++++++-- envoy/tests/legacy/test_utils.py | 22 ++++++++++ envoy/tests/test_integration.py | 3 +- envoy/tests/test_unit.py | 43 ++++++++++++++++++- 15 files changed, 152 insertions(+), 8 deletions(-) create mode 100644 envoy/changelog.d/16718.added create mode 100644 envoy/tests/fixtures/legacy/connection_limit.txt create mode 100644 envoy/tests/fixtures/openmetrics/openmetrics_1_29.txt diff --git a/envoy/changelog.d/16718.added b/envoy/changelog.d/16718.added new file mode 100644 index 00000000000000..43a4f0f3f5ebd7 --- /dev/null +++ b/envoy/changelog.d/16718.added @@ -0,0 +1 @@ +Add connection limit metrics for envoy diff --git a/envoy/datadog_checks/envoy/check.py b/envoy/datadog_checks/envoy/check.py index 656569ca27535b..dbc3576ca20b32 100644 --- a/envoy/datadog_checks/envoy/check.py +++ b/envoy/datadog_checks/envoy/check.py @@ -17,6 +17,7 @@ 'cluster_name': 'envoy_cluster', 'envoy_cluster_name': 'envoy_cluster', 'envoy_local_http_ratelimit_prefix': 'stat_prefix', # local rate limit + 'envoy_connection_limit_prefix': 'stat_prefix', # connection limit 'envoy_http_conn_manager_prefix': 'stat_prefix', # tracing 'envoy_listener_address': 'address', # listener 'envoy_virtual_cluster': 'virtual_envoy_cluster', # vhost @@ -90,6 +91,16 @@ 'metric_type': 'monotonic_count', 'new_name': 'listener.downstream_cx.count', }, + r'envoy_connection_limit_(.+)_active_connections$': { + 'label_name': 'stat_prefix', + 'metric_type': 'gauge', + 'new_name': 'connection_limit.active_connections', + }, + r'envoy_connection_limit_(.+)_limited_connections$': { + 'label_name': 'stat_prefix', + 'metric_type': 'monotonic_count', + 'new_name': 'connection_limit.limited_connections.count', + }, r'envoy_(.+)_http_local_rate_limit_enabled$': { 'label_name': 'stat_prefix', 'metric_type': 'monotonic_count', diff --git a/envoy/datadog_checks/envoy/metrics.py b/envoy/datadog_checks/envoy/metrics.py index a3a949989ca247..1eee9e9fb68b8f 100644 --- a/envoy/datadog_checks/envoy/metrics.py +++ b/envoy/datadog_checks/envoy/metrics.py @@ -383,6 +383,8 @@ 'envoy_cluster_client_ssl_socket_factory_ssl_context_update_by_sds': 'cluster.client_ssl_socket_factory.ssl_context_update_by_sds', # noqa: E501 'envoy_cluster_client_ssl_socket_factory_upstream_context_secrets_not_ready': 'cluster.client_ssl_socket_factory.upstream_context_secrets_not_ready', # noqa: E501 'envoy_cluster_client_ssl_socket_factory_downstream_context_secrets_not_ready': 'cluster.client_ssl_socket_factory.downstream_context_secrets_not_ready', # noqa: E501 + 'envoy_connection_limit_active_connections': 'connection_limit.active_connections', + 'envoy_connection_limit_limited_connections': 'connection_limit.limited_connections', } # fmt: off @@ -3933,6 +3935,20 @@ ), 'method': 'monotonic_count', }, + 'connection_limit.active_connections': { + 'tags': ( + ('stat_prefix',), + (), + ), + 'method': 'gauge', + }, + 'connection_limit.limited_connections': { + 'tags': ( + ('stat_prefix',), + (), + ), + 'method': 'monotonic_count', + }, } # fmt: on diff --git a/envoy/metadata.csv b/envoy/metadata.csv index 9d9ced2053061d..18b5da051d9e54 100644 --- a/envoy/metadata.csv +++ b/envoy/metadata.csv @@ -149,6 +149,9 @@ envoy.cluster_manager.cluster_updated.count,count,,,,[OpenMetrics V2] Total clus envoy.cluster_manager.custer_updated_via_merge.count,count,,,,[OpenMetrics V2],0,envoy,, envoy.cluster_manager.update_merge_cancelled.count,count,,,,[OpenMetrics V2] Total merged updates that got cancelled and delivered early,0,envoy,, envoy.cluster_manager.update_out_of_merge_window.count,count,,,,[OpenMetrics V2] Total updates which arrived out of a merge window,0,envoy,, +envoy.connection_limit.active_connections,gauge,,,,[OpenMetrics V2] Number of currently active connections in the scope of this network filter chain,0,envoy,, +envoy.connection_limit.limited_connections.count,count,,,,[OpenMetrics V2] Total connections that have been rejected due to connection limit exceeded,0,envoy,, +envoy.connection_limit.limited_connections,count,,,,[Legacy] Total connections that have been rejected due to connection limit exceeded,0,envoy,, envoy.filesystem.flushed_by_timer.count,count,,,,[OpenMetrics V2],0,envoy,, envoy.filesystem.reopen_failed.count,count,,,,[OpenMetrics V2],0,envoy,, envoy.filesystem.write_buffered.count,count,,,,[OpenMetrics V2],0,envoy,, diff --git a/envoy/tests/common.py b/envoy/tests/common.py index 56556ca8ab4c73..ed763aa2144f67 100644 --- a/envoy/tests/common.py +++ b/envoy/tests/common.py @@ -376,6 +376,13 @@ "vhost.vcluster.upstream_rq.count", ] +CONNECTION_LIMIT_METRICS = [ + "connection_limit.active_connections", + "connection_limit.limited_connections.count", +] + +CONNECTION_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:ingress_http' + LOCAL_RATE_LIMIT_METRICS = [ "http.local_rate_limit_enabled.count", "http.local_rate_limit_enforced.count", @@ -383,6 +390,8 @@ "http.local_rate_limit_ok.count", ] +RATE_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:http_local_rate_limiter' + CLUSTER_AND_LISTENER_SSL_METRICS = [ "cluster.client_ssl_socket_factory.downstream_context_secrets_not_ready.count", "cluster.client_ssl_socket_factory.ssl_context_update_by_sds.count", @@ -394,8 +403,6 @@ CONNECT_STATE_METRIC = ['control_plane.connected_state'] -RATE_LIMIT_STAT_PREFIX_TAG = 'stat_prefix:http_local_rate_limiter' - FLAKY_METRICS = [ "listener.downstream_cx_active", "listener.downstream_cx_destroy.count", @@ -731,6 +738,8 @@ "tcp.on_demand_cluster_timeout.count", "tcp.upstream_flush.count", "tcp.upstream_flush_active", + "connection_limit.active_connections", + "connection_limit.limited_connections.count", ] diff --git a/envoy/tests/docker/api_v3/front-envoy.yaml b/envoy/tests/docker/api_v3/front-envoy.yaml index ef613f8b944369..1849c6ea5af129 100644 --- a/envoy/tests/docker/api_v3/front-envoy.yaml +++ b/envoy/tests/docker/api_v3/front-envoy.yaml @@ -24,6 +24,12 @@ static_resources: socket_address: {address: 0.0.0.0, port_value: 80} filter_chains: - filters: + - name: envoy.filters.network.connection_limit + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.connection_limit.v3.ConnectionLimit + stat_prefix: ingress_http + max_connections: 1000 + delay: 0s - name: envoy.http_connection_manager typed_config: "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager diff --git a/envoy/tests/fixtures/legacy/connection_limit.txt b/envoy/tests/fixtures/legacy/connection_limit.txt new file mode 100644 index 00000000000000..775f988c949c27 --- /dev/null +++ b/envoy/tests/fixtures/legacy/connection_limit.txt @@ -0,0 +1,2 @@ +connection_limit.ingress_http.active_connections: 0 +connection_limit.ingress_http.limited_connections: 0 diff --git a/envoy/tests/fixtures/legacy/stat_prefix b/envoy/tests/fixtures/legacy/stat_prefix index b3a191b8ab7194..1ad6efc9e5bb45 100644 --- a/envoy/tests/fixtures/legacy/stat_prefix +++ b/envoy/tests/fixtures/legacy/stat_prefix @@ -8,3 +8,5 @@ cluster.foo.ext_authz.bar.disabled: 6 cluster.foo.ext_authz.bar.error: 7 cluster.foo.ext_authz.bar.failure_mode_allowed: 8 cluster.foo.ext_authz.bar.ok: 9 +connection_limit.ingress_http.active_connections: 0 +connection_limit.ingress_http.limited_connections: 0 diff --git a/envoy/tests/fixtures/openmetrics/openmetrics.txt b/envoy/tests/fixtures/openmetrics/openmetrics.txt index 99906261ee4aae..66e1a95f547e4e 100644 --- a/envoy/tests/fixtures/openmetrics/openmetrics.txt +++ b/envoy/tests/fixtures/openmetrics/openmetrics.txt @@ -521,6 +521,10 @@ envoy_filesystem_write_buffered{} 7 envoy_http_downstream_cx_upgrades_total{envoy_http_conn_manager_prefix="admin"} 0 # TYPE envoy_cluster_manager_cluster_removed counter envoy_cluster_manager_cluster_removed{} 0 +# TYPE envoy_connection_limit_ingress_http_active_connections gauge +envoy_connection_limit_ingress_http_active_connections{} 0 +# TYPE envoy_connection_limit_ingress_http_limited_connections counter +envoy_connection_limit_ingress_http_limited_connections{} 0 # TYPE envoy_server_debug_assertion_failures counter envoy_server_debug_assertion_failures{} 0 # TYPE envoy_server_worker_3_watchdog_miss counter diff --git a/envoy/tests/fixtures/openmetrics/openmetrics_1_29.txt b/envoy/tests/fixtures/openmetrics/openmetrics_1_29.txt new file mode 100644 index 00000000000000..254f58f0e25c30 --- /dev/null +++ b/envoy/tests/fixtures/openmetrics/openmetrics_1_29.txt @@ -0,0 +1,4 @@ +# TYPE envoy_connection_limit_active_connections gauge +envoy_connection_limit_active_connections{envoy_connection_limit_prefix="ingress_http"} 0 +# TYPE envoy_connection_limit_limited_connections counter +envoy_connection_limit_limited_connections{envoy_connection_limit_prefix="ingress_http"} 0 diff --git a/envoy/tests/legacy/common.py b/envoy/tests/legacy/common.py index ddac5df67f9317..65502ddd7f837c 100644 --- a/envoy/tests/legacy/common.py +++ b/envoy/tests/legacy/common.py @@ -52,7 +52,14 @@ "envoy.http_local_rate_limit.ok", ] -STAT_PREFIX_TAG = ['stat_prefix:http_local_rate_limiter', 'stat_prefix:foo_buz_112'] +RATE_LIMIT_STAT_PREFIX_TAG = ['stat_prefix:http_local_rate_limiter', 'stat_prefix:foo_buz_112'] + +CONNECTION_LIMIT_METRICS = [ + "envoy.connection_limit.active_connections", + "envoy.connection_limit.limited_connections", +] + +CONNECTION_LIMIT_STAT_PREFIX_TAG = ['stat_prefix:ingress_http'] RBAC_METRICS = [ "envoy.http.rbac.allowed", diff --git a/envoy/tests/legacy/test_unit.py b/envoy/tests/legacy/test_unit.py index dc3a359e06c509..cf6d7a84ec5e3e 100644 --- a/envoy/tests/legacy/test_unit.py +++ b/envoy/tests/legacy/test_unit.py @@ -12,14 +12,16 @@ from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS from .common import ( + CONNECTION_LIMIT_METRICS, + CONNECTION_LIMIT_STAT_PREFIX_TAG, ENVOY_VERSION, EXT_METRICS, FLAVOR, HOST, INSTANCES, LOCAL_RATE_LIMIT_METRICS, + RATE_LIMIT_STAT_PREFIX_TAG, RBAC_METRICS, - STAT_PREFIX_TAG, ) CHECK_NAME = 'envoy' @@ -289,7 +291,7 @@ def test_stats_prefix_optional_tags( standard_tags.append('endpoint:{}'.format(instance["stats_url"])) tags_prefix = standard_tags + additional_tags c = check(instance) - mock_http_response(file_path=fixture_path(fixture_file)).return_value + mock_http_response(file_path=fixture_path(fixture_file)) dd_run_check(c) # To ensure that this change didn't break the old behavior, both the value and the tags are asserted. @@ -313,7 +315,20 @@ def test_local_rate_limit_metrics(aggregator, fixture_path, mock_http_response, for metric in LOCAL_RATE_LIMIT_METRICS: aggregator.assert_metric(metric) - for tag in STAT_PREFIX_TAG: + for tag in RATE_LIMIT_STAT_PREFIX_TAG: + aggregator.assert_metric_has_tag(metric, tag, count=1) + + aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + + +def test_connection_limit_metrics(aggregator, fixture_path, mock_http_response, check, dd_run_check): + instance = INSTANCES['main'] + c = check(instance) + + mock_http_response(file_path=fixture_path('./legacy/connection_limit.txt')) + dd_run_check(c) + for metric in CONNECTION_LIMIT_METRICS: + for tag in CONNECTION_LIMIT_STAT_PREFIX_TAG: aggregator.assert_metric_has_tag(metric, tag, count=1) aggregator.assert_metrics_using_metadata(get_metadata_metrics()) diff --git a/envoy/tests/legacy/test_utils.py b/envoy/tests/legacy/test_utils.py index 8d8ebe6730d836..2ec6bafd79255c 100644 --- a/envoy/tests/legacy/test_utils.py +++ b/envoy/tests/legacy/test_utils.py @@ -74,6 +74,20 @@ def test_wildcard_removal_tree(): () ), "method": "monotonic_count", + }, + "connection_limit.*.active_connections": { + "tags": ( + ("stat_prefix",), + () + ), + "method": "monotonic_count", + }, + "connection_limit.*.limited_connections": { + "tags": ( + ("stat_prefix",), + () + ), + "method": "monotonic_count", } } @@ -85,6 +99,14 @@ def test_wildcard_removal_tree(): "http_local_rate_limit.enforced": { "tags": (("stat_prefix",), (), ()), "method": "monotonic_count", + }, + "connection_limit.active_connections": { + "tags": (("stat_prefix",), ()), + "method": "monotonic_count", + }, + "connection_limit.limited_connections": { + "tags": (("stat_prefix",), ()), + "method": "monotonic_count", } } # fmt: on diff --git a/envoy/tests/test_integration.py b/envoy/tests/test_integration.py index 4020349d4baa93..efdca881192ec6 100644 --- a/envoy/tests/test_integration.py +++ b/envoy/tests/test_integration.py @@ -9,6 +9,7 @@ from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS from .common import ( + CONNECTION_LIMIT_METRICS, DEFAULT_INSTANCE, ENVOY_VERSION, FLAKY_METRICS, @@ -31,7 +32,7 @@ def test_check(aggregator, dd_run_check, check): dd_run_check(c) dd_run_check(c) - for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS: + for metric in PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CONNECTION_LIMIT_METRICS: formatted_metric = "envoy.{}".format(metric) if metric in FLAKY_METRICS: aggregator.assert_metric(formatted_metric, at_least=0) diff --git a/envoy/tests/test_unit.py b/envoy/tests/test_unit.py index e7d1e00011c7b4..6249f61dd8fc13 100644 --- a/envoy/tests/test_unit.py +++ b/envoy/tests/test_unit.py @@ -12,6 +12,8 @@ from .common import ( CLUSTER_AND_LISTENER_SSL_METRICS, CONNECT_STATE_METRIC, + CONNECTION_LIMIT_METRICS, + CONNECTION_LIMIT_STAT_PREFIX_TAG, DEFAULT_INSTANCE, LOCAL_RATE_LIMIT_METRICS, MOCKED_PROMETHEUS_METRICS, @@ -46,7 +48,12 @@ def test_check(aggregator, dd_run_check, check, mock_http_response): dd_run_check(c) - for metric in MOCKED_PROMETHEUS_METRICS + LOCAL_RATE_LIMIT_METRICS + CLUSTER_AND_LISTENER_SSL_METRICS: + for metric in ( + MOCKED_PROMETHEUS_METRICS + + LOCAL_RATE_LIMIT_METRICS + + CONNECTION_LIMIT_METRICS + + CLUSTER_AND_LISTENER_SSL_METRICS + ): aggregator.assert_metric("envoy.{}".format(metric)) for metric in CONNECT_STATE_METRIC: @@ -130,6 +137,40 @@ def test_local_rate_limit_metrics(aggregator, dd_run_check, check, mock_http_res aggregator.assert_metrics_using_metadata(get_metadata_metrics()) +@requires_py3 +@pytest.mark.parametrize( + 'fixture_file', + [ + 'openmetrics.txt', + 'openmetrics_1_29.txt', + ], + ids=[ + "Envoy < 1.29", + "Envoy >= 1.29", + ], +) +def test_connection_limit_metrics(aggregator, dd_run_check, check, mock_http_response, fixture_file): + # Envoy 1.29+ fixed this metric by moving the variable stat_prefix into a label which follows the normal + # OpenMetrics convention. However older versions still have the stat_prefix inside the metric name. + # https://github.com/envoyproxy/envoy/commit/ea71e737298a03298f478489c181395629a21ce3 + mock_http_response(file_path=get_fixture_path('./openmetrics/{}'.format(fixture_file))) + + c = check(DEFAULT_INSTANCE) + + dd_run_check(c) + + for metric in CONNECTION_LIMIT_METRICS: + aggregator.assert_metric('envoy.{}'.format(metric)) + aggregator.assert_metric_has_tag('envoy.{}'.format(metric), CONNECTION_LIMIT_STAT_PREFIX_TAG) + + aggregator.assert_service_check( + "envoy.openmetrics.health", status=AgentCheck.OK, tags=['endpoint:http://localhost:8001/stats/prometheus'] + ) + + aggregator.assert_no_duplicate_metrics() + aggregator.assert_metrics_using_metadata(get_metadata_metrics()) + + @requires_py3 def test_tags_in_ssl_metrics(aggregator, dd_run_check, check, mock_http_response): mock_http_response(file_path=get_fixture_path('./openmetrics/openmetrics_ssl_metrics.txt'))