From 55391c0fcd930248c4dfda5114d915ee4a529ce5 Mon Sep 17 00:00:00 2001 From: Alex Lopez Date: Tue, 5 Jul 2022 16:30:47 +0200 Subject: [PATCH] Add ext_authz and ratelimiter metrics to OpenMetrics implementation (#12451) * Improve integration testing * Improve integration testing: external authorization And add missing metadata * Simplify ratelimiter and external authorization test setup * Add external authorization metrics to OpenMetricsV2 implementation * Add ratelimit metrics to OpenMetrics V2 implementation * Fix test running for api v2 --- envoy/datadog_checks/envoy/metrics.py | 9 +++ envoy/metadata.csv | 15 +++++ envoy/tests/common.py | 5 ++ envoy/tests/conftest.py | 8 ++- envoy/tests/docker/api_v2/Dockerfile-service | 2 +- envoy/tests/docker/api_v3/Dockerfile-service | 2 +- envoy/tests/docker/api_v3/front-envoy.yaml | 68 +++++++++++++++++++- envoy/tests/test_check.py | 24 ++++++- 8 files changed, 127 insertions(+), 6 deletions(-) diff --git a/envoy/datadog_checks/envoy/metrics.py b/envoy/datadog_checks/envoy/metrics.py index 10a36034e4102..8eb6151dfd7b0 100644 --- a/envoy/datadog_checks/envoy/metrics.py +++ b/envoy/datadog_checks/envoy/metrics.py @@ -10,6 +10,11 @@ 'envoy_cluster_assignment_timeout_received': 'cluster.assignment_timeout_received', 'envoy_cluster_bind_errors': 'cluster.bind_errors', 'envoy_cluster_default_total_match_count': 'cluster.default_total_match', + 'envoy_cluster_ext_authz_ok': 'cluster.ext_authz.ok', + 'envoy_cluster_ext_authz_error': 'cluster.ext_authz.error', + 'envoy_cluster_ext_authz_denied': 'cluster.ext_authz.denied', + 'envoy_cluster_ext_authz_disabled': 'cluster.ext_authz.disabled', + 'envoy_cluster_ext_authz_failure_mode_allowed': 'cluster.ext_authz.failure_mode_allowed', 'envoy_cluster_http2_dropped_headers_with_underscores': 'cluster.http2.dropped_headers_with_underscores', 'envoy_cluster_http2_header_overflow': 'cluster.http2.header_overflow', 'envoy_cluster_http2_headers_cb_no_stream': 'cluster.http2.headers_cb_no_stream', @@ -47,6 +52,10 @@ 'envoy_cluster_lb_zone_routing_sampled': 'cluster.lb_zone_routing_sampled', 'envoy_cluster_membership_change': 'cluster.membership_change', 'envoy_cluster_original_dst_host_invalid': 'cluster.original_dst_host_invalid', + 'envoy_cluster_ratelimit_ok': 'cluster.ratelimit.ok', + 'envoy_cluster_ratelimit_error': 'cluster.ratelimit.error', + 'envoy_cluster_ratelimit_over_limit': 'cluster.ratelimit.over_limit', + 'envoy_cluster_ratelimit_failure_mode_allowed': 'cluster.ratelimit.failure_mode_allowed', 'envoy_cluster_retry_or_shadow_abandoned': 'cluster.retry_or_shadow_abandoned', 'envoy_cluster_update_attempt': 'cluster.update_attempt', 'envoy_cluster_update_empty': 'cluster.update_empty', diff --git a/envoy/metadata.csv b/envoy/metadata.csv index 2ee38e8371527..7bef7c7d98571 100644 --- a/envoy/metadata.csv +++ b/envoy/metadata.csv @@ -89,6 +89,15 @@ envoy.cluster.upstream_rq_rx_reset.count,count,,,,[OpenMetrics V2] Total request envoy.cluster.upstream_rq_timeout.count,count,,,,[OpenMetrics V2] Total requests that timed out waiting for a response,0,envoy,, envoy.cluster.upstream_rq_tx_reset.count,count,,,,[OpenMetrics V2] Total requests that were reset locally,0,envoy,, envoy.cluster.upstream_rq_xx.count,count,,,,"[OpenMetrics V2] Aggregate HTTP response codes (e.g., 2xx, 3xx, etc.)",0,envoy,, +envoy.cluster.ratelimit.ok.count,count,,response,,[OpenMetrics V2] Total under limit responses from the rate limit service,1,envoy,, +envoy.cluster.ratelimit.error.count,count,,response,,[OpenMetrics V2] Total errors contacting the rate limit service,-1,envoy,, +envoy.cluster.ratelimit.over_limit.count,count,,response,,[OpenMetrics V2] Total over limit responses from the rate limit service,-1,envoy,, +envoy.cluster.ratelimit.failure_mode_allowed.count,count,,response,,[OpenMetrics V2] Total requests that errored when contacting the rate limit service but were allowed through because of failure_mode_deny being set to false,-1,envoy,, +envoy.cluster.ext_authz.ok.count,count,,response,,[OpenMetrics V2] Total responses from the external authorization service,1,envoy,, +envoy.cluster.ext_authz.error.count,count,,response,,[OpenMetrics V2] Total errors contacting the external authorization service,-1,envoy,, +envoy.cluster.ext_authz.denied.count,count,,response,,[OpenMetrics V2] Total responses from the external authorization service denying the traffic,-1,envoy,, +envoy.cluster.ext_authz.disabled.count,count,,response,,[OpenMetrics V2] Total requests allowed without calling the external authorization services due to the filter being disabled,-1,envoy,, +envoy.cluster.ext_authz.failure_mode_allowed.count,count,,response,,[OpenMetrics V2] Total requests that errored when contacting the external authorization service but were allowed through because of failure_mode_allow being set to false,-1,envoy,, envoy.cluster_manager.cds.control_plane.rate_limit_enforced.count,count,,occurrence,,[OpenMetrics V2] Total number of times rate limit was enforced for management server requests,0,envoy,, envoy.cluster_manager.cds.init_fetch_timeout.count,count,,,,[OpenMetrics V2] Total initial fetch timeouts,0,envoy,, envoy.cluster_manager.cds.update_attempt.count,count,,,,[OpenMetrics V2] Total attempted cluster membership updates by service discovery,0,envoy,, @@ -269,6 +278,12 @@ envoy.vhost.vcluster.upstream_rq_total,count,,request,,[Legacy] Total requests i envoy.cluster.ratelimit.ok,count,,response,,[Legacy] Total under limit responses from the rate limit service,1,envoy,, envoy.cluster.ratelimit.error,count,,response,,[Legacy] Total errors contacting the rate limit service,-1,envoy,, envoy.cluster.ratelimit.over_limit,count,,response,,[Legacy] Total over limit responses from the rate limit service,-1,envoy,, +envoy.cluster.ratelimit.failure_mode_allowed,count,,response,,[Legacy] Total requests that errored when contacting the rate limit service but were allowed through because of failure_mode_deny being set to false,-1,envoy,, +envoy.cluster.ext_authz.ok,count,,response,,[Legacy] Total responses from the external authorization service,1,envoy,, +envoy.cluster.ext_authz.error,count,,response,,[Legacy] Total errors contacting the external authorization service,-1,envoy,, +envoy.cluster.ext_authz.denied,count,,response,,[Legacy] Total responses from the external authorization service denying the traffic,-1,envoy,, +envoy.cluster.ext_authz.disabled,count,,response,,[Legacy] Total requests allowed without calling the external authorization services due to the filter being disabled,-1,envoy,, +envoy.cluster.ext_authz.failure_mode_allowed,count,,response,,[Legacy] Total requests that errored when contacting the external authorization service but were allowed through because of failure_mode_allow being set to false,-1,envoy,, envoy.http.ip_tagging.hit,count,,request,,[Legacy] Total number of requests that have the tag_name tag applied to it,0,envoy,, envoy.http.ip_tagging.no_hit,count,,request,,[Legacy] Total number of requests with no applicable IP tags,0,envoy,, envoy.http.ip_tagging.total,count,,request,,[Legacy] Total number of requests the IP Tagging Filter operated on,0,envoy,, diff --git a/envoy/tests/common.py b/envoy/tests/common.py index 6be6637281121..51ad61f0b1ad5 100644 --- a/envoy/tests/common.py +++ b/envoy/tests/common.py @@ -15,6 +15,7 @@ URL = 'http://{}:{}'.format(HOST, PORT) DEFAULT_INSTANCE = {'openmetrics_endpoint': '{}/stats/prometheus'.format(URL)} +LEGACY_INSTANCE = {'stats_url': '{}/stats'.format(URL)} requires_new_environment = pytest.mark.skipif(ENVOY_LEGACY != 'false', reason='Requires prometheus environment') PROMETHEUS_METRICS = [ @@ -27,6 +28,8 @@ "cluster.circuit_breakers.rq_pending_open", "cluster.circuit_breakers.rq_retry_open", "cluster.default_total_match.count", + "cluster.ext_authz.error.count", + "cluster.ext_authz.failure_mode_allowed.count", "cluster.http1.dropped_headers_with_underscores.count", "cluster.http1.metadata_not_supported_error.count", "cluster.http1.requests_rejected_with_underscores_in_headers.count", @@ -74,6 +77,8 @@ "cluster.membership_healthy", "cluster.membership_total", "cluster.original_dst_host_invalid.count", + "cluster.ratelimit.error.count", + "cluster.ratelimit.failure_mode_allowed.count", "cluster.retry_or_shadow_abandoned.count", "cluster.update_attempt.count", "cluster.update_empty.count", diff --git a/envoy/tests/conftest.py b/envoy/tests/conftest.py index e7aa495e43cdc..12f5390f0384b 100644 --- a/envoy/tests/conftest.py +++ b/envoy/tests/conftest.py @@ -1,11 +1,12 @@ import os import pytest +import requests from datadog_checks.dev import docker_run from datadog_checks.envoy import Envoy -from .common import DEFAULT_INSTANCE, DOCKER_DIR, ENVOY_LEGACY, FIXTURE_DIR, URL +from .common import DEFAULT_INSTANCE, DOCKER_DIR, ENVOY_LEGACY, FIXTURE_DIR, HOST, URL from .legacy.common import FLAVOR, INSTANCES @@ -25,8 +26,11 @@ def dd_environment(): os.path.join(DOCKER_DIR, FLAVOR, 'docker-compose.yaml'), build=True, endpoints="{}/stats".format(URL), - log_patterns=['all dependencies initialized. starting workers'], + log_patterns=['front-envoy(.*?)all dependencies initialized. starting workers'], ): + # Exercising envoy a bit will trigger extra metrics + requests.get('http://{}:8000/service/1'.format(HOST)) + requests.get('http://{}:8000/service/2'.format(HOST)) yield instance diff --git a/envoy/tests/docker/api_v2/Dockerfile-service b/envoy/tests/docker/api_v2/Dockerfile-service index 05c34fce0ac6b..5c2831e2a7a20 100644 --- a/envoy/tests/docker/api_v2/Dockerfile-service +++ b/envoy/tests/docker/api_v2/Dockerfile-service @@ -1,7 +1,7 @@ FROM envoyproxy/envoy-alpine:v1.14.1 RUN apk update && apk add python3 bash py-pip RUN python3 --version && pip3 --version -RUN pip3 install -q Flask==0.11.1 requests==2.18.4 +RUN pip3 install -q Flask==2.1.2 requests==2.28.1 RUN mkdir /code ADD ./service.py /code ADD ./start_service.sh /usr/local/bin/start_service.sh diff --git a/envoy/tests/docker/api_v3/Dockerfile-service b/envoy/tests/docker/api_v3/Dockerfile-service index b2f1fa5d717e7..89242f0e9f82c 100644 --- a/envoy/tests/docker/api_v3/Dockerfile-service +++ b/envoy/tests/docker/api_v3/Dockerfile-service @@ -1,7 +1,7 @@ FROM envoyproxy/envoy-alpine:v1.18.3 RUN apk update && apk add python3 bash py-pip RUN python3 --version && pip3 --version -RUN pip3 install -q Flask==0.11.1 requests==2.18.4 +RUN pip3 install -q Flask==2.1.2 requests==2.28.1 RUN mkdir /code ADD ./service.py /code ADD ./start_service.sh /usr/local/bin/start_service.sh diff --git a/envoy/tests/docker/api_v3/front-envoy.yaml b/envoy/tests/docker/api_v3/front-envoy.yaml index a3ff3855224d3..ad6047fc0a13b 100644 --- a/envoy/tests/docker/api_v3/front-envoy.yaml +++ b/envoy/tests/docker/api_v3/front-envoy.yaml @@ -43,8 +43,40 @@ static_resources: prefix: "/service/2" route: cluster: service2 + rate_limits: + - stage: 0 + actions: + - {source_cluster: {}} + typed_per_filter_config: + envoy.filters.http.ext_authz: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthzPerRoute + disabled: true http_filters: - - name: envoy.router + - name: envoy.filters.http.ext_authz + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthz + http_service: + server_uri: + uri: 127.0.0.1:10003 + cluster: ext-authz + timeout: 0.25s + include_peer_certificate: true + failure_mode_allow: true + - name: envoy.filters.http.ratelimit + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ratelimit.v3.RateLimit + domain: apis + stage: 0 + request_type: external + failure_mode_deny: false + rate_limit_service: + transport_api_version: V3 + grpc_service: + envoy_grpc: + cluster_name: ratelimiter + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router clusters: - name: service1 connect_timeout: 0.25s @@ -88,6 +120,40 @@ static_resources: socket_address: address: xds port_value: 8080 + - name: ratelimiter + type: static + connect_timeout: 0.25s + lb_policy: ROUND_ROBIN + load_assignment: + cluster_name: ratelimiter + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 8081 + http2_protocol_options: {} + - name: ext-authz + type: static + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: {} + load_assignment: + cluster_name: ext-authz + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 10003 + + # This timeout controls the initial TCP handshake timeout - not the timeout for the + # entire request. + connect_timeout: 0.25s dynamic_resources: cds_config: resource_api_version: V3 diff --git a/envoy/tests/test_check.py b/envoy/tests/test_check.py index e49187e5bd57a..d7fe09da13644 100644 --- a/envoy/tests/test_check.py +++ b/envoy/tests/test_check.py @@ -3,7 +3,14 @@ from datadog_checks.dev.utils import get_metadata_metrics from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS -from .common import DEFAULT_INSTANCE, ENVOY_VERSION, FLAKY_METRICS, PROMETHEUS_METRICS, requires_new_environment +from .common import ( + DEFAULT_INSTANCE, + ENVOY_VERSION, + FLAKY_METRICS, + LEGACY_INSTANCE, + PROMETHEUS_METRICS, + requires_new_environment, +) pytestmark = [requires_new_environment] @@ -17,6 +24,7 @@ @pytest.mark.usefixtures('dd_environment') def test_check(aggregator, dd_run_check, check): c = check(DEFAULT_INSTANCE) + dd_run_check(c) dd_run_check(c) @@ -40,6 +48,20 @@ def test_check(aggregator, dd_run_check, check): aggregator.assert_metrics_using_metadata(get_metadata_metrics()) +@pytest.mark.integration +@pytest.mark.usefixtures('dd_environment') +def test_check_legacy(aggregator, dd_run_check, check): + c = check(LEGACY_INSTANCE) + + dd_run_check(c) + + metadata_metrics = get_metadata_metrics() + # Metric that has a different type in legacy + metadata_metrics['envoy.cluster.upstream_cx_tx_bytes_total']['metric_type'] = 'count' + + aggregator.assert_metrics_using_metadata(metadata_metrics) + + @pytest.mark.integration @pytest.mark.usefixtures('dd_environment') def test_metadata_integration(aggregator, dd_run_check, datadog_agent, check):