Skip to content

Commit

Permalink
Add ext_authz and ratelimiter metrics to OpenMetrics implementation (#…
Browse files Browse the repository at this point in the history
…12451)

* Improve integration testing

* Improve integration testing: external authorization

And add missing metadata

* Simplify ratelimiter and external authorization test setup

* Add external authorization metrics to OpenMetricsV2 implementation

* Add ratelimit metrics to OpenMetrics V2 implementation

* Fix test running for api v2
  • Loading branch information
alopezz authored Jul 5, 2022
1 parent e8708c2 commit 55391c0
Show file tree
Hide file tree
Showing 8 changed files with 127 additions and 6 deletions.
9 changes: 9 additions & 0 deletions envoy/datadog_checks/envoy/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
'envoy_cluster_assignment_timeout_received': 'cluster.assignment_timeout_received',
'envoy_cluster_bind_errors': 'cluster.bind_errors',
'envoy_cluster_default_total_match_count': 'cluster.default_total_match',
'envoy_cluster_ext_authz_ok': 'cluster.ext_authz.ok',
'envoy_cluster_ext_authz_error': 'cluster.ext_authz.error',
'envoy_cluster_ext_authz_denied': 'cluster.ext_authz.denied',
'envoy_cluster_ext_authz_disabled': 'cluster.ext_authz.disabled',
'envoy_cluster_ext_authz_failure_mode_allowed': 'cluster.ext_authz.failure_mode_allowed',
'envoy_cluster_http2_dropped_headers_with_underscores': 'cluster.http2.dropped_headers_with_underscores',
'envoy_cluster_http2_header_overflow': 'cluster.http2.header_overflow',
'envoy_cluster_http2_headers_cb_no_stream': 'cluster.http2.headers_cb_no_stream',
Expand Down Expand Up @@ -47,6 +52,10 @@
'envoy_cluster_lb_zone_routing_sampled': 'cluster.lb_zone_routing_sampled',
'envoy_cluster_membership_change': 'cluster.membership_change',
'envoy_cluster_original_dst_host_invalid': 'cluster.original_dst_host_invalid',
'envoy_cluster_ratelimit_ok': 'cluster.ratelimit.ok',
'envoy_cluster_ratelimit_error': 'cluster.ratelimit.error',
'envoy_cluster_ratelimit_over_limit': 'cluster.ratelimit.over_limit',
'envoy_cluster_ratelimit_failure_mode_allowed': 'cluster.ratelimit.failure_mode_allowed',
'envoy_cluster_retry_or_shadow_abandoned': 'cluster.retry_or_shadow_abandoned',
'envoy_cluster_update_attempt': 'cluster.update_attempt',
'envoy_cluster_update_empty': 'cluster.update_empty',
Expand Down
15 changes: 15 additions & 0 deletions envoy/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ envoy.cluster.upstream_rq_rx_reset.count,count,,,,[OpenMetrics V2] Total request
envoy.cluster.upstream_rq_timeout.count,count,,,,[OpenMetrics V2] Total requests that timed out waiting for a response,0,envoy,,
envoy.cluster.upstream_rq_tx_reset.count,count,,,,[OpenMetrics V2] Total requests that were reset locally,0,envoy,,
envoy.cluster.upstream_rq_xx.count,count,,,,"[OpenMetrics V2] Aggregate HTTP response codes (e.g., 2xx, 3xx, etc.)",0,envoy,,
envoy.cluster.ratelimit.ok.count,count,,response,,[OpenMetrics V2] Total under limit responses from the rate limit service,1,envoy,,
envoy.cluster.ratelimit.error.count,count,,response,,[OpenMetrics V2] Total errors contacting the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.over_limit.count,count,,response,,[OpenMetrics V2] Total over limit responses from the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.failure_mode_allowed.count,count,,response,,[OpenMetrics V2] Total requests that errored when contacting the rate limit service but were allowed through because of failure_mode_deny being set to false,-1,envoy,,
envoy.cluster.ext_authz.ok.count,count,,response,,[OpenMetrics V2] Total responses from the external authorization service,1,envoy,,
envoy.cluster.ext_authz.error.count,count,,response,,[OpenMetrics V2] Total errors contacting the external authorization service,-1,envoy,,
envoy.cluster.ext_authz.denied.count,count,,response,,[OpenMetrics V2] Total responses from the external authorization service denying the traffic,-1,envoy,,
envoy.cluster.ext_authz.disabled.count,count,,response,,[OpenMetrics V2] Total requests allowed without calling the external authorization services due to the filter being disabled,-1,envoy,,
envoy.cluster.ext_authz.failure_mode_allowed.count,count,,response,,[OpenMetrics V2] Total requests that errored when contacting the external authorization service but were allowed through because of failure_mode_allow being set to false,-1,envoy,,
envoy.cluster_manager.cds.control_plane.rate_limit_enforced.count,count,,occurrence,,[OpenMetrics V2] Total number of times rate limit was enforced for management server requests,0,envoy,,
envoy.cluster_manager.cds.init_fetch_timeout.count,count,,,,[OpenMetrics V2] Total initial fetch timeouts,0,envoy,,
envoy.cluster_manager.cds.update_attempt.count,count,,,,[OpenMetrics V2] Total attempted cluster membership updates by service discovery,0,envoy,,
Expand Down Expand Up @@ -269,6 +278,12 @@ envoy.vhost.vcluster.upstream_rq_total,count,,request,,[Legacy] Total requests i
envoy.cluster.ratelimit.ok,count,,response,,[Legacy] Total under limit responses from the rate limit service,1,envoy,,
envoy.cluster.ratelimit.error,count,,response,,[Legacy] Total errors contacting the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.over_limit,count,,response,,[Legacy] Total over limit responses from the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.failure_mode_allowed,count,,response,,[Legacy] Total requests that errored when contacting the rate limit service but were allowed through because of failure_mode_deny being set to false,-1,envoy,,
envoy.cluster.ext_authz.ok,count,,response,,[Legacy] Total responses from the external authorization service,1,envoy,,
envoy.cluster.ext_authz.error,count,,response,,[Legacy] Total errors contacting the external authorization service,-1,envoy,,
envoy.cluster.ext_authz.denied,count,,response,,[Legacy] Total responses from the external authorization service denying the traffic,-1,envoy,,
envoy.cluster.ext_authz.disabled,count,,response,,[Legacy] Total requests allowed without calling the external authorization services due to the filter being disabled,-1,envoy,,
envoy.cluster.ext_authz.failure_mode_allowed,count,,response,,[Legacy] Total requests that errored when contacting the external authorization service but were allowed through because of failure_mode_allow being set to false,-1,envoy,,
envoy.http.ip_tagging.hit,count,,request,,[Legacy] Total number of requests that have the tag_name tag applied to it,0,envoy,,
envoy.http.ip_tagging.no_hit,count,,request,,[Legacy] Total number of requests with no applicable IP tags,0,envoy,,
envoy.http.ip_tagging.total,count,,request,,[Legacy] Total number of requests the IP Tagging Filter operated on,0,envoy,,
Expand Down
5 changes: 5 additions & 0 deletions envoy/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

URL = 'http://{}:{}'.format(HOST, PORT)
DEFAULT_INSTANCE = {'openmetrics_endpoint': '{}/stats/prometheus'.format(URL)}
LEGACY_INSTANCE = {'stats_url': '{}/stats'.format(URL)}
requires_new_environment = pytest.mark.skipif(ENVOY_LEGACY != 'false', reason='Requires prometheus environment')

PROMETHEUS_METRICS = [
Expand All @@ -27,6 +28,8 @@
"cluster.circuit_breakers.rq_pending_open",
"cluster.circuit_breakers.rq_retry_open",
"cluster.default_total_match.count",
"cluster.ext_authz.error.count",
"cluster.ext_authz.failure_mode_allowed.count",
"cluster.http1.dropped_headers_with_underscores.count",
"cluster.http1.metadata_not_supported_error.count",
"cluster.http1.requests_rejected_with_underscores_in_headers.count",
Expand Down Expand Up @@ -74,6 +77,8 @@
"cluster.membership_healthy",
"cluster.membership_total",
"cluster.original_dst_host_invalid.count",
"cluster.ratelimit.error.count",
"cluster.ratelimit.failure_mode_allowed.count",
"cluster.retry_or_shadow_abandoned.count",
"cluster.update_attempt.count",
"cluster.update_empty.count",
Expand Down
8 changes: 6 additions & 2 deletions envoy/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os

import pytest
import requests

from datadog_checks.dev import docker_run
from datadog_checks.envoy import Envoy

from .common import DEFAULT_INSTANCE, DOCKER_DIR, ENVOY_LEGACY, FIXTURE_DIR, URL
from .common import DEFAULT_INSTANCE, DOCKER_DIR, ENVOY_LEGACY, FIXTURE_DIR, HOST, URL
from .legacy.common import FLAVOR, INSTANCES


Expand All @@ -25,8 +26,11 @@ def dd_environment():
os.path.join(DOCKER_DIR, FLAVOR, 'docker-compose.yaml'),
build=True,
endpoints="{}/stats".format(URL),
log_patterns=['all dependencies initialized. starting workers'],
log_patterns=['front-envoy(.*?)all dependencies initialized. starting workers'],
):
# Exercising envoy a bit will trigger extra metrics
requests.get('http://{}:8000/service/1'.format(HOST))
requests.get('http://{}:8000/service/2'.format(HOST))
yield instance


Expand Down
2 changes: 1 addition & 1 deletion envoy/tests/docker/api_v2/Dockerfile-service
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM envoyproxy/envoy-alpine:v1.14.1
RUN apk update && apk add python3 bash py-pip
RUN python3 --version && pip3 --version
RUN pip3 install -q Flask==0.11.1 requests==2.18.4
RUN pip3 install -q Flask==2.1.2 requests==2.28.1
RUN mkdir /code
ADD ./service.py /code
ADD ./start_service.sh /usr/local/bin/start_service.sh
Expand Down
2 changes: 1 addition & 1 deletion envoy/tests/docker/api_v3/Dockerfile-service
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM envoyproxy/envoy-alpine:v1.18.3
RUN apk update && apk add python3 bash py-pip
RUN python3 --version && pip3 --version
RUN pip3 install -q Flask==0.11.1 requests==2.18.4
RUN pip3 install -q Flask==2.1.2 requests==2.28.1
RUN mkdir /code
ADD ./service.py /code
ADD ./start_service.sh /usr/local/bin/start_service.sh
Expand Down
68 changes: 67 additions & 1 deletion envoy/tests/docker/api_v3/front-envoy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,40 @@ static_resources:
prefix: "/service/2"
route:
cluster: service2
rate_limits:
- stage: 0
actions:
- {source_cluster: {}}
typed_per_filter_config:
envoy.filters.http.ext_authz:
"@type": type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthzPerRoute
disabled: true
http_filters:
- name: envoy.router
- name: envoy.filters.http.ext_authz
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthz
http_service:
server_uri:
uri: 127.0.0.1:10003
cluster: ext-authz
timeout: 0.25s
include_peer_certificate: true
failure_mode_allow: true
- name: envoy.filters.http.ratelimit
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.ratelimit.v3.RateLimit
domain: apis
stage: 0
request_type: external
failure_mode_deny: false
rate_limit_service:
transport_api_version: V3
grpc_service:
envoy_grpc:
cluster_name: ratelimiter
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
clusters:
- name: service1
connect_timeout: 0.25s
Expand Down Expand Up @@ -88,6 +120,40 @@ static_resources:
socket_address:
address: xds
port_value: 8080
- name: ratelimiter
type: static
connect_timeout: 0.25s
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: ratelimiter
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 8081
http2_protocol_options: {}
- name: ext-authz
type: static
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http2_protocol_options: {}
load_assignment:
cluster_name: ext-authz
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 10003

# This timeout controls the initial TCP handshake timeout - not the timeout for the
# entire request.
connect_timeout: 0.25s
dynamic_resources:
cds_config:
resource_api_version: V3
Expand Down
24 changes: 23 additions & 1 deletion envoy/tests/test_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@
from datadog_checks.dev.utils import get_metadata_metrics
from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS

from .common import DEFAULT_INSTANCE, ENVOY_VERSION, FLAKY_METRICS, PROMETHEUS_METRICS, requires_new_environment
from .common import (
DEFAULT_INSTANCE,
ENVOY_VERSION,
FLAKY_METRICS,
LEGACY_INSTANCE,
PROMETHEUS_METRICS,
requires_new_environment,
)

pytestmark = [requires_new_environment]

Expand All @@ -17,6 +24,7 @@
@pytest.mark.usefixtures('dd_environment')
def test_check(aggregator, dd_run_check, check):
c = check(DEFAULT_INSTANCE)

dd_run_check(c)
dd_run_check(c)

Expand All @@ -40,6 +48,20 @@ def test_check(aggregator, dd_run_check, check):
aggregator.assert_metrics_using_metadata(get_metadata_metrics())


@pytest.mark.integration
@pytest.mark.usefixtures('dd_environment')
def test_check_legacy(aggregator, dd_run_check, check):
c = check(LEGACY_INSTANCE)

dd_run_check(c)

metadata_metrics = get_metadata_metrics()
# Metric that has a different type in legacy
metadata_metrics['envoy.cluster.upstream_cx_tx_bytes_total']['metric_type'] = 'count'

aggregator.assert_metrics_using_metadata(metadata_metrics)


@pytest.mark.integration
@pytest.mark.usefixtures('dd_environment')
def test_metadata_integration(aggregator, dd_run_check, datadog_agent, check):
Expand Down

0 comments on commit 55391c0

Please sign in to comment.