Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ext_authz and ratelimiter metrics to OpenMetrics implementation #12451

Merged
merged 6 commits into from
Jul 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions envoy/datadog_checks/envoy/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
'envoy_cluster_assignment_timeout_received': 'cluster.assignment_timeout_received',
'envoy_cluster_bind_errors': 'cluster.bind_errors',
'envoy_cluster_default_total_match_count': 'cluster.default_total_match',
'envoy_cluster_ext_authz_ok': 'cluster.ext_authz.ok',
'envoy_cluster_ext_authz_error': 'cluster.ext_authz.error',
'envoy_cluster_ext_authz_denied': 'cluster.ext_authz.denied',
'envoy_cluster_ext_authz_disabled': 'cluster.ext_authz.disabled',
'envoy_cluster_ext_authz_failure_mode_allowed': 'cluster.ext_authz.failure_mode_allowed',
'envoy_cluster_http2_dropped_headers_with_underscores': 'cluster.http2.dropped_headers_with_underscores',
'envoy_cluster_http2_header_overflow': 'cluster.http2.header_overflow',
'envoy_cluster_http2_headers_cb_no_stream': 'cluster.http2.headers_cb_no_stream',
Expand Down Expand Up @@ -47,6 +52,10 @@
'envoy_cluster_lb_zone_routing_sampled': 'cluster.lb_zone_routing_sampled',
'envoy_cluster_membership_change': 'cluster.membership_change',
'envoy_cluster_original_dst_host_invalid': 'cluster.original_dst_host_invalid',
'envoy_cluster_ratelimit_ok': 'cluster.ratelimit.ok',
'envoy_cluster_ratelimit_error': 'cluster.ratelimit.error',
'envoy_cluster_ratelimit_over_limit': 'cluster.ratelimit.over_limit',
'envoy_cluster_ratelimit_failure_mode_allowed': 'cluster.ratelimit.failure_mode_allowed',
'envoy_cluster_retry_or_shadow_abandoned': 'cluster.retry_or_shadow_abandoned',
'envoy_cluster_update_attempt': 'cluster.update_attempt',
'envoy_cluster_update_empty': 'cluster.update_empty',
Expand Down
15 changes: 15 additions & 0 deletions envoy/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ envoy.cluster.upstream_rq_rx_reset.count,count,,,,[OpenMetrics V2] Total request
envoy.cluster.upstream_rq_timeout.count,count,,,,[OpenMetrics V2] Total requests that timed out waiting for a response,0,envoy,,
envoy.cluster.upstream_rq_tx_reset.count,count,,,,[OpenMetrics V2] Total requests that were reset locally,0,envoy,,
envoy.cluster.upstream_rq_xx.count,count,,,,"[OpenMetrics V2] Aggregate HTTP response codes (e.g., 2xx, 3xx, etc.)",0,envoy,,
envoy.cluster.ratelimit.ok.count,count,,response,,[OpenMetrics V2] Total under limit responses from the rate limit service,1,envoy,,
envoy.cluster.ratelimit.error.count,count,,response,,[OpenMetrics V2] Total errors contacting the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.over_limit.count,count,,response,,[OpenMetrics V2] Total over limit responses from the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.failure_mode_allowed.count,count,,response,,[OpenMetrics V2] Total requests that errored when contacting the rate limit service but were allowed through because of failure_mode_deny being set to false,-1,envoy,,
envoy.cluster.ext_authz.ok.count,count,,response,,[OpenMetrics V2] Total responses from the external authorization service,1,envoy,,
envoy.cluster.ext_authz.error.count,count,,response,,[OpenMetrics V2] Total errors contacting the external authorization service,-1,envoy,,
envoy.cluster.ext_authz.denied.count,count,,response,,[OpenMetrics V2] Total responses from the external authorization service denying the traffic,-1,envoy,,
envoy.cluster.ext_authz.disabled.count,count,,response,,[OpenMetrics V2] Total requests allowed without calling the external authorization services due to the filter being disabled,-1,envoy,,
envoy.cluster.ext_authz.failure_mode_allowed.count,count,,response,,[OpenMetrics V2] Total requests that errored when contacting the external authorization service but were allowed through because of failure_mode_allow being set to false,-1,envoy,,
envoy.cluster_manager.cds.control_plane.rate_limit_enforced.count,count,,occurrence,,[OpenMetrics V2] Total number of times rate limit was enforced for management server requests,0,envoy,,
envoy.cluster_manager.cds.init_fetch_timeout.count,count,,,,[OpenMetrics V2] Total initial fetch timeouts,0,envoy,,
envoy.cluster_manager.cds.update_attempt.count,count,,,,[OpenMetrics V2] Total attempted cluster membership updates by service discovery,0,envoy,,
Expand Down Expand Up @@ -269,6 +278,12 @@ envoy.vhost.vcluster.upstream_rq_total,count,,request,,[Legacy] Total requests i
envoy.cluster.ratelimit.ok,count,,response,,[Legacy] Total under limit responses from the rate limit service,1,envoy,,
envoy.cluster.ratelimit.error,count,,response,,[Legacy] Total errors contacting the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.over_limit,count,,response,,[Legacy] Total over limit responses from the rate limit service,-1,envoy,,
envoy.cluster.ratelimit.failure_mode_allowed,count,,response,,[Legacy] Total requests that errored when contacting the rate limit service but were allowed through because of failure_mode_deny being set to false,-1,envoy,,
envoy.cluster.ext_authz.ok,count,,response,,[Legacy] Total responses from the external authorization service,1,envoy,,
envoy.cluster.ext_authz.error,count,,response,,[Legacy] Total errors contacting the external authorization service,-1,envoy,,
envoy.cluster.ext_authz.denied,count,,response,,[Legacy] Total responses from the external authorization service denying the traffic,-1,envoy,,
envoy.cluster.ext_authz.disabled,count,,response,,[Legacy] Total requests allowed without calling the external authorization services due to the filter being disabled,-1,envoy,,
envoy.cluster.ext_authz.failure_mode_allowed,count,,response,,[Legacy] Total requests that errored when contacting the external authorization service but were allowed through because of failure_mode_allow being set to false,-1,envoy,,
envoy.http.ip_tagging.hit,count,,request,,[Legacy] Total number of requests that have the tag_name tag applied to it,0,envoy,,
envoy.http.ip_tagging.no_hit,count,,request,,[Legacy] Total number of requests with no applicable IP tags,0,envoy,,
envoy.http.ip_tagging.total,count,,request,,[Legacy] Total number of requests the IP Tagging Filter operated on,0,envoy,,
Expand Down
5 changes: 5 additions & 0 deletions envoy/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

URL = 'http://{}:{}'.format(HOST, PORT)
DEFAULT_INSTANCE = {'openmetrics_endpoint': '{}/stats/prometheus'.format(URL)}
LEGACY_INSTANCE = {'stats_url': '{}/stats'.format(URL)}
requires_new_environment = pytest.mark.skipif(ENVOY_LEGACY != 'false', reason='Requires prometheus environment')

PROMETHEUS_METRICS = [
Expand All @@ -27,6 +28,8 @@
"cluster.circuit_breakers.rq_pending_open",
"cluster.circuit_breakers.rq_retry_open",
"cluster.default_total_match.count",
"cluster.ext_authz.error.count",
"cluster.ext_authz.failure_mode_allowed.count",
"cluster.http1.dropped_headers_with_underscores.count",
"cluster.http1.metadata_not_supported_error.count",
"cluster.http1.requests_rejected_with_underscores_in_headers.count",
Expand Down Expand Up @@ -74,6 +77,8 @@
"cluster.membership_healthy",
"cluster.membership_total",
"cluster.original_dst_host_invalid.count",
"cluster.ratelimit.error.count",
"cluster.ratelimit.failure_mode_allowed.count",
"cluster.retry_or_shadow_abandoned.count",
"cluster.update_attempt.count",
"cluster.update_empty.count",
Expand Down
8 changes: 6 additions & 2 deletions envoy/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os

import pytest
import requests

from datadog_checks.dev import docker_run
from datadog_checks.envoy import Envoy

from .common import DEFAULT_INSTANCE, DOCKER_DIR, ENVOY_LEGACY, FIXTURE_DIR, URL
from .common import DEFAULT_INSTANCE, DOCKER_DIR, ENVOY_LEGACY, FIXTURE_DIR, HOST, URL
from .legacy.common import FLAVOR, INSTANCES


Expand All @@ -25,8 +26,11 @@ def dd_environment():
os.path.join(DOCKER_DIR, FLAVOR, 'docker-compose.yaml'),
build=True,
endpoints="{}/stats".format(URL),
log_patterns=['all dependencies initialized. starting workers'],
log_patterns=['front-envoy(.*?)all dependencies initialized. starting workers'],
):
# Exercising envoy a bit will trigger extra metrics
requests.get('http://{}:8000/service/1'.format(HOST))
requests.get('http://{}:8000/service/2'.format(HOST))
yield instance


Expand Down
2 changes: 1 addition & 1 deletion envoy/tests/docker/api_v2/Dockerfile-service
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM envoyproxy/envoy-alpine:v1.14.1
RUN apk update && apk add python3 bash py-pip
RUN python3 --version && pip3 --version
RUN pip3 install -q Flask==0.11.1 requests==2.18.4
RUN pip3 install -q Flask==2.1.2 requests==2.28.1
RUN mkdir /code
ADD ./service.py /code
ADD ./start_service.sh /usr/local/bin/start_service.sh
Expand Down
2 changes: 1 addition & 1 deletion envoy/tests/docker/api_v3/Dockerfile-service
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM envoyproxy/envoy-alpine:v1.18.3
RUN apk update && apk add python3 bash py-pip
RUN python3 --version && pip3 --version
RUN pip3 install -q Flask==0.11.1 requests==2.18.4
RUN pip3 install -q Flask==2.1.2 requests==2.28.1
RUN mkdir /code
ADD ./service.py /code
ADD ./start_service.sh /usr/local/bin/start_service.sh
Expand Down
68 changes: 67 additions & 1 deletion envoy/tests/docker/api_v3/front-envoy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,40 @@ static_resources:
prefix: "/service/2"
route:
cluster: service2
rate_limits:
- stage: 0
actions:
- {source_cluster: {}}
typed_per_filter_config:
envoy.filters.http.ext_authz:
"@type": type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthzPerRoute
disabled: true
http_filters:
- name: envoy.router
- name: envoy.filters.http.ext_authz
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.ext_authz.v3.ExtAuthz
http_service:
server_uri:
uri: 127.0.0.1:10003
cluster: ext-authz
timeout: 0.25s
include_peer_certificate: true
failure_mode_allow: true
- name: envoy.filters.http.ratelimit
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.ratelimit.v3.RateLimit
domain: apis
stage: 0
request_type: external
failure_mode_deny: false
rate_limit_service:
transport_api_version: V3
grpc_service:
envoy_grpc:
cluster_name: ratelimiter
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
clusters:
- name: service1
connect_timeout: 0.25s
Expand Down Expand Up @@ -88,6 +120,40 @@ static_resources:
socket_address:
address: xds
port_value: 8080
- name: ratelimiter
type: static
connect_timeout: 0.25s
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: ratelimiter
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 8081
http2_protocol_options: {}
- name: ext-authz
type: static
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http2_protocol_options: {}
load_assignment:
cluster_name: ext-authz
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 10003

# This timeout controls the initial TCP handshake timeout - not the timeout for the
# entire request.
connect_timeout: 0.25s
dynamic_resources:
cds_config:
resource_api_version: V3
Expand Down
24 changes: 23 additions & 1 deletion envoy/tests/test_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@
from datadog_checks.dev.utils import get_metadata_metrics
from datadog_checks.envoy.metrics import METRIC_PREFIX, METRICS

from .common import DEFAULT_INSTANCE, ENVOY_VERSION, FLAKY_METRICS, PROMETHEUS_METRICS, requires_new_environment
from .common import (
DEFAULT_INSTANCE,
ENVOY_VERSION,
FLAKY_METRICS,
LEGACY_INSTANCE,
PROMETHEUS_METRICS,
requires_new_environment,
)

pytestmark = [requires_new_environment]

Expand All @@ -17,6 +24,7 @@
@pytest.mark.usefixtures('dd_environment')
def test_check(aggregator, dd_run_check, check):
c = check(DEFAULT_INSTANCE)

dd_run_check(c)
dd_run_check(c)

Expand All @@ -40,6 +48,20 @@ def test_check(aggregator, dd_run_check, check):
aggregator.assert_metrics_using_metadata(get_metadata_metrics())


@pytest.mark.integration
@pytest.mark.usefixtures('dd_environment')
def test_check_legacy(aggregator, dd_run_check, check):
c = check(LEGACY_INSTANCE)

dd_run_check(c)

metadata_metrics = get_metadata_metrics()
# Metric that has a different type in legacy
metadata_metrics['envoy.cluster.upstream_cx_tx_bytes_total']['metric_type'] = 'count'

aggregator.assert_metrics_using_metadata(metadata_metrics)


@pytest.mark.integration
@pytest.mark.usefixtures('dd_environment')
def test_metadata_integration(aggregator, dd_run_check, datadog_agent, check):
Expand Down