Skip to content

Commit

Permalink
Document Antrea component metrics
Browse files Browse the repository at this point in the history
Add the list of metrics which are exposed by Antrea agent and controller
in the integration doc for reference.
  • Loading branch information
ksamoray committed Oct 4, 2020
1 parent cc49ad7 commit 76bd3b9
Show file tree
Hide file tree
Showing 7 changed files with 436 additions and 1 deletion.
31 changes: 31 additions & 0 deletions .github/workflows/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,34 @@ jobs:
- name: Run netpol tests
working-directory: hack/netpol
run: ./test-kind.sh

validate-prometheus-metrics-doc:
name: Validate metrics in Prometheus document match running deployment's
needs: build-antrea-image
runs-on: [ubuntu-18.04]
steps:
- name: Free disk space
# https://github.com/actions/virtual-environments/issues/709
run: |
sudo apt-get clean
df -h
- uses: actions/checkout@v2
- uses: actions/setup-go@v1
with:
go-version: 1.13
- name: Download Antrea image from previous job
uses: actions/download-artifact@v1
with:
name: antrea-ubuntu-netpol
- name: Load Antrea image
run: docker load -i antrea-ubuntu-netpol/antrea-ubuntu-netpol.tar
- name: Install Kind
env:
KIND_VERSION: v0.7.0
run: |
curl -Lo ./kind https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-$(uname)-amd64
chmod +x ./kind
sudo mv kind /usr/local/bin
- name: Validate document
run: |
./ci/kind/validate-metrics-doc.sh
49 changes: 49 additions & 0 deletions build/yamls/antrea-prometheus-rbac.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Create a namespace for Prometheus components
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
name: monitoring
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
# Authorize Prometheus to view Kubernetes cluster components for service discovery purposes
# Authorize Prometheus to retrieve metrics
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
17 changes: 16 additions & 1 deletion ci/kind/kind-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ NUM_WORKERS=2
SUBNETS=""
ENCAP_MODE=""
PROXY=false
PROMETHEUS=false

THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

set -eo pipefail
function echoerr {
Expand All @@ -46,6 +49,7 @@ where:
--encap-mode: inter-node pod traffic encap mode, default is encap
--proxy: enable Antrea proxy, default is false
--antrea-cni: specifies install Antrea CNI in kind cluster, default is true.
--prometheus: enable Prometheus metrics listener for Antrea Controller and Agents, default is false
--num-workers: specifies number of worker nodes in kind cluster, default is $NUM_WORKERS
--images: specifies images loaded to kind cluster, default is $IMAGES
--subnets: a subnet creates a separate docker bridge network with assigned subnet that worker nodes may connect to. Default is empty all worker
Expand Down Expand Up @@ -266,8 +270,15 @@ EOF
if [[ $PROXY == true ]]; then
cmd+=" --proxy"
fi
if [[ $PROMETHEUS == true ]]; then
cmd+=" --prometheus"
fi
echo "$cmd --kind $(get_encap_mode) | kubectl apply --context kind-$CLUSTER_NAME -f -"
eval "$cmd --kind $(get_encap_mode) | kubectl apply --context kind-$CLUSTER_NAME -f -"

if [[ $PROMETHEUS == true ]]; then
kubectl apply --context kind-$CLUSTER_NAME -f $THIS_DIR/../../build/yamls/antrea-prometheus-rbac.yml
fi
fi

# wait for cluster info
Expand Down Expand Up @@ -314,7 +325,11 @@ while [[ $# -gt 0 ]]
;;
--proxy)
PROXY=true
shift 2
shift
;;
--prometheus)
PROMETHEUS=true
shift
;;
--subnets)
SUBNETS="$2"
Expand Down
47 changes: 47 additions & 0 deletions ci/kind/validate-metrics-doc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash

# Copyright 2020 Antrea Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The script validated the Prometheus metrics list within docs/prometheus-integration.md againt a Kind cluster.

set -eo pipefail

METRICS_TMP_DOC=$(mktemp /tmp/metricsdoc.XXXXXX.md)

function exit_handler() {
echo "Cleaning up..."
if [ -f $METRICS_TMP_DOC ]; then
rm -rf $METRICS_TMP_DOC
fi
}

trap exit_handler INT EXIT

THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
MAKE_CMD="$THIS_DIR/../../hack/make-metrics-doc.sh"
METRICS_DOC="$THIS_DIR/../../docs/prometheus-integration.md"

cp -v $METRICS_DOC $METRICS_TMP_DOC
$MAKE_CMD $METRICS_TMP_DOC
cmp -s $METRICS_DOC $METRICS_TMP_DOC
result=$?
if [ $result -ne 0 ]; then
echo "Error: Prometheus metrics document should be updated"
echo "You can update it by building the Antrea Docker image locally (with `make`), running ./hack/make-metrics-doc.sh and committing the changes"
exit 1
fi

echo "Prometheus Metrics document verified successfully"
exit 0
162 changes: 162 additions & 0 deletions docs/prometheus-integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,165 @@ The configuration file above can be used to deploy Prometheus Server with
scraping configuration for Antrea services.
To deploy this configuration use
`kubectl apply -f build/yamls/antrea-prometheus.yml`

# Antrea Prometheus Metrics
Antrea Controller and Agents expose various metrics, some of which are provided
by the Antrea components and others which are provided by 3rd party components
used by the Antrea components.

Below is a list of metrics, provided by the components and by 3rd parties.

## Antrea Agent Metrics
**antrea_agent_conntrack_antrea_connection_count:** Number of connections
in the Antrea ZoneID of the conntrack table. This metric gets updated at
an interval specified by flowPollInterval, a configuration parameter for
the Agent.
**antrea_agent_conntrack_max_connection_count:** Size of the conntrack
table. This metric gets updated at an interval specified by flowPollInterval,
a configuration parameter for the Agent.
**antrea_agent_conntrack_total_connection_count:** Number of connections in
the conntrack table. This metric gets updated at an interval specified by
flowPollInterval, a configuration parameter for the Agent.
**antrea_agent_egress_networkpolicy_rule_count:** Number of egress
networkpolicy rules on local node which are managed by the Antrea Agent.
**antrea_agent_ingress_networkpolicy_rule_count:** Number of ingress
networkpolicy rules on local node which are managed by the Antrea Agent.
**antrea_agent_local_pod_count:** Number of pods on local node which are
managed by the Antrea Agent.
**antrea_agent_networkpolicy_count:** Number of networkpolicies on local
node which are managed by the Antrea Agent.
**antrea_agent_ovs_flow_count:** Flow count for each OVS flow table. The
TableID is used as a label.
**antrea_agent_ovs_flow_ops_count:** Number of OVS flow operations, partitioned
by operation type (add, modify and delete).
**antrea_agent_ovs_flow_ops_error_count:** Number of OVS flow operation
errors, partitioned by operation type (add, modify and delete).
**antrea_agent_ovs_flow_ops_latency_milliseconds:** The latency of OVS flow
operations, partitioned by operation type (add, modify and delete).
**antrea_agent_ovs_total_flow_count:** Total flow count of all OVS flow tables.
**antrea_agent_runtime_info:** Antrea agent runtime info (Deprecated since
Antrea 0.10.0), defined as labels. The value of the gauge is always set to 1.

## Antrea Controller Metrics
**antrea_controller_address_group_processed:** The total number of
address-group processed
**antrea_controller_address_group_sync_duration_milliseconds:** The duration
of syncing address-group
**antrea_controller_applied_to_group_processed:** The total number of
applied-to-group processed
**antrea_controller_applied_to_group_sync_duration_milliseconds:** The
duration of syncing applied-to-group
**antrea_controller_length_address_group_queue:** The length of
AddressGroupQueue
**antrea_controller_length_applied_to_group_queue:** The length of
AppliedToGroupQueue
**antrea_controller_length_network_policy_queue:** The length of
InternalNetworkPolicyQueue
**antrea_controller_network_policy_processed:** The total number of
internal-networkpolicy processed
**antrea_controller_network_policy_sync_duration_milliseconds:** The duration
of syncing internal-networkpolicy
**antrea_controller_runtime_info:** Antrea controller runtime info (Deprecated
since Antrea 0.10.0), defined as labels. The value of the gauge is always
set to 1.

## Common Metrics Provided by Infrastructure
## Apiserver Metrics
**apiserver_audit_event_total:** Counter of audit events generated and sent
to the audit backend.
**apiserver_audit_requests_rejected_total:** Counter of apiserver requests
rejected due to an error in audit logging backend.
**apiserver_client_certificate_expiration_seconds:** Distribution of the
remaining lifetime on the certificate used to authenticate a request.
**apiserver_current_inflight_requests:** Maximal number of currently used
inflight request limit of this apiserver per request kind in last second.
**apiserver_envelope_encryption_dek_cache_fill_percent:** Percent of the
cache slots currently occupied by cached DEKs.
**apiserver_longrunning_gauge:** Gauge of all active long-running apiserver
requests broken out by verb, group, version, resource, scope and component. Not
all requests are tracked this way.
**apiserver_registered_watchers:** Number of currently registered watchers
for a given resources
**apiserver_request_duration_seconds:** Response latency distribution in
seconds for each verb, dry run value, group, version, resource, subresource,
scope and component.
**apiserver_request_total:** Counter of apiserver requests broken out for
each verb, dry run value, group, version, resource, scope, component, and
HTTP response contentType and code.
**apiserver_response_sizes:** Response size distribution in bytes for each
group, version, verb, resource, subresource, scope and component.
**apiserver_storage_data_key_generation_duration_seconds:** Latencies in
seconds of data encryption key(DEK) generation operations.
**apiserver_storage_data_key_generation_failures_total:** Total number of
failed data encryption key(DEK) generation operations.
**apiserver_storage_envelope_transformation_cache_misses_total:** Total
number of cache misses while accessing key decryption key(KEK).
**apiserver_watch_events_sizes:** Watch event size distribution in bytes
**apiserver_watch_events_total:** Number of events sent in watch clients

## Authenticated Metrics
**authenticated_user_requests:** Counter of authenticated requests broken
out by username.

## Authentication Metrics
**authentication_attempts:** Counter of authenticated attempts.
**authentication_duration_seconds:** Authentication duration in seconds
broken out by result.
**authentication_token_cache_active_fetch_count:**
**authentication_token_cache_fetch_total:**
**authentication_token_cache_request_duration_seconds:**
**authentication_token_cache_request_total:**

## Go Metrics
**go_gc_duration_seconds:** A summary of the GC invocation durations.
**go_goroutines:** Number of goroutines that currently exist.
**go_info:** Information about the Go environment.
**go_memstats_alloc_bytes:** Number of bytes allocated and still in use.
**go_memstats_alloc_bytes_total:** Total number of bytes allocated, even
if freed.
**go_memstats_buck_hash_sys_bytes:** Number of bytes used by the profiling
bucket hash table.
**go_memstats_frees_total:** Total number of frees.
**go_memstats_gc_cpu_fraction:** The fraction of this program's available
CPU time used by the GC since the program started.
**go_memstats_gc_sys_bytes:** Number of bytes used for garbage collection
system metadata.
**go_memstats_heap_alloc_bytes:** Number of heap bytes allocated and still
in use.
**go_memstats_heap_idle_bytes:** Number of heap bytes waiting to be used.
**go_memstats_heap_inuse_bytes:** Number of heap bytes that are in use.
**go_memstats_heap_objects:** Number of allocated objects.
**go_memstats_heap_released_bytes:** Number of heap bytes released to OS.
**go_memstats_heap_sys_bytes:** Number of heap bytes obtained from system.
**go_memstats_last_gc_time_seconds:** Number of seconds since 1970 of last
garbage collection.
**go_memstats_lookups_total:** Total number of pointer lookups.
**go_memstats_mallocs_total:** Total number of mallocs.
**go_memstats_mcache_inuse_bytes:** Number of bytes in use by mcache
structures.
**go_memstats_mcache_sys_bytes:** Number of bytes used for mcache structures
obtained from system.
**go_memstats_mspan_inuse_bytes:** Number of bytes in use by mspan structures.
**go_memstats_mspan_sys_bytes:** Number of bytes used for mspan structures
obtained from system.
**go_memstats_next_gc_bytes:** Number of heap bytes when next garbage
collection will take place.
**go_memstats_other_sys_bytes:** Number of bytes used for other system
allocations.
**go_memstats_stack_inuse_bytes:** Number of bytes in use by the stack
allocator.
**go_memstats_stack_sys_bytes:** Number of bytes obtained from system for
stack allocator.
**go_memstats_sys_bytes:** Number of bytes obtained from system.
**go_threads:** Number of OS threads created.

## Process Metrics
**process_cpu_seconds_total:** Total user and system CPU time spent in seconds.
**process_max_fds:** Maximum number of open file descriptors.
**process_open_fds:** Number of open file descriptors.
**process_resident_memory_bytes:** Resident memory size in bytes.
**process_start_time_seconds:** Start time of the process since unix epoch
in seconds.
**process_virtual_memory_bytes:** Virtual memory size in bytes.
**process_virtual_memory_max_bytes:** Maximum amount of virtual memory
available in bytes.
11 changes: 11 additions & 0 deletions hack/generate-manifest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Generate a YAML manifest for Antrea using Kustomize and print it to stdout.
--ipsec Generate a manifest with IPSec encryption of tunnel traffic enabled
--proxy Generate a manifest with Antrea proxy enabled
--np Generate a manifest with ClusterNetworkPolicy and Antrea NetworkPolicy features enabled
--prometheus Generate a manifest with Antrea Controller and Agent Prometheus metrics listener enabled
--keep Debug flag which will preserve the generated kustomization.yml
--tun (geneve|vxlan|gre|stt) Choose encap tunnel type from geneve, gre, stt and vxlan (default is geneve)
--verbose-log Generate a manifest with increased log-level (level 4) for Antrea agent and controller.
Expand Down Expand Up @@ -66,6 +67,7 @@ TUN_TYPE="geneve"
VERBOSE_LOG=false
ON_DELETE=false
COVERAGE=false
PROMETHEUS=false

while [[ $# -gt 0 ]]
do
Expand Down Expand Up @@ -100,6 +102,10 @@ case $key in
NP=true
shift
;;
--prometheus)
PROMETHEUS=true
shift
;;
--keep)
KEEP=true
shift
Expand Down Expand Up @@ -217,6 +223,11 @@ if $NP; then
sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*AntreaPolicy[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/ AntreaPolicy: true/" antrea-agent.conf
fi

if $PROMETHEUS; then
sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*enablePrometheusMetrics[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/enablePrometheusMetrics: true/" antrea-controller.conf
sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*enablePrometheusMetrics[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/enablePrometheusMetrics: true/" antrea-agent.conf
fi

if [[ $ENCAP_MODE != "" ]]; then
sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*trafficEncapMode[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/trafficEncapMode: $ENCAP_MODE/" antrea-agent.conf
fi
Expand Down
Loading

0 comments on commit 76bd3b9

Please sign in to comment.