diff --git a/.github/workflows/kind.yml b/.github/workflows/kind.yml index da8c71c711f..0bdb57ba6b3 100755 --- a/.github/workflows/kind.yml +++ b/.github/workflows/kind.yml @@ -371,3 +371,34 @@ jobs: - name: Run netpol tests working-directory: hack/netpol run: ./test-kind.sh + + validate-prometheus-metrics-doc: + name: Validate metrics in Prometheus document match running deployment's + needs: build-antrea-image + runs-on: [ubuntu-18.04] + steps: + - name: Free disk space + # https://github.com/actions/virtual-environments/issues/709 + run: | + sudo apt-get clean + df -h + - uses: actions/checkout@v2 + - uses: actions/setup-go@v1 + with: + go-version: 1.13 + - name: Download Antrea image from previous job + uses: actions/download-artifact@v1 + with: + name: antrea-ubuntu-netpol + - name: Load Antrea image + run: docker load -i antrea-ubuntu-netpol/antrea-ubuntu-netpol.tar + - name: Install Kind + env: + KIND_VERSION: v0.7.0 + run: | + curl -Lo ./kind https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-$(uname)-amd64 + chmod +x ./kind + sudo mv kind /usr/local/bin + - name: Validate document + run: | + ./ci/kind/validate-metrics-doc.sh diff --git a/build/yamls/antrea-prometheus-rbac.yml b/build/yamls/antrea-prometheus-rbac.yml new file mode 100644 index 00000000000..1f5475a07ca --- /dev/null +++ b/build/yamls/antrea-prometheus-rbac.yml @@ -0,0 +1,49 @@ +# Create a namespace for Prometheus components +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + name: monitoring +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: monitoring +--- +# Authorize Prometheus to view Kubernetes cluster components for service discovery purposes +# Authorize Prometheus to retrieve metrics +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: monitoring diff --git a/ci/kind/kind-setup.sh b/ci/kind/kind-setup.sh index 183ec96f634..002df2428a2 100755 --- a/ci/kind/kind-setup.sh +++ b/ci/kind/kind-setup.sh @@ -27,6 +27,9 @@ NUM_WORKERS=2 SUBNETS="" ENCAP_MODE="" PROXY=false +PROMETHEUS=false + +THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" set -eo pipefail function echoerr { @@ -46,6 +49,7 @@ where: --encap-mode: inter-node pod traffic encap mode, default is encap --proxy: enable Antrea proxy, default is false --antrea-cni: specifies install Antrea CNI in kind cluster, default is true. + --prometheus: enable Prometheus metrics listener for Antrea Controller and Agents, default is false --num-workers: specifies number of worker nodes in kind cluster, default is $NUM_WORKERS --images: specifies images loaded to kind cluster, default is $IMAGES --subnets: a subnet creates a separate docker bridge network with assigned subnet that worker nodes may connect to. Default is empty all worker @@ -266,8 +270,15 @@ EOF if [[ $PROXY == true ]]; then cmd+=" --proxy" fi + if [[ $PROMETHEUS == true ]]; then + cmd+=" --prometheus" + fi echo "$cmd --kind $(get_encap_mode) | kubectl apply --context kind-$CLUSTER_NAME -f -" eval "$cmd --kind $(get_encap_mode) | kubectl apply --context kind-$CLUSTER_NAME -f -" + + if [[ $PROMETHEUS == true ]]; then + kubectl apply --context kind-$CLUSTER_NAME -f $THIS_DIR/../../build/yamls/antrea-prometheus-rbac.yml + fi fi # wait for cluster info @@ -314,7 +325,11 @@ while [[ $# -gt 0 ]] ;; --proxy) PROXY=true - shift 2 + shift + ;; + --prometheus) + PROMETHEUS=true + shift ;; --subnets) SUBNETS="$2" diff --git a/ci/kind/validate-metrics-doc.sh b/ci/kind/validate-metrics-doc.sh new file mode 100755 index 00000000000..7895fc38e1f --- /dev/null +++ b/ci/kind/validate-metrics-doc.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# Copyright 2020 Antrea Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The script validated the Prometheus metrics list within docs/prometheus-integration.md againt a Kind cluster. + +set -eo pipefail + +METRICS_TMP_DOC=$(mktemp /tmp/metricsdoc.XXXXXX.md) + +function exit_handler() { + echo "Cleaning up..." + if [ -f $METRICS_TMP_DOC ]; then + rm -rf $METRICS_TMP_DOC + fi +} + +trap exit_handler INT EXIT + +THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +MAKE_CMD="$THIS_DIR/../../hack/make-metrics-doc.sh" +METRICS_DOC="$THIS_DIR/../../docs/prometheus-integration.md" + +cp -v $METRICS_DOC $METRICS_TMP_DOC +$MAKE_CMD $METRICS_TMP_DOC +cmp -s $METRICS_DOC $METRICS_TMP_DOC +result=$? +if [ $result -ne 0 ]; then + echo "Error: Prometheus metrics document should be updated" + echo "You can update it by building the Antrea Docker image locally (with `make`), running ./hack/make-metrics-doc.sh and committing the changes" + exit 1 +fi + +echo "Prometheus Metrics document verified successfully" +exit 0 diff --git a/docs/prometheus-integration.md b/docs/prometheus-integration.md index 61a636d1dc9..3bbb4a79bb9 100644 --- a/docs/prometheus-integration.md +++ b/docs/prometheus-integration.md @@ -116,3 +116,165 @@ The configuration file above can be used to deploy Prometheus Server with scraping configuration for Antrea services. To deploy this configuration use `kubectl apply -f build/yamls/antrea-prometheus.yml` + +# Antrea Prometheus Metrics +Antrea Controller and Agents expose various metrics, some of which are provided +by the Antrea components and others which are provided by 3rd party components +used by the Antrea components. + +Below is a list of metrics, provided by the components and by 3rd parties. + +## Antrea Agent Metrics +**antrea_agent_conntrack_antrea_connection_count:** Number of connections +in the Antrea ZoneID of the conntrack table. This metric gets updated at +an interval specified by flowPollInterval, a configuration parameter for +the Agent. +**antrea_agent_conntrack_max_connection_count:** Size of the conntrack +table. This metric gets updated at an interval specified by flowPollInterval, +a configuration parameter for the Agent. +**antrea_agent_conntrack_total_connection_count:** Number of connections in +the conntrack table. This metric gets updated at an interval specified by +flowPollInterval, a configuration parameter for the Agent. +**antrea_agent_egress_networkpolicy_rule_count:** Number of egress +networkpolicy rules on local node which are managed by the Antrea Agent. +**antrea_agent_ingress_networkpolicy_rule_count:** Number of ingress +networkpolicy rules on local node which are managed by the Antrea Agent. +**antrea_agent_local_pod_count:** Number of pods on local node which are +managed by the Antrea Agent. +**antrea_agent_networkpolicy_count:** Number of networkpolicies on local +node which are managed by the Antrea Agent. +**antrea_agent_ovs_flow_count:** Flow count for each OVS flow table. The +TableID is used as a label. +**antrea_agent_ovs_flow_ops_count:** Number of OVS flow operations, partitioned +by operation type (add, modify and delete). +**antrea_agent_ovs_flow_ops_error_count:** Number of OVS flow operation +errors, partitioned by operation type (add, modify and delete). +**antrea_agent_ovs_flow_ops_latency_milliseconds:** The latency of OVS flow +operations, partitioned by operation type (add, modify and delete). +**antrea_agent_ovs_total_flow_count:** Total flow count of all OVS flow tables. +**antrea_agent_runtime_info:** Antrea agent runtime info (Deprecated since +Antrea 0.10.0), defined as labels. The value of the gauge is always set to 1. + +## Antrea Controller Metrics +**antrea_controller_address_group_processed:** The total number of +address-group processed +**antrea_controller_address_group_sync_duration_milliseconds:** The duration +of syncing address-group +**antrea_controller_applied_to_group_processed:** The total number of +applied-to-group processed +**antrea_controller_applied_to_group_sync_duration_milliseconds:** The +duration of syncing applied-to-group +**antrea_controller_length_address_group_queue:** The length of +AddressGroupQueue +**antrea_controller_length_applied_to_group_queue:** The length of +AppliedToGroupQueue +**antrea_controller_length_network_policy_queue:** The length of +InternalNetworkPolicyQueue +**antrea_controller_network_policy_processed:** The total number of +internal-networkpolicy processed +**antrea_controller_network_policy_sync_duration_milliseconds:** The duration +of syncing internal-networkpolicy +**antrea_controller_runtime_info:** Antrea controller runtime info (Deprecated +since Antrea 0.10.0), defined as labels. The value of the gauge is always +set to 1. + +## Common Metrics Provided by Infrastructure +## Apiserver Metrics +**apiserver_audit_event_total:** Counter of audit events generated and sent +to the audit backend. +**apiserver_audit_requests_rejected_total:** Counter of apiserver requests +rejected due to an error in audit logging backend. +**apiserver_client_certificate_expiration_seconds:** Distribution of the +remaining lifetime on the certificate used to authenticate a request. +**apiserver_current_inflight_requests:** Maximal number of currently used +inflight request limit of this apiserver per request kind in last second. +**apiserver_envelope_encryption_dek_cache_fill_percent:** Percent of the +cache slots currently occupied by cached DEKs. +**apiserver_longrunning_gauge:** Gauge of all active long-running apiserver +requests broken out by verb, group, version, resource, scope and component. Not +all requests are tracked this way. +**apiserver_registered_watchers:** Number of currently registered watchers +for a given resources +**apiserver_request_duration_seconds:** Response latency distribution in +seconds for each verb, dry run value, group, version, resource, subresource, +scope and component. +**apiserver_request_total:** Counter of apiserver requests broken out for +each verb, dry run value, group, version, resource, scope, component, and +HTTP response contentType and code. +**apiserver_response_sizes:** Response size distribution in bytes for each +group, version, verb, resource, subresource, scope and component. +**apiserver_storage_data_key_generation_duration_seconds:** Latencies in +seconds of data encryption key(DEK) generation operations. +**apiserver_storage_data_key_generation_failures_total:** Total number of +failed data encryption key(DEK) generation operations. +**apiserver_storage_envelope_transformation_cache_misses_total:** Total +number of cache misses while accessing key decryption key(KEK). +**apiserver_watch_events_sizes:** Watch event size distribution in bytes +**apiserver_watch_events_total:** Number of events sent in watch clients + +## Authenticated Metrics +**authenticated_user_requests:** Counter of authenticated requests broken +out by username. + +## Authentication Metrics +**authentication_attempts:** Counter of authenticated attempts. +**authentication_duration_seconds:** Authentication duration in seconds +broken out by result. +**authentication_token_cache_active_fetch_count:** +**authentication_token_cache_fetch_total:** +**authentication_token_cache_request_duration_seconds:** +**authentication_token_cache_request_total:** + +## Go Metrics +**go_gc_duration_seconds:** A summary of the GC invocation durations. +**go_goroutines:** Number of goroutines that currently exist. +**go_info:** Information about the Go environment. +**go_memstats_alloc_bytes:** Number of bytes allocated and still in use. +**go_memstats_alloc_bytes_total:** Total number of bytes allocated, even +if freed. +**go_memstats_buck_hash_sys_bytes:** Number of bytes used by the profiling +bucket hash table. +**go_memstats_frees_total:** Total number of frees. +**go_memstats_gc_cpu_fraction:** The fraction of this program's available +CPU time used by the GC since the program started. +**go_memstats_gc_sys_bytes:** Number of bytes used for garbage collection +system metadata. +**go_memstats_heap_alloc_bytes:** Number of heap bytes allocated and still +in use. +**go_memstats_heap_idle_bytes:** Number of heap bytes waiting to be used. +**go_memstats_heap_inuse_bytes:** Number of heap bytes that are in use. +**go_memstats_heap_objects:** Number of allocated objects. +**go_memstats_heap_released_bytes:** Number of heap bytes released to OS. +**go_memstats_heap_sys_bytes:** Number of heap bytes obtained from system. +**go_memstats_last_gc_time_seconds:** Number of seconds since 1970 of last +garbage collection. +**go_memstats_lookups_total:** Total number of pointer lookups. +**go_memstats_mallocs_total:** Total number of mallocs. +**go_memstats_mcache_inuse_bytes:** Number of bytes in use by mcache +structures. +**go_memstats_mcache_sys_bytes:** Number of bytes used for mcache structures +obtained from system. +**go_memstats_mspan_inuse_bytes:** Number of bytes in use by mspan structures. +**go_memstats_mspan_sys_bytes:** Number of bytes used for mspan structures +obtained from system. +**go_memstats_next_gc_bytes:** Number of heap bytes when next garbage +collection will take place. +**go_memstats_other_sys_bytes:** Number of bytes used for other system +allocations. +**go_memstats_stack_inuse_bytes:** Number of bytes in use by the stack +allocator. +**go_memstats_stack_sys_bytes:** Number of bytes obtained from system for +stack allocator. +**go_memstats_sys_bytes:** Number of bytes obtained from system. +**go_threads:** Number of OS threads created. + +## Process Metrics +**process_cpu_seconds_total:** Total user and system CPU time spent in seconds. +**process_max_fds:** Maximum number of open file descriptors. +**process_open_fds:** Number of open file descriptors. +**process_resident_memory_bytes:** Resident memory size in bytes. +**process_start_time_seconds:** Start time of the process since unix epoch +in seconds. +**process_virtual_memory_bytes:** Virtual memory size in bytes. +**process_virtual_memory_max_bytes:** Maximum amount of virtual memory +available in bytes. diff --git a/hack/generate-manifest.sh b/hack/generate-manifest.sh index 514fd56537c..c3201537c29 100755 --- a/hack/generate-manifest.sh +++ b/hack/generate-manifest.sh @@ -29,6 +29,7 @@ Generate a YAML manifest for Antrea using Kustomize and print it to stdout. --ipsec Generate a manifest with IPSec encryption of tunnel traffic enabled --proxy Generate a manifest with Antrea proxy enabled --np Generate a manifest with ClusterNetworkPolicy and Antrea NetworkPolicy features enabled + --prometheus Generate a manifest with Antrea Controller and Agent Prometheus metrics listener enabled --keep Debug flag which will preserve the generated kustomization.yml --tun (geneve|vxlan|gre|stt) Choose encap tunnel type from geneve, gre, stt and vxlan (default is geneve) --verbose-log Generate a manifest with increased log-level (level 4) for Antrea agent and controller. @@ -66,6 +67,7 @@ TUN_TYPE="geneve" VERBOSE_LOG=false ON_DELETE=false COVERAGE=false +PROMETHEUS=false while [[ $# -gt 0 ]] do @@ -100,6 +102,10 @@ case $key in NP=true shift ;; + --prometheus) + PROMETHEUS=true + shift + ;; --keep) KEEP=true shift @@ -217,6 +223,11 @@ if $NP; then sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*AntreaPolicy[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/ AntreaPolicy: true/" antrea-agent.conf fi +if $PROMETHEUS; then + sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*enablePrometheusMetrics[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/enablePrometheusMetrics: true/" antrea-controller.conf + sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*enablePrometheusMetrics[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/enablePrometheusMetrics: true/" antrea-agent.conf +fi + if [[ $ENCAP_MODE != "" ]]; then sed -i.bak -E "s/^[[:space:]]*#[[:space:]]*trafficEncapMode[[:space:]]*:[[:space:]]*[a-z]+[[:space:]]*$/trafficEncapMode: $ENCAP_MODE/" antrea-agent.conf fi diff --git a/hack/make-metrics-doc.sh b/hack/make-metrics-doc.sh new file mode 100755 index 00000000000..61d341d0655 --- /dev/null +++ b/hack/make-metrics-doc.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash + +# Copyright 2020 Antrea Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The script deploys a Kind cluster with Prometheus metrics enabled. Then queries Antrea endpoints for metrics, and +# outputs it to the console. +# When a doc filename parameter is specified, the script updates the metrics list within the document with the metrics +# from the Kind deployment. + +function exit_handler() { + echo "Cleaning up..." + if [ -f $certfile ]; then + rm -rf certfile + $THIS_DIR/../ci/kind/kind-setup.sh destroy kind + fi +} + +trap exit_handler INT EXIT + +function get_metrics_url() { + pod_name=$1 + host_ip=$(kubectl get pod -n kube-system $pod_name -o jsonpath="{.status.hostIP}") + host_port=$(kubectl get pod -n kube-system $pod_name -o jsonpath="{.spec.containers[*].ports[*].hostPort}") + + echo "https://$host_ip:$host_port/metrics" +} + +function format_metrics() { + sorted_metrics=$1 + # Gather list of metric names + metrics_types_unarranged=$(awk '/# TYPE/{print $3}' <<< $sorted_metrics) + # Put Antrea-specific metrics at the beginning, push 3rd parties after + metrics_types=$(grep antrea <<< $metrics_types_unarranged)$'\n'$(grep -v antrea <<< $metrics_types_unarranged) + # Gather metrics descriptions + metrics_help=$(grep '# HELP' <<< $sorted_metrics | sed 's/\[.*\] //i') + last_pfx="" + echo 'Below is a list of metrics, provided by the components and by 3rd parties.' + for metric in $metrics_types; do + metric_pfx=$(sed 's/_/ /g' <<< $metric | awk '{print $1}') + if [ "$metric_pfx" == 'antrea' ]; then + # For Antrea metrics, add Agent, Controller to title + metric_pfx=$(sed 's/_/ /g' <<< $metric | awk '{print $1" "$2}') + fi + if [ "$last_pfx" != "$metric_pfx" ]; then + echo + # Ouptut metrics title + # Ouptut 3rd party metrics title + if [[ "$last_pfx" =~ ^antrea.* ]] && [[ ! "$metric_pfx" =~ ^antrea.* ]]; then + echo "## Common Metrics Provided by Infrastructure" + fi + # Ouptut metrics title + echo "## "$(sed -e "s/\b\(.\)/\u\1/g" <<< $metric_pfx)" Metrics" + last_pfx=$metric_pfx + fi + metric_help=$(grep " $metric " <<< $metrics_help | sed "s/.*$metric //") + echo "**$metric:** $metric_help" + done +} + +if [ "$1" == "-h" ] || [ "$1" == "--help" ]; then + echo 'Usage: make-metrics-doc.sh [-h|--help|]' + exit 0 +fi +metrics_doc=$1 +if [ "$metrics_doc" != "" ] && [ ! -f $metrics_doc ]; then + echo "Metrics document not found at $metrics_doc" + exit 1 +fi + +set -eo pipefail +THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Initialize a Kind Antrea cluster +$THIS_DIR/../ci/kind/kind-setup.sh create kind --prometheus --num-workers 0 + +# Wait for Antrea components to be ready, allow Antrea inits to complete +kubectl -n kube-system wait --for=condition=ready --timeout=120s pod -l app=antrea +sleep 30 + +# Extract Antrea credentials +certfile=$(mktemp /tmp/cacert.XXXXXX.ca) +secret_name=$(kubectl get serviceaccounts -n monitoring prometheus -o jsonpath="{.secrets[*].name}") +kubectl get secrets -n monitoring $secret_name -o jsonpath="{.data.ca\.crt}" | base64 -d > $certfile +token=$(kubectl get secrets -n monitoring $secret_name --template "{{.data.token}}" | base64 -d) + +# Find agent, controller pods +controller_pod=$(kubectl get pod -n kube-system | awk '/antrea-controller/{print $1}') +agent_pod=$(kubectl get pod -n kube-system | awk '/antrea-agent/{print $1}' | head -n1) + +agent_metrics_url=$(get_metrics_url $agent_pod) +controller_metrics_url=$(get_metrics_url $controller_pod) + +# Retrieve agent and controller metrics +agent_metrics=$(curl -fsk -H "Authorization: Bearer $token" --cacert $certfile $agent_metrics_url | grep '^#') +controller_metrics=$(curl -fsk -H "Authorization: Bearer $token" --cacert $certfile $controller_metrics_url | grep '^#') + +# Sort metrics, eliminate duplicates e.g apiserver etc +sorted_metrics=$(sort -u <<< "${agent_metrics}"$'\n'"${controller_metrics}") + +# Format metrics +formatted_metrics=$(format_metrics "$sorted_metrics") + +if [ "$metrics_doc" == "" ]; then + fmt -w 80 -s <<< $formatted_metrics +else + sed -i '/^Below is a list of metrics, provided by the components and by 3rd parties.$/,$d' $metrics_doc + fmt -w 80 -s <<< $formatted_metrics >> $metrics_doc +fi