Skip to content

Commit

Permalink
tmpnet: Enable collection of logs and metrics (#2820)
Browse files Browse the repository at this point in the history
  • Loading branch information
marun authored Mar 18, 2024
1 parent 6249bab commit e88e565
Show file tree
Hide file tree
Showing 13 changed files with 568 additions and 24 deletions.
93 changes: 89 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ concurrency:

env:
go_version: '~1.21.8'
tmpnet_data_path: ~/.tmpnet/networks
grafana_url: https://grafana-experimental.avax-dev.network/d/kBQpRdWnk/avalanche-main-dashboard?orgId=1&refresh=10s&var-filter=is_ephemeral_node%7C%3D%7Cfalse&var-filter=gh_repo%7C%3D%7Cava-labs%2Favalanchego&var-filter=gh_run_id%7C%3D%7C${{ github.run_id }}&var-filter=gh_run_attempt%7C%3D%7C${{ github.run_attempt }}

jobs:
Unit:
Expand Down Expand Up @@ -67,15 +67,44 @@ jobs:
- name: Build AvalancheGo Binary
shell: bash
run: ./scripts/build.sh -r
- name: Start prometheus
shell: bash
run: bash -x ./scripts/run_prometheus.sh
env:
PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }}
PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }}
- name: Start promtail
shell: bash
run: bash -x ./scripts/run_promtail.sh
env:
LOKI_ID: ${{ secrets.LOKI_ID }}
LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }}
- name: Notify of metrics availability
shell: bash
run: .github/workflows/notify-metrics-availability.sh
env:
GRAFANA_URL: ${{ env.grafana_url }}
GH_JOB_ID: ${{ github.job }}
FILTER_BY_OWNER: avalanchego-e2e
- name: Run e2e tests
shell: bash
run: E2E_SERIAL=1 ./scripts/tests.e2e.sh
env:
GH_REPO: ${{ github.repository }}
GH_WORKFLOW: ${{ github.workflow }}
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_NUMBER: ${{ github.run_number }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
GH_JOB_ID: ${{ github.job }}
- name: Upload tmpnet network dir
uses: actions/upload-artifact@v4
if: always()
with:
name: e2e-tmpnet-data
path: ${{ env.tmpnet_data_path }}
path: |
~/.tmpnet/networks
~/.tmpnet/prometheus/prometheus.log
~/.tmpnet/promtail/promtail.log
if-no-files-found: error
e2e_existing_network:
runs-on: ubuntu-latest
Expand All @@ -88,15 +117,43 @@ jobs:
- name: Build AvalancheGo Binary
shell: bash
run: ./scripts/build.sh -r
- name: Start prometheus
shell: bash
run: bash -x ./scripts/run_prometheus.sh
env:
PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }}
PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }}
- name: Start promtail
shell: bash
run: bash -x ./scripts/run_promtail.sh
env:
LOKI_ID: ${{ secrets.LOKI_ID }}
LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }}
- name: Notify of metrics availability
shell: bash
run: .github/workflows/notify-metrics-availability.sh
env:
GRAFANA_URL: ${{ env.grafana_url }}
GH_JOB_ID: ${{ github.job }}
- name: Run e2e tests with existing network
shell: bash
run: E2E_SERIAL=1 ./scripts/tests.e2e.existing.sh
env:
GH_REPO: ${{ github.repository }}
GH_WORKFLOW: ${{ github.workflow }}
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_NUMBER: ${{ github.run_number }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
GH_JOB_ID: ${{ github.job }}
- name: Upload tmpnet network dir
uses: actions/upload-artifact@v4
if: always()
with:
name: e2e-existing-network-tmpnet-data
path: ${{ env.tmpnet_data_path }}
path: |
~/.tmpnet/networks
~/.tmpnet/prometheus/prometheus.log
~/.tmpnet/promtail/promtail.log
if-no-files-found: error
Upgrade:
runs-on: ubuntu-latest
Expand All @@ -109,15 +166,43 @@ jobs:
- name: Build AvalancheGo Binary
shell: bash
run: ./scripts/build.sh
- name: Start prometheus
shell: bash
run: bash -x ./scripts/run_prometheus.sh
env:
PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }}
PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }}
- name: Start promtail
shell: bash
run: bash -x ./scripts/run_promtail.sh
env:
LOKI_ID: ${{ secrets.LOKI_ID }}
LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }}
- name: Notify of metrics availability
shell: bash
run: .github/workflows/notify-metrics-availability.sh
env:
GRAFANA_URL: ${{ env.grafana_url }}
GH_JOB_ID: ${{ github.job }}
- name: Run e2e tests
shell: bash
run: ./scripts/tests.upgrade.sh
env:
GH_REPO: ${{ github.repository }}
GH_WORKFLOW: ${{ github.workflow }}
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_NUMBER: ${{ github.run_number }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
GH_JOB_ID: ${{ github.job }}
- name: Upload tmpnet network dir
uses: actions/upload-artifact@v4
if: always()
with:
name: upgrade-tmpnet-data
path: ${{ env.tmpnet_data_path }}
path: |
~/.tmpnet/networks
~/.tmpnet/prometheus/prometheus.log
~/.tmpnet/promtail/promtail.log
if-no-files-found: error
Lint:
runs-on: ubuntu-latest
Expand Down
19 changes: 19 additions & 0 deletions .github/workflows/notify-metrics-availability.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

set -euo pipefail

# Timestamps are in seconds
from_timestamp="$(date '+%s')"
monitoring_period=900 # 15 minutes
to_timestamp="$((from_timestamp + monitoring_period))"

# Grafana expects microseconds, so pad timestamps with 3 zeros
metrics_url="${GRAFANA_URL}&var-filter=gh_job_id%7C%3D%7C${GH_JOB_ID}&from=${from_timestamp}000&to=${to_timestamp}000"

# Optionally ensure that the link displays metrics only for the shared
# network rather than mixing it with the results for private networks.
if [[ -n "${FILTER_BY_OWNER:-}" ]]; then
metrics_url="${metrics_url}&var-filter=network_owner%7C%3D%7C${FILTER_BY_OWNER}"
fi

echo "::notice links::metrics ${metrics_url}"
120 changes: 120 additions & 0 deletions scripts/run_prometheus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env bash

set -euo pipefail

# Starts a prometheus instance in agent-mode, forwarding to a central
# instance. Intended to enable metrics collection from temporary networks running
# locally and in CI.
#
# The prometheus instance will remain running in the background and will forward
# metrics to the central instance for all tmpnet networks.
#
# To stop it:
#
# $ kill -9 `cat ~/.tmpnet/prometheus/run.pid` && rm ~/.tmpnet/prometheus/run.pid
#

# e.g.,
# PROMETHEUS_ID=<id> PROMETHEUS_PASSWORD=<password> ./scripts/run_prometheus.sh
if ! [[ "$0" =~ scripts/run_prometheus.sh ]]; then
echo "must be run from repository root"
exit 255
fi

PROMETHEUS_WORKING_DIR="${HOME}/.tmpnet/prometheus"
PIDFILE="${PROMETHEUS_WORKING_DIR}"/run.pid

# First check if an agent-mode prometheus is already running. A single instance can collect
# metrics from all local temporary networks.
if pgrep --pidfile="${PIDFILE}" -f 'prometheus.*enable-feature=agent' &> /dev/null; then
echo "prometheus is already running locally with --enable-feature=agent"
exit 0
fi

PROMETHEUS_URL="${PROMETHEUS_URL:-https://prometheus-experimental.avax-dev.network}"
if [[ -z "${PROMETHEUS_URL}" ]]; then
echo "Please provide a value for PROMETHEUS_URL"
exit 1
fi

PROMETHEUS_ID="${PROMETHEUS_ID:-}"
if [[ -z "${PROMETHEUS_ID}" ]]; then
echo "Please provide a value for PROMETHEUS_ID"
exit 1
fi

PROMETHEUS_PASSWORD="${PROMETHEUS_PASSWORD:-}"
if [[ -z "${PROMETHEUS_PASSWORD}" ]]; then
echo "Plase provide a value for PROMETHEUS_PASSWORD"
exit 1
fi

# This was the LTS version when this script was written. Probably not
# much reason to update it unless something breaks since the usage
# here is only to collect metrics from temporary networks.
VERSION="2.45.3"

# Ensure the prometheus command is locally available
CMD=prometheus
if ! command -v "${CMD}" &> /dev/null; then
# Try to use a local version
CMD="${PWD}/bin/prometheus"
if ! command -v "${CMD}" &> /dev/null; then
echo "prometheus not found, attempting to install..."

# Determine the arch
if which sw_vers &> /dev/null; then
echo "on macos, only amd64 binaries are available so rosetta is required on apple silicon machines."
echo "to avoid using rosetta, install via homebrew: brew install prometheus"
DIST=darwin
else
ARCH="$(uname -i)"
if [[ "${ARCH}" != "x86_64" ]]; then
echo "on linux, only amd64 binaries are available. manual installation of prometheus is required."
exit 1
else
DIST="linux"
fi
fi

# Install the specified release
PROMETHEUS_FILE="prometheus-${VERSION}.${DIST}-amd64"
URL="https://github.com/prometheus/prometheus/releases/download/v${VERSION}/${PROMETHEUS_FILE}.tar.gz"
curl -s -L "${URL}" | tar zxv -C /tmp > /dev/null
mkdir -p "$(dirname "${CMD}")"
cp /tmp/"${PROMETHEUS_FILE}/prometheus" "${CMD}"
fi
fi

# Configure prometheus
FILE_SD_PATH="${PROMETHEUS_WORKING_DIR}/file_sd_configs"
mkdir -p "${FILE_SD_PATH}"

echo "writing configuration..."
cat >"${PROMETHEUS_WORKING_DIR}"/prometheus.yaml <<EOL
# my global config
global:
# Make sure this value takes into account the network-shutdown-delay in tests/fixture/e2e/env.go
scrape_interval: 10s # Default is every 1 minute.
evaluation_interval: 10s # The default is every 1 minute.
scrape_timeout: 5s # The default is every 10s
scrape_configs:
- job_name: "avalanchego"
metrics_path: "/ext/metrics"
file_sd_configs:
- files:
- '${FILE_SD_PATH}/*.json'
remote_write:
- url: "${PROMETHEUS_URL}/api/v1/write"
basic_auth:
username: "${PROMETHEUS_ID}"
password: "${PROMETHEUS_PASSWORD}"
EOL

echo "starting prometheus..."
cd "${PROMETHEUS_WORKING_DIR}"
nohup "${CMD}" --config.file=prometheus.yaml --web.listen-address=localhost:0 --enable-feature=agent > prometheus.log 2>&1 &
echo $! > "${PIDFILE}"
echo "running with pid $(cat "${PIDFILE}")"
Loading

0 comments on commit e88e565

Please sign in to comment.