From 68f919a3cfe34e1150ef67c721b8cade00b3bca1 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Thu, 25 Jul 2024 10:08:02 -0700 Subject: [PATCH] [ci] Remove defunct network outage sim workflow --- .github/actionlint.yml | 1 - .github/workflows/cleanup-net-outage-sim.sh | 13 --- .github/workflows/net-outage-sim.yml | 32 ------- .github/workflows/run-net-outage-sim.sh | 99 --------------------- 4 files changed, 145 deletions(-) delete mode 100755 .github/workflows/cleanup-net-outage-sim.sh delete mode 100644 .github/workflows/net-outage-sim.yml delete mode 100755 .github/workflows/run-net-outage-sim.sh diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 2e6d753c2282..62bee747e6f0 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -2,4 +2,3 @@ self-hosted-runner: labels: - custom-arm64-focal - custom-arm64-jammy - - net-outage-sim diff --git a/.github/workflows/cleanup-net-outage-sim.sh b/.github/workflows/cleanup-net-outage-sim.sh deleted file mode 100755 index bcb9862f9efb..000000000000 --- a/.github/workflows/cleanup-net-outage-sim.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -### -# cleanup removes the docker instance and the network -echo "Cleaning up..." -# shellcheck disable=SC2046 -docker rm $(sudo docker stop $(sudo docker ps -a -q --filter ancestor=avaplatform/avalanchego:latest --format="{{.ID}}")) #if the filter returns nothing the command fails, so ignore errors -docker network rm controlled-net -rm /opt/mainnet-db-daily* 2>/dev/null -rm -rf /var/lib/avalanchego 2>/dev/null -echo "Done cleaning up" diff --git a/.github/workflows/net-outage-sim.yml b/.github/workflows/net-outage-sim.yml deleted file mode 100644 index af6af82fb4c1..000000000000 --- a/.github/workflows/net-outage-sim.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: network-outage-simulation - -on: - workflow_dispatch: - schedule: - # * is a special character in YAML so you have to quote this string - # Run every day at 7 AM. (The database backup is created around 5 AM.) - - cron: "0 7 * * *" - -jobs: - run_sim: - runs-on: [self-hosted, linux, x64, net-outage-sim] - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Cleanup docker (avoid conflicts with previous runs) - shell: bash - run: .github/workflows/cleanup-net-outage-sim.sh - - - name: Download avalanchego:latest - run: docker pull avaplatform/avalanchego:latest - - - name: Run the internet outage simulation - shell: bash - run: .github/workflows/run-net-outage-sim.sh - - - name: Cleanup again - if: always() # Always clean up - shell: bash - run: .github/workflows/cleanup-net-outage-sim.sh diff --git a/.github/workflows/run-net-outage-sim.sh b/.github/workflows/run-net-outage-sim.sh deleted file mode 100755 index 809d290e1d16..000000000000 --- a/.github/workflows/run-net-outage-sim.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -SUCCESS=1 - -# Polls AvalancheGo until it's healthy. When it is, -# sets SUCCESS to 0 and returns. If AvalancheGo -# doesn't become healthy within 3 hours, sets -# SUCCESS to 1 and returns. -wait_until_healthy () { - # timeout: if after 3 hours it is not healthy, return - stop=$(date -d "+ 3 hour" +%s) - # store the response code here - response=0 - # while the endpoint doesn't return 200 - while [ "$response" -ne 200 ] - do - echo "Checking if local node is healthy..." - # Ignore error in case of ephemeral failure to hit node's API - response=$(curl --write-out '%{http_code}' --silent --output /dev/null localhost:9650/ext/health) - echo "got status code $response from health endpoint" - # check that 3 hours haven't passed - now=$(date +%s) - if [ "$now" -ge "$stop" ]; - then - # timeout: exit - SUCCESS=1 - return - fi - # no timeout yet, wait 30s until retry - sleep 30 - done - # response returned 200, therefore exit - echo "Node became healthy" - SUCCESS=0 -} - -#remove any existing database files -echo "removing existing database files..." -rm /opt/mainnet-db-daily* 2>/dev/null || true # Do || true to ignore error if files dont exist yet -rm -rf /var/lib/avalanchego 2>/dev/null || true # Do || true to ignore error if files dont exist yet -echo "done existing database files" - -#download latest mainnet DB backup -FILENAME="mainnet-db-daily-" -DATE=$(date +'%m-%d-%Y') -DB_FILE="$FILENAME$DATE" -echo "Copying database file $DB_FILE from S3 to local..." -aws s3 cp s3://avalanche-db-daily/ /opt/ --no-progress --recursive --exclude "*" --include "$DB_FILE*" -echo "Done downloading database" - -# extract DB -echo "Extracting database..." -mkdir -p /var/lib/avalanchego/db -tar -zxf /opt/"$DB_FILE"*-tar.gz -C /var/lib/avalanchego/db -echo "Done extracting database" - -echo "Creating Docker network..." -docker network create controlled-net - -echo "Starting Docker container..." -containerID=$(docker run --name="net_outage_simulation" --memory="12g" --memory-reservation="11g" --cpus="6.0" --net=controlled-net -p 9650:9650 -p 9651:9651 -v /var/lib/avalanchego/db:/db -d avaplatform/avalanchego:latest /avalanchego/build/avalanchego --db-dir /db --http-host=0.0.0.0) - -echo "Waiting 30 seconds for node to start..." -sleep 30 -echo "Waiting until healthy..." -wait_until_healthy -if [ $SUCCESS -eq 1 ]; -then - echo "Timed out waiting for node to become healthy; exiting." - exit 1 -fi - -# To simulate internet outage, we will disable the docker network connection -echo "Disconnecting node from internet..." -docker network disconnect controlled-net "$containerID" -echo "Sleeping 60 minutes..." -sleep 3600 -echo "Reconnecting node to internet..." -docker network connect controlled-net "$containerID" -echo "Reconnected to internet. Waiting until healthy..." - -# now repeatedly check the node's health until it returns healthy -start=$(date +%s) -SUCCESS=-1 -wait_until_healthy -if [ $SUCCESS -eq 1 ]; -then - echo "Timed out waiting for node to become healthy after outage; exiting." - exit 1 -fi - -# The node returned healthy, print how long it took -end=$(date +%s) - -DELAY=$((end - start)) -echo "Node became healthy again after complete outage after $DELAY seconds." -echo "Test completed"