From 68f919a3cfe34e1150ef67c721b8cade00b3bca1 Mon Sep 17 00:00:00 2001
From: Maru Newby <maru.newby@avalabs.org>
Date: Thu, 25 Jul 2024 10:08:02 -0700
Subject: [PATCH] [ci] Remove defunct network outage sim workflow

---
 .github/actionlint.yml                      |  1 -
 .github/workflows/cleanup-net-outage-sim.sh | 13 ---
 .github/workflows/net-outage-sim.yml        | 32 -------
 .github/workflows/run-net-outage-sim.sh     | 99 ---------------------
 4 files changed, 145 deletions(-)
 delete mode 100755 .github/workflows/cleanup-net-outage-sim.sh
 delete mode 100644 .github/workflows/net-outage-sim.yml
 delete mode 100755 .github/workflows/run-net-outage-sim.sh

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 2e6d753c2282..62bee747e6f0 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -2,4 +2,3 @@ self-hosted-runner:
   labels:
     - custom-arm64-focal
     - custom-arm64-jammy
-    - net-outage-sim
diff --git a/.github/workflows/cleanup-net-outage-sim.sh b/.github/workflows/cleanup-net-outage-sim.sh
deleted file mode 100755
index bcb9862f9efb..000000000000
--- a/.github/workflows/cleanup-net-outage-sim.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-###
-# cleanup removes the docker instance and the network
-echo "Cleaning up..."
-# shellcheck disable=SC2046
-docker rm $(sudo docker stop $(sudo docker ps -a -q --filter ancestor=avaplatform/avalanchego:latest --format="{{.ID}}"))  #if the filter returns nothing the command fails, so ignore errors
-docker network rm controlled-net
-rm /opt/mainnet-db-daily* 2>/dev/null
-rm -rf /var/lib/avalanchego 2>/dev/null
-echo "Done cleaning up"
diff --git a/.github/workflows/net-outage-sim.yml b/.github/workflows/net-outage-sim.yml
deleted file mode 100644
index af6af82fb4c1..000000000000
--- a/.github/workflows/net-outage-sim.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: network-outage-simulation
-
-on:
-  workflow_dispatch:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    # Run every day at 7 AM. (The database backup is created around 5 AM.)
-    - cron: "0 7 * * *"
-
-jobs:
-  run_sim:
-    runs-on: [self-hosted, linux, x64, net-outage-sim]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Cleanup docker (avoid conflicts with previous runs)
-        shell: bash
-        run: .github/workflows/cleanup-net-outage-sim.sh
-
-      - name: Download avalanchego:latest
-        run: docker pull avaplatform/avalanchego:latest
-
-      - name: Run the internet outage simulation
-        shell: bash
-        run: .github/workflows/run-net-outage-sim.sh
-
-      - name: Cleanup again
-        if: always() # Always clean up
-        shell: bash
-        run: .github/workflows/cleanup-net-outage-sim.sh
diff --git a/.github/workflows/run-net-outage-sim.sh b/.github/workflows/run-net-outage-sim.sh
deleted file mode 100755
index 809d290e1d16..000000000000
--- a/.github/workflows/run-net-outage-sim.sh
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-SUCCESS=1
-
-# Polls AvalancheGo until it's healthy. When it is,
-# sets SUCCESS to 0 and returns. If AvalancheGo
-# doesn't become healthy within 3 hours, sets
-# SUCCESS to 1 and returns.
-wait_until_healthy () {
-  # timeout: if after 3 hours it is not healthy, return
-  stop=$(date -d "+ 3 hour" +%s)
-  # store the response code here
-  response=0
-  # while the endpoint doesn't return 200
-  while [ "$response" -ne 200 ]
-  do
-    echo "Checking if local node is healthy..."
-    # Ignore error in case of ephemeral failure to hit node's API
-    response=$(curl --write-out '%{http_code}' --silent --output /dev/null localhost:9650/ext/health)
-    echo "got status code $response from health endpoint"
-    # check that 3 hours haven't passed
-    now=$(date +%s)
-    if [ "$now" -ge "$stop" ];
-    then
-      # timeout: exit
-      SUCCESS=1
-      return
-    fi
-    # no timeout yet, wait 30s until retry
-    sleep 30
-  done
-  # response returned 200, therefore exit
-  echo "Node became healthy"
-  SUCCESS=0
-}
-
-#remove any existing database files
-echo "removing existing database files..."
-rm /opt/mainnet-db-daily* 2>/dev/null || true # Do || true to ignore error if files dont exist yet
-rm -rf /var/lib/avalanchego 2>/dev/null || true # Do || true to ignore error if files dont exist yet
-echo "done existing database files"
-
-#download latest mainnet DB backup
-FILENAME="mainnet-db-daily-"
-DATE=$(date +'%m-%d-%Y')
-DB_FILE="$FILENAME$DATE"
-echo "Copying database file $DB_FILE from S3 to local..."
-aws s3 cp s3://avalanche-db-daily/ /opt/ --no-progress --recursive --exclude "*" --include "$DB_FILE*"
-echo "Done downloading database"
-
-# extract DB
-echo "Extracting database..."
-mkdir -p /var/lib/avalanchego/db
-tar -zxf /opt/"$DB_FILE"*-tar.gz -C /var/lib/avalanchego/db
-echo "Done extracting database"
-
-echo "Creating Docker network..."
-docker network create controlled-net
-
-echo "Starting Docker container..."
-containerID=$(docker run --name="net_outage_simulation" --memory="12g" --memory-reservation="11g" --cpus="6.0" --net=controlled-net -p 9650:9650 -p 9651:9651 -v /var/lib/avalanchego/db:/db -d avaplatform/avalanchego:latest /avalanchego/build/avalanchego --db-dir /db --http-host=0.0.0.0)
-
-echo "Waiting 30 seconds for node to start..."
-sleep 30
-echo "Waiting until healthy..."
-wait_until_healthy
-if [ $SUCCESS -eq 1 ];
-then
-  echo "Timed out waiting for node to become healthy; exiting."
-  exit 1
-fi
-
-# To simulate internet outage, we will disable the docker network connection
-echo "Disconnecting node from internet..."
-docker network disconnect controlled-net "$containerID"
-echo "Sleeping 60 minutes..."
-sleep 3600
-echo "Reconnecting node to internet..."
-docker network connect controlled-net "$containerID"
-echo "Reconnected to internet. Waiting until healthy..."
-
-# now repeatedly check the node's health until it returns healthy
-start=$(date +%s)
-SUCCESS=-1
-wait_until_healthy
-if [ $SUCCESS -eq 1 ];
-then
-  echo "Timed out waiting for node to become healthy after outage; exiting."
-  exit 1
-fi
-
-# The node returned healthy, print how long it took
-end=$(date +%s)
-
-DELAY=$((end - start))
-echo "Node became healthy again after complete outage after $DELAY seconds."
-echo "Test completed"