From 42507821892447502ec4a3be9d174ec3df855857 Mon Sep 17 00:00:00 2001 From: ludamad Date: Fri, 20 Dec 2024 16:07:14 -0500 Subject: [PATCH] fix(ci): tester/builder start race conditions (#10893) - lock around builder install - reuse code in tester/builder --- .github/ensure-builder/action.yml | 2 +- .github/ensure-builder/chron | 11 +++++++ .github/ensure-builder/{wrapper => install} | 8 ++--- .github/ensure-builder/run | 34 +++++++++++++-------- .github/ensure-tester/run | 30 +++++++++++------- .github/ensure-tester/wrapper | 33 -------------------- .github/workflows/ci.yml | 2 +- ci3/retry | 19 ++++++++++++ 8 files changed, 75 insertions(+), 64 deletions(-) create mode 100755 .github/ensure-builder/chron rename .github/ensure-builder/{wrapper => install} (89%) delete mode 100755 .github/ensure-tester/wrapper create mode 100755 ci3/retry diff --git a/.github/ensure-builder/action.yml b/.github/ensure-builder/action.yml index d1f39fff679..6ff89d78471 100644 --- a/.github/ensure-builder/action.yml +++ b/.github/ensure-builder/action.yml @@ -31,7 +31,7 @@ runs: if [[ $TYPE == builder-x86 ]]; then # 128-core x86 instance types, aws chooses for us based on capacity echo "instance_type=m6a.32xlarge m6i.32xlarge m6in.32xlarge m7a.32xlarge r6a.32xlarge r6i.32xlarge r6in.32xlarge" >> $GITHUB_OUTPUT - echo "ami_id=ami-052a1e16394277fdf" >> $GITHUB_OUTPUT + echo "ami_id=ami-044f8e99cd65daf76" >> $GITHUB_OUTPUT echo "runner_concurrency=20" >> $GITHUB_OUTPUT echo "runner_label=$USERNAME-x86" >> $GITHUB_OUTPUT echo "ttl=40" >> $GITHUB_OUTPUT diff --git a/.github/ensure-builder/chron b/.github/ensure-builder/chron new file mode 100755 index 00000000000..591ce2fd870 --- /dev/null +++ b/.github/ensure-builder/chron @@ -0,0 +1,11 @@ +set -eu +# One-time config only on builder. +if ! [ -f ~/maybe_exit_spot.sh ] ; then + cp scripts/ci/maybe_exit_spot.sh ~/maybe_exit_spot.sh + # Run maybe_exit_spot.sh every minute + chmod +x ~/maybe_exit_spot.sh + echo "* * * * * ~/maybe_exit_spot.sh" | crontab - + echo "Configured instance exit cron job." +else + echo "Chron jobs already configured." +fi \ No newline at end of file diff --git a/.github/ensure-builder/wrapper b/.github/ensure-builder/install similarity index 89% rename from .github/ensure-builder/wrapper rename to .github/ensure-builder/install index a3d2dcaf994..336b20d68d0 100755 --- a/.github/ensure-builder/wrapper +++ b/.github/ensure-builder/install @@ -32,12 +32,10 @@ if ! command -v jq >/dev/null; then sudo mv ./jq /usr/bin/jq fi function install_parallel() { - sudo apt update && sudo apt install parallel + sudo apt update && sudo systemctl stop unattended-upgrades && sudo apt install parallel } export -f install_parallel if ! command -v parallel >/dev/null; then - DENOISE=1 ci3/denoise install_parallel + DENOISE=1 ci3/denoise ci3/retry install_parallel fi -[ -x /usr/local/bin/earthly ] || ci3/dump_fail ci3/earthly_install -# Run the test. -ci3/aws_handle_evict "set -eu; $2" \ No newline at end of file +[ -x /usr/local/bin/earthly ] || ci3/dump_fail ci3/earthly_install \ No newline at end of file diff --git a/.github/ensure-builder/run b/.github/ensure-builder/run index 4a85aba0751..fe1b3ff8d0b 100755 --- a/.github/ensure-builder/run +++ b/.github/ensure-builder/run @@ -1,26 +1,34 @@ #!/usr/bin/env bash +set -u exit_code=254 ttl=$1 scripts/run_on_builder " set -eu; sudo shutdown -P $ttl; - if ! [ -d ~/run-$RUN_ID ]; then - mkdir -p ~/run-$RUN_ID; - cd ~/run-$RUN_ID; - git init >/dev/null 2>&1; - git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1; - git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1; - git checkout FETCH_HEAD >/dev/null 2>&1; - fi; - cd ~/run-$RUN_ID; - .github/ensure-builder/wrapper $DOCKERHUB_PASSWORD '$INPUT'; + function clone { + if ! [ -d ~/run-$RUN_ID ]; then + mkdir -p ~/run-$RUN_ID; + cd ~/run-$RUN_ID; + git init >/dev/null 2>&1; + git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1; + git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1; + git checkout FETCH_HEAD >/dev/null 2>&1; + fi; + } + export RUN_ID GIT_COMMIT + export -f clone + flock /var/lock/clone.lock bash -c clone + cd ~/run-$RUN_ID + flock /var/lock/install.lock .github/ensure-builder/chron + flock /var/lock/install.lock .github/ensure-builder/install $DOCKERHUB_PASSWORD + ci3/aws_handle_evict 'set -eu; $INPUT' " exit_code=$? if [ $exit_code = 255 ]; then - echo "Treating ssh termination as spot eviction."; + echo "Treating ssh termination as spot eviction." exit_code=155 -fi; -echo "exit_code=$exit_code" >> $GITHUB_OUTPUT; +fi +echo "exit_code=$exit_code" >> $GITHUB_OUTPUT if [ $exit_code = 155 ]; then echo "Spot eviction detected - retrying with on-demand." fi \ No newline at end of file diff --git a/.github/ensure-tester/run b/.github/ensure-tester/run index dc1e8a75f21..0bf096e1517 100755 --- a/.github/ensure-tester/run +++ b/.github/ensure-tester/run @@ -1,19 +1,27 @@ #!/usr/bin/env bash +set -u exit_code=254 ttl=$1 scripts/run_on_tester " set -eu; sudo shutdown -P $ttl; - if ! [ -d ~/run-$RUN_ID ]; then - mkdir -p ~/run-$RUN_ID; - cd ~/run-$RUN_ID; - git init >/dev/null 2>&1; - git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1; - git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1; - git checkout FETCH_HEAD >/dev/null 2>&1; - fi; - cd ~/run-$RUN_ID; - .github/ensure-tester/wrapper $DOCKERHUB_PASSWORD '$INPUT'; + function clone { + if ! [ -d ~/run-$RUN_ID ]; then + mkdir -p ~/run-$RUN_ID; + cd ~/run-$RUN_ID; + git init >/dev/null 2>&1; + git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1; + git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1; + git checkout FETCH_HEAD >/dev/null 2>&1; + fi; + } + export RUN_ID GIT_COMMIT + export -f clone + flock /var/lock/clone.lock bash -c clone + cd ~/run-$RUN_ID + # reuse script from ensure-builder, but don't set up chron + flock /var/lock/install.lock .github/ensure-builder/install $DOCKERHUB_PASSWORD + ci3/aws_handle_evict 'set -eu; $INPUT' " exit_code=$? if [ $exit_code = 255 ]; then @@ -23,4 +31,4 @@ fi echo "exit_code=$exit_code" >> $GITHUB_OUTPUT if [ $exit_code = 155 ]; then echo "Spot eviction detected - retrying with on-demand." -fi +fi \ No newline at end of file diff --git a/.github/ensure-tester/wrapper b/.github/ensure-tester/wrapper deleted file mode 100755 index b65b45e2dfa..00000000000 --- a/.github/ensure-tester/wrapper +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -set -eu -# One-time config. -mkdir -p ~/.ssh -echo $BUILD_INSTANCE_SSH_KEY | base64 --decode > ~/.ssh/build_instance_key -chmod 600 ~/.ssh/build_instance_key -set +x -# Ensure docker is active. -echo "Waiting for Docker service to become active..." -i=0 -set +x -while ! systemctl is-active --quiet docker; do - sleep 2 - if [ $(( i++ )) -gt 60 ]; then - echo "Docker service not found! Report this." - exit 1 - fi -done -echo $1 | ci3/dump_fail docker login -u aztecprotocolci --password-stdin -ci3/dump_fail wget https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_$(ci3/arch) -O ./yq 2>&1 >/dev/null -chmod +x ./yq -sudo mv ./yq /usr/bin/yq -ci3/dump_fail wget https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-$(ci3/arch) -O ./jq -chmod +x ./jq -sudo mv ./jq /usr/bin/jq -ci3/dump_fail ci3/earthly_install -function install_parallel() { - sudo apt update && sudo apt install parallel -} -export -f install_parallel -DENOISE=1 ci3/denoise install_parallel -# Run command. -ci3/aws_handle_evict "set -eu; $2" \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c80e4086045..6b60b35f102 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,7 +153,7 @@ jobs: steps: - uses: actions/checkout@v4 with: { ref: "${{ env.GIT_COMMIT }}" } - - name: "CI (l1-contracts, avm-transpiler, noir-projects, yarn-project)" + - name: "Bootstrap (l1-contracts, avm-transpiler, noir-projects, yarn-project)" uses: ./.github/ensure-builder timeout-minutes: 40 with: diff --git a/ci3/retry b/ci3/retry new file mode 100755 index 00000000000..43351f9cef3 --- /dev/null +++ b/ci3/retry @@ -0,0 +1,19 @@ +#!/bin/bash +set -u # not -e +[ "${BUILD_SYSTEM_DEBUG:-}" = 1 ] && set -x + +if [ -n "${RETRY_DISABLED:-}" ]; then + set -e + eval "$" + exit +fi + +ATTEMPTS=3 +# Retries up to 3 times with 5 second intervals +for i in $(seq 1 $ATTEMPTS); do + eval "$*" && exit + [ "$i" != "$ATTEMPTS" ] && sleep 5 +done + +>&2 echo "$@ failed after $ATTEMPTS attempts" +exit 1 \ No newline at end of file