Skip to content

Commit

Permalink
fix(ci): tester/builder start race conditions (#10893)
Browse files Browse the repository at this point in the history
- lock around builder install
- reuse code in tester/builder
  • Loading branch information
ludamad authored Dec 20, 2024
1 parent 57439a7 commit 4250782
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 64 deletions.
2 changes: 1 addition & 1 deletion .github/ensure-builder/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ runs:
if [[ $TYPE == builder-x86 ]]; then
# 128-core x86 instance types, aws chooses for us based on capacity
echo "instance_type=m6a.32xlarge m6i.32xlarge m6in.32xlarge m7a.32xlarge r6a.32xlarge r6i.32xlarge r6in.32xlarge" >> $GITHUB_OUTPUT
echo "ami_id=ami-052a1e16394277fdf" >> $GITHUB_OUTPUT
echo "ami_id=ami-044f8e99cd65daf76" >> $GITHUB_OUTPUT
echo "runner_concurrency=20" >> $GITHUB_OUTPUT
echo "runner_label=$USERNAME-x86" >> $GITHUB_OUTPUT
echo "ttl=40" >> $GITHUB_OUTPUT
Expand Down
11 changes: 11 additions & 0 deletions .github/ensure-builder/chron
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
set -eu
# One-time config only on builder.
if ! [ -f ~/maybe_exit_spot.sh ] ; then
cp scripts/ci/maybe_exit_spot.sh ~/maybe_exit_spot.sh
# Run maybe_exit_spot.sh every minute
chmod +x ~/maybe_exit_spot.sh
echo "* * * * * ~/maybe_exit_spot.sh" | crontab -
echo "Configured instance exit cron job."
else
echo "Chron jobs already configured."
fi
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,10 @@ if ! command -v jq >/dev/null; then
sudo mv ./jq /usr/bin/jq
fi
function install_parallel() {
sudo apt update && sudo apt install parallel
sudo apt update && sudo systemctl stop unattended-upgrades && sudo apt install parallel
}
export -f install_parallel
if ! command -v parallel >/dev/null; then
DENOISE=1 ci3/denoise install_parallel
DENOISE=1 ci3/denoise ci3/retry install_parallel
fi
[ -x /usr/local/bin/earthly ] || ci3/dump_fail ci3/earthly_install
# Run the test.
ci3/aws_handle_evict "set -eu; $2"
[ -x /usr/local/bin/earthly ] || ci3/dump_fail ci3/earthly_install
34 changes: 21 additions & 13 deletions .github/ensure-builder/run
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
#!/usr/bin/env bash
set -u
exit_code=254
ttl=$1
scripts/run_on_builder "
set -eu;
sudo shutdown -P $ttl;
if ! [ -d ~/run-$RUN_ID ]; then
mkdir -p ~/run-$RUN_ID;
cd ~/run-$RUN_ID;
git init >/dev/null 2>&1;
git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1;
git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1;
git checkout FETCH_HEAD >/dev/null 2>&1;
fi;
cd ~/run-$RUN_ID;
.github/ensure-builder/wrapper $DOCKERHUB_PASSWORD '$INPUT';
function clone {
if ! [ -d ~/run-$RUN_ID ]; then
mkdir -p ~/run-$RUN_ID;
cd ~/run-$RUN_ID;
git init >/dev/null 2>&1;
git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1;
git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1;
git checkout FETCH_HEAD >/dev/null 2>&1;
fi;
}
export RUN_ID GIT_COMMIT
export -f clone
flock /var/lock/clone.lock bash -c clone
cd ~/run-$RUN_ID
flock /var/lock/install.lock .github/ensure-builder/chron
flock /var/lock/install.lock .github/ensure-builder/install $DOCKERHUB_PASSWORD
ci3/aws_handle_evict 'set -eu; $INPUT'
"
exit_code=$?
if [ $exit_code = 255 ]; then
echo "Treating ssh termination as spot eviction.";
echo "Treating ssh termination as spot eviction."
exit_code=155
fi;
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT;
fi
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
if [ $exit_code = 155 ]; then
echo "Spot eviction detected - retrying with on-demand."
fi
30 changes: 19 additions & 11 deletions .github/ensure-tester/run
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
#!/usr/bin/env bash
set -u
exit_code=254
ttl=$1
scripts/run_on_tester "
set -eu;
sudo shutdown -P $ttl;
if ! [ -d ~/run-$RUN_ID ]; then
mkdir -p ~/run-$RUN_ID;
cd ~/run-$RUN_ID;
git init >/dev/null 2>&1;
git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1;
git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1;
git checkout FETCH_HEAD >/dev/null 2>&1;
fi;
cd ~/run-$RUN_ID;
.github/ensure-tester/wrapper $DOCKERHUB_PASSWORD '$INPUT';
function clone {
if ! [ -d ~/run-$RUN_ID ]; then
mkdir -p ~/run-$RUN_ID;
cd ~/run-$RUN_ID;
git init >/dev/null 2>&1;
git remote add origin https://github.com/aztecprotocol/aztec-packages >/dev/null 2>&1;
git fetch --depth 1 origin $GIT_COMMIT >/dev/null 2>&1;
git checkout FETCH_HEAD >/dev/null 2>&1;
fi;
}
export RUN_ID GIT_COMMIT
export -f clone
flock /var/lock/clone.lock bash -c clone
cd ~/run-$RUN_ID
# reuse script from ensure-builder, but don't set up chron
flock /var/lock/install.lock .github/ensure-builder/install $DOCKERHUB_PASSWORD
ci3/aws_handle_evict 'set -eu; $INPUT'
"
exit_code=$?
if [ $exit_code = 255 ]; then
Expand All @@ -23,4 +31,4 @@ fi
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
if [ $exit_code = 155 ]; then
echo "Spot eviction detected - retrying with on-demand."
fi
fi
33 changes: 0 additions & 33 deletions .github/ensure-tester/wrapper

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ jobs:
steps:
- uses: actions/checkout@v4
with: { ref: "${{ env.GIT_COMMIT }}" }
- name: "CI (l1-contracts, avm-transpiler, noir-projects, yarn-project)"
- name: "Bootstrap (l1-contracts, avm-transpiler, noir-projects, yarn-project)"
uses: ./.github/ensure-builder
timeout-minutes: 40
with:
Expand Down
19 changes: 19 additions & 0 deletions ci3/retry
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash
set -u # not -e
[ "${BUILD_SYSTEM_DEBUG:-}" = 1 ] && set -x

if [ -n "${RETRY_DISABLED:-}" ]; then
set -e
eval "$"
exit
fi

ATTEMPTS=3
# Retries up to 3 times with 5 second intervals
for i in $(seq 1 $ATTEMPTS); do
eval "$*" && exit
[ "$i" != "$ATTEMPTS" ] && sleep 5
done

>&2 echo "$@ failed after $ATTEMPTS attempts"
exit 1

0 comments on commit 4250782

Please sign in to comment.