Skip to content

Commit

Permalink
chore(ci): fix master, better spot copy times (#6374)
Browse files Browse the repository at this point in the history
- Copying to same EBS disk (/var/lib/docker) takes 20 seconds instead of
1:20
- Master had bad git hash in some situations
  • Loading branch information
ludamad authored May 13, 2024
1 parent ded28b7 commit fee7649
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 79 deletions.
10 changes: 8 additions & 2 deletions .github/ensure-tester-with-images/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,14 @@ runs:
export BUILDER_SPOT_IP=${{ env.BUILDER_SPOT_IP }}
export BUILDER_SPOT_KEY=~/.ssh/build_instance_key
scripts/run_on_builder "
flock ${{ env.IMAGE_KEY }}.lock bash -c '! [ -f ${{ env.IMAGE_KEY }}.brotli ] && docker save ${{ inputs.builder_images_to_copy }} | brotli -2 > ${{ env.IMAGE_KEY }}.brotli'
cat ${{ env.IMAGE_KEY }}.brotli
sudo mkdir -p /var/lib/docker/tmp
sudo flock /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.lock bash -c '
if ! [ -f /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli ] ; then
docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp
mv /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
fi'
sudo cat /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
" | brotli --decompress | docker load
- name: Test
Expand Down
41 changes: 17 additions & 24 deletions .github/spot-runner-action/dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -337,8 +337,9 @@ class Ec2Instance {
const fleet = yield client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
for (const error of fleet.Errors) {
if (error.ErrorCode === "RequestLimitExceeded") {
return "RequestLimitExceeded";
if (error.ErrorCode === "RequestLimitExceeded" ||
error.ErrorCode === "InsufficientInstanceCapacity") {
return error.ErrorCode;
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
Expand Down Expand Up @@ -732,29 +733,21 @@ function requestAndWaitForSpot(config) {
// 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
// TODO make longer lived spot request?
for (let i = 0; i < 6; i++) {
try {
// Start instance
instanceId =
yield ec2Client.requestMachine(
// we fallback to on-demand
ec2Strategy.toLocaleLowerCase() === "none");
// let's exit, only loop on InsufficientInstanceCapacity
if (instanceId !== "RequestLimitExceeded") {
break;
}
// Start instance
const instanceIdOrError = yield ec2Client.requestMachine(
// we fallback to on-demand
ec2Strategy.toLocaleLowerCase() === "none");
// let's exit, only loop on InsufficientInstanceCapacity
if (instanceIdOrError === "RequestLimitExceeded" ||
instanceIdOrError === "InsufficientInstanceCapacity") {
core.info("Failed to create instance due to " +
instanceIdOrError +
" , waiting 10 seconds and trying again.");
backoff += 1;
}
catch (error) {
// TODO is this still the relevant error?
if ((error === null || error === void 0 ? void 0 : error.code) &&
error.code === "InsufficientInstanceCapacity" &&
ec2SpotStrategies.length > 0 &&
ec2Strategy.toLocaleLowerCase() != "none") {
core.info("Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again.");
// we loop after 10 seconds
}
else {
throw error;
}
else {
instanceId = instanceIdOrError;
break;
}
// wait 10 seconds
yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff)));
Expand Down
7 changes: 5 additions & 2 deletions .github/spot-runner-action/src/ec2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,11 @@ export class Ec2Instance {
const fleet = await client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
for (const error of fleet.Errors) {
if (error.ErrorCode === "RequestLimitExceeded") {
return "RequestLimitExceeded";
if (
error.ErrorCode === "RequestLimitExceeded" ||
error.ErrorCode === "InsufficientInstanceCapacity"
) {
return error.ErrorCode;
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
Expand Down
50 changes: 21 additions & 29 deletions .github/spot-runner-action/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,37 +61,29 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {

let instanceId = "";
for (const ec2Strategy of ec2SpotStrategies) {
let backoff = 1;
let backoff = 0;
core.info(`Starting instance with ${ec2Strategy} strategy`);
// 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
// TODO make longer lived spot request?
for (let i = 0; i < 6; i++) {
try {
// Start instance
instanceId =
await ec2Client.requestMachine(
// we fallback to on-demand
ec2Strategy.toLocaleLowerCase() === "none"
);
// let's exit, only loop on InsufficientInstanceCapacity
if (instanceId !== "RequestLimitExceeded") {
break;
}
} catch (error) {
// TODO is this still the relevant error?
if (
error?.code &&
error.code === "InsufficientInstanceCapacity" &&
ec2SpotStrategies.length > 0 &&
ec2Strategy.toLocaleLowerCase() != "none"
) {
core.info(
"Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again."
);
// we loop after 10 seconds
} else {
throw error;
}
// Start instance
const instanceIdOrError =
await ec2Client.requestMachine(
// we fallback to on-demand
ec2Strategy.toLocaleLowerCase() === "none"
);
// let's exit, only loop on InsufficientInstanceCapacity
if (
instanceIdOrError === "RequestLimitExceeded" ||
instanceIdOrError === "InsufficientInstanceCapacity"
) {
backoff += 1;
core.info(
"Failed to create instance due to " +
instanceIdOrError +
" , waiting " + 10000 * 2 ** backoff + " seconds and trying again."
);
} else {
instanceId = instanceIdOrError;
break;
}
// wait 10 seconds
await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff));
Expand Down
45 changes: 23 additions & 22 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
BUILD_INSTANCE_SSH_KEY: ${{ secrets.BUILD_INSTANCE_SSH_KEY }}
GIT_COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
# kludge until we move away from runners
WAIT_FOR_RUNNERS: false

Expand All @@ -42,7 +43,7 @@ jobs:
bench_list: ${{ steps.bench_list.outputs.list }}
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: build-${{ inputs.username || github.actor }}-x86
Expand All @@ -69,7 +70,7 @@ jobs:
test: ${{ fromJson( needs.build.outputs.e2e_list )}}
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
- name: Setup and Test
timeout-minutes: 40
Expand All @@ -79,7 +80,7 @@ jobs:
builder_type: builder-x86
# these are copied to the tester and expected by the earthly command below
# if they fail to copy, it will try to build them on the tester and fail
builder_images_to_copy: aztecprotocol/aztec:${{ github.event.pull_request.head.sha }} aztecprotocol/end-to-end:${{ github.event.pull_request.head.sha }}
builder_images_to_copy: aztecprotocol/aztec:${{ env.GIT_COMMIT }} aztecprotocol/end-to-end:${{ env.GIT_COMMIT }}
# command to produce the images in case they don't exist
builder_command: scripts/earthly-ci ./yarn-project+export-e2e-test-images
run: |
Expand All @@ -98,7 +99,7 @@ jobs:
test: ${{ fromJson( needs.build.outputs.bench_list )}}
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
- name: Setup and Test
uses: ./.github/ensure-tester-with-images
Expand All @@ -108,7 +109,7 @@ jobs:
builder_type: builder-x86
# these are copied to the tester and expected by the earthly command below
# if they fail to copy, it will try to build them on the tester and fail
builder_images_to_copy: aztecprotocol/aztec:${{ github.event.pull_request.head.sha }} aztecprotocol/end-to-end:${{ github.event.pull_request.head.sha }}
builder_images_to_copy: aztecprotocol/aztec:${{ env.GIT_COMMIT }} aztecprotocol/end-to-end:${{ env.GIT_COMMIT }}
# command to produce the images in case they don't exist
builder_command: cd yarn-project/end-to-end/ && ../../scripts/earthly-ci +${{ matrix.test }}
run: |
Expand All @@ -128,7 +129,7 @@ jobs:
# # - uses: actions/checkout@v4
# # with:
# # fetch-depth: 100 # Downloading base benchmark from master requires access to history
# # ref: "${{ github.event.pull_request.head.sha }}"
# # ref: "${{ env.GIT_COMMIT }}"
# # - uses: ./.github/ci-setup-action
# # with:
# # concurrency_key: build-${{ inputs.username || github.actor }}-x86
Expand All @@ -153,7 +154,7 @@ jobs:
runs-on: ${{ inputs.username || github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: noir-format-${{ inputs.username || github.actor }}-x86
Expand All @@ -173,7 +174,7 @@ jobs:
runs-on: ${{ inputs.username || github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
# Only allow one memory-hunger prover test to use this runner
- uses: ./.github/ci-setup-action
with:
Expand All @@ -190,7 +191,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: bb-js-test-${{ github.actor }}-x86
Expand All @@ -204,7 +205,7 @@ jobs:
runs-on: ${{ inputs.username || github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: noir-${{ inputs.username || github.actor }}-x86
Expand All @@ -216,7 +217,7 @@ jobs:
runs-on: ${{ inputs.username || github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: noir-packages-${{ inputs.username || github.actor }}-x86
Expand All @@ -228,7 +229,7 @@ jobs:
runs-on: ${{ inputs.username || github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: noir-projects-${{ inputs.username || github.actor }}-x86
Expand All @@ -241,7 +242,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
# Only allow one memory-hunger prover test to use this runner
- uses: ./.github/ci-setup-action
with:
Expand All @@ -256,7 +257,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
# Only allow one memory-hunger prover test to use this runner
- uses: ./.github/ci-setup-action
with:
Expand All @@ -270,7 +271,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: prover-client-test-${{ github.actor }}-x86
Expand All @@ -283,7 +284,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: build-acir-tests-${{ github.actor }}-x86
Expand All @@ -296,7 +297,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: barretenberg-acir-tests-bb-${{ github.actor }}-x86
Expand All @@ -310,7 +311,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: barretenberg-acir-tests-sol-${{ github.actor }}-x86
Expand All @@ -324,7 +325,7 @@ jobs:
runs-on: ${{ github.actor }}-x86
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: barretenberg-acir-tests-bb-js-${{ github.actor }}-x86
Expand All @@ -339,7 +340,7 @@ jobs:
if: github.event.number
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
with:
concurrency_key: docs-preview-${{ inputs.username || github.actor }}-x86
Expand All @@ -352,7 +353,7 @@ jobs:
needs: setup
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
- uses: ./.github/ci-setup-action
- name: Build Bench Binaries
uses: ./.github/ensure-builder
Expand All @@ -378,7 +379,7 @@ jobs:
pull-requests: write
steps:
- uses: actions/checkout@v4
with: { ref: "${{ github.event.pull_request.head.sha }}" }
with: { ref: "${{ env.GIT_COMMIT }}" }
# Only allow one memory-hunger prover test to use this runner
- uses: ./.github/ci-setup-action
with:
Expand Down
2 changes: 2 additions & 0 deletions scripts/ci/attach_ebs_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ fi
mkdir -p /var/lib/docker
mount $BLKDEVICE /var/lib/docker
service docker restart
# clear our images temp folder
rm -rf /var/lib/docker/tmp
# important: everything (except earthly ls) should go through earthly-ci
scripts/earthly-ci bootstrap
touch /home/ubuntu/.setup-complete

0 comments on commit fee7649

Please sign in to comment.