From fee764922b1dee6062f9dd7f9776dee6186740b7 Mon Sep 17 00:00:00 2001 From: ludamad Date: Mon, 13 May 2024 16:47:02 -0400 Subject: [PATCH] chore(ci): fix master, better spot copy times (#6374) - Copying to same EBS disk (/var/lib/docker) takes 20 seconds instead of 1:20 - Master had bad git hash in some situations --- .github/ensure-tester-with-images/action.yml | 10 +++- .github/spot-runner-action/dist/index.js | 41 +++++++--------- .github/spot-runner-action/src/ec2.ts | 7 ++- .github/spot-runner-action/src/main.ts | 50 ++++++++------------ .github/workflows/ci.yml | 45 +++++++++--------- scripts/ci/attach_ebs_cache.sh | 2 + 6 files changed, 76 insertions(+), 79 deletions(-) diff --git a/.github/ensure-tester-with-images/action.yml b/.github/ensure-tester-with-images/action.yml index 23c773caf1e..010acf9119b 100644 --- a/.github/ensure-tester-with-images/action.yml +++ b/.github/ensure-tester-with-images/action.yml @@ -60,8 +60,14 @@ runs: export BUILDER_SPOT_IP=${{ env.BUILDER_SPOT_IP }} export BUILDER_SPOT_KEY=~/.ssh/build_instance_key scripts/run_on_builder " - flock ${{ env.IMAGE_KEY }}.lock bash -c '! [ -f ${{ env.IMAGE_KEY }}.brotli ] && docker save ${{ inputs.builder_images_to_copy }} | brotli -2 > ${{ env.IMAGE_KEY }}.brotli' - cat ${{ env.IMAGE_KEY }}.brotli + sudo mkdir -p /var/lib/docker/tmp + + sudo flock /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.lock bash -c ' + if ! [ -f /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli ] ; then + docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp + mv /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli + fi' + sudo cat /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli " | brotli --decompress | docker load - name: Test diff --git a/.github/spot-runner-action/dist/index.js b/.github/spot-runner-action/dist/index.js index 3404bc2f382..666658bd34c 100644 --- a/.github/spot-runner-action/dist/index.js +++ b/.github/spot-runner-action/dist/index.js @@ -337,8 +337,9 @@ class Ec2Instance { const fleet = yield client.createFleet(createFleetRequest).promise(); if (fleet.Errors && fleet.Errors.length > 0) { for (const error of fleet.Errors) { - if (error.ErrorCode === "RequestLimitExceeded") { - return "RequestLimitExceeded"; + if (error.ErrorCode === "RequestLimitExceeded" || + error.ErrorCode === "InsufficientInstanceCapacity") { + return error.ErrorCode; } } core.error(JSON.stringify(fleet.Errors, null, 2)); @@ -732,29 +733,21 @@ function requestAndWaitForSpot(config) { // 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff // TODO make longer lived spot request? for (let i = 0; i < 6; i++) { - try { - // Start instance - instanceId = - yield ec2Client.requestMachine( - // we fallback to on-demand - ec2Strategy.toLocaleLowerCase() === "none"); - // let's exit, only loop on InsufficientInstanceCapacity - if (instanceId !== "RequestLimitExceeded") { - break; - } + // Start instance + const instanceIdOrError = yield ec2Client.requestMachine( + // we fallback to on-demand + ec2Strategy.toLocaleLowerCase() === "none"); + // let's exit, only loop on InsufficientInstanceCapacity + if (instanceIdOrError === "RequestLimitExceeded" || + instanceIdOrError === "InsufficientInstanceCapacity") { + core.info("Failed to create instance due to " + + instanceIdOrError + + " , waiting 10 seconds and trying again."); + backoff += 1; } - catch (error) { - // TODO is this still the relevant error? - if ((error === null || error === void 0 ? void 0 : error.code) && - error.code === "InsufficientInstanceCapacity" && - ec2SpotStrategies.length > 0 && - ec2Strategy.toLocaleLowerCase() != "none") { - core.info("Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again."); - // we loop after 10 seconds - } - else { - throw error; - } + else { + instanceId = instanceIdOrError; + break; } // wait 10 seconds yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff))); diff --git a/.github/spot-runner-action/src/ec2.ts b/.github/spot-runner-action/src/ec2.ts index a00ca587b6c..41c91bcdd4d 100644 --- a/.github/spot-runner-action/src/ec2.ts +++ b/.github/spot-runner-action/src/ec2.ts @@ -256,8 +256,11 @@ export class Ec2Instance { const fleet = await client.createFleet(createFleetRequest).promise(); if (fleet.Errors && fleet.Errors.length > 0) { for (const error of fleet.Errors) { - if (error.ErrorCode === "RequestLimitExceeded") { - return "RequestLimitExceeded"; + if ( + error.ErrorCode === "RequestLimitExceeded" || + error.ErrorCode === "InsufficientInstanceCapacity" + ) { + return error.ErrorCode; } } core.error(JSON.stringify(fleet.Errors, null, 2)); diff --git a/.github/spot-runner-action/src/main.ts b/.github/spot-runner-action/src/main.ts index b5db5bb376c..01397bcfd50 100644 --- a/.github/spot-runner-action/src/main.ts +++ b/.github/spot-runner-action/src/main.ts @@ -61,37 +61,29 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise { let instanceId = ""; for (const ec2Strategy of ec2SpotStrategies) { - let backoff = 1; + let backoff = 0; core.info(`Starting instance with ${ec2Strategy} strategy`); - // 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff - // TODO make longer lived spot request? for (let i = 0; i < 6; i++) { - try { - // Start instance - instanceId = - await ec2Client.requestMachine( - // we fallback to on-demand - ec2Strategy.toLocaleLowerCase() === "none" - ); - // let's exit, only loop on InsufficientInstanceCapacity - if (instanceId !== "RequestLimitExceeded") { - break; - } - } catch (error) { - // TODO is this still the relevant error? - if ( - error?.code && - error.code === "InsufficientInstanceCapacity" && - ec2SpotStrategies.length > 0 && - ec2Strategy.toLocaleLowerCase() != "none" - ) { - core.info( - "Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again." - ); - // we loop after 10 seconds - } else { - throw error; - } + // Start instance + const instanceIdOrError = + await ec2Client.requestMachine( + // we fallback to on-demand + ec2Strategy.toLocaleLowerCase() === "none" + ); + // let's exit, only loop on InsufficientInstanceCapacity + if ( + instanceIdOrError === "RequestLimitExceeded" || + instanceIdOrError === "InsufficientInstanceCapacity" + ) { + backoff += 1; + core.info( + "Failed to create instance due to " + + instanceIdOrError + + " , waiting " + 10000 * 2 ** backoff + " seconds and trying again." + ); + } else { + instanceId = instanceIdOrError; + break; } // wait 10 seconds await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff)); diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57c0e430110..70315ae2792 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,7 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} BUILD_INSTANCE_SSH_KEY: ${{ secrets.BUILD_INSTANCE_SSH_KEY }} + GIT_COMMIT: ${{ github.event.pull_request.head.sha || github.sha }} # kludge until we move away from runners WAIT_FOR_RUNNERS: false @@ -42,7 +43,7 @@ jobs: bench_list: ${{ steps.bench_list.outputs.list }} steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: build-${{ inputs.username || github.actor }}-x86 @@ -69,7 +70,7 @@ jobs: test: ${{ fromJson( needs.build.outputs.e2e_list )}} steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action - name: Setup and Test timeout-minutes: 40 @@ -79,7 +80,7 @@ jobs: builder_type: builder-x86 # these are copied to the tester and expected by the earthly command below # if they fail to copy, it will try to build them on the tester and fail - builder_images_to_copy: aztecprotocol/aztec:${{ github.event.pull_request.head.sha }} aztecprotocol/end-to-end:${{ github.event.pull_request.head.sha }} + builder_images_to_copy: aztecprotocol/aztec:${{ env.GIT_COMMIT }} aztecprotocol/end-to-end:${{ env.GIT_COMMIT }} # command to produce the images in case they don't exist builder_command: scripts/earthly-ci ./yarn-project+export-e2e-test-images run: | @@ -98,7 +99,7 @@ jobs: test: ${{ fromJson( needs.build.outputs.bench_list )}} steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action - name: Setup and Test uses: ./.github/ensure-tester-with-images @@ -108,7 +109,7 @@ jobs: builder_type: builder-x86 # these are copied to the tester and expected by the earthly command below # if they fail to copy, it will try to build them on the tester and fail - builder_images_to_copy: aztecprotocol/aztec:${{ github.event.pull_request.head.sha }} aztecprotocol/end-to-end:${{ github.event.pull_request.head.sha }} + builder_images_to_copy: aztecprotocol/aztec:${{ env.GIT_COMMIT }} aztecprotocol/end-to-end:${{ env.GIT_COMMIT }} # command to produce the images in case they don't exist builder_command: cd yarn-project/end-to-end/ && ../../scripts/earthly-ci +${{ matrix.test }} run: | @@ -128,7 +129,7 @@ jobs: # # - uses: actions/checkout@v4 # # with: # # fetch-depth: 100 # Downloading base benchmark from master requires access to history - # # ref: "${{ github.event.pull_request.head.sha }}" + # # ref: "${{ env.GIT_COMMIT }}" # # - uses: ./.github/ci-setup-action # # with: # # concurrency_key: build-${{ inputs.username || github.actor }}-x86 @@ -153,7 +154,7 @@ jobs: runs-on: ${{ inputs.username || github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: noir-format-${{ inputs.username || github.actor }}-x86 @@ -173,7 +174,7 @@ jobs: runs-on: ${{ inputs.username || github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } # Only allow one memory-hunger prover test to use this runner - uses: ./.github/ci-setup-action with: @@ -190,7 +191,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: bb-js-test-${{ github.actor }}-x86 @@ -204,7 +205,7 @@ jobs: runs-on: ${{ inputs.username || github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: noir-${{ inputs.username || github.actor }}-x86 @@ -216,7 +217,7 @@ jobs: runs-on: ${{ inputs.username || github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: noir-packages-${{ inputs.username || github.actor }}-x86 @@ -228,7 +229,7 @@ jobs: runs-on: ${{ inputs.username || github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: noir-projects-${{ inputs.username || github.actor }}-x86 @@ -241,7 +242,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } # Only allow one memory-hunger prover test to use this runner - uses: ./.github/ci-setup-action with: @@ -256,7 +257,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } # Only allow one memory-hunger prover test to use this runner - uses: ./.github/ci-setup-action with: @@ -270,7 +271,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: prover-client-test-${{ github.actor }}-x86 @@ -283,7 +284,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: build-acir-tests-${{ github.actor }}-x86 @@ -296,7 +297,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: barretenberg-acir-tests-bb-${{ github.actor }}-x86 @@ -310,7 +311,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: barretenberg-acir-tests-sol-${{ github.actor }}-x86 @@ -324,7 +325,7 @@ jobs: runs-on: ${{ github.actor }}-x86 steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: barretenberg-acir-tests-bb-js-${{ github.actor }}-x86 @@ -339,7 +340,7 @@ jobs: if: github.event.number steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action with: concurrency_key: docs-preview-${{ inputs.username || github.actor }}-x86 @@ -352,7 +353,7 @@ jobs: needs: setup steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } - uses: ./.github/ci-setup-action - name: Build Bench Binaries uses: ./.github/ensure-builder @@ -378,7 +379,7 @@ jobs: pull-requests: write steps: - uses: actions/checkout@v4 - with: { ref: "${{ github.event.pull_request.head.sha }}" } + with: { ref: "${{ env.GIT_COMMIT }}" } # Only allow one memory-hunger prover test to use this runner - uses: ./.github/ci-setup-action with: diff --git a/scripts/ci/attach_ebs_cache.sh b/scripts/ci/attach_ebs_cache.sh index b97e5920281..640341814d9 100755 --- a/scripts/ci/attach_ebs_cache.sh +++ b/scripts/ci/attach_ebs_cache.sh @@ -153,6 +153,8 @@ fi mkdir -p /var/lib/docker mount $BLKDEVICE /var/lib/docker service docker restart +# clear our images temp folder +rm -rf /var/lib/docker/tmp # important: everything (except earthly ls) should go through earthly-ci scripts/earthly-ci bootstrap touch /home/ubuntu/.setup-complete \ No newline at end of file