From fee764922b1dee6062f9dd7f9776dee6186740b7 Mon Sep 17 00:00:00 2001
From: ludamad <adam.domurad@gmail.com>
Date: Mon, 13 May 2024 16:47:02 -0400
Subject: [PATCH] chore(ci): fix master, better spot copy times (#6374)

- Copying to same EBS disk (/var/lib/docker) takes 20 seconds instead of
1:20
- Master had bad git hash in some situations
---
 .github/ensure-tester-with-images/action.yml | 10 +++-
 .github/spot-runner-action/dist/index.js     | 41 +++++++---------
 .github/spot-runner-action/src/ec2.ts        |  7 ++-
 .github/spot-runner-action/src/main.ts       | 50 ++++++++------------
 .github/workflows/ci.yml                     | 45 +++++++++---------
 scripts/ci/attach_ebs_cache.sh               |  2 +
 6 files changed, 76 insertions(+), 79 deletions(-)

diff --git a/.github/ensure-tester-with-images/action.yml b/.github/ensure-tester-with-images/action.yml
index 23c773caf1e..010acf9119b 100644
--- a/.github/ensure-tester-with-images/action.yml
+++ b/.github/ensure-tester-with-images/action.yml
@@ -60,8 +60,14 @@ runs:
           export BUILDER_SPOT_IP=${{ env.BUILDER_SPOT_IP }}
           export BUILDER_SPOT_KEY=~/.ssh/build_instance_key
           scripts/run_on_builder "
-            flock ${{ env.IMAGE_KEY }}.lock bash -c '! [ -f ${{ env.IMAGE_KEY }}.brotli ] && docker save ${{ inputs.builder_images_to_copy }} | brotli -2 > ${{ env.IMAGE_KEY }}.brotli'
-            cat ${{ env.IMAGE_KEY }}.brotli
+            sudo mkdir -p /var/lib/docker/tmp
+
+            sudo flock /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.lock bash -c '
+              if ! [ -f /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli ] ; then
+                docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp
+                mv /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
+              fi'
+            sudo cat /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
           " | brotli --decompress | docker load
 
     - name: Test
diff --git a/.github/spot-runner-action/dist/index.js b/.github/spot-runner-action/dist/index.js
index 3404bc2f382..666658bd34c 100644
--- a/.github/spot-runner-action/dist/index.js
+++ b/.github/spot-runner-action/dist/index.js
@@ -337,8 +337,9 @@ class Ec2Instance {
             const fleet = yield client.createFleet(createFleetRequest).promise();
             if (fleet.Errors && fleet.Errors.length > 0) {
                 for (const error of fleet.Errors) {
-                    if (error.ErrorCode === "RequestLimitExceeded") {
-                        return "RequestLimitExceeded";
+                    if (error.ErrorCode === "RequestLimitExceeded" ||
+                        error.ErrorCode === "InsufficientInstanceCapacity") {
+                        return error.ErrorCode;
                     }
                 }
                 core.error(JSON.stringify(fleet.Errors, null, 2));
@@ -732,29 +733,21 @@ function requestAndWaitForSpot(config) {
             // 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
             // TODO make longer lived spot request?
             for (let i = 0; i < 6; i++) {
-                try {
-                    // Start instance
-                    instanceId =
-                        yield ec2Client.requestMachine(
-                        // we fallback to on-demand
-                        ec2Strategy.toLocaleLowerCase() === "none");
-                    // let's exit, only loop on InsufficientInstanceCapacity
-                    if (instanceId !== "RequestLimitExceeded") {
-                        break;
-                    }
+                // Start instance
+                const instanceIdOrError = yield ec2Client.requestMachine(
+                // we fallback to on-demand
+                ec2Strategy.toLocaleLowerCase() === "none");
+                // let's exit, only loop on InsufficientInstanceCapacity
+                if (instanceIdOrError === "RequestLimitExceeded" ||
+                    instanceIdOrError === "InsufficientInstanceCapacity") {
+                    core.info("Failed to create instance due to " +
+                        instanceIdOrError +
+                        " , waiting 10 seconds and trying again.");
+                    backoff += 1;
                 }
-                catch (error) {
-                    // TODO is this still the relevant error?
-                    if ((error === null || error === void 0 ? void 0 : error.code) &&
-                        error.code === "InsufficientInstanceCapacity" &&
-                        ec2SpotStrategies.length > 0 &&
-                        ec2Strategy.toLocaleLowerCase() != "none") {
-                        core.info("Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again.");
-                        // we loop after 10 seconds
-                    }
-                    else {
-                        throw error;
-                    }
+                else {
+                    instanceId = instanceIdOrError;
+                    break;
                 }
                 // wait 10 seconds
                 yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff)));
diff --git a/.github/spot-runner-action/src/ec2.ts b/.github/spot-runner-action/src/ec2.ts
index a00ca587b6c..41c91bcdd4d 100644
--- a/.github/spot-runner-action/src/ec2.ts
+++ b/.github/spot-runner-action/src/ec2.ts
@@ -256,8 +256,11 @@ export class Ec2Instance {
     const fleet = await client.createFleet(createFleetRequest).promise();
     if (fleet.Errors && fleet.Errors.length > 0) {
       for (const error of fleet.Errors) {
-        if (error.ErrorCode === "RequestLimitExceeded") {
-          return "RequestLimitExceeded";
+        if (
+          error.ErrorCode === "RequestLimitExceeded" ||
+          error.ErrorCode === "InsufficientInstanceCapacity"
+        ) {
+          return error.ErrorCode;
         }
       }
       core.error(JSON.stringify(fleet.Errors, null, 2));
diff --git a/.github/spot-runner-action/src/main.ts b/.github/spot-runner-action/src/main.ts
index b5db5bb376c..01397bcfd50 100644
--- a/.github/spot-runner-action/src/main.ts
+++ b/.github/spot-runner-action/src/main.ts
@@ -61,37 +61,29 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {
 
   let instanceId = "";
   for (const ec2Strategy of ec2SpotStrategies) {
-    let backoff = 1;
+    let backoff = 0;
     core.info(`Starting instance with ${ec2Strategy} strategy`);
-    // 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
-    // TODO make longer lived spot request?
     for (let i = 0; i < 6; i++) {
-      try {
-        // Start instance
-        instanceId =
-          await ec2Client.requestMachine(
-            // we fallback to on-demand
-            ec2Strategy.toLocaleLowerCase() === "none"
-          );
-        // let's exit, only loop on InsufficientInstanceCapacity
-        if (instanceId !== "RequestLimitExceeded") {
-          break;
-        }
-      } catch (error) {
-        // TODO is this still the relevant error?
-        if (
-          error?.code &&
-          error.code === "InsufficientInstanceCapacity" &&
-          ec2SpotStrategies.length > 0 &&
-          ec2Strategy.toLocaleLowerCase() != "none"
-        ) {
-          core.info(
-            "Failed to create instance due to 'InsufficientInstanceCapacity', waiting 10 seconds and trying again."
-          );
-          // we loop after 10 seconds
-        } else {
-          throw error;
-        }
+      // Start instance
+      const instanceIdOrError =
+        await ec2Client.requestMachine(
+          // we fallback to on-demand
+          ec2Strategy.toLocaleLowerCase() === "none"
+        );
+      // let's exit, only loop on InsufficientInstanceCapacity
+      if (
+        instanceIdOrError === "RequestLimitExceeded" ||
+        instanceIdOrError === "InsufficientInstanceCapacity"
+      ) {
+        backoff += 1;
+        core.info(
+          "Failed to create instance due to " +
+            instanceIdOrError +
+            " , waiting " + 10000 * 2 ** backoff + " seconds and trying again."
+        );
+      } else {
+        instanceId = instanceIdOrError;
+        break;
       }
       // wait 10 seconds
       await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff));
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 57c0e430110..70315ae2792 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,7 @@ env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
   BUILD_INSTANCE_SSH_KEY: ${{ secrets.BUILD_INSTANCE_SSH_KEY }}
+  GIT_COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
   # kludge until we move away from runners
   WAIT_FOR_RUNNERS: false
 
@@ -42,7 +43,7 @@ jobs:
       bench_list: ${{ steps.bench_list.outputs.list }}
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: build-${{ inputs.username || github.actor }}-x86
@@ -69,7 +70,7 @@ jobs:
         test: ${{ fromJson( needs.build.outputs.e2e_list )}}
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
       - name: Setup and Test
         timeout-minutes: 40
@@ -79,7 +80,7 @@ jobs:
           builder_type: builder-x86
           # these are copied to the tester and expected by the earthly command below
           # if they fail to copy, it will try to build them on the tester and fail
-          builder_images_to_copy: aztecprotocol/aztec:${{ github.event.pull_request.head.sha }} aztecprotocol/end-to-end:${{ github.event.pull_request.head.sha }}
+          builder_images_to_copy: aztecprotocol/aztec:${{ env.GIT_COMMIT }} aztecprotocol/end-to-end:${{ env.GIT_COMMIT }}
           # command to produce the images in case they don't exist
           builder_command: scripts/earthly-ci ./yarn-project+export-e2e-test-images
           run: |
@@ -98,7 +99,7 @@ jobs:
         test: ${{ fromJson( needs.build.outputs.bench_list )}}
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
       - name: Setup and Test
         uses: ./.github/ensure-tester-with-images
@@ -108,7 +109,7 @@ jobs:
           builder_type: builder-x86
           # these are copied to the tester and expected by the earthly command below
           # if they fail to copy, it will try to build them on the tester and fail
-          builder_images_to_copy: aztecprotocol/aztec:${{ github.event.pull_request.head.sha }} aztecprotocol/end-to-end:${{ github.event.pull_request.head.sha }}
+          builder_images_to_copy: aztecprotocol/aztec:${{ env.GIT_COMMIT }} aztecprotocol/end-to-end:${{ env.GIT_COMMIT }}
           # command to produce the images in case they don't exist
           builder_command: cd yarn-project/end-to-end/ && ../../scripts/earthly-ci +${{ matrix.test }}
           run: |
@@ -128,7 +129,7 @@ jobs:
   # #     - uses: actions/checkout@v4
   # #       with:
   # #         fetch-depth: 100 # Downloading base benchmark from master requires access to history
-  # #         ref: "${{ github.event.pull_request.head.sha }}"
+  # #         ref: "${{ env.GIT_COMMIT }}"
   # #     - uses: ./.github/ci-setup-action
   # #       with:
   # #         concurrency_key: build-${{ inputs.username || github.actor }}-x86
@@ -153,7 +154,7 @@ jobs:
     runs-on: ${{ inputs.username || github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: noir-format-${{ inputs.username || github.actor }}-x86
@@ -173,7 +174,7 @@ jobs:
     runs-on: ${{ inputs.username || github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       # Only allow one memory-hunger prover test to use this runner
       - uses: ./.github/ci-setup-action
         with:
@@ -190,7 +191,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: bb-js-test-${{ github.actor }}-x86
@@ -204,7 +205,7 @@ jobs:
     runs-on: ${{ inputs.username || github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: noir-${{ inputs.username || github.actor }}-x86
@@ -216,7 +217,7 @@ jobs:
     runs-on: ${{ inputs.username || github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: noir-packages-${{ inputs.username || github.actor }}-x86
@@ -228,7 +229,7 @@ jobs:
     runs-on: ${{ inputs.username || github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: noir-projects-${{ inputs.username || github.actor }}-x86
@@ -241,7 +242,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       # Only allow one memory-hunger prover test to use this runner
       - uses: ./.github/ci-setup-action
         with:
@@ -256,7 +257,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       # Only allow one memory-hunger prover test to use this runner
       - uses: ./.github/ci-setup-action
         with:
@@ -270,7 +271,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: prover-client-test-${{ github.actor }}-x86
@@ -283,7 +284,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: build-acir-tests-${{ github.actor }}-x86
@@ -296,7 +297,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: barretenberg-acir-tests-bb-${{ github.actor }}-x86
@@ -310,7 +311,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: barretenberg-acir-tests-sol-${{ github.actor }}-x86
@@ -324,7 +325,7 @@ jobs:
     runs-on: ${{ github.actor }}-x86
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: barretenberg-acir-tests-bb-js-${{ github.actor }}-x86
@@ -339,7 +340,7 @@ jobs:
     if: github.event.number
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
         with:
           concurrency_key: docs-preview-${{ inputs.username || github.actor }}-x86
@@ -352,7 +353,7 @@ jobs:
     needs: setup
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       - uses: ./.github/ci-setup-action
       - name: Build Bench Binaries
         uses: ./.github/ensure-builder
@@ -378,7 +379,7 @@ jobs:
       pull-requests: write
     steps:
       - uses: actions/checkout@v4
-        with: { ref: "${{ github.event.pull_request.head.sha }}" }
+        with: { ref: "${{ env.GIT_COMMIT }}" }
       # Only allow one memory-hunger prover test to use this runner
       - uses: ./.github/ci-setup-action
         with:
diff --git a/scripts/ci/attach_ebs_cache.sh b/scripts/ci/attach_ebs_cache.sh
index b97e5920281..640341814d9 100755
--- a/scripts/ci/attach_ebs_cache.sh
+++ b/scripts/ci/attach_ebs_cache.sh
@@ -153,6 +153,8 @@ fi
 mkdir -p /var/lib/docker
 mount $BLKDEVICE /var/lib/docker
 service docker restart
+# clear our images temp folder
+rm -rf /var/lib/docker/tmp
 # important: everything (except earthly ls) should go through earthly-ci
 scripts/earthly-ci bootstrap
 touch /home/ubuntu/.setup-complete
\ No newline at end of file