feat: aggregate and summarize metrics with rust (#1154)

* chore: move metrics_unify to openvm-prof * feat(prof): aggregate and summary metrics with rust * feat: add diffs for aggregate/summary * fix * fix: handle prev formatting mismatch * hacky add metadata * chore: copy less files
openvm-org · Jan 2, 2025 · 1425c1d · 1425c1d
1 parent cb6c5a9
commit 1425c1d
Show file tree

Hide file tree

Showing 17 changed files with 15,015 additions and 717 deletions.
diff --git a/.github/workflows/benchmark-call.yml b/.github/workflows/benchmark-call.yml
@@ -4,16 +4,9 @@ on:
   workflow_dispatch:
     inputs:
       benchmark_name:
-        type: choice
+        type: string
         required: true
         description: The name of the benchmark to run
-        options:
-          - verify_fibair
-          - fibonacci
-          - revm_transfer
-          - regex
-          - base64_json
-          - fib_e2e
       instance_type:
         type: string
         required: false
@@ -104,9 +97,7 @@ on:
         description: Whether to run the e2e benchmark
 
 env:
-  S3_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/github/results
   S3_METRICS_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/github/metrics
-  PUBLIC_S3_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/github/flamegraphs
   FEATURE_FLAGS: "bench-metrics,parallel,nightly-features"
   CMD_ARGS: ""
   INPUT_ARGS: ""
@@ -128,7 +119,6 @@ jobs:
       ##########################################################################
       # Environment setup                                                      #
       ##########################################################################
-
       - uses: actions/checkout@v4
         with:
           ref: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -211,7 +201,7 @@ jobs:
           python3 ${{ steps.set-working-dir.outputs.relative_path }}/ci/scripts/bench.py $BIN_NAME $CMD_ARGS $INPUT_ARGS
 
       ##########################################################################
-      # Generate result .md files and flamegraphs, store them in S3            #
+      # Store metric json file to S3                                           #
       ##########################################################################
       - name: Upload metric json and compute diff with previous to generate markdown
         run: |
@@ -221,9 +211,6 @@ jobs:
 
           s5cmd cp $METRIC_PATH ${{ env.S3_METRICS_PATH }}/${METRIC_NAME}-${current_sha}.json
 
-          source ci/scripts/utils.sh
-          generate_markdown $METRIC_PATH $METRIC_NAME ${{ env.S3_METRICS_PATH }} "."
-
       # - name: Install inferno-flamegraph
       #   run: cargo install inferno
 
@@ -235,82 +222,10 @@ jobs:
       #       echo "UPLOAD_FLAMEGRAPHS=1" >> $GITHUB_ENV
       #     fi
 
-      - name: Add benchmark metadata and upload markdown
-        id: add_metadata
-        run: |
-          source ci/scripts/utils.sh
-          add_metadata results.md ${{ inputs.max_segment_length }} ${{ inputs.instance_type }} ${{ inputs.memory_allocator }} ${{ github.repository }} ${{ github.run_id }}
-          s3_md_file="${METRIC_NAME}-${current_sha}.md"
-          s5cmd cp results.md "${{ env.S3_PATH }}/${s3_md_file}"
-
       ##########################################################################
-      # Update S3 with individual results upon a push event                    #
+      # Update s3 for latest main metrics upon a push event                    #
       ##########################################################################
       - name: Update latest main result in s3
         if: github.event_name == 'push' && github.ref == 'refs/heads/main'
         run: |
-          s5cmd cp "${{ env.S3_PATH }}/${METRIC_NAME}-${{ env.current_sha }}.md" "${{ env.S3_PATH }}/main-${METRIC_NAME}.md"
-          if [[ -f $METRIC_PATH ]]; then
-            s5cmd cp $METRIC_PATH "${{ env.S3_METRICS_PATH }}/main-${METRIC_NAME}.json"
-          fi
-
-      ##########################################################################
-      # Update benchmark-results with individual results                       #
-      ##########################################################################
-      - uses: actions/checkout@v4
-        with:
-          ref: benchmark-results
-
-      - name: Set up git
-        run: |
-          git config --global user.email "github-actions[bot]@users.noreply.github.com"
-          git config --global user.name "github-actions[bot]"
-
-      - name: Set github pages path for dispatch
-        run: |
-          BENCHMARK_RESULTS_PATH="benchmarks-dispatch/${{ github.head_ref || github.ref }}"
-          echo "BENCHMARK_RESULTS_PATH=${BENCHMARK_RESULTS_PATH}" >> $GITHUB_ENV
-
-      - name: Set github pages path for PR
-        if: github.event_name == 'pull_request'
-        run: |
-          BENCHMARK_RESULTS_PATH="benchmarks-pr/${{ github.event.pull_request.number }}/individual"
-          echo "BENCHMARK_RESULTS_PATH=${BENCHMARK_RESULTS_PATH}" >> $GITHUB_ENV
-
-      - name: Set github pages path for push
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
-        run: |
-          BENCHMARK_RESULTS_PATH="benchmarks/individual"
-          echo "BENCHMARK_RESULTS_PATH=${BENCHMARK_RESULTS_PATH}" >> $GITHUB_ENV
-
-      - name: Update PR github pages with new bench results
-        if: github.event.pull_request.head.repo.fork == false # forks do not have write access
-        run: |
-          mkdir -p ${BENCHMARK_RESULTS_PATH}
-          s3_md_file="${METRIC_NAME}-${current_sha}.md"
-          s5cmd cp "${{ env.S3_PATH }}/${s3_md_file}" ${BENCHMARK_RESULTS_PATH}/${s3_md_file}
-          git add ${BENCHMARK_RESULTS_PATH}/${s3_md_file}
-          git commit --allow-empty -m "Update benchmark result at ${BENCHMARK_RESULTS_PATH}/${s3_md_file}"
-
-          MAX_RETRIES=10
-          RETRY_DELAY=5
-          ATTEMPT=0
-          SUCCESS=false
-
-          while [ $ATTEMPT -lt $MAX_RETRIES ]; do
-              echo "Attempt $((ATTEMPT + 1)) to push of $MAX_RETRIES..."
-              git fetch origin benchmark-results
-              git merge origin/benchmark-results --no-edit
-              if git push origin benchmark-results; then
-                  SUCCESS=true
-                  break
-              else
-                  echo "Push failed. Retrying in $RETRY_DELAY seconds..."
-                  sleep $RETRY_DELAY
-                  ATTEMPT=$((ATTEMPT + 1))
-              fi
-          done
-          if [ "$SUCCESS" = false ]; then
-              echo "PUSH_FAILED"
-              exit 1
-          fi
+          s5cmd cp $METRIC_PATH "${{ env.S3_METRICS_PATH }}/main-${METRIC_NAME}.json"
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -14,6 +14,7 @@ on:
       - "benchmarks/**"
       - ".github/workflows/benchmark-call.yml"
       - ".github/workflows/benchmarks.yml"
+  workflow_dispatch:
 
 concurrency:
   group: benchmark-${{ github.event.pull_request.number || github.sha }}
@@ -25,6 +26,8 @@ env:
   CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
   REPO: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
   CARGO_NET_GIT_FETCH_WITH_CLI: "true"
+  S3_METRICS_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/github/metrics
+  S3_MD_PATH: s3://openvm-public-data-sandbox-us-east-1/benchmark/github/results
 
 permissions:
   contents: write
@@ -138,6 +141,15 @@ jobs:
           ref: ${{ env.CURRENT_SHA }}
           repository: ${{ env.REPO }}
 
+      - name: Install openvm-prof
+        working-directory: crates/prof
+        run: cargo install --force --profile=dev --path .
+
+      - name: Set github pages path for dispatch
+        run: |
+          BENCHMARK_RESULTS_PATH="benchmarks-dispatch/${{ github.head_ref || github.ref }}"
+          echo "BENCHMARK_RESULTS_PATH=${BENCHMARK_RESULTS_PATH}" >> $GITHUB_ENV
+
       - name: Set github pages path for PR
         if: github.event_name == 'pull_request'
         run: |
@@ -150,59 +162,84 @@ jobs:
           BENCHMARK_RESULTS_PATH="benchmarks"
           echo "BENCHMARK_RESULTS_PATH=${BENCHMARK_RESULTS_PATH}" >> $GITHUB_ENV
 
-      - name: Load all metadata files from S3
+      - name: Download all metric json files from S3
         run: |
-          current_sha=$(git rev-parse HEAD)
-          md_files=$(echo '${{ needs.create-matrix.outputs.matrix }}' | jq -r '
+          json_files=$(echo '${{ needs.create-matrix.outputs.matrix }}' | jq -r '
             .[] |
-            select(.e2e_bench != true) |
-            "\(.id)-"' |
-            sed "s/$/${current_sha}.md/" |
-            sort)
-          md_file_list=$(echo -n "$md_files" | paste -sd "," -)
+            "\(.id)-${{ env.CURRENT_SHA }}.json"')
+          json_file_list=$(echo -n "$json_files" | paste -sd "," -)
+          echo $json_file_list
 
-          e2e_md_files=$(echo '${{ needs.create-matrix.outputs.matrix }}' | jq -r '
+          prev_json_files=$(echo '${{ needs.create-matrix.outputs.matrix }}' | jq -r '
             .[] |
-            select(.e2e_bench == true) |
-            "\(.id)-"' |
-            sed "s/$/${current_sha}.md/" |
-            sort)
-          e2e_md_file_list=$(echo -n "$e2e_md_files" | paste -sd "," -)
-
-          while read md_file; do
-            if [ -z "$md_file" ]; then
+            "main-\(.id).json"')
+          prev_json_file_list=$(echo -n "$prev_json_files" | paste -sd "," -)
+          echo $prev_json_file_list
+
+          (echo "$json_files"; echo "$prev_json_files") | while read json_file; do
+            if [ -z "$json_file" ]; then
               continue
             fi
-            echo "Downloading results for benchmark: $md_file"
-            s5cmd cp "s3://openvm-public-data-sandbox-us-east-1/benchmark/github/results/${md_file}" "${md_file}"
-          done <<< "$md_files"
-
-          E2E_FILE_LIST=""
-          if [[ -n "$e2e_md_files" ]]; then
-            while read e2e_md_file; do
-              echo "Downloading results for benchmark: $e2e_md_file"
-              s5cmd cp "s3://openvm-public-data-sandbox-us-east-1/benchmark/github/results/${e2e_md_file}" "${e2e_md_file}"
-            done <<< "$e2e_md_files"
-            E2E_FILE_LIST="${e2e_md_file_list}"
-          fi
+            echo "Downloading metrics for benchmark: $json_file"
+            if ! s5cmd cp "${{ env.S3_METRICS_PATH }}/${json_file}" "${json_file}"; then
+              echo "Warning: Failed to download ${json_file}, skipping..."
+              continue
+            fi
+          done
 
-          echo "Benchmarks: ${md_file_list}"
-          echo "E2E Benchmarks: ${E2E_FILE_LIST}"
+          openvm-prof --json-paths "${json_file_list}" \
+            --prev-json-paths "${prev_json_file_list}" \
+            summary \
+            --benchmark-results-link "https://github.com/${{ github.repository }}/blob/benchmark-results/${BENCHMARK_RESULTS_PATH}" \
+            --summary-md-path summary.md
 
-          python3 ci/scripts/metric_unify/summarize.py "${md_file_list}" \
-            --e2e-md-files "${E2E_FILE_LIST}" \
-            --aggregation-json ci/scripts/metric_unify/aggregation.json \
-            --benchmark-results-link "https://github.com/${{ github.repository }}/blob/benchmark-results/${BENCHMARK_RESULTS_PATH}"
+          COMMIT_URL=https://github.com/${{ github.repository }}/commit/${CURRENT_SHA}
+          BENCHMARK_WORKFLOW_URL=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          TMP_DIR=/tmp/benchmark-results/${{ env.CURRENT_SHA }}
+          echo "TMP_DIR=${TMP_DIR}" >> $GITHUB_ENV
+          mkdir -p ${TMP_DIR}
+
+          # add metadata to markdown files
+          source ci/scripts/utils.sh
+          # Parse matrix data into associative arrays
+          while IFS= read -r line; do
+            # Convert .json to .md
+            md_file="${line%.json}.md"
+
+            id=$(echo '${{ needs.create-matrix.outputs.matrix }}' | jq -r --arg file "$line" '.[] |
+              select(.id == ($file | split("-")[0])) |
+              {
+                max_segment_length: .max_segment_length,
+                instance_type: .instance_type,
+                memory_allocator: .memory_allocator
+              }')
+
+            if [ ! -z "$id" ]; then
+              max_segment_length=$(echo "$id" | jq -r '.max_segment_length')
+              instance_type=$(echo "$id" | jq -r '.instance_type')
+              memory_allocator=$(echo "$id" | jq -r '.memory_allocator')
+
+              # Call add_metadata for each file with its corresponding data
+              add_metadata \
+                "$md_file" \
+                "$max_segment_length" \
+                "$instance_type" \
+                "$memory_allocator" \
+                "$COMMIT_URL" \
+                "$BENCHMARK_WORKFLOW_URL"
+            fi
+            cp "$md_file" "${TMP_DIR}/"
+          done <<< "$json_files"
 
           echo "" >> summary.md
-          echo "Commit: https://github.com/${{ github.repository }}/commit/${CURRENT_SHA}" >> summary.md
+          echo "Commit: $COMMIT_URL" >> summary.md
           echo "" >> summary.md
-          echo "[Benchmark Workflow](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})" >> summary.md
-          mkdir -p /tmp/benchmark-results/
-          cp summary.md /tmp/benchmark-results/
+          echo "[Benchmark Workflow]($BENCHMARK_WORKFLOW_URL)" >> summary.md
+
+          cp summary.md ${TMP_DIR}/
 
       ##########################################################################
-      # Update benchmark-results with summary upon a PR event                           #
+      # Update benchmark-results branch with summary upon a PR event           #
       ##########################################################################
       - uses: actions/checkout@v4
         if: github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/main')
@@ -219,14 +256,36 @@ jobs:
         if: (github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false) || (github.event_name == 'push' && github.ref == 'refs/heads/main')
         run: |
           mkdir -p ${BENCHMARK_RESULTS_PATH}
-          cp /tmp/benchmark-results/summary.md ${BENCHMARK_RESULTS_PATH}/summary.md
-          git add ${BENCHMARK_RESULTS_PATH}/summary.md
+          cp ${TMP_DIR}/*.md ${BENCHMARK_RESULTS_PATH}/
+          git add ${BENCHMARK_RESULTS_PATH}
           if [[ "${{ github.event_name }}" == "push" ]]; then
             cp /tmp/benchmark-results/summary.md index.md
             git add index.md
           fi
-          git commit --allow-empty -m "Update summarized benchmark result at ${BENCHMARK_RESULTS_PATH}/summary.md"
-          git push --force
+          git commit --allow-empty -m "Update benchmark results at ${{ env.CURRENT_SHA }}"
+
+          MAX_RETRIES=10
+          RETRY_DELAY=5
+          ATTEMPT=0
+          SUCCESS=false
+
+          while [ $ATTEMPT -lt $MAX_RETRIES ]; do
+              echo "Attempt $((ATTEMPT + 1)) to push of $MAX_RETRIES..."
+              git fetch origin benchmark-results
+              git merge origin/benchmark-results --no-edit
+              if git push origin benchmark-results; then
+                  SUCCESS=true
+                  break
+              else
+                  echo "Push failed. Retrying in $RETRY_DELAY seconds..."
+                  sleep $RETRY_DELAY
+                  ATTEMPT=$((ATTEMPT + 1))
+              fi
+          done
+          if [ "$SUCCESS" = false ]; then
+              echo "PUSH_FAILED"
+              exit 1
+          fi
 
       ##########################################################################
       # Update PR comment upon a pull request event                            #
@@ -262,7 +321,7 @@ jobs:
         with:
           script: |
             const fs = require('fs')
-            const newBenchmark = fs.readFileSync('/tmp/benchmark-results/summary.md', { encoding: 'utf8', flag: 'r' })
+            const newBenchmark = fs.readFileSync('${{ env.TMP_DIR }}/summary.md', { encoding: 'utf8', flag: 'r' })
 
             github.rest.issues.createComment({
               issue_number: context.issue.number,