From ba9239d1b32c304539d5a42904ba58dd224aead2 Mon Sep 17 00:00:00 2001
From: Pavlos Rontidis <pavlos.rontidis@gmail.com>
Date: Thu, 14 Sep 2023 10:04:08 +0200
Subject: [PATCH] WIP

---
 .github/workflows/workload_checks.yml         | 352 ++++++++++++++++++
 workload-checks/README.md                     |   9 +
 .../cases/http_text_to_http_json/README.md    |   5 +
 .../http_text_to_http_json/experiment.yaml    |  21 ++
 .../http_text_to_http_json/lading/lading.yaml |  16 +
 .../http_text_to_http_json/vector/vector.toml |  12 +
 workload-checks/typical/machine.yaml          |   7 +
 7 files changed, 422 insertions(+)
 create mode 100644 .github/workflows/workload_checks.yml
 create mode 100644 workload-checks/README.md
 create mode 100644 workload-checks/typical/cases/http_text_to_http_json/README.md
 create mode 100644 workload-checks/typical/cases/http_text_to_http_json/experiment.yaml
 create mode 100644 workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml
 create mode 100644 workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml
 create mode 100644 workload-checks/typical/machine.yaml

diff --git a/.github/workflows/workload_checks.yml b/.github/workflows/workload_checks.yml
new file mode 100644
index 00000000000000..ffc8e084f8c421
--- /dev/null
+++ b/.github/workflows/workload_checks.yml
@@ -0,0 +1,352 @@
+# Workload Checks
+#
+# Runs Vector Workload Checks.
+#
+# Runs on:
+#  - scheduled daily UTC midnight
+
+# This workflow runs the collection of our workload checks, using the repo HEAD SHA,
+# which depends on when the workflow is invoked.
+#
+# The goal is to establish a baseline of check results for a variety of cases
+# and visualize trends for important Vector use cases.
+#
+# The HEAD SHA is also used to tag the Vector Docker image.
+
+name: Workload Checks
+
+on:
+  workflow_call:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'
+
+env:
+  SINGLE_MACHINE_PERFORMANCE_API: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_API }}
+
+jobs:
+  compute-metadata:
+    name: Compute metadata
+    runs-on: ubuntu-22.04
+    needs: should-run
+    outputs:
+      target-sha: ${{ steps.pr-metadata-comment.outputs.TARGET_SHA }}
+
+      # below are used in the experiment/analyze jobs
+      cpus: ${{ steps.system.outputs.CPUS }}
+      memory: ${{ steps.system.outputs.MEMORY }}
+      vector-cpus: ${{ steps.system.outputs.VECTOR_CPUS }}
+
+      replicas: ${{ steps.experimental-meta.outputs.REPLICAS }}
+      warmup-seconds: ${{ steps.experimental-meta.outputs.WARMUP_SECONDS }}
+      total-samples: ${{ steps.experimental-meta.outputs.TOTAL_SAMPLES }}
+      p-value: ${{ steps.experimental-meta.outputs.P_VALUE }}
+      smp-version: ${{ steps.experimental-meta.outputs.SMP_CRATE_VERSION }}
+      lading-version: ${{ steps.experimental-meta.outputs.LADING_VERSION }}
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 1000
+
+      - name: Get git metadata
+        id: git-metadata
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          export TARGET_SHA=$(git merge-base master HEAD)
+          echo "TARGET_SHA=${TARGET_SHA}" >> $GITHUB_OUTPUT
+
+          echo "target sha is: ${TARGET_SHA}"
+
+          if [ "${TARGET_SHA}" = "" ] ; then
+            echo "TARGET_SHA not found, exiting."
+            exit 1
+          fi
+
+      - name: Setup experimental metadata
+        id: experimental-meta
+        run: |
+          export WARMUP_SECONDS="45"
+          export REPLICAS="10"
+          export TOTAL_SAMPLES="600"
+          export P_VALUE="0.1"
+          export SMP_CRATE_VERSION="0.10.0"
+          export LADING_VERSION="0.18.0"
+
+          echo "warmup seconds: ${WARMUP_SECONDS}"
+          echo "replicas: ${REPLICAS}"
+          echo "total samples: ${TOTAL_SAMPLES}"
+          echo "regression p-value: ${P_VALUE}"
+          echo "smp crate version: ${SMP_CRATE_VERSION}"
+          echo "lading version: ${LADING_VERSION}"
+
+          echo "WARMUP_SECONDS=${WARMUP_SECONDS}" >> $GITHUB_OUTPUT
+          echo "REPLICAS=${REPLICAS}" >> $GITHUB_OUTPUT
+          echo "TOTAL_SAMPLES=${TOTAL_SAMPLES}" >> $GITHUB_OUTPUT
+          echo "P_VALUE=${P_VALUE}" >> $GITHUB_OUTPUT
+          echo "SMP_CRATE_VERSION=${SMP_CRATE_VERSION}" >> $GITHUB_OUTPUT
+          echo "LADING_VERSION=${LADING_VERSION}" >> $GITHUB_OUTPUT
+
+      - name: Setup system details
+        id: system
+        run: |
+          export CPUS="7"
+          export MEMORY="30g"
+          export VECTOR_CPUS="4"
+
+          echo "cpus total: ${CPUS}"
+          echo "memory total: ${MEMORY}"
+          echo "vector cpus: ${VECTOR_CPUS}"
+
+          echo "CPUS=${CPUS}" >> $GITHUB_OUTPUT
+          echo "MEMORY=${MEMORY}" >> $GITHUB_OUTPUT
+          echo "VECTOR_CPUS=${VECTOR_CPUS}" >> $GITHUB_OUTPUT
+
+  ##
+  ## BUILD
+  ##
+
+  build-target:
+    name: Build target Vector container
+    runs-on: [linux, ubuntu-20.04-4core]
+    needs:
+      - compute-metadata
+    steps:
+      - uses: colpal/actions-clean@v1
+
+      - uses: actions/checkout@v3
+        with:
+          ref: ${{ needs.compute-metadata.outputs.target-sha }}
+          path: target-vector
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3.0.0
+
+      - name: Build 'vector' target image
+        uses: docker/build-push-action@v5.0.0
+        with:
+          context: target-vector/
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          file: regression/Dockerfile
+          builder: ${{ steps.buildx.outputs.name }}
+          outputs: type=docker,dest=${{ runner.temp }}/target-image.tar
+          tags: |
+            vector:${{ needs.compute-metadata.outputs.target-sha }}
+
+      - name: Upload image as artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: target-image
+          path: "${{ runner.temp }}/target-image.tar"
+
+  confirm-valid-credentials:
+    name: Confirm AWS credentials are minimally valid
+    runs-on: ubuntu-22.04
+    needs:
+      - compute-metadata
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4.0.0
+        with:
+          aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }}
+          aws-region: us-west-2
+
+      - name: Download SMP binary
+        run: |
+          aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp
+
+  ##
+  ## SUBMIT
+  ##
+
+  upload-target-image-to-ecr:
+    name: Upload target images to ECR
+    runs-on: ubuntu-22.04
+    needs:
+      - compute-metadata
+      - confirm-valid-credentials
+      - build-target
+    steps:
+      - name: 'Download target image'
+        uses: actions/download-artifact@v3
+        with:
+          name: target-image
+
+      - name: Load target image
+        run: |
+          docker load --input target-image.tar
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4.0.0
+        with:
+          aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }}
+          aws-region: us-west-2
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v1
+
+      - name: Docker Login to ECR
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ steps.login-ecr.outputs.registry }}
+
+      - name: Tag & push target image
+        run: |
+          docker tag vector:${{ needs.compute-metadata.outputs.target-sha }} ${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }}
+          docker push ${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }}
+
+  submit-job:
+    name: Submit workload checks job
+    runs-on: ubuntu-22.04
+    needs:
+      - compute-metadata
+      - upload-target-image-to-ecr
+    steps:
+      - name: Check status, in-progress
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
+            -f state='pending' \
+            -f description='Experiments submitted to the Regression Detection cluster.' \
+            -f context='Regression Detection Suite / submission' \
+            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+      - uses: actions/checkout@v3
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4.0.0
+        with:
+          aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }}
+          aws-region: us-west-2
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v1
+
+      - name: Download SMP binary
+        run: |
+          aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp
+
+      - name: Submit job
+        env:
+          RUST_LOG: info
+        run: |
+          git fetch origin
+             
+          # Setup AWS credentials for single-machine-performance AWS account
+          AWS_NAMED_PROFILE="single-machine-performance"
+          SMP_ACCOUNT_ID=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-account-id --with-decryption --query "Parameter.Value" --out text)
+          SMP_ECR_URL=${SMP_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com
+          SMP_AGENT_TEAM_ID=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-agent-team-id --with-decryption --query "Parameter.Value" --out text)
+          SMP_API=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-api --with-decryption --query "Parameter.Value" --out text)
+          aws configure set aws_access_key_id $(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-bot-access-key-id --with-decryption --query "Parameter.Value" --out text) --profile ${AWS_NAMED_PROFILE}
+          aws configure set aws_secret_access_key $(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-bot-access-key --with-decryption --query "Parameter.Value" --out text) --profile ${AWS_NAMED_PROFILE}
+          aws configure set region us-west-2 --profile ${AWS_NAMED_PROFILE}
+          
+          # Download smp binary and prepare it for use
+          aws --profile single-machine-performance s3 cp s3://smp-cli-releases/v${SMP_VERSION}/x86_64-unknown-linux-gnu/smp smp
+          chmod +x smp
+             
+          TARGET_IMAGE =${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }}
+          CURRENT_DATE=$(date --utc '+%Y_%m_%d')
+             
+          RUST_LOG="info,aws_config::profile::credentials=error"
+          RUST_LOG_DEBUG="debug,aws_config::profile::credentials=error"
+        
+          chmod +x ${{ runner.temp }}/bin/smp
+
+          RUST_BACKTRACE=1 RUST_LOG="${RUST_LOG_DEBUG}" ${{ runner.temp }}/bin/smp \
+             --team-id ${SMP_AGENT_TEAM_ID} --api-base ${SMP_API} --aws-named-profile ${AWS_NAMED_PROFILE} \
+            job submit-workload \
+              --lading-version ${LADING_VERSION} \
+              --total-samples ${TOTAL_SAMPLES} \
+              --warmup-seconds ${WARMUP_SECONDS} \
+              --replicas ${REPLICAS} \
+              --target-image ${TARGET_IMAGE} \
+              --target-sha ${CI_COMMIT_SHA} \
+              --target-config-dir test/workload-checks \
+              --target-name datadog-agent \
+              --target-command "/bin/entrypoint.sh" \
+              --target-environment-variables "DD_HOSTNAME=smp-workload-checks,DD_DD_URL=http://127.0.0.1:9092,DD_API_KEY=00000001" \
+              --tags smp_status=nightly,client_team="agent",tag_date="${CURRENT_DATE}" \
+              --submission-metadata submission-metadata
+          
+      - uses: actions/upload-artifact@v3
+        with:
+          name: vector-submission-metadata
+          path: ${{ runner.temp }}/submission-metadata
+
+      - name: Await job
+        timeout-minutes: 120
+        env:
+          RUST_LOG: info
+        run: |
+          chmod +x ${{ runner.temp }}/bin/smp
+
+          ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} \
+             job status \
+              --wait \
+              --wait-delay-seconds 60 \
+              --wait-timeout-minutes 90 \
+              --submission-metadata ${{ runner.temp }}/submission-metadata
+
+      - name: Handle cancellation if necessary
+        if: ${{ cancelled() }}
+        env:
+          RUST_LOG: info
+        run: |
+          chmod +x ${{ runner.temp }}/bin/smp
+          ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} job cancel \
+            --submission-metadata ${{ runner.temp }}/submission-metadata
+
+      - name: Check status, cancelled
+        if: ${{ cancelled() }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
+            -f state='failure' \
+            -f description='Experiments submitted to the Regression Detection cluster cancelled.' \
+            -f context='Regression Detection Suite / submission' \
+            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+      - name: Check status, success
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
+            -f state='success' \
+            -f description='Experiments submitted to the Regression Detection cluster successfully.' \
+            -f context='Regression Detection Suite / submission' \
+            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+      - name: Check status, failure
+        if: ${{ failure() }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
+            -f state='success' \
+            -f description='Experiments submitted to the Regression Detection Suite failed.' \
+            -f context='Regression Detection Suite / submission' \
+            -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
diff --git a/workload-checks/README.md b/workload-checks/README.md
new file mode 100644
index 00000000000000..eee2c80bec72da
--- /dev/null
+++ b/workload-checks/README.md
@@ -0,0 +1,9 @@
+# Workload Checks
+
+The `smp` tool performs a nightly run of 'checks' to determine if Vector is fit for purpose.
+The 'checks' can help us answer questions about CPU usage, memory consumption, throughput etc.
+By consistently running these checks we establish a historical dataset [here](https://app.datadoghq.com/dashboard/wj9-9ds-q49?refresh_mode=sliding&from_ts=1694089061369&to_ts=1694693861369&live=true).
+
+## Adding an Experiment
+
+You can read more about the workload requirements [here](https://github.com/DataDog/datadog-agent/blob/main/test/workload-checks/README.md).
diff --git a/workload-checks/typical/cases/http_text_to_http_json/README.md b/workload-checks/typical/cases/http_text_to_http_json/README.md
new file mode 100644
index 00000000000000..ec257135c5cf6c
--- /dev/null
+++ b/workload-checks/typical/cases/http_text_to_http_json/README.md
@@ -0,0 +1,5 @@
+# HTTP Text To HTTP JSON
+
+## Purpose
+
+Simulates a simple Vector use with one HTTP server source and one HTTP sink. This was added as a proof of concept for the SMP workload checks.
diff --git a/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml b/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml
new file mode 100644
index 00000000000000..446808a9fc6306
--- /dev/null
+++ b/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml
@@ -0,0 +1,21 @@
+description: >
+  Simulates a simple Vector use with one HTTP server source and one HTTP sink. 
+  This was added as a proof of concept for the SMP workload checks.
+teams: []
+
+labels: {}
+
+checks:
+  - name: memory_usage
+    description: "Memory usage"
+    bounds:
+      series: rss_bytes
+      # The machine has 12Gb free.
+      upper_bound: 3.5Gb
+
+  - name: cpu_utilization
+    description: "CPU utilization"
+    bounds:
+      series: cpu_percentage
+      # The machine has 8 cores available.
+      upper_bound: 400
diff --git a/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml b/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml
new file mode 100644
index 00000000000000..24b27c1e26abb2
--- /dev/null
+++ b/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml
@@ -0,0 +1,16 @@
+generator:
+  - http:
+      seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
+             59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131]
+      headers: {}
+      target_uri: "http://localhost:8282/"
+      bytes_per_second: "500 Mb"
+      parallel_connections: 10
+      method:
+        post:
+          maximum_prebuild_cache_size_bytes: "256 Mb"
+          variant: "apache_common"
+
+blackhole:
+  - http:
+      binding_addr: "0.0.0.0:8080"
diff --git a/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml b/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml
new file mode 100644
index 00000000000000..04510055960362
--- /dev/null
+++ b/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml
@@ -0,0 +1,12 @@
+data_dir = "/var/lib/vector"
+
+[sources.logs]
+type = "http_server"
+address = "0.0.0.0:8282"
+decoding.codec = "bytes"
+
+[sinks.http_sink]
+type = "http"
+uri = "http://localhost:8080"
+inputs = ["logs"]
+encoding.codec = "json"
diff --git a/workload-checks/typical/machine.yaml b/workload-checks/typical/machine.yaml
new file mode 100644
index 00000000000000..f16dab521f2fbb
--- /dev/null
+++ b/workload-checks/typical/machine.yaml
@@ -0,0 +1,7 @@
+description: >
+  An ‘average’ customer server on which the agent runs alongside user
+  software. This is equivalent to an AWS c5.2xlarge with 4Gb of system memory
+  held back for system processes.
+name: typical
+cpu: 8
+memory: 12Gb