From ba9239d1b32c304539d5a42904ba58dd224aead2 Mon Sep 17 00:00:00 2001 From: Pavlos Rontidis Date: Thu, 14 Sep 2023 10:04:08 +0200 Subject: [PATCH] WIP --- .github/workflows/workload_checks.yml | 352 ++++++++++++++++++ workload-checks/README.md | 9 + .../cases/http_text_to_http_json/README.md | 5 + .../http_text_to_http_json/experiment.yaml | 21 ++ .../http_text_to_http_json/lading/lading.yaml | 16 + .../http_text_to_http_json/vector/vector.toml | 12 + workload-checks/typical/machine.yaml | 7 + 7 files changed, 422 insertions(+) create mode 100644 .github/workflows/workload_checks.yml create mode 100644 workload-checks/README.md create mode 100644 workload-checks/typical/cases/http_text_to_http_json/README.md create mode 100644 workload-checks/typical/cases/http_text_to_http_json/experiment.yaml create mode 100644 workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml create mode 100644 workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml create mode 100644 workload-checks/typical/machine.yaml diff --git a/.github/workflows/workload_checks.yml b/.github/workflows/workload_checks.yml new file mode 100644 index 00000000000000..ffc8e084f8c421 --- /dev/null +++ b/.github/workflows/workload_checks.yml @@ -0,0 +1,352 @@ +# Workload Checks +# +# Runs Vector Workload Checks. +# +# Runs on: +# - scheduled daily UTC midnight + +# This workflow runs the collection of our workload checks, using the repo HEAD SHA, +# which depends on when the workflow is invoked. +# +# The goal is to establish a baseline of check results for a variety of cases +# and visualize trends for important Vector use cases. +# +# The HEAD SHA is also used to tag the Vector Docker image. + +name: Workload Checks + +on: + workflow_call: + workflow_dispatch: + schedule: + - cron: '0 0 * * *' + +env: + SINGLE_MACHINE_PERFORMANCE_API: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_API }} + +jobs: + compute-metadata: + name: Compute metadata + runs-on: ubuntu-22.04 + needs: should-run + outputs: + target-sha: ${{ steps.pr-metadata-comment.outputs.TARGET_SHA }} + + # below are used in the experiment/analyze jobs + cpus: ${{ steps.system.outputs.CPUS }} + memory: ${{ steps.system.outputs.MEMORY }} + vector-cpus: ${{ steps.system.outputs.VECTOR_CPUS }} + + replicas: ${{ steps.experimental-meta.outputs.REPLICAS }} + warmup-seconds: ${{ steps.experimental-meta.outputs.WARMUP_SECONDS }} + total-samples: ${{ steps.experimental-meta.outputs.TOTAL_SAMPLES }} + p-value: ${{ steps.experimental-meta.outputs.P_VALUE }} + smp-version: ${{ steps.experimental-meta.outputs.SMP_CRATE_VERSION }} + lading-version: ${{ steps.experimental-meta.outputs.LADING_VERSION }} + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 1000 + + - name: Get git metadata + id: git-metadata + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + export TARGET_SHA=$(git merge-base master HEAD) + echo "TARGET_SHA=${TARGET_SHA}" >> $GITHUB_OUTPUT + + echo "target sha is: ${TARGET_SHA}" + + if [ "${TARGET_SHA}" = "" ] ; then + echo "TARGET_SHA not found, exiting." + exit 1 + fi + + - name: Setup experimental metadata + id: experimental-meta + run: | + export WARMUP_SECONDS="45" + export REPLICAS="10" + export TOTAL_SAMPLES="600" + export P_VALUE="0.1" + export SMP_CRATE_VERSION="0.10.0" + export LADING_VERSION="0.18.0" + + echo "warmup seconds: ${WARMUP_SECONDS}" + echo "replicas: ${REPLICAS}" + echo "total samples: ${TOTAL_SAMPLES}" + echo "regression p-value: ${P_VALUE}" + echo "smp crate version: ${SMP_CRATE_VERSION}" + echo "lading version: ${LADING_VERSION}" + + echo "WARMUP_SECONDS=${WARMUP_SECONDS}" >> $GITHUB_OUTPUT + echo "REPLICAS=${REPLICAS}" >> $GITHUB_OUTPUT + echo "TOTAL_SAMPLES=${TOTAL_SAMPLES}" >> $GITHUB_OUTPUT + echo "P_VALUE=${P_VALUE}" >> $GITHUB_OUTPUT + echo "SMP_CRATE_VERSION=${SMP_CRATE_VERSION}" >> $GITHUB_OUTPUT + echo "LADING_VERSION=${LADING_VERSION}" >> $GITHUB_OUTPUT + + - name: Setup system details + id: system + run: | + export CPUS="7" + export MEMORY="30g" + export VECTOR_CPUS="4" + + echo "cpus total: ${CPUS}" + echo "memory total: ${MEMORY}" + echo "vector cpus: ${VECTOR_CPUS}" + + echo "CPUS=${CPUS}" >> $GITHUB_OUTPUT + echo "MEMORY=${MEMORY}" >> $GITHUB_OUTPUT + echo "VECTOR_CPUS=${VECTOR_CPUS}" >> $GITHUB_OUTPUT + + ## + ## BUILD + ## + + build-target: + name: Build target Vector container + runs-on: [linux, ubuntu-20.04-4core] + needs: + - compute-metadata + steps: + - uses: colpal/actions-clean@v1 + + - uses: actions/checkout@v3 + with: + ref: ${{ needs.compute-metadata.outputs.target-sha }} + path: target-vector + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3.0.0 + + - name: Build 'vector' target image + uses: docker/build-push-action@v5.0.0 + with: + context: target-vector/ + cache-from: type=gha + cache-to: type=gha,mode=max + file: regression/Dockerfile + builder: ${{ steps.buildx.outputs.name }} + outputs: type=docker,dest=${{ runner.temp }}/target-image.tar + tags: | + vector:${{ needs.compute-metadata.outputs.target-sha }} + + - name: Upload image as artifact + uses: actions/upload-artifact@v3 + with: + name: target-image + path: "${{ runner.temp }}/target-image.tar" + + confirm-valid-credentials: + name: Confirm AWS credentials are minimally valid + runs-on: ubuntu-22.04 + needs: + - compute-metadata + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4.0.0 + with: + aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + + - name: Download SMP binary + run: | + aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp + + ## + ## SUBMIT + ## + + upload-target-image-to-ecr: + name: Upload target images to ECR + runs-on: ubuntu-22.04 + needs: + - compute-metadata + - confirm-valid-credentials + - build-target + steps: + - name: 'Download target image' + uses: actions/download-artifact@v3 + with: + name: target-image + + - name: Load target image + run: | + docker load --input target-image.tar + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4.0.0 + with: + aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Docker Login to ECR + uses: docker/login-action@v2 + with: + registry: ${{ steps.login-ecr.outputs.registry }} + + - name: Tag & push target image + run: | + docker tag vector:${{ needs.compute-metadata.outputs.target-sha }} ${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }} + docker push ${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }} + + submit-job: + name: Submit workload checks job + runs-on: ubuntu-22.04 + needs: + - compute-metadata + - upload-target-image-to-ecr + steps: + - name: Check status, in-progress + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='pending' \ + -f description='Experiments submitted to the Regression Detection cluster.' \ + -f context='Regression Detection Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - uses: actions/checkout@v3 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4.0.0 + with: + aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Download SMP binary + run: | + aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp + + - name: Submit job + env: + RUST_LOG: info + run: | + git fetch origin + + # Setup AWS credentials for single-machine-performance AWS account + AWS_NAMED_PROFILE="single-machine-performance" + SMP_ACCOUNT_ID=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-account-id --with-decryption --query "Parameter.Value" --out text) + SMP_ECR_URL=${SMP_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com + SMP_AGENT_TEAM_ID=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-agent-team-id --with-decryption --query "Parameter.Value" --out text) + SMP_API=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-api --with-decryption --query "Parameter.Value" --out text) + aws configure set aws_access_key_id $(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-bot-access-key-id --with-decryption --query "Parameter.Value" --out text) --profile ${AWS_NAMED_PROFILE} + aws configure set aws_secret_access_key $(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-bot-access-key --with-decryption --query "Parameter.Value" --out text) --profile ${AWS_NAMED_PROFILE} + aws configure set region us-west-2 --profile ${AWS_NAMED_PROFILE} + + # Download smp binary and prepare it for use + aws --profile single-machine-performance s3 cp s3://smp-cli-releases/v${SMP_VERSION}/x86_64-unknown-linux-gnu/smp smp + chmod +x smp + + TARGET_IMAGE =${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }} + CURRENT_DATE=$(date --utc '+%Y_%m_%d') + + RUST_LOG="info,aws_config::profile::credentials=error" + RUST_LOG_DEBUG="debug,aws_config::profile::credentials=error" + + chmod +x ${{ runner.temp }}/bin/smp + + RUST_BACKTRACE=1 RUST_LOG="${RUST_LOG_DEBUG}" ${{ runner.temp }}/bin/smp \ + --team-id ${SMP_AGENT_TEAM_ID} --api-base ${SMP_API} --aws-named-profile ${AWS_NAMED_PROFILE} \ + job submit-workload \ + --lading-version ${LADING_VERSION} \ + --total-samples ${TOTAL_SAMPLES} \ + --warmup-seconds ${WARMUP_SECONDS} \ + --replicas ${REPLICAS} \ + --target-image ${TARGET_IMAGE} \ + --target-sha ${CI_COMMIT_SHA} \ + --target-config-dir test/workload-checks \ + --target-name datadog-agent \ + --target-command "/bin/entrypoint.sh" \ + --target-environment-variables "DD_HOSTNAME=smp-workload-checks,DD_DD_URL=http://127.0.0.1:9092,DD_API_KEY=00000001" \ + --tags smp_status=nightly,client_team="agent",tag_date="${CURRENT_DATE}" \ + --submission-metadata submission-metadata + + - uses: actions/upload-artifact@v3 + with: + name: vector-submission-metadata + path: ${{ runner.temp }}/submission-metadata + + - name: Await job + timeout-minutes: 120 + env: + RUST_LOG: info + run: | + chmod +x ${{ runner.temp }}/bin/smp + + ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} \ + job status \ + --wait \ + --wait-delay-seconds 60 \ + --wait-timeout-minutes 90 \ + --submission-metadata ${{ runner.temp }}/submission-metadata + + - name: Handle cancellation if necessary + if: ${{ cancelled() }} + env: + RUST_LOG: info + run: | + chmod +x ${{ runner.temp }}/bin/smp + ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} job cancel \ + --submission-metadata ${{ runner.temp }}/submission-metadata + + - name: Check status, cancelled + if: ${{ cancelled() }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='failure' \ + -f description='Experiments submitted to the Regression Detection cluster cancelled.' \ + -f context='Regression Detection Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - name: Check status, success + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='success' \ + -f description='Experiments submitted to the Regression Detection cluster successfully.' \ + -f context='Regression Detection Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - name: Check status, failure + if: ${{ failure() }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='success' \ + -f description='Experiments submitted to the Regression Detection Suite failed.' \ + -f context='Regression Detection Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} diff --git a/workload-checks/README.md b/workload-checks/README.md new file mode 100644 index 00000000000000..eee2c80bec72da --- /dev/null +++ b/workload-checks/README.md @@ -0,0 +1,9 @@ +# Workload Checks + +The `smp` tool performs a nightly run of 'checks' to determine if Vector is fit for purpose. +The 'checks' can help us answer questions about CPU usage, memory consumption, throughput etc. +By consistently running these checks we establish a historical dataset [here](https://app.datadoghq.com/dashboard/wj9-9ds-q49?refresh_mode=sliding&from_ts=1694089061369&to_ts=1694693861369&live=true). + +## Adding an Experiment + +You can read more about the workload requirements [here](https://github.com/DataDog/datadog-agent/blob/main/test/workload-checks/README.md). diff --git a/workload-checks/typical/cases/http_text_to_http_json/README.md b/workload-checks/typical/cases/http_text_to_http_json/README.md new file mode 100644 index 00000000000000..ec257135c5cf6c --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/README.md @@ -0,0 +1,5 @@ +# HTTP Text To HTTP JSON + +## Purpose + +Simulates a simple Vector use with one HTTP server source and one HTTP sink. This was added as a proof of concept for the SMP workload checks. diff --git a/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml b/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml new file mode 100644 index 00000000000000..446808a9fc6306 --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml @@ -0,0 +1,21 @@ +description: > + Simulates a simple Vector use with one HTTP server source and one HTTP sink. + This was added as a proof of concept for the SMP workload checks. +teams: [] + +labels: {} + +checks: + - name: memory_usage + description: "Memory usage" + bounds: + series: rss_bytes + # The machine has 12Gb free. + upper_bound: 3.5Gb + + - name: cpu_utilization + description: "CPU utilization" + bounds: + series: cpu_percentage + # The machine has 8 cores available. + upper_bound: 400 diff --git a/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml b/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml new file mode 100644 index 00000000000000..24b27c1e26abb2 --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml @@ -0,0 +1,16 @@ +generator: + - http: + seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] + headers: {} + target_uri: "http://localhost:8282/" + bytes_per_second: "500 Mb" + parallel_connections: 10 + method: + post: + maximum_prebuild_cache_size_bytes: "256 Mb" + variant: "apache_common" + +blackhole: + - http: + binding_addr: "0.0.0.0:8080" diff --git a/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml b/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml new file mode 100644 index 00000000000000..04510055960362 --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml @@ -0,0 +1,12 @@ +data_dir = "/var/lib/vector" + +[sources.logs] +type = "http_server" +address = "0.0.0.0:8282" +decoding.codec = "bytes" + +[sinks.http_sink] +type = "http" +uri = "http://localhost:8080" +inputs = ["logs"] +encoding.codec = "json" diff --git a/workload-checks/typical/machine.yaml b/workload-checks/typical/machine.yaml new file mode 100644 index 00000000000000..f16dab521f2fbb --- /dev/null +++ b/workload-checks/typical/machine.yaml @@ -0,0 +1,7 @@ +description: > + An ‘average’ customer server on which the agent runs alongside user + software. This is equivalent to an AWS c5.2xlarge with 4Gb of system memory + held back for system processes. +name: typical +cpu: 8 +memory: 12Gb