diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index 360f67b173aed..19ed0a26e5543 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -45,9 +45,10 @@ Comcast Consolas Coolpad DEBHELPER -DOOV Danew +dkr Dockerfiles +DOOV Douban Enot Evercoss diff --git a/.github/workflows/workload_checks.yml b/.github/workflows/workload_checks.yml new file mode 100644 index 0000000000000..3610c78bdfa15 --- /dev/null +++ b/.github/workflows/workload_checks.yml @@ -0,0 +1,204 @@ +# Workload Checks Suite +# +# Runs Vector Workload Checks. +# +# Runs on: +# - scheduled UTC midnight Tues-Sat + +# This workflow runs the collection of our workload checks, using the latest Vector nightly image, +# which depends on when the workflow is invoked. +# +# The goal is to establish a baseline of check results for a variety of cases +# and visualize trends for important Vector use cases. + +name: Workload Checks Suite + +on: + workflow_call: + workflow_dispatch: + schedule: + # At midnight UTC Tue-Sat + - cron: '0 0 * * 2-6' +env: + SINGLE_MACHINE_PERFORMANCE_API: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_API }} + +jobs: + compute-metadata: + name: Compute metadata + runs-on: ubuntu-latest + outputs: + replicas: ${{ steps.experimental-meta.outputs.REPLICAS }} + warmup-seconds: ${{ steps.experimental-meta.outputs.WARMUP_SECONDS }} + total-samples: ${{ steps.experimental-meta.outputs.TOTAL_SAMPLES }} + smp-version: ${{ steps.experimental-meta.outputs.SMP_CRATE_VERSION }} + lading-version: ${{ steps.experimental-meta.outputs.LADING_VERSION }} + + target-sha: ${{ steps.git-metadata.outputs.TARGET_SHA }} + + steps: + - uses: actions/checkout@v3 + + - name: Get git metadata + id: git-metadata + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + export TARGET_SHA=$(git rev-parse HEAD) + echo "TARGET_SHA=${TARGET_SHA}" >> $GITHUB_OUTPUT + + echo "target sha is: ${TARGET_SHA}" + + if [ "${TARGET_SHA}" = "" ] ; then + echo "TARGET_SHA not found, exiting." + exit 1 + fi + + - name: Setup experimental metadata + id: experimental-meta + run: | + export WARMUP_SECONDS="45" + export REPLICAS="10" + export TOTAL_SAMPLES="600" + export SMP_CRATE_VERSION="0.10.0" + export LADING_VERSION="0.18.0" + + echo "warmup seconds: ${WARMUP_SECONDS}" + echo "replicas: ${REPLICAS}" + echo "total samples: ${TOTAL_SAMPLES}" + echo "smp crate version: ${SMP_CRATE_VERSION}" + echo "lading version: ${LADING_VERSION}" + + echo "WARMUP_SECONDS=${WARMUP_SECONDS}" >> $GITHUB_OUTPUT + echo "REPLICAS=${REPLICAS}" >> $GITHUB_OUTPUT + echo "TOTAL_SAMPLES=${TOTAL_SAMPLES}" >> $GITHUB_OUTPUT + echo "SMP_CRATE_VERSION=${SMP_CRATE_VERSION}" >> $GITHUB_OUTPUT + echo "LADING_VERSION=${LADING_VERSION}" >> $GITHUB_OUTPUT + + submit-job: + name: Submit workload checks job + runs-on: ubuntu-latest + needs: + - compute-metadata + steps: + - name: Check status, in-progress + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='pending' \ + -f description='Experiments submitted to the Workload Checks cluster.' \ + -f context='Workload Checks Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - uses: actions/checkout@v3 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4.0.0 + with: + aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }} + aws-region: us-west-2 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Download SMP binary + run: | + aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp + + - name: Submit job + env: + RUST_LOG: info + run: | + CURRENT_DATE=$(date --utc '+%Y_%m_%d') + RUST_LOG_DEBUG="debug,aws_config::profile::credentials=error" + + chmod +x ${{ runner.temp }}/bin/smp + RUST_BACKTRACE=1 RUST_LOG="${RUST_LOG_DEBUG}" ${{ runner.temp }}/bin/smp \ + --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} \ + job submit-workload \ + --lading-version ${{ needs.compute-metadata.outputs.lading-version }} \ + --total-samples ${{ needs.compute-metadata.outputs.total-samples }} \ + --warmup-seconds ${{ needs.compute-metadata.outputs.warmup-seconds }} \ + --replicas ${{ needs.compute-metadata.outputs.replicas }} \ + --target-image timberio/vector:nightly-debian \ + --target-sha ${{ needs.compute-metadata.outputs.target-sha }} \ + --target-config-dir ${{ github.workspace }}/workload-checks \ + --target-name vector \ + --target-command "/usr/bin/vector" \ + --target-environment-variables "DD_HOSTNAME=smp-workload-checks,DD_DD_URL=http://127.0.0.1:9092,DD_API_KEY=00000001" \ + --tags smp_status=nightly,client_team="vector",tag_date="${CURRENT_DATE}" \ + --submission-metadata ${{ runner.temp }}/submission-metadata + + - uses: actions/upload-artifact@v3 + with: + name: vector-submission-metadata + path: ${{ runner.temp }}/submission-metadata + + - name: Await job + timeout-minutes: 120 + env: + RUST_LOG: info + run: | + chmod +x ${{ runner.temp }}/bin/smp + + ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} \ + job status \ + --wait \ + --wait-delay-seconds 60 \ + --wait-timeout-minutes 90 \ + --submission-metadata ${{ runner.temp }}/submission-metadata + + - name: Handle cancellation if necessary + if: ${{ cancelled() }} + env: + RUST_LOG: info + run: | + chmod +x ${{ runner.temp }}/bin/smp + ${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} job cancel \ + --submission-metadata ${{ runner.temp }}/submission-metadata + + - name: Check status, cancelled + if: ${{ cancelled() }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='failure' \ + -f description='Experiments submitted to the Workload Checks cluster cancelled.' \ + -f context='Workload Checks Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - name: Check status, success + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='success' \ + -f description='Experiments submitted to the Workload Checks cluster successfully.' \ + -f context='Workload Checks Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - name: Check status, failure + if: ${{ failure() }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + /repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \ + -f state='success' \ + -f description='Experiments submitted to the Workload Checks Suite failed.' \ + -f context='Workload Checks Suite / submission' \ + -f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} diff --git a/workload-checks/README.md b/workload-checks/README.md new file mode 100644 index 0000000000000..70f551da8f29b --- /dev/null +++ b/workload-checks/README.md @@ -0,0 +1,9 @@ +# Workload Checks + +The `smp` tool performs a nightly run of 'checks' to determine if Vector is fit for purpose. +The 'checks' can help us answer questions about CPU usage, memory consumption, throughput etc. +By consistently running these checks we establish a historical baseline that we can compare against. + +## Adding an Experiment + +You can read more about the workload requirements [here](https://datadoghq.atlassian.net/wiki/spaces/SMP/pages/3183248544/Workload+Checks+-+Getting+Started). diff --git a/workload-checks/typical/cases/http_text_to_http_json/README.md b/workload-checks/typical/cases/http_text_to_http_json/README.md new file mode 100644 index 0000000000000..ec257135c5cf6 --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/README.md @@ -0,0 +1,5 @@ +# HTTP Text To HTTP JSON + +## Purpose + +Simulates a simple Vector use with one HTTP server source and one HTTP sink. This was added as a proof of concept for the SMP workload checks. diff --git a/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml b/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml new file mode 100644 index 0000000000000..aa561ba19f031 --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/experiment.yaml @@ -0,0 +1,22 @@ +description: > + Simulates a simple Vector use with one HTTP server source and one HTTP sink. + This was added as a proof of concept for the SMP workload checks. +teams: [] + +labels: {} + +checks: + - name: memory_usage + description: "Memory usage" + bounds: + series: rss_bytes + # The machine has 12Gb free. + upper_bound: 2Gb + + - name: cpu_utilization + description: "CPU utilization" + bounds: + series: cpu_percentage + # The machine has 8 cores available. + lower_bound: 0 + upper_bound: 600 diff --git a/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml b/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml new file mode 100644 index 0000000000000..d176da51bb336 --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/lading/lading.yaml @@ -0,0 +1,16 @@ +generator: + - http: + seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] + headers: {} + target_uri: "http://localhost:8282/" + bytes_per_second: "250 Mb" + parallel_connections: 10 + method: + post: + maximum_prebuild_cache_size_bytes: "256 Mb" + variant: "apache_common" + +blackhole: + - http: + binding_addr: "0.0.0.0:8080" diff --git a/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml b/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml new file mode 100644 index 0000000000000..0451005596036 --- /dev/null +++ b/workload-checks/typical/cases/http_text_to_http_json/vector/vector.toml @@ -0,0 +1,12 @@ +data_dir = "/var/lib/vector" + +[sources.logs] +type = "http_server" +address = "0.0.0.0:8282" +decoding.codec = "bytes" + +[sinks.http_sink] +type = "http" +uri = "http://localhost:8080" +inputs = ["logs"] +encoding.codec = "json" diff --git a/workload-checks/typical/machine.yaml b/workload-checks/typical/machine.yaml new file mode 100644 index 0000000000000..ad34e1a080142 --- /dev/null +++ b/workload-checks/typical/machine.yaml @@ -0,0 +1,7 @@ +description: > + An ‘average’ customer server on which vector runs alongside user + software. This is equivalent to an AWS c5.2xlarge with 4Gb of system memory + held back for system processes. +name: typical +cpu: 8 +memory: 12Gb