Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
pront committed Sep 14, 2023
1 parent 9e8407e commit ba9239d
Show file tree
Hide file tree
Showing 7 changed files with 422 additions and 0 deletions.
352 changes: 352 additions & 0 deletions .github/workflows/workload_checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,352 @@
# Workload Checks
#
# Runs Vector Workload Checks.
#
# Runs on:
# - scheduled daily UTC midnight

# This workflow runs the collection of our workload checks, using the repo HEAD SHA,
# which depends on when the workflow is invoked.
#
# The goal is to establish a baseline of check results for a variety of cases
# and visualize trends for important Vector use cases.
#
# The HEAD SHA is also used to tag the Vector Docker image.

name: Workload Checks

on:
workflow_call:
workflow_dispatch:
schedule:
- cron: '0 0 * * *'

env:
SINGLE_MACHINE_PERFORMANCE_API: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_API }}

jobs:
compute-metadata:
name: Compute metadata
runs-on: ubuntu-22.04
needs: should-run
outputs:
target-sha: ${{ steps.pr-metadata-comment.outputs.TARGET_SHA }}

# below are used in the experiment/analyze jobs
cpus: ${{ steps.system.outputs.CPUS }}
memory: ${{ steps.system.outputs.MEMORY }}
vector-cpus: ${{ steps.system.outputs.VECTOR_CPUS }}

replicas: ${{ steps.experimental-meta.outputs.REPLICAS }}
warmup-seconds: ${{ steps.experimental-meta.outputs.WARMUP_SECONDS }}
total-samples: ${{ steps.experimental-meta.outputs.TOTAL_SAMPLES }}
p-value: ${{ steps.experimental-meta.outputs.P_VALUE }}
smp-version: ${{ steps.experimental-meta.outputs.SMP_CRATE_VERSION }}
lading-version: ${{ steps.experimental-meta.outputs.LADING_VERSION }}

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 1000

- name: Get git metadata
id: git-metadata
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
export TARGET_SHA=$(git merge-base master HEAD)
echo "TARGET_SHA=${TARGET_SHA}" >> $GITHUB_OUTPUT
echo "target sha is: ${TARGET_SHA}"
if [ "${TARGET_SHA}" = "" ] ; then
echo "TARGET_SHA not found, exiting."
exit 1
fi
- name: Setup experimental metadata
id: experimental-meta
run: |
export WARMUP_SECONDS="45"
export REPLICAS="10"
export TOTAL_SAMPLES="600"
export P_VALUE="0.1"
export SMP_CRATE_VERSION="0.10.0"
export LADING_VERSION="0.18.0"
echo "warmup seconds: ${WARMUP_SECONDS}"
echo "replicas: ${REPLICAS}"
echo "total samples: ${TOTAL_SAMPLES}"
echo "regression p-value: ${P_VALUE}"
echo "smp crate version: ${SMP_CRATE_VERSION}"
echo "lading version: ${LADING_VERSION}"
echo "WARMUP_SECONDS=${WARMUP_SECONDS}" >> $GITHUB_OUTPUT
echo "REPLICAS=${REPLICAS}" >> $GITHUB_OUTPUT
echo "TOTAL_SAMPLES=${TOTAL_SAMPLES}" >> $GITHUB_OUTPUT
echo "P_VALUE=${P_VALUE}" >> $GITHUB_OUTPUT
echo "SMP_CRATE_VERSION=${SMP_CRATE_VERSION}" >> $GITHUB_OUTPUT
echo "LADING_VERSION=${LADING_VERSION}" >> $GITHUB_OUTPUT
- name: Setup system details
id: system
run: |
export CPUS="7"
export MEMORY="30g"
export VECTOR_CPUS="4"
echo "cpus total: ${CPUS}"
echo "memory total: ${MEMORY}"
echo "vector cpus: ${VECTOR_CPUS}"
echo "CPUS=${CPUS}" >> $GITHUB_OUTPUT
echo "MEMORY=${MEMORY}" >> $GITHUB_OUTPUT
echo "VECTOR_CPUS=${VECTOR_CPUS}" >> $GITHUB_OUTPUT
##
## BUILD
##

build-target:
name: Build target Vector container
runs-on: [linux, ubuntu-20.04-4core]
needs:
- compute-metadata
steps:
- uses: colpal/actions-clean@v1

- uses: actions/checkout@v3
with:
ref: ${{ needs.compute-metadata.outputs.target-sha }}
path: target-vector

- name: Set up Docker Buildx
id: buildx
uses: docker/[email protected]

- name: Build 'vector' target image
uses: docker/[email protected]
with:
context: target-vector/
cache-from: type=gha
cache-to: type=gha,mode=max
file: regression/Dockerfile
builder: ${{ steps.buildx.outputs.name }}
outputs: type=docker,dest=${{ runner.temp }}/target-image.tar
tags: |
vector:${{ needs.compute-metadata.outputs.target-sha }}
- name: Upload image as artifact
uses: actions/upload-artifact@v3
with:
name: target-image
path: "${{ runner.temp }}/target-image.tar"

confirm-valid-credentials:
name: Confirm AWS credentials are minimally valid
runs-on: ubuntu-22.04
needs:
- compute-metadata
steps:
- name: Configure AWS Credentials
uses: aws-actions/[email protected]
with:
aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }}
aws-region: us-west-2

- name: Download SMP binary
run: |
aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp
##
## SUBMIT
##

upload-target-image-to-ecr:
name: Upload target images to ECR
runs-on: ubuntu-22.04
needs:
- compute-metadata
- confirm-valid-credentials
- build-target
steps:
- name: 'Download target image'
uses: actions/download-artifact@v3
with:
name: target-image

- name: Load target image
run: |
docker load --input target-image.tar
- name: Configure AWS Credentials
uses: aws-actions/[email protected]
with:
aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }}
aws-region: us-west-2

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1

- name: Docker Login to ECR
uses: docker/login-action@v2
with:
registry: ${{ steps.login-ecr.outputs.registry }}

- name: Tag & push target image
run: |
docker tag vector:${{ needs.compute-metadata.outputs.target-sha }} ${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }}
docker push ${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }}
submit-job:
name: Submit workload checks job
runs-on: ubuntu-22.04
needs:
- compute-metadata
- upload-target-image-to-ecr
steps:
- name: Check status, in-progress
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
-f state='pending' \
-f description='Experiments submitted to the Regression Detection cluster.' \
-f context='Regression Detection Suite / submission' \
-f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
- uses: actions/checkout@v3

- name: Configure AWS Credentials
uses: aws-actions/[email protected]
with:
aws-access-key-id: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.SINGLE_MACHINE_PERFORMANCE_BOT_SECRET_ACCESS_KEY }}
aws-region: us-west-2

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1

- name: Download SMP binary
run: |
aws s3 cp s3://smp-cli-releases/v${{ needs.compute-metadata.outputs.smp-version }}/x86_64-unknown-linux-gnu/smp ${{ runner.temp }}/bin/smp
- name: Submit job
env:
RUST_LOG: info
run: |
git fetch origin
# Setup AWS credentials for single-machine-performance AWS account
AWS_NAMED_PROFILE="single-machine-performance"
SMP_ACCOUNT_ID=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-account-id --with-decryption --query "Parameter.Value" --out text)
SMP_ECR_URL=${SMP_ACCOUNT_ID}.dkr.ecr.us-west-2.amazonaws.com

Check failure

Code scanning / check-spelling

Unrecognized Spelling Error

dkr is not a recognized word. (unrecognized-spelling)
SMP_AGENT_TEAM_ID=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-agent-team-id --with-decryption --query "Parameter.Value" --out text)
SMP_API=$(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-api --with-decryption --query "Parameter.Value" --out text)
aws configure set aws_access_key_id $(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-bot-access-key-id --with-decryption --query "Parameter.Value" --out text) --profile ${AWS_NAMED_PROFILE}
aws configure set aws_secret_access_key $(aws ssm get-parameter --region us-east-1 --name ci.datadog-agent.single-machine-performance-bot-access-key --with-decryption --query "Parameter.Value" --out text) --profile ${AWS_NAMED_PROFILE}
aws configure set region us-west-2 --profile ${AWS_NAMED_PROFILE}
# Download smp binary and prepare it for use
aws --profile single-machine-performance s3 cp s3://smp-cli-releases/v${SMP_VERSION}/x86_64-unknown-linux-gnu/smp smp
chmod +x smp
TARGET_IMAGE =${{ steps.login-ecr.outputs.registry }}/${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }}-vector:${{ needs.compute-metadata.outputs.target-sha }}
CURRENT_DATE=$(date --utc '+%Y_%m_%d')
RUST_LOG="info,aws_config::profile::credentials=error"
RUST_LOG_DEBUG="debug,aws_config::profile::credentials=error"
chmod +x ${{ runner.temp }}/bin/smp
RUST_BACKTRACE=1 RUST_LOG="${RUST_LOG_DEBUG}" ${{ runner.temp }}/bin/smp \
--team-id ${SMP_AGENT_TEAM_ID} --api-base ${SMP_API} --aws-named-profile ${AWS_NAMED_PROFILE} \
job submit-workload \
--lading-version ${LADING_VERSION} \
--total-samples ${TOTAL_SAMPLES} \
--warmup-seconds ${WARMUP_SECONDS} \
--replicas ${REPLICAS} \
--target-image ${TARGET_IMAGE} \
--target-sha ${CI_COMMIT_SHA} \
--target-config-dir test/workload-checks \
--target-name datadog-agent \
--target-command "/bin/entrypoint.sh" \
--target-environment-variables "DD_HOSTNAME=smp-workload-checks,DD_DD_URL=http://127.0.0.1:9092,DD_API_KEY=00000001" \
--tags smp_status=nightly,client_team="agent",tag_date="${CURRENT_DATE}" \
--submission-metadata submission-metadata
- uses: actions/upload-artifact@v3
with:
name: vector-submission-metadata
path: ${{ runner.temp }}/submission-metadata

- name: Await job
timeout-minutes: 120
env:
RUST_LOG: info
run: |
chmod +x ${{ runner.temp }}/bin/smp
${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} \
job status \
--wait \
--wait-delay-seconds 60 \
--wait-timeout-minutes 90 \
--submission-metadata ${{ runner.temp }}/submission-metadata
- name: Handle cancellation if necessary
if: ${{ cancelled() }}
env:
RUST_LOG: info
run: |
chmod +x ${{ runner.temp }}/bin/smp
${{ runner.temp }}/bin/smp --team-id ${{ secrets.SINGLE_MACHINE_PERFORMANCE_TEAM_ID }} job cancel \
--submission-metadata ${{ runner.temp }}/submission-metadata
- name: Check status, cancelled
if: ${{ cancelled() }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
-f state='failure' \
-f description='Experiments submitted to the Regression Detection cluster cancelled.' \
-f context='Regression Detection Suite / submission' \
-f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
- name: Check status, success
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
-f state='success' \
-f description='Experiments submitted to the Regression Detection cluster successfully.' \
-f context='Regression Detection Suite / submission' \
-f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
- name: Check status, failure
if: ${{ failure() }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${{ needs.compute-metadata.outputs.target-sha }} \
-f state='success' \
-f description='Experiments submitted to the Regression Detection Suite failed.' \
-f context='Regression Detection Suite / submission' \
-f target_url=${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
9 changes: 9 additions & 0 deletions workload-checks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Workload Checks

The `smp` tool performs a nightly run of 'checks' to determine if Vector is fit for purpose.
The 'checks' can help us answer questions about CPU usage, memory consumption, throughput etc.
By consistently running these checks we establish a historical dataset [here](https://app.datadoghq.com/dashboard/wj9-9ds-q49?refresh_mode=sliding&from_ts=1694089061369&to_ts=1694693861369&live=true).

## Adding an Experiment

You can read more about the workload requirements [here](https://github.com/DataDog/datadog-agent/blob/main/test/workload-checks/README.md).
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# HTTP Text To HTTP JSON

## Purpose

Simulates a simple Vector use with one HTTP server source and one HTTP sink. This was added as a proof of concept for the SMP workload checks.
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
description: >
Simulates a simple Vector use with one HTTP server source and one HTTP sink.
This was added as a proof of concept for the SMP workload checks.
teams: []

labels: {}

checks:
- name: memory_usage
description: "Memory usage"
bounds:
series: rss_bytes
# The machine has 12Gb free.
upper_bound: 3.5Gb

- name: cpu_utilization
description: "CPU utilization"
bounds:
series: cpu_percentage
# The machine has 8 cores available.
upper_bound: 400
Loading

0 comments on commit ba9239d

Please sign in to comment.