Skip to content

Validation with Model Server #205

Validation with Model Server

Validation with Model Server #205

name: Validation with Model Server
on:
workflow_dispatch:
schedule:
- cron: '0 23 * * *'
permissions:
pull-requests: write
contents: write
repository-projects: write
packages: write
jobs:
Create-runner:
name: "Create Runner"
uses: ./.github/workflows/create_equinix_runner.yml
secrets: inherit
Install:
name: "Install"
needs: Create-runner
runs-on: self-hosted
outputs:
runner-name: ${{ runner.name }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run Setup Runner Action
uses: ./.github/actions/setup-action
- name: Checkout code
uses: actions/checkout@v4
- name: Run playbook
id: run_playbook
run: |
cd ${GITHUB_WORKSPACE}/ansible
echo "Create VM"
ansible-playbook -i inventory.yml kvm_playbook.yml
echo "Install SSH tunnel"
ansible-playbook ssh_tunnel_playbook.yml
echo "Install Prometheus"
ansible-playbook -i inventory.yml metrics_playbook.yml
echo "Install Node Exporter"
ansible-playbook -i inventory.yml node_exporter_playbook.yml -vvv
echo "Verify node-exporter"
sudo systemctl status node_exporter || true
sudo ss -tuln | grep 9100 || true
curl -s localhost:9100/metrics | grep collector || true
echo "Install Kepler"
ansible-playbook -i inventory.yml -vvv kepler_playbook.yml -e "target_host=localhost"
echo "Install Model Server"
ansible-playbook -i inventory.yml -vvv model_server_playbook.yml
echo "Create ssh tunnel"
ansible-playbook -i inventory.yml ssh_tunnel_playbook.yml
echo "Run validation test"
ansible-playbook -vvv kepler_validator.yml
echo "Checkout the report"
ls /tmp
cat /tmp/report-*.md || true
# create a directory to store the artifacts, the directory is the current date
set -x
export DATE_STR=$(date +%Y-%m-%d)
export TIME_STR=$(date +%H-%M-%S)
cd ${GITHUB_WORKSPACE}
mkdir -p docs/validation/${DATE_STR}
export KEPLER_TAG=$(ls -d /tmp/validator-* |tail -1 | sed 's/.*validator-//g')
# python -m pip install --upgrade pip
# pip install tabulate
# python util/generate_daily_validations.py \
# --report-md-filepath docs/daily-validations/daily-validations-model-server/daily-report.md \
# --report-json-filepath docs/daily-validations/daily-validations-model-server/daily-report.json \
# --new-val-filepath /tmp/validator-${KEPLER_TAG}/${KEPLER_TAG}.json
# copy the report to the directory
mv /tmp/validator-${KEPLER_TAG}/ docs/validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server
echo "| " ${DATE_STR} " | " ${KEPLER_TAG}-${TIME_STR}-model-server " | [Report](validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/report-${KEPLER_TAG}.md) | [Artifacts](validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/artifacts) |" \
>> docs/kepler-model-validation.md
# Capture Prometheus TSDB Snapshot
mkdir -p docs/validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/prometheus-snapshot
snap_name=$(curl -XPOST http://localhost:9090/api/v1/admin/tsdb/snapshot | jq -r '.data.name')
tar -cvzf docs/validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/prometheus-snapshot/snapshot.tar.gz -C \
/var/lib/prometheus/snapshots ${snap_name}
# Push to the repo
git config user.email "dependabot[bot]@users.noreply.github.com"
git config user.name "dependabot[bot]"
git add docs/*
git commit -m "Add model server validation report for ${DATE_STR}-${TIME_STR}" -s
git push
echo "date=${DATE_STR}" >> $GITHUB_OUTPUT
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
TOTAL_RUNTIME_SECONDS: 1200
VALIDATOTR_CURVE_TYPE: "default"
MODEL_SERVER_IMAGE: 'quay.io/sustainable_computing_io/kepler_model_server:latest'
- name: error_handle_for_estimator
if: ${{ failure() }}
run: |
systemctl status container-estimator || true
systemctl list-units --type=service
sudo podman ps -a || true
sudo podman logs estimator || true
- name: upload to artifacts
uses: actions/upload-artifact@v4
with:
name: equinix_metal_model_server
path: docs/validation/${{ steps.run_workload.run_playbook.date }}
retention-days: 30
Cleanup:
name: "Cleanup"
needs: [Install]
if: ${{ always() }}
uses: ./.github/workflows/clean_equinix_runner.yml
secrets: inherit
with:
runner_name: ${{ needs.Install.outputs.runner-name }}