Validation with Model Server #205
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Validation with Model Server | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: '0 23 * * *' | |
permissions: | |
pull-requests: write | |
contents: write | |
repository-projects: write | |
packages: write | |
jobs: | |
Create-runner: | |
name: "Create Runner" | |
uses: ./.github/workflows/create_equinix_runner.yml | |
secrets: inherit | |
Install: | |
name: "Install" | |
needs: Create-runner | |
runs-on: self-hosted | |
outputs: | |
runner-name: ${{ runner.name }} | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Run Setup Runner Action | |
uses: ./.github/actions/setup-action | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Run playbook | |
id: run_playbook | |
run: | | |
cd ${GITHUB_WORKSPACE}/ansible | |
echo "Create VM" | |
ansible-playbook -i inventory.yml kvm_playbook.yml | |
echo "Install SSH tunnel" | |
ansible-playbook ssh_tunnel_playbook.yml | |
echo "Install Prometheus" | |
ansible-playbook -i inventory.yml metrics_playbook.yml | |
echo "Install Node Exporter" | |
ansible-playbook -i inventory.yml node_exporter_playbook.yml -vvv | |
echo "Verify node-exporter" | |
sudo systemctl status node_exporter || true | |
sudo ss -tuln | grep 9100 || true | |
curl -s localhost:9100/metrics | grep collector || true | |
echo "Install Kepler" | |
ansible-playbook -i inventory.yml -vvv kepler_playbook.yml -e "target_host=localhost" | |
echo "Install Model Server" | |
ansible-playbook -i inventory.yml -vvv model_server_playbook.yml | |
echo "Create ssh tunnel" | |
ansible-playbook -i inventory.yml ssh_tunnel_playbook.yml | |
echo "Run validation test" | |
ansible-playbook -vvv kepler_validator.yml | |
echo "Checkout the report" | |
ls /tmp | |
cat /tmp/report-*.md || true | |
# create a directory to store the artifacts, the directory is the current date | |
set -x | |
export DATE_STR=$(date +%Y-%m-%d) | |
export TIME_STR=$(date +%H-%M-%S) | |
cd ${GITHUB_WORKSPACE} | |
mkdir -p docs/validation/${DATE_STR} | |
export KEPLER_TAG=$(ls -d /tmp/validator-* |tail -1 | sed 's/.*validator-//g') | |
# python -m pip install --upgrade pip | |
# pip install tabulate | |
# python util/generate_daily_validations.py \ | |
# --report-md-filepath docs/daily-validations/daily-validations-model-server/daily-report.md \ | |
# --report-json-filepath docs/daily-validations/daily-validations-model-server/daily-report.json \ | |
# --new-val-filepath /tmp/validator-${KEPLER_TAG}/${KEPLER_TAG}.json | |
# copy the report to the directory | |
mv /tmp/validator-${KEPLER_TAG}/ docs/validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server | |
echo "| " ${DATE_STR} " | " ${KEPLER_TAG}-${TIME_STR}-model-server " | [Report](validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/report-${KEPLER_TAG}.md) | [Artifacts](validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/artifacts) |" \ | |
>> docs/kepler-model-validation.md | |
# Capture Prometheus TSDB Snapshot | |
mkdir -p docs/validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/prometheus-snapshot | |
snap_name=$(curl -XPOST http://localhost:9090/api/v1/admin/tsdb/snapshot | jq -r '.data.name') | |
tar -cvzf docs/validation/${DATE_STR}/validator-${KEPLER_TAG}-${TIME_STR}-model-server/prometheus-snapshot/snapshot.tar.gz -C \ | |
/var/lib/prometheus/snapshots ${snap_name} | |
# Push to the repo | |
git config user.email "dependabot[bot]@users.noreply.github.com" | |
git config user.name "dependabot[bot]" | |
git add docs/* | |
git commit -m "Add model server validation report for ${DATE_STR}-${TIME_STR}" -s | |
git push | |
echo "date=${DATE_STR}" >> $GITHUB_OUTPUT | |
env: | |
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} | |
TOTAL_RUNTIME_SECONDS: 1200 | |
VALIDATOTR_CURVE_TYPE: "default" | |
MODEL_SERVER_IMAGE: 'quay.io/sustainable_computing_io/kepler_model_server:latest' | |
- name: error_handle_for_estimator | |
if: ${{ failure() }} | |
run: | | |
systemctl status container-estimator || true | |
systemctl list-units --type=service | |
sudo podman ps -a || true | |
sudo podman logs estimator || true | |
- name: upload to artifacts | |
uses: actions/upload-artifact@v4 | |
with: | |
name: equinix_metal_model_server | |
path: docs/validation/${{ steps.run_workload.run_playbook.date }} | |
retention-days: 30 | |
Cleanup: | |
name: "Cleanup" | |
needs: [Install] | |
if: ${{ always() }} | |
uses: ./.github/workflows/clean_equinix_runner.yml | |
secrets: inherit | |
with: | |
runner_name: ${{ needs.Install.outputs.runner-name }} |