[GSoC] Add e2e test for tune
api with LLM hyperparameter optimization
#416
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Test with tune API | |
on: | |
pull_request: | |
paths-ignore: | |
- "pkg/ui/v1beta1/frontend/**" | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
e2e: | |
runs-on: ubuntu-22.04 | |
timeout-minutes: 120 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Test Env | |
uses: ./.github/workflows/template-setup-e2e-test | |
with: | |
kubernetes-version: ${{ matrix.kubernetes-version }} | |
- name: Install Katib SDK with extra requires | |
shell: bash | |
run: | | |
pip install --prefer-binary -e 'sdk/python/v1beta1[huggingface]' | |
- name: Check Disk Space Before Test | |
run: | | |
docker system prune -a | |
docker volume prune | |
echo "Checking disk space usage before e2e test..." | |
df -h # Run 'df' to check free disk space | |
- name: Run e2e test with tune API | |
if: always() | |
uses: ./.github/workflows/template-e2e-test | |
with: | |
tune-api: true | |
training-operator: true | |
- name: Check Disk Space After Test | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking disk space usage after e2e test..." | |
df -h # Run 'df' to check free disk space | |
- name: Fetch Experiment Pod Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching all the pods in the default namespace..." | |
kubectl get pods -n default | |
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) | |
echo "Fetching pod description for experiment pod..." | |
kubectl describe pod $POD_NAME -n default | |
echo "Fetching logs for experiment pod..." | |
kubectl logs $POD_NAME -n default --all-containers | |
echo "Fetching events for experiment pod..." | |
kubectl get events -n default | grep "tune-example-2" | |
- name: Fetch Kubelet Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching kubelet logs..." | |
sudo journalctl -u kubelet | |
- name: Check container runtime logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking container runtime logs..." | |
sudo journalctl -u docker | |
strategy: | |
fail-fast: false | |
matrix: | |
# Detail: https://hub.docker.com/r/kindest/node | |
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"] |