Skip to content

Merge filestore usage feature into develop #2

Merge filestore usage feature into develop

Merge filestore usage feature into develop #2

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
name: Storage Tests
on:
push:
branches: ["main"]
pull_request: # By default this runs for types assigned, opened and synchronize.
env:
# Names must be unique in parallel running tests.
TPU_FILESTORE_CLUSTER_NAME: xpk-fs-attach-${{github.event.number}}
TPU_FILESTORE_CLUSTER_NAME_CREATE: xpk-fs-create-${{github.event.number}}
STORAGE_NAME: test-storage-${{github.event.number}}
FS_STORAGE_NAME: ${{secrets.INSTANCE_NAME}}-test-storage-${{github.event.number}}
FS_DELETE_WORKLOAD: "fs-delete-workload"
FS_READ_WORKLOAD: "fs-read-workload"
FS_WRITE_WORKLOAD: "fs-write-workload"
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
jobs:
run-filestore-workload:
runs-on: [ubuntu-20.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: filestore-test-group-${{github.event.number}}
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Generate random seed
run: |
RANDOM_SEED=$((RANDOM % 10000)) # Generate a random number between 0 and 9999
echo "RANDOM_SEED=$RANDOM_SEED" >> $GITHUB_ENV
- name: Install kubectl
run: gcloud components install kubectl
- name: Verify gcp setup
run: gcloud info
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk and verify it executes corretly
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
xpk --help
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create a XPK Cluster with 2x v4-8 nodepools. Larger num-nodes to avoid master resizing.
run: |
python3 xpk.py cluster create --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 \
--zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 \
--reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-workload-identity --enable-gcpfilestore-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Fill Filestore manifest file
run: |
sed -i 's/PROJECT_NAME/${{secrets.PROJECT_NAME}}/g; s/ZONE/us-central2-b/g; s/INSTANCE_NAME/${{secrets.INSTANCE_NAME}}/g; s/VOL_NAME/${{secrets.VOL_NAME}}/g; s/IP_ADDRESS/${{secrets.IP_ADDRESS}}/g' ./tests/data/fs-manifest.yaml
- name: Create auto-mount GCP Filestore Storage instance
run: |
python3 xpk.py storage attach $FS_STORAGE_NAME --cluster=$TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --type=gcpfilestore \
--auto-mount=true \
--mount-point='/fs-test-mount-point' --readonly=false --manifest='./tests/data/fs-manifest.yaml'
- name: List and verify existing Storages
run: python3 xpk.py storage list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b | tee output.txt | grep $FS_STORAGE_NAME || (echo 'No storage found' && exit 143)
- name: Run workload to write file on filestore
run : python3 xpk.py workload create --workload $FS_WRITE_WORKLOAD --docker-image='marketplace.gcr.io/google/ubuntu2004' --command "mkdir /fs-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --zone us-central2
- name: Wait for writer workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $FS_WRITE_WORKLOAD --timeout 300
- name: Run workload to read file on filestore
run : python3 xpk.py workload create --workload $FS_READ_WORKLOAD --command "grep 'Test text message' /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)" --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --zone us-central2
- name: Wait for reader workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $FS_READ_WORKLOAD --timeout 300
- name: Run workload to delete file on filestore
run : python3 xpk.py workload create --workload $FS_DELETE_WORKLOAD --command "rm -rf /fs-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --cluster $TPU_FILESTORE_CLUSTER_NAME --tpu-type=v4-8 --zone us-central2
- name: Wait for delete workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $FS_DELETE_WORKLOAD --timeout 300
- name: Delete storage
run: python3 xpk.py storage delete $FS_STORAGE_NAME --zone=us-central2-b --cluster=$TPU_FILESTORE_CLUSTER_NAME
- name: Delete the writer workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_WRITE_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
- name: Delete the reader workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_READ_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
- name: Delete the delete workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_DELETE_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
- name: Delete the cluster created
if: always()
run: python3 xpk.py cluster delete --cluster $TPU_FILESTORE_CLUSTER_NAME --zone=us-central2-b
filestore-create:
runs-on: [ubuntu-20.04]
concurrency: # We support one build or nightly test to run at a time currently.
group: filestore-test-group-${{github.event.number}}
cancel-in-progress: false
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: install kubectl
run: gcloud components install kubectl
- name: Generate random seed
run: |
RANDOM_SEED=$((RANDOM % 10000)) # Generate a random number between 0 and 9999
echo "RANDOM_SEED=$RANDOM_SEED" >> $GITHUB_ENV
- name: Verify gcp setup
run: gcloud info
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk with pip and verify it executes corretly
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
xpk --help
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create a XPK Cluster with 2x v4-8 nodepools. Larger num-nodes to avoid master resizing.
run: |
python3 xpk.py cluster create --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --tpu-type=v4-8 --num-slices=2 \
--zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=16 \
--reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-workload-identity --enable-gcpfilestore-csi-driver \
--custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
- name: Create auto-mount GCP Filestore Storage instance
run: |
python3 xpk.py storage create $FS_STORAGE_NAME --cluster=$TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b --type=gcpfilestore \
--auto-mount=true --vol=vol1 --size=1024 --tier=BASIC_HDD \
--mount-point='/fs-test-mount-point' --readonly=false --network=${{secrets.NETWORK_NAME}}
- name: List and verify existing Storages
run: python3 xpk.py storage list --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b | tee output.txt | grep $FS_STORAGE_NAME || (echo 'No storage found' && exit 143)
- name: Run workload to write file on filestore
run : python3 xpk.py workload create --workload $FS_WRITE_WORKLOAD --num-slices=1 --docker-image='marketplace.gcr.io/google/ubuntu2004' --command "mkdir /fs-test-mount-point/$RANDOM_SEED/ && echo 'Test text message' > /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Writing to filestore failed' && exit 143)" --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --tpu-type=v4-8 --zone us-central2-b
- name: Wait for writer workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b --wait-for-job-completion $FS_WRITE_WORKLOAD --timeout 300
- name: Run workload to read file on filestore
run : python3 xpk.py workload create --workload $FS_READ_WORKLOAD --num-slices=1 --command "grep 'Test text message' /fs-test-mount-point/$RANDOM_SEED/test.txt || (echo 'Reading from filestore failed' && exit 143)" --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --tpu-type=v4-8 --zone us-central2-b
- name: Wait for reader workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b --wait-for-job-completion $FS_READ_WORKLOAD --timeout 300
- name: Run workload to delete file on filestore
run : python3 xpk.py workload create --workload $FS_DELETE_WORKLOAD --command "rm -rf /fs-test-mount-point/$RANDOM_SEED/test.txt || exit 143" --num-slices=1 --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --tpu-type=v4-8 --zone us-central2-b
- name: Wait for delete workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b --wait-for-job-completion $FS_DELETE_WORKLOAD --timeout 300
- name: Delete storage
run: python3 xpk.py storage delete $FS_STORAGE_NAME --zone=us-central2-b --cluster=$TPU_FILESTORE_CLUSTER_NAME_CREATE
- name: Delete the filestore instance
if: always()
run: gcloud filestore instances delete $FS_STORAGE_NAME --zone=us-central2-b --force
- name: Delete the writer workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_WRITE_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b
- name: Delete the reader workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_READ_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b
- name: Delete the delete workload on the cluster
if: always()
run: python3 xpk.py workload delete --workload $FS_DELETE_WORKLOAD --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b
- name: Delete the cluster created
if: always()
run: python3 xpk.py cluster delete --cluster $TPU_FILESTORE_CLUSTER_NAME_CREATE --zone=us-central2-b