diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh index 7431121d0d..614a02c430 100644 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh @@ -13,14 +13,15 @@ cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse" echo "Building docker image containing all pytorch libraries..." sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/dino/Dockerfile --tag pytorch-gcsfuse -mkdir container_artifacts +mkdir -p container_artifacts echo "Running the docker image build in the previous step..." sudo docker run --runtime=nvidia --name=pytorch_automation_container --privileged -d -v ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/container_artifacts:/pytorch_dino/run_artifacts:rw,rshared \ --shm-size=128g pytorch-gcsfuse:latest -echo "Setting up cron job to delete the gcsfuse_logs." -echo "0 */1 * * * cd ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse && sudo sh ./perfmetrics/scripts/ml_tests/smart_log_deleter.sh container_artifacts/gcsfuse_logs/" | crontab - +# Setup the log_rotation. +chmod +x perfmetrics/scripts/ml_tests/setup_log_rotation.sh +source perfmetrics/scripts/ml_tests/setup_log_rotation.sh ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/container_artifacts/gcsfuse.log # Wait for the script completion as well as logs output. sudo docker logs -f pytorch_automation_container diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg index 6d73209080..e2e8435028 100644 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/continuous.cfg @@ -1,7 +1,7 @@ build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/dino/build.sh" -# Setting the 16 days (23040 mins) timeout for kokoro-jobs. -timeout_mins: 23040 +# Setting the 8 days (11520 mins) timeout for kokoro-jobs. +timeout_mins: 11520 # Post the gcsfuse logs as an artifact to GCSBucket diff --git a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh b/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh index e4ceb0c105..daf14ad287 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/dino/setup_container.sh @@ -4,12 +4,10 @@ wget -O go_tar.tar.gz https://go.dev/dl/go1.19.5.linux-amd64.tar.gz rm -rf /usr/local/go && tar -C /usr/local -xzf go_tar.tar.gz export PATH=$PATH:/usr/local/go/bin -# Todo: please update the branch, when log-rotation changes are merged. # Log-rotation branch will create the logs.txt file after every 6 hours. # Hence, we need to setup the job to delete the logs file if not required. git clone https://github.com/GoogleCloudPlatform/gcsfuse.git cd gcsfuse -git checkout log_rotation go build . cd - @@ -22,11 +20,10 @@ nohup /pytorch_dino/gcsfuse/gcsfuse --foreground --type-cache-ttl=1728000s \ --stat-cache-capacity=1320000 \ --stackdriver-export-interval=60s \ --implicit-dirs \ - --experimental-enable-storage-client-library \ --max-conns-per-host=100 \ --debug_fs \ --debug_gcs \ - --log-file run_artifacts/gcsfuse_logs/logs.txt \ + --log-file run_artifacts/gcsfuse.log \ --log-format text \ gcsfuse-ml-data gcsfuse_data > "run_artifacts/gcsfuse.out" 2> "run_artifacts/gcsfuse.err" & @@ -67,7 +64,7 @@ python3 -m torch.distributed.launch \ --norm_last_layer False \ --use_fp16 False \ --clip_grad 0 \ - --epochs 200 \ + --epochs 100 \ --global_crops_scale 0.25 1.0 \ --local_crops_number 10 \ --local_crops_scale 0.05 0.25 \ diff --git a/perfmetrics/scripts/ml_tests/setup_log_rotation.sh b/perfmetrics/scripts/ml_tests/setup_log_rotation.sh new file mode 100644 index 0000000000..af15f886de --- /dev/null +++ b/perfmetrics/scripts/ml_tests/setup_log_rotation.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# This will setup the rotation of log-file present at the $1 +# Please provide the absolute path of log-file. + +log_file=$1 +echo "Creating logrotate configuration..." +cat << EOF | sudo tee ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/gcsfuse_logrotate.conf +${log_file} { + su root adm + rotate 3 + size 1G + missingok + notifempty + compress + dateext + dateformat -%Y%m%d-%s + copytruncate +} +EOF + +# Set the correct access permission to the config file. +sudo chmod 0644 ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/gcsfuse_logrotate.conf +sudo chown root ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/gcsfuse_logrotate.conf + +# Make sure logrotate installed on the system. +if test -x /usr/sbin/logrotate ; then + echo "Logrotate already installed on the system." +else + echo "Installing logrotate on the system..." + sudo apt-get install logrotate +fi + +# Add a shell script which will be run hourly, which eventually executes the +# command to rotate the logs according to config present in /etc/logrotate.hourly.d +cat << EOF | sudo tee /etc/cron.hourly/gcsfuse_logrotate +#!/bin/bash +test -x /usr/sbin/logrotate || exit 0 +/usr/sbin/logrotate ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/gcsfuse_logrotate.conf --state ${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/gcsfuse_logrotate_status +EOF + +# Make sure, we have hourly logrotate setup inplace correctly. +if [ $? -eq 0 ]; then + echo "Hourly cron setup for logrotate completed successfully" +else + echo "Please install linux package - cron" + exit 1 +fi + +sudo chmod 775 /etc/cron.hourly/gcsfuse_logrotate + +# Restart the cron service +sudo service cron restart