diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index bfbc6b4fb0..01358dc045 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,6 +1,6 @@ agents: queue: new-central - slurm_time: 24:00:00 + slurm_time: 4:00:00 modules: climacommon/2024_03_18 env: @@ -344,8 +344,6 @@ steps: slurm_ntasks: 2 slurm_mem: 16GB - - label: "batch script" - command: "sbatch test/mpi_tests/local_checks.sh" # short high-res performance test - label: "Unthreaded AMIP FINE" # also reported by longruns with a flame graph @@ -431,9 +429,13 @@ steps: slurm_mem: 20GB slurm_gpus: 1 - - - - wait + - group: "Bash scripts" + steps: + - label: "Submit and Monitor sbatch Job on Caltech HPC" + # check that (1) the script can be succesfully submitted, (2) it runs successfully + command: "test/mpi_tests/test_sbatch_script.sh" + agents: + slurm_ntasks: 1 - wait diff --git a/.gitignore b/.gitignore index 2b10544978..c225ec897f 100644 --- a/.gitignore +++ b/.gitignore @@ -50,7 +50,7 @@ experiments/AMIP/output/* *.so # internal tests -testdel.jl +slurm-*.out # ignore vscode artifacts *.vscode diff --git a/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml b/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml new file mode 100644 index 0000000000..a99b64cea8 --- /dev/null +++ b/config/model_configs/coarse_single_ft64_hourly_checkpoints_restart.yml @@ -0,0 +1,19 @@ +anim: false +apply_limiter: false +dt: "400secs" +dt_cpl: 400 +dt_save_restart: "10days" +dt_save_to_sol: "1days" +energy_check: false +h_elem: 6 +hourly_checkpoint: true +hourly_checkpoint_dt: 1 +job_id: "coarse_single_ft64_hourly_checkpoints_restart" +mode_name: "amip" +moist: "equil" +mono_surface: false +precip_model: "0M" +rad: "gray" +run_name: "coarse_single_ft64_hourly_checkpoints_restart" +t_end: "800secs" +vert_diff: "true" diff --git a/experiments/AMIP/cli_options.jl b/experiments/AMIP/cli_options.jl index 6e69b5f1ef..9bab51e062 100644 --- a/experiments/AMIP/cli_options.jl +++ b/experiments/AMIP/cli_options.jl @@ -38,6 +38,10 @@ function argparse_settings() help = "Boolean flag indicating whether to checkpoint at intervals of 1 hour or multiple hours" arg_type = Bool default = false + "--hourly_checkpoint_dt" + help = "Time interval for hourly checkpointing in hours (20 days by default)" + arg_type = Int + default = 480 "--coupler_output_dir" help = "Directory to save output files. Note that TempestRemap fails if interactive and paths are too long." arg_type = String diff --git a/experiments/AMIP/coupler_driver.jl b/experiments/AMIP/coupler_driver.jl index fa47c2636c..fc0b75bf48 100644 --- a/experiments/AMIP/coupler_driver.jl +++ b/experiments/AMIP/coupler_driver.jl @@ -131,6 +131,7 @@ saveat = Float64(time_to_seconds(config_dict["dt_save_to_sol"])) date0 = date = DateTime(config_dict["start_date"], dateformat"yyyymmdd") mono_surface = config_dict["mono_surface"] hourly_checkpoint = config_dict["hourly_checkpoint"] +hourly_checkpoint_dt = config_dict["hourly_checkpoint_dt"] restart_dir = config_dict["restart_dir"] restart_t = Int(config_dict["restart_t"]) evolving_ocean = config_dict["evolving_ocean"] @@ -526,8 +527,13 @@ The currently implemented callbacks are: being approximated from wind speed). It is updated at the same frequency as the atmospheric radiation. NB: Eventually, we will call all of radiation from the coupler, in addition to the albedo calculation. =# -checkpoint_cb = - HourlyCallback(dt = FT(480), func = checkpoint_sims, ref_date = [dates.date[1]], active = hourly_checkpoint) # 20 days + +checkpoint_cb = HourlyCallback( + dt = hourly_checkpoint_dt, + func = checkpoint_sims, + ref_date = [dates.date[1]], + active = hourly_checkpoint, +) # 20 days update_firstdayofmonth!_cb = MonthlyCallback(dt = FT(1), func = update_firstdayofmonth!, ref_date = [dates.date1[1]], active = true) dt_water_albedo = parse(FT, filter(x -> !occursin(x, "hours"), dt_rad)) diff --git a/test/mpi_tests/local_checks.sh b/test/mpi_tests/local_checks.sh index 08213273fa..cb4f49c4f7 100644 --- a/test/mpi_tests/local_checks.sh +++ b/test/mpi_tests/local_checks.sh @@ -1,29 +1,25 @@ #!/bin/bash -#SBATCH --time=24:00:00 -#SBATCH --nodes=1 -#SBATCH --job-name=mpi_restart_test -#SBATCH --reservation=clima -#SBATCH --mem=32GB #SBATCH --ntasks=2 +#SBATCH --job-name=mpi_amip +#SBATCH --time=24:00:00 +#SBATCH --mem-per-cpu=16G +#SBATCH --partition=expansion -# TODO: this needs to be updated (+ implement better tests that are caught on Buildkite) #667 - +export MODULEPATH="/groups/esm/modules:$MODULEPATH" module purge -module load julia/1.10.1 -export JULIA_MPI_BINARY=system -export JULIA_NUM_THREADS=${SLURM_CPUS_PER_TASK:=1} -export CLIMACORE_DISTRIBUTED="MPI" -export JULIA_HDF5_PATH="" +module load climacommon/2024_03_18 -export RUN_NAME=amip_restart_mpi_test +export CC_PATH=$(pwd)/ # adjust this to the path of your ClimaCoupler.jl directory +export RUN_NAME=coarse_single_ft64_hourly_checkpoints_restart +export CONFIG_FILE=${CC_PATH}config/model_configs/${RUN_NAME}.yml export RESTART_DIR=experiments/AMIP/output/amip/${RUN_NAME}_artifacts/ -export RESTART_T=200 -julia -e 'using Pkg; Pkg.add("MPIPreferences"); using MPIPreferences; use_system_binary()' -julia --project -e 'using Pkg; Pkg.instantiate()' -julia --project -e 'using Pkg; Pkg.build("MPI")' -julia --project -e 'using Pkg; Pkg.build("HDF5")' -julia --project -e 'using Pkg; Pkg.API.precompile()' +export OPENBLAS_NUM_THREADS=1 +export JULIA_NVTX_CALLBACKS=gc +export OMPI_MCA_opal_warn_on_missing_libcuda=0 +export JULIA_MAX_NUM_PRECOMPILE_FILES=100 +export JULIA_CPU_TARGET='broadwell;skylake;icelake;cascadelake;epyc' +export SLURM_KILL_BAD_EXIT=1 julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.instantiate(;verbose=true)' julia --project=experiments/AMIP/ -e 'using Pkg; Pkg.precompile()' @@ -34,10 +30,29 @@ julia --project=artifacts -e 'using Pkg; Pkg.precompile()' julia --project=artifacts -e 'using Pkg; Pkg.status()' julia --project=artifacts artifacts/download_artifacts.jl -# run spin up -# - specify `--hourly_checkpoint true` to save monthly checkpoints of all model prognostic states -mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --start_date 19790101 --hourly_checkpoint true --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64 +srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_FILE + +# restart from simulation time of 400 seconds +export RESTART_T=400 + +# setup the new config file with ammened checkpointing frequency +export RESTART_CONFIG_FILE=${CONFIG_FILE::-4}_tmp.yml +cp $CONFIG_FILE $RESTART_CONFIG_FILE +sed -i 's/t_end: \"800secs\"/t_end: \"3600secs\"/g' $RESTART_CONFIG_FILE + +# rerun the model +srun -K julia --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $RESTART_CONFIG_FILE --restart_dir $RESTART_DIR --restart_t $RESTART_T + +# throw an error if no restart checkpoint files are found +if [ $(ls -1 $RESTART_DIR/checkpoint | wc -l) -lt 5 ]; then + echo "Error: RESTART_DIR does not contain enough files" + exit 1 +else + echo "Successful: RESTART_DIR contains $(ls -1 $RESTART_DIR/checkpoint | wc -l) files" + exit 0 +fi -# init using a restart -# - specify the directory of the `checkpoint/` folder (i.e., `--restart_dir`) and time (in secs; `--restart_t`) of the restart file -mpiexec julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --run_name $RUN_NAME --coupled true --restart_dir $RESTART_DIR --restart_t $RESTART_T --start_date 19790102 --anim true --surface_setup PrescribedSurface --dt_cpl 200 --energy_check false --mode_name amip --mono_surface false --vert_diff true --moist equil --rad clearsky --precip_model 0M --z_elem 35 --dz_bottom 50 --h_elem 12 --rayleigh_sponge true --alpha_rayleigh_uh 0 --dt 200secs --t_end 0.1days --job_id $RUN_NAME --dt_save_to_sol 1000days --dt_save_state_to_disk 10days --apply_limiter false --FLOAT_TYPE Float64 +# Trouble shooting? +# - ensure you're using the latest module file of climacommon and set MODULEPATH to the correct location +# - ensure you're using the latest version of ClimaCoupler.jl +# - did you cd to your version of ClimaCoupler.jl? diff --git a/test/mpi_tests/test_sbatch_script.sh b/test/mpi_tests/test_sbatch_script.sh new file mode 100755 index 0000000000..83f1a9ac78 --- /dev/null +++ b/test/mpi_tests/test_sbatch_script.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# This script submits a job to the Slurm scheduler and waits for it to finish. It +# reports the job status every 30 seconds until the job completes. If the job +# fails or is terminated, the script prints an error message and exits with a +# non-zero status code. This is used by Buildkite to determine whether the job +# truly succeeded or failed. + +# Submit the sbatch script and capture its job ID +JOB_ID=$(sbatch test/mpi_tests/local_checks.sh | awk '{print $4}') +echo "Submitted job with ID: $JOB_ID, output log: slurm-$JOB_ID.out" +START_TIME=$(date +%s) +# Loop until the job finishes +while true; do + # Check the status of the job + STATUS=$(scontrol show job $JOB_ID | grep -oP 'JobState=\K\S+') + sleep 30 + ELAPSED_TIME=$(( $(date +%s) - $START_TIME )) + # If the job status is 'PD' (pending) or 'R' (running), wait and continue checking + if [ "$STATUS" == "" ] || [ "$STATUS" == "PENDING" ] || [ "$STATUS" == "RUNNING" ]; then + echo "Job is still running... Elapsed time: $ELAPSED_TIME seconds." + # If the job status is 'CF' (completed successfully), print success message and exit + elif [ "$STATUS" == "COMPLETED" ]; then + echo "Job completed successfully." + exit 0 + # If the job status is anything else, print error message and exit + else + echo "Error: Job failed or terminated. See slurm-$JOB_ID.out for more information." + cat "slurm-$JOB_ID.out" + exit 1 + fi +done