Skip to content

Commit

Permalink
Enable MPI+GPU tests for restarts
Browse files Browse the repository at this point in the history
Pure MPI test is missing because sometimes it hangs and I don't know why
  • Loading branch information
Sbozzolo committed Oct 2, 2024
1 parent 44a2a6b commit 9cd858b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 18 deletions.
20 changes: 10 additions & 10 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -438,16 +438,16 @@ steps:
# slurm_ntasks: 2
# slurm_mem: 16G

# - label: ":computer: test restart GPU MPI"
# command: >
# srun julia --color=yes --project=examples test/restart.jl
# env:
# CLIMACOMMS_CONTEXT: "MPI"
# CLIMACOMMS_DEVICE: "CUDA"
# agents:
# slurm_gpus_per_task: 1
# slurm_ntasks: 2
# slurm_mem: 16G
- label: ":computer: test restart GPU MPI"
command: >
srun julia --color=yes --project=examples test/restart.jl
env:
CLIMACOMMS_CONTEXT: "MPI"
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus_per_task: 1
slurm_ntasks: 2
slurm_mem: 16G

- group: "MPI Examples"
steps:
Expand Down
13 changes: 5 additions & 8 deletions test/restart.jl
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,14 @@ function test_restart(test_dict; job_id, comms_ctx, more_ignore = Symbol[])
# Reset random seed for RRTMGP
Random.seed!(1234)

println(" just reading data")
ClimaComms.iamroot(comms_ctx) && println(" just reading data")
config_should_be_same = CA.AtmosConfig(
merge(test_dict, Dict("detect_restart_file" => true));
job_id,
comms_ctx,
)

simulation_restarted =
CA.get_simulation(config_should_be_same)
simulation_restarted = CA.get_simulation(config_should_be_same)

local_success[] &= compare(
simulation.integrator.u,
Expand Down Expand Up @@ -240,12 +239,11 @@ function test_restart(test_dict; job_id, comms_ctx, more_ignore = Symbol[])
)

# Check re-importing from previous state and advancing one step
println(" reading and simulating")
ClimaComms.iamroot(comms_ctx) && println(" reading and simulating")
# Reset random seed for RRTMGP
Random.seed!(1234)

restart_file =
joinpath(simulation.output_dir, "day0.2.hdf5")
restart_file = joinpath(simulation.output_dir, "day0.2.hdf5")
@test isfile(joinpath(restart_dir), "day0.2.hdf5")
# Restart from specific file
config2 = CA.AtmosConfig(
Expand Down Expand Up @@ -357,8 +355,7 @@ for configuration in configurations
if turbconv_mode == "prognostic_edmf"
more_ignore = [:ᶠnh_pressure₃ʲs]
end
push!(TESTING,
(; test_dict, job_id, more_ignore))
push!(TESTING, (; test_dict, job_id, more_ignore))
end
end
end
Expand Down

0 comments on commit 9cd858b

Please sign in to comment.