Skip to content

Commit

Permalink
Merge pull request #3307 from CliMA/gb/automatic_restart
Browse files Browse the repository at this point in the history
Automatically detect restart files and add tests for restarts
  • Loading branch information
Sbozzolo authored Sep 27, 2024
2 parents 4b4bae4 + a441b01 commit e731500
Show file tree
Hide file tree
Showing 9 changed files with 520 additions and 9 deletions.
38 changes: 38 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,44 @@ steps:
--job_id sphere_ssp_baroclinic_wave_rhoe_equilmoist_earth
artifact_paths: "sphere_ssp_baroclinic_wave_rhoe_equilmoist_earth/output_active/*"

- group: "Restarting"
steps:

- label: ":computer: test restart"
command: >
julia --color=yes --project=examples test/restart.jl
agents:
slurm_mem: 16GB

- label: ":computer: test restart GPU"
command: >
julia --color=yes --project=examples test/restart.jl
env:
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1
slurm_mem: 16G

# - label: ":computer: test restart MPI"
# command: >
# srun julia --color=yes --project=examples test/restart.jl
# env:
# CLIMACOMMS_CONTEXT: "MPI"
# agents:
# slurm_ntasks: 2
# slurm_mem: 16G

# - label: ":computer: test restart GPU MPI"
# command: >
# srun julia --color=yes --project=examples test/restart.jl
# env:
# CLIMACOMMS_CONTEXT: "MPI"
# CLIMACOMMS_DEVICE: "CUDA"
# agents:
# slurm_gpus_per_task: 1
# slurm_ntasks: 2
# slurm_mem: 16G

- group: "MPI Examples"
steps:

Expand Down
3 changes: 3 additions & 0 deletions config/default_configs/default_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,9 @@ restart_file:
prescribe_ozone:
help: "Prescribe time and spatially varying ozone from a file [`false` (default), `true`]"
value: false
detect_restart_file:
help: "When true, try finding a restart file and use it to restart the simulation. Only works with ActiveLink."
value: false
prescribed_aerosols:
help: "Which aerosols to add. List of keys from the data file (e.g., CB1, CB2)."
value: []
Expand Down
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ makedocs(;
"Gravity Wave Drag Parameterizations" => "gravity_wave.md",
"Ocean Surface Albedo Parameterization" => "surface_albedo.md",
"Radiative Equilibrium" => "radiative_equilibrium.md",
"Restarts and checkpoints" => "restarts.md",
"REPL scripts" => "repl_scripts.md",
"Configuration" => "config.md",
"Parameters" => "parameters.md",
Expand Down
49 changes: 49 additions & 0 deletions docs/src/restarts.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
## Restarting Simulations in ClimaAtmos

`ClimaAtmos` supports restarting simulations from previously saved checkpoints,
allowing you to split simulations across multiple runs. This feature is
particularly useful for

* **Performing long simulations on clusters:** Most supercomputers do not allow
jobs to run for an unlimited amount of wall-time. So, instead of running
multi-year simulation in a single run, you can break it down into shorter
segments, restarting from the last saved state.

* **Recovery from interruptions:** If a simulation is unexpectedly interrupted
(e.g., due to a crash), you can resume it from the last saved checkpoint
instead of starting over.

* **Sensitivity experiments:** You can run a simulation to a certain point, then
branch it off into multiple simulations with modified parameters or initial
conditions, restarting from the common checkpoint.

!!! note

In the current version, restarting a simulation will check if the `AtmosModel`
used to produce the restart file is identical to the new one and throw a warning
if that is not the case. When the warning is produced, it is your responsability
to ensure that what you are doing makes sense.

### How Restarts Work

`ClimaAtmos` periodically saves the simulation state to a file called a "restart
file". This file contains all the necessary information to resume the simulation
from that point, including the values of all prognostic variables. The frequency
of saving restart files can be configured in the simulation settings using the
`dt_save_state_to_disk` option.

Restart files are HDF5 files that contain the state `Y` of the simulation at the
time of checkpoint. Then, the run restarted by preparing a new simulation as
specified by the new configuration, but using the state read from file. The
values of non-prognostic variables is computed again.

`ClimaAtmos` can automatically detect the latest restart file within a
structured output directory generated using the `ActiveLinkStyle`. When
`ClimaAtmos` is configured to do (e.g., with the `detect_restart_file` option),
`ClimaAtmos` will scan previous output directories for the most recent file that
matches the expected name for a restart file. If none is found, a new simulation
is started.

If is also possible to manually specify a restart file. In this case, this will
override any file automatically detected.

4 changes: 2 additions & 2 deletions examples/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,9 @@ weakdeps = ["CUDA", "MPI"]

[[deps.ClimaCore]]
deps = ["Adapt", "BandedMatrices", "BlockArrays", "ClimaComms", "CubedSphere", "DataStructures", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LinearAlgebra", "MultiBroadcastFusion", "NVTX", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "StaticArrays", "Statistics", "Unrolled"]
git-tree-sha1 = "806e8490ff1aa664ca579544d798f8addfa1b07d"
git-tree-sha1 = "13c2f4e58c78fa54a22705d15e039c99462112ed"
uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
version = "0.14.15"
version = "0.14.17"
weakdeps = ["CUDA", "Krylov"]

[deps.ClimaCore.extensions]
Expand Down
8 changes: 7 additions & 1 deletion src/callbacks/callbacks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,13 @@ NVTX.@annotate function save_state_to_disk_func(integrator, output_dir)
output_file = joinpath(output_dir, "day$day.$sec.hdf5")
comms_ctx = ClimaComms.context(integrator.u.c)
hdfwriter = InputOutput.HDF5Writer(output_file, comms_ctx)
InputOutput.HDF5.write_attribute(hdfwriter.file, "time", t) # TODO: a better way to write metadata
# TODO: a better way to write metadata
InputOutput.HDF5.write_attribute(hdfwriter.file, "time", t)
InputOutput.HDF5.write_attribute(
hdfwriter.file,
"atmos_model_hash",
hash(p.atmos),
)
InputOutput.write!(hdfwriter, Y, "Y")
Base.close(hdfwriter)
return nothing
Expand Down
2 changes: 1 addition & 1 deletion src/parameterized_tendencies/radiation/RRTMGPInterface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,6 @@ function RRTMGPModel(
RRTMGP.Fluxes.FluxSW(ncol, nlay, FT, DA)
set_and_save!(flux_sw.flux_up, "face_sw_flux_up", t...)
set_and_save!(flux_sw.flux_dn, "face_sw_flux_dn", t...)
set_and_save!(flux_sw.flux_dn_dir, "face_sw_direct_flux_dn", t...)
set_and_save!(flux_sw.flux_net, "face_sw_flux", t...)
if radiation_mode isa AllSkyRadiationWithClearSkyDiagnostics
flux_sw2 = RRTMGP.Fluxes.FluxSW(ncol, nlay, FT, DA)
Expand All @@ -634,6 +633,7 @@ function RRTMGPModel(
t...,
)
set_and_save!(flux_sw2.flux_net, "face_clear_sw_flux", t...)
set_and_save!(flux_sw.flux_dn_dir, "face_sw_direct_flux_dn", t...)
end

cos_zenith = DA{FT}(undef, ncol)
Expand Down
82 changes: 77 additions & 5 deletions src/solver/type_getters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -291,13 +291,20 @@ function get_spaces_restart(Y)
return (; center_space, face_space)
end

function get_state_restart(config::AtmosConfig)
function get_state_restart(config::AtmosConfig, restart_file, atmos_model_hash)
(; parsed_args, comms_ctx) = config
restart_file = parsed_args["restart_file"]

@assert !isnothing(restart_file)
reader = InputOutput.HDF5Reader(restart_file, comms_ctx)
Y = InputOutput.read_field(reader, "Y")
t_start = InputOutput.HDF5.read_attribute(reader.file, "time")
atmos_model_hash_in_restart =
InputOutput.HDF5.read_attribute(reader.file, "atmos_model_hash")
if atmos_model_hash_in_restart != atmos_model_hash
error(
"Restart file $(restart_file) was constructed with a different AtmosModel",
)
end
return (Y, t_start)
end

Expand Down Expand Up @@ -477,6 +484,54 @@ thermo_state_type(::EquilMoistModel, ::Type{FT}) where {FT} = TD.PhaseEquil{FT}
thermo_state_type(::NonEquilMoistModel, ::Type{FT}) where {FT} =
TD.PhaseNonEquil{FT}

auto_detect_restart_file(::OutputPathGenerator.OutputPathGeneratorStyle, _) =
error("auto_detect_restart_file works only with ActiveLink")

"""
auto_detect_restart_file(::ActiveLinkStyle, base_output_dir)
Return the most recent restart file in the directory structure in `base_output_dir`, if any.
`auto_detect_restart_file` scans the content of `base_output_dir` matching the expected
names for output folders generated by `ActiveLinkStyle` and for restart files
(`dayDDDD.SSSSS.hdf5`). If no folder or no restart file is found, return `nothing`: this
means that the simulation cannot be automatically restarted. If a folder is found, look
inside it and return the latest restart file (latest measured by the time in the file name).
"""
function auto_detect_restart_file(
output_dir_style::OutputPathGenerator.ActiveLinkStyle,
base_output_dir,
)
# if base_output_dir does not exist, we return restart_file = nothing because there is
# no restart file to be detected
isdir(base_output_dir) || return nothing

# output_dir will be something like ABC/DEF/output_1234
name_rx = r"output_(\d\d\d\d)"
restart_file_rx = r"day\d+\.\w+\.hdf5"
restart_file = nothing

existing_outputs =
filter(x -> !isnothing(match(name_rx, x)), readdir(base_output_dir))

isempty(existing_outputs) && return nothing

latest_output = first(sort(existing_outputs, rev = true))
previous_folder = joinpath(base_output_dir, latest_output)
possible_restart_files =
filter(f -> occursin(restart_file_rx, f), readdir(previous_folder))
if isempty(possible_restart_files)
@warn "Detected folder $(previous_folder), but no restart file was found"
return nothing
end

restart_file_name = last(CA.sort_files_by_time(possible_restart_files))
restart_file = joinpath(previous_folder, restart_file_name)
@assert isfile(restart_file) "Restart file does not exist"

return restart_file
end

function get_sim_info(config::AtmosConfig)
(; parsed_args) = config
FT = eltype(config)
Expand All @@ -496,15 +551,28 @@ function get_sim_info(config::AtmosConfig)
haskey(allowed_dir_styles, lowercase(requested_style)) ||
error("output_dir_style $(requested_style) not available")

output_dir_style = allowed_dir_styles[lowercase(requested_style)]

# We look for a restart before creating a new output dir because we want to
# look for previous folders
restart_file =
parsed_args["detect_restart_file"] ?
auto_detect_restart_file(output_dir_style, base_output_dir) :
parsed_args["restart_file"]

output_dir = OutputPathGenerator.generate_output_path(
base_output_dir;
context = config.comms_ctx,
style = allowed_dir_styles[lowercase(requested_style)],
style = output_dir_style,
)

isnothing(restart_file) ||
@info "Restarting simulation from file $restart_file"

sim = (;
output_dir,
restart = !isnothing(parsed_args["restart_file"]),
restart = !isnothing(restart_file),
restart_file,
job_id,
dt = FT(time_to_seconds(parsed_args["dt"])),
start_date = DateTime(parsed_args["start_date"], dateformat"yyyymmdd"),
Expand Down Expand Up @@ -619,7 +687,11 @@ function get_simulation(config::AtmosConfig)

if sim_info.restart
s = @timed_str begin
(Y, t_start) = get_state_restart(config)
(Y, t_start) = get_state_restart(
config,
sim_info.restart_file,
hash(atmos),
)
spaces = get_spaces_restart(Y)
end
@info "Allocating Y: $s"
Expand Down
Loading

0 comments on commit e731500

Please sign in to comment.