Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial support for farming out trials on >1 GPUs per node #15

Merged
merged 2 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion echo/examples/keras/hyperparameter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ save_path: "./data"

pbs:
jobs: 1
trials_per_job: 1
tasks_per_worker: 1
gpus_per_node: 1
bash: ["source ~/.bashrc", "conda activate echo"]
batch:
l: ["select=1:ncpus=8:ngpus=1:mem=64GB", "walltime=12:00:00"]
Expand Down
3 changes: 2 additions & 1 deletion echo/examples/torch/hyperparameter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ save_path: "./data"

pbs:
jobs: 1
trials_per_job: 1
tasks_per_worker: 1
gpus_per_node: 1
bash: ["source ~/.bashrc", "conda activate echo"]
batch:
l: ["select=1:ncpus=8:ngpus=1:mem=64GB", "walltime=12:00:00"]
Expand Down
116 changes: 74 additions & 42 deletions echo/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,76 @@ def fix_broken_study(
return study_fixed, removed



def generate_batch_commands(
hyper_config, batch_type, aiml_path, jobid, batch_commands = []
) -> List[str]:

# Check if "gpus_per_node" is specified in hyper_config[batch_type]
if "gpus_per_node" in hyper_config[batch_type]:
# Get the list of GPU devices, or convert a single integer to a list
gpus_per_node = list(range(hyper_config[batch_type]["gpus_per_node"]))

# Check if "tasks_per_worker" is specified in hyper_config[batch_type]
if (
"tasks_per_worker" in hyper_config[batch_type]
and hyper_config[batch_type]["tasks_per_worker"] > 1
):
# Warn about the experimental nature of tasks_per_worker
logging.warning(
"The tasks_per_worker is experimental; be advised that some runs may fail."
)
logging.warning(
"Check the log and stdout/err files if simulations are dying to see the errors."
)

# Loop over the specified number of trials
for copy in range(hyper_config[batch_type]["tasks_per_worker"]):
# Loop over each GPU device
for device in gpus_per_node:
# Append the command with CUDA_VISIBLE_DEVICES={device} to batch_commands
batch_commands.append(
f"CUDA_VISIBLE_DEVICES={device}, {aiml_path} {sys.argv[1]} {sys.argv[2]} -n {jobid} &"
)
# Allow some time between calling instances of run
batch_commands.append("sleep 0.5")
# Wait for all background jobs to finish
batch_commands.append("wait")
else:
# Loop over each GPU device without multiple trials
for device in gpus_per_node:
# Append the command with CUDA_VISIBLE_DEVICES={device} to batch_commands
batch_commands.append(
f"CUDA_VISIBLE_DEVICES={device}, {aiml_path} {sys.argv[1]} {sys.argv[2]} -n {jobid}"
)
elif (
"tasks_per_worker" in hyper_config[batch_type]
and hyper_config[batch_type]["tasks_per_worker"] > 1
):
# Warn about the experimental nature of tasks_per_worker
logging.warning(
"The trails_per_job is experimental, be advised that some runs may fail."
)
logging.warning(
"Check the log and stdout/err files if simulations are dying to see the errors."
)
# Loop over the specified number of trials
for copy in range(hyper_config[batch_type]["tasks_per_worker"]):
# Append the command to batch_commands
batch_commands.append(
f"{aiml_path} {sys.argv[1]} {sys.argv[2]} -n {jobid} &"
)
# Allow some time between calling instances of run
batch_commands.append("sleep 0.5")
# Wait for all background jobs to finish
batch_commands.append("wait")
else:
# Append the default command to batch_commands
batch_commands.append(f"{aiml_path} {sys.argv[1]} {sys.argv[2]} -n {jobid}")

return batch_commands


def prepare_slurm_launch_script(hyper_config: str, model_config: str) -> List[str]:

slurm_options = ["#!/bin/bash -l"]
Expand All @@ -212,30 +282,11 @@ def prepare_slurm_launch_script(hyper_config: str, model_config: str) -> List[st
slurm_options.append(f'{hyper_config["slurm"]["kernel"]}')
aiml_path = "echo-run"
slurm_id = "$SLURM_JOB_ID"
if (
"trials_per_job" in hyper_config["slurm"]
and hyper_config["slurm"]["trials_per_job"] > 1
):
logging.warning(
"The trails_per_job is experimental, be advised that some runs may fail"
)
logging.warning(
"Check the log and stdout/err files if simulations are dying to see the errors"
)
for copy in range(hyper_config["slurm"]["trials_per_job"]):
slurm_options.append(
f"{aiml_path} {sys.argv[1]} {sys.argv[2]} -n {slurm_id} &"
)
# allow some time between calling instances of run
slurm_options.append("sleep 0.5")
slurm_options.append("wait")
else:
slurm_options.append(f"{aiml_path} {sys.argv[1]} {sys.argv[2]} -n {slurm_id}")
return slurm_options
# hyper_config, batch_type, aiml_path, jobid, batch_commands = []
return generate_batch_commands(hyper_config, "slurm", aiml_path, slurm_id, batch_commands=slurm_options)


def prepare_pbs_launch_script(hyper_config: str, model_config: str) -> List[str]:

pbs_options = ["#!/bin/bash -l"]
for arg, val in hyper_config["pbs"]["batch"].items():
if arg == "l" and type(val) == list:
Expand Down Expand Up @@ -267,26 +318,7 @@ def prepare_pbs_launch_script(hyper_config: str, model_config: str) -> List[str]
pbs_options.append(f'{hyper_config["pbs"]["kernel"]}')
aiml_path = "echo-run"
pbs_jobid = "$PBS_JOBID"
if (
"trials_per_job" in hyper_config["pbs"]
and hyper_config["pbs"]["trials_per_job"] > 1
):
logging.warning(
"The trails_per_job is experimental, be advised that some runs may fail"
)
logging.warning(
"Check the log and stdout/err files if simulations are dying to see the errors"
)
for copy in range(hyper_config["pbs"]["trials_per_job"]):
pbs_options.append(
f"{aiml_path} {sys.argv[1]} {sys.argv[2]} -n {pbs_jobid} &"
)
# allow some time between calling instances of run
pbs_options.append("sleep 0.5")
pbs_options.append("wait")
else:
pbs_options.append(f"{aiml_path} {sys.argv[1]} {sys.argv[2]} -n {pbs_jobid}")
return pbs_options
return generate_batch_commands(hyper_config, "pbs", aiml_path, pbs_jobid, batch_commands=pbs_options)


def main():
Expand Down Expand Up @@ -552,7 +584,7 @@ def main():
with open(script_location, "w") as fid:
for line in launch_script:
fid.write(f"{line}\n")

""" Launch the slurm jobs """
job_ids = []
name_condition = "N" in hyper_config["pbs"]["batch"]
Expand Down