Skip to content

Commit

Permalink
Adding support for CPU-only machines
Browse files Browse the repository at this point in the history
  • Loading branch information
Biological Compatibility Benchmarks committed Dec 30, 2024
1 parent 8180c50 commit cc4f255
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 21 deletions.
27 changes: 19 additions & 8 deletions aintelope/config/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,24 @@ def set_memory_limits():
set_mem_limits(data_size_limit, address_space_size_limit)


def select_gpu():
gridsearch_gpu = os.environ.get("AINTELOPE_GPU")
if gridsearch_gpu is not None:
gridsearch_gpu = int(gridsearch_gpu)
torch.cuda.set_device(gridsearch_gpu)
device_name = torch.cuda.get_device_name(gridsearch_gpu)
print(f"Using CUDA GPU {gridsearch_gpu} : {device_name}")
def select_gpu(gpu_index=None):
if gpu_index is None:
gpu_index = os.environ.get("AINTELOPE_GPU")

# TODO: run some threads on CPU if the available GPU-s do not support the required amount of threads

if gpu_index is not None:
gpu_count = torch.cuda.device_count()
if gpu_count == 0:
print(
"No CUDA GPU available, ignoring assigned GPU index, will be using CPU as a CUDA device"
)
return

gpu_index = int(gpu_index)
torch.cuda.set_device(gpu_index)
device_name = torch.cuda.get_device_name(gpu_index)
print(f"Using CUDA GPU {gpu_index} : {device_name}")
else:
# for each next experiment select next available GPU to maximally balance the load considering multiple running processes
rotate_active_gpu_selection()
Expand Down Expand Up @@ -193,7 +204,7 @@ def rotate_active_gpu_selection():

gpu_count = torch.cuda.device_count()
if gpu_count == 0:
print("No CUDA GPU available")
print("No CUDA GPU available, will be using CPU as a CUDA device")
return

elif gpu_count == 1:
Expand Down
12 changes: 7 additions & 5 deletions aintelope/gridsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,28 +121,30 @@ async def run_gridsearch_experiments() -> None:

if use_multiprocessing:
# for each next experiment select next available GPU to maximally balance the load considering multiple running processes
use_gpu = available_gpus.pop(0)
use_gpu_index = available_gpus.pop(0) if any(available_gpus) else None

arguments = {
"gridsearch_params": gridsearch_params,
"gridsearch_combination_for_print": gridsearch_combination_for_print,
"args": sys.argv,
"do_not_create_subprocess": False,
"environ": dict(os.environ),
"use_gpu_index": use_gpu,
"use_gpu_index": use_gpu_index,
}
coroutine = asyncio.create_task(
run_gridsearch_experiment_multiprocess(**arguments)
) # NB! do not await here yet, awaiting will be done below by waiting for a group of coroutines at once.
coroutine_gpus[coroutine] = use_gpu
coroutine_gpus[coroutine] = use_gpu_index

active_coroutines.add(coroutine)
if len(active_coroutines) == num_workers:
dones, pendings = await asyncio.wait(
active_coroutines, return_when=asyncio.FIRST_COMPLETED
)
for task in dones:
available_gpus.append(coroutine_gpus[task])
gpu_index = coroutine_gpus[task]
if gpu_index is not None:
available_gpus.append(gpu_index)
del coroutine_gpus[task]

ex = (
Expand Down Expand Up @@ -329,7 +331,7 @@ def get_run_gridsearch_experiment_cache_helper_cache_key(gridsearch_params):
)


if __name__ == "__main__":
if __name__ == "__main__": # for multiprocessing support
register_resolvers()

if (
Expand Down
6 changes: 3 additions & 3 deletions aintelope/gridsearch_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@

logger = logging.getLogger("aintelope.__main__")

gpu_count = max(1, torch.cuda.device_count())
gpu_count = torch.cuda.device_count()
worker_count_multiplier = 1 # when running pipeline search, then having more workers than GPU-s will cause all sorts of Python and CUDA errors under Windows for some reason, even though there is plenty of free RAM and GPU memory. Yet, when the pipeline processes are run manually, there is no concurrency limit except the real hardware capacity limits. # TODO: why?
num_workers = gpu_count * worker_count_multiplier
num_workers = max(1, gpu_count) * worker_count_multiplier

# needs to be initialised here in order to avoid circular imports in gridsearch
cache_folder = "gridsearch_cache"
Expand Down Expand Up @@ -116,7 +116,7 @@ def run_pipeline(cfg: DictConfig) -> None:
max_count=num_workers,
disable=(gridsearch_params_in is not None)
or (
os.name != "nt"
os.name != "nt" or gpu_count == 0
), # Linux does not unlock semaphore after a process gets killed, therefore disabling Semaphore under Linux until this gets resolved.
) as semaphore:
if gridsearch_params_in is None:
Expand Down
2 changes: 1 addition & 1 deletion aintelope/nonpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,6 @@ def analytics(cfg, score_dimensions, title, experiment_name, do_not_show_plot=Fa
)


if __name__ == "__main__":
if __name__ == "__main__": # for multiprocessing support
register_resolvers()
aintelope_main()
8 changes: 4 additions & 4 deletions aintelope/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@

logger = logging.getLogger("aintelope.__main__")

gpu_count = max(1, torch.cuda.device_count())
gpu_count = torch.cuda.device_count()
worker_count_multiplier = 1 # when running pipeline search, then having more workers than GPU-s will cause all sorts of Python and CUDA errors under Windows for some reason, even though there is plenty of free RAM and GPU memory. Yet, when the pipeline processes are run manually, there is no concurrency limit except the real hardware capacity limits. # TODO: why?
num_workers = gpu_count * worker_count_multiplier
num_workers = max(1, gpu_count) * worker_count_multiplier


def aintelope_main() -> None:
Expand Down Expand Up @@ -90,7 +90,7 @@ def run_pipeline(cfg: DictConfig) -> None:
semaphore_name,
max_count=num_workers,
disable=(
os.name != "nt"
os.name != "nt" or gpu_count == 0
), # Linux does not unlock semaphore after a process gets killed, therefore disabling Semaphore under Linux until this gets resolved.
) as semaphore:
print("Semaphore acquired...")
Expand Down Expand Up @@ -302,7 +302,7 @@ def aintelope_main() -> None:
run_pipeline()


if __name__ == "__main__":
if __name__ == "__main__": # for multiprocessing support
register_resolvers()

if (
Expand Down

0 comments on commit cc4f255

Please sign in to comment.