From cc4f255357108270dab3a0b433a5a3e7d2cbcabd Mon Sep 17 00:00:00 2001
From: Biological Compatibility Benchmarks
 <biological.compatibility.tests@gmail.com>
Date: Mon, 30 Dec 2024 19:24:52 +0200
Subject: [PATCH] Adding support for CPU-only machines

---
 aintelope/config/config_utils.py | 27 +++++++++++++++++++--------
 aintelope/gridsearch.py          | 12 +++++++-----
 aintelope/gridsearch_pipeline.py |  6 +++---
 aintelope/nonpipeline.py         |  2 +-
 aintelope/pipeline.py            |  8 ++++----
 5 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/aintelope/config/config_utils.py b/aintelope/config/config_utils.py
index 9c1228f..265a840 100644
--- a/aintelope/config/config_utils.py
+++ b/aintelope/config/config_utils.py
@@ -155,13 +155,24 @@ def set_memory_limits():
         set_mem_limits(data_size_limit, address_space_size_limit)
 
 
-def select_gpu():
-    gridsearch_gpu = os.environ.get("AINTELOPE_GPU")
-    if gridsearch_gpu is not None:
-        gridsearch_gpu = int(gridsearch_gpu)
-        torch.cuda.set_device(gridsearch_gpu)
-        device_name = torch.cuda.get_device_name(gridsearch_gpu)
-        print(f"Using CUDA GPU {gridsearch_gpu} : {device_name}")
+def select_gpu(gpu_index=None):
+    if gpu_index is None:
+        gpu_index = os.environ.get("AINTELOPE_GPU")
+
+    # TODO: run some threads on CPU if the available GPU-s do not support the required amount of threads
+
+    if gpu_index is not None:
+        gpu_count = torch.cuda.device_count()
+        if gpu_count == 0:
+            print(
+                "No CUDA GPU available, ignoring assigned GPU index, will be using CPU as a CUDA device"
+            )
+            return
+
+        gpu_index = int(gpu_index)
+        torch.cuda.set_device(gpu_index)
+        device_name = torch.cuda.get_device_name(gpu_index)
+        print(f"Using CUDA GPU {gpu_index} : {device_name}")
     else:
         # for each next experiment select next available GPU to maximally balance the load considering multiple running processes
         rotate_active_gpu_selection()
@@ -193,7 +204,7 @@ def rotate_active_gpu_selection():
 
     gpu_count = torch.cuda.device_count()
     if gpu_count == 0:
-        print("No CUDA GPU available")
+        print("No CUDA GPU available, will be using CPU as a CUDA device")
         return
 
     elif gpu_count == 1:
diff --git a/aintelope/gridsearch.py b/aintelope/gridsearch.py
index 98eb808..7991fac 100644
--- a/aintelope/gridsearch.py
+++ b/aintelope/gridsearch.py
@@ -121,7 +121,7 @@ async def run_gridsearch_experiments() -> None:
 
             if use_multiprocessing:
                 # for each next experiment select next available GPU to maximally balance the load considering multiple running processes
-                use_gpu = available_gpus.pop(0)
+                use_gpu_index = available_gpus.pop(0) if any(available_gpus) else None
 
                 arguments = {
                     "gridsearch_params": gridsearch_params,
@@ -129,12 +129,12 @@ async def run_gridsearch_experiments() -> None:
                     "args": sys.argv,
                     "do_not_create_subprocess": False,
                     "environ": dict(os.environ),
-                    "use_gpu_index": use_gpu,
+                    "use_gpu_index": use_gpu_index,
                 }
                 coroutine = asyncio.create_task(
                     run_gridsearch_experiment_multiprocess(**arguments)
                 )  # NB! do not await here yet, awaiting will be done below by waiting for a group of coroutines at once.
-                coroutine_gpus[coroutine] = use_gpu
+                coroutine_gpus[coroutine] = use_gpu_index
 
                 active_coroutines.add(coroutine)
                 if len(active_coroutines) == num_workers:
@@ -142,7 +142,9 @@ async def run_gridsearch_experiments() -> None:
                         active_coroutines, return_when=asyncio.FIRST_COMPLETED
                     )
                     for task in dones:
-                        available_gpus.append(coroutine_gpus[task])
+                        gpu_index = coroutine_gpus[task]
+                        if gpu_index is not None:
+                            available_gpus.append(gpu_index)
                         del coroutine_gpus[task]
 
                         ex = (
@@ -329,7 +331,7 @@ def get_run_gridsearch_experiment_cache_helper_cache_key(gridsearch_params):
     )
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # for multiprocessing support
     register_resolvers()
 
     if (
diff --git a/aintelope/gridsearch_pipeline.py b/aintelope/gridsearch_pipeline.py
index d65cbd9..9a40eaf 100644
--- a/aintelope/gridsearch_pipeline.py
+++ b/aintelope/gridsearch_pipeline.py
@@ -42,9 +42,9 @@
 
 logger = logging.getLogger("aintelope.__main__")
 
-gpu_count = max(1, torch.cuda.device_count())
+gpu_count = torch.cuda.device_count()
 worker_count_multiplier = 1  # when running pipeline search, then having more workers than GPU-s will cause all sorts of Python and CUDA errors under Windows for some reason, even though there is plenty of free RAM and GPU memory. Yet, when the pipeline processes are run manually, there is no concurrency limit except the real hardware capacity limits. # TODO: why?
-num_workers = gpu_count * worker_count_multiplier
+num_workers = max(1, gpu_count) * worker_count_multiplier
 
 # needs to be initialised here in order to avoid circular imports in gridsearch
 cache_folder = "gridsearch_cache"
@@ -116,7 +116,7 @@ def run_pipeline(cfg: DictConfig) -> None:
         max_count=num_workers,
         disable=(gridsearch_params_in is not None)
         or (
-            os.name != "nt"
+            os.name != "nt" or gpu_count == 0
         ),  # Linux does not unlock semaphore after a process gets killed, therefore disabling Semaphore under Linux until this gets resolved.
     ) as semaphore:
         if gridsearch_params_in is None:
diff --git a/aintelope/nonpipeline.py b/aintelope/nonpipeline.py
index d76d99c..9cb554d 100644
--- a/aintelope/nonpipeline.py
+++ b/aintelope/nonpipeline.py
@@ -90,6 +90,6 @@ def analytics(cfg, score_dimensions, title, experiment_name, do_not_show_plot=Fa
     )
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # for multiprocessing support
     register_resolvers()
     aintelope_main()
diff --git a/aintelope/pipeline.py b/aintelope/pipeline.py
index b4a7f45..b970992 100644
--- a/aintelope/pipeline.py
+++ b/aintelope/pipeline.py
@@ -45,9 +45,9 @@
 
 logger = logging.getLogger("aintelope.__main__")
 
-gpu_count = max(1, torch.cuda.device_count())
+gpu_count = torch.cuda.device_count()
 worker_count_multiplier = 1  # when running pipeline search, then having more workers than GPU-s will cause all sorts of Python and CUDA errors under Windows for some reason, even though there is plenty of free RAM and GPU memory. Yet, when the pipeline processes are run manually, there is no concurrency limit except the real hardware capacity limits. # TODO: why?
-num_workers = gpu_count * worker_count_multiplier
+num_workers = max(1, gpu_count) * worker_count_multiplier
 
 
 def aintelope_main() -> None:
@@ -90,7 +90,7 @@ def run_pipeline(cfg: DictConfig) -> None:
         semaphore_name,
         max_count=num_workers,
         disable=(
-            os.name != "nt"
+            os.name != "nt" or gpu_count == 0
         ),  # Linux does not unlock semaphore after a process gets killed, therefore disabling Semaphore under Linux until this gets resolved.
     ) as semaphore:
         print("Semaphore acquired...")
@@ -302,7 +302,7 @@ def aintelope_main() -> None:
     run_pipeline()
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # for multiprocessing support
     register_resolvers()
 
     if (