Adding support for CPU-only machines

aintelope · Dec 30, 2024 · cc4f255 · cc4f255
1 parent 8180c50
commit cc4f255
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 21 deletions.
diff --git a/aintelope/config/config_utils.py b/aintelope/config/config_utils.py
@@ -155,13 +155,24 @@ def set_memory_limits():
         set_mem_limits(data_size_limit, address_space_size_limit)
 
 
-def select_gpu():
-    gridsearch_gpu = os.environ.get("AINTELOPE_GPU")
-    if gridsearch_gpu is not None:
-        gridsearch_gpu = int(gridsearch_gpu)
-        torch.cuda.set_device(gridsearch_gpu)
-        device_name = torch.cuda.get_device_name(gridsearch_gpu)
-        print(f"Using CUDA GPU {gridsearch_gpu} : {device_name}")
+def select_gpu(gpu_index=None):
+    if gpu_index is None:
+        gpu_index = os.environ.get("AINTELOPE_GPU")
+
+    # TODO: run some threads on CPU if the available GPU-s do not support the required amount of threads
+
+    if gpu_index is not None:
+        gpu_count = torch.cuda.device_count()
+        if gpu_count == 0:
+            print(
+                "No CUDA GPU available, ignoring assigned GPU index, will be using CPU as a CUDA device"
+            )
+            return
+
+        gpu_index = int(gpu_index)
+        torch.cuda.set_device(gpu_index)
+        device_name = torch.cuda.get_device_name(gpu_index)
+        print(f"Using CUDA GPU {gpu_index} : {device_name}")
     else:
         # for each next experiment select next available GPU to maximally balance the load considering multiple running processes
         rotate_active_gpu_selection()
@@ -193,7 +204,7 @@ def rotate_active_gpu_selection():
 
     gpu_count = torch.cuda.device_count()
     if gpu_count == 0:
-        print("No CUDA GPU available")
+        print("No CUDA GPU available, will be using CPU as a CUDA device")
         return
 
     elif gpu_count == 1:

diff --git a/aintelope/gridsearch.py b/aintelope/gridsearch.py
@@ -121,28 +121,30 @@ async def run_gridsearch_experiments() -> None:
 
             if use_multiprocessing:
                 # for each next experiment select next available GPU to maximally balance the load considering multiple running processes
-                use_gpu = available_gpus.pop(0)
+                use_gpu_index = available_gpus.pop(0) if any(available_gpus) else None
 
                 arguments = {
                     "gridsearch_params": gridsearch_params,
                     "gridsearch_combination_for_print": gridsearch_combination_for_print,
                     "args": sys.argv,
                     "do_not_create_subprocess": False,
                     "environ": dict(os.environ),
-                    "use_gpu_index": use_gpu,
+                    "use_gpu_index": use_gpu_index,
                 }
                 coroutine = asyncio.create_task(
                     run_gridsearch_experiment_multiprocess(**arguments)
                 )  # NB! do not await here yet, awaiting will be done below by waiting for a group of coroutines at once.
-                coroutine_gpus[coroutine] = use_gpu
+                coroutine_gpus[coroutine] = use_gpu_index
 
                 active_coroutines.add(coroutine)
                 if len(active_coroutines) == num_workers:
                     dones, pendings = await asyncio.wait(
                         active_coroutines, return_when=asyncio.FIRST_COMPLETED
                     )
                     for task in dones:
-                        available_gpus.append(coroutine_gpus[task])
+                        gpu_index = coroutine_gpus[task]
+                        if gpu_index is not None:
+                            available_gpus.append(gpu_index)
                         del coroutine_gpus[task]
 
                         ex = (
@@ -329,7 +331,7 @@ def get_run_gridsearch_experiment_cache_helper_cache_key(gridsearch_params):
     )
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # for multiprocessing support
     register_resolvers()
 
     if (

diff --git a/aintelope/gridsearch_pipeline.py b/aintelope/gridsearch_pipeline.py
@@ -42,9 +42,9 @@
 
 logger = logging.getLogger("aintelope.__main__")
 
-gpu_count = max(1, torch.cuda.device_count())
+gpu_count = torch.cuda.device_count()
 worker_count_multiplier = 1  # when running pipeline search, then having more workers than GPU-s will cause all sorts of Python and CUDA errors under Windows for some reason, even though there is plenty of free RAM and GPU memory. Yet, when the pipeline processes are run manually, there is no concurrency limit except the real hardware capacity limits. # TODO: why?
-num_workers = gpu_count * worker_count_multiplier
+num_workers = max(1, gpu_count) * worker_count_multiplier
 
 # needs to be initialised here in order to avoid circular imports in gridsearch
 cache_folder = "gridsearch_cache"
@@ -116,7 +116,7 @@ def run_pipeline(cfg: DictConfig) -> None:
         max_count=num_workers,
         disable=(gridsearch_params_in is not None)
         or (
-            os.name != "nt"
+            os.name != "nt" or gpu_count == 0
         ),  # Linux does not unlock semaphore after a process gets killed, therefore disabling Semaphore under Linux until this gets resolved.
     ) as semaphore:
         if gridsearch_params_in is None:

diff --git a/aintelope/nonpipeline.py b/aintelope/nonpipeline.py
@@ -90,6 +90,6 @@ def analytics(cfg, score_dimensions, title, experiment_name, do_not_show_plot=Fa
     )
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # for multiprocessing support
     register_resolvers()
     aintelope_main()
diff --git a/aintelope/pipeline.py b/aintelope/pipeline.py
@@ -45,9 +45,9 @@
 
 logger = logging.getLogger("aintelope.__main__")
 
-gpu_count = max(1, torch.cuda.device_count())
+gpu_count = torch.cuda.device_count()
 worker_count_multiplier = 1  # when running pipeline search, then having more workers than GPU-s will cause all sorts of Python and CUDA errors under Windows for some reason, even though there is plenty of free RAM and GPU memory. Yet, when the pipeline processes are run manually, there is no concurrency limit except the real hardware capacity limits. # TODO: why?
-num_workers = gpu_count * worker_count_multiplier
+num_workers = max(1, gpu_count) * worker_count_multiplier
 
 
 def aintelope_main() -> None:
@@ -90,7 +90,7 @@ def run_pipeline(cfg: DictConfig) -> None:
         semaphore_name,
         max_count=num_workers,
         disable=(
-            os.name != "nt"
+            os.name != "nt" or gpu_count == 0
         ),  # Linux does not unlock semaphore after a process gets killed, therefore disabling Semaphore under Linux until this gets resolved.
     ) as semaphore:
         print("Semaphore acquired...")
@@ -302,7 +302,7 @@ def aintelope_main() -> None:
     run_pipeline()
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # for multiprocessing support
     register_resolvers()
 
     if (