From 084781f3982c8aae98fe8d3564badc90f7824451 Mon Sep 17 00:00:00 2001 From: alaydshah Date: Tue, 2 Jul 2024 19:19:51 +0000 Subject: [PATCH] Add logs in occupy_gpu_ids, and funcs in hardware_utils for debugging --- .../scheduler/comm_utils/hardware_utils.py | 25 +++++++++++++++++++ .../scheduler/comm_utils/job_utils.py | 7 ++++++ 2 files changed, 32 insertions(+) diff --git a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py index c876948145..56a75fe3e1 100644 --- a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py @@ -11,6 +11,29 @@ GPU_CARD_UTILS = [NvidiaGPUtil, QualcommNPUtil] +# This function is just for debugging, can be removed at later point +def get_gpu_list_and_realtime_gpu_available_ids() -> (List[dict], List[int]): + gpu_list = HardwareUtil.get_gpus() + gpu_count = len(gpu_list) + realtime_available_gpu_ids = HardwareUtil.get_available_gpu_ids(order='memory', limit=gpu_count, max_load=0.01, + max_memory=0.01) + return gpu_list, realtime_available_gpu_ids + +# This function is just for debugging, can be removed at later point +def trim_unavailable_gpu_ids(gpu_ids) -> List[int]: + # Trim the gpu ids based on the realtime available gpu id list. + available_gpu_ids = [int(gpu_id) for gpu_id in gpu_ids] + gpu_list, realtime_available_gpu_ids = get_gpu_list_and_realtime_gpu_available_ids() + unavailable_gpu_ids = list() + + for gpu_id in available_gpu_ids: + if gpu_id not in realtime_available_gpu_ids: + unavailable_gpu_ids.append(gpu_id) + + trimmed_gpu_ids = list(set(available_gpu_ids) - set(unavailable_gpu_ids)) + return trimmed_gpu_ids.copy() + + class HardwareUtil(metaclass=Singleton): __gpu_util: Optional[GPUCardUtil] = None @@ -60,6 +83,8 @@ def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: Doc if __name__ == "__main__": gpus = HardwareUtil.get_gpus() get_available_gpu_cards = HardwareUtil.get_available_gpu_ids(limit=len(gpus)) + trimmed_gpu_ids = trim_unavailable_gpu_ids(get_available_gpu_cards) + print(trimmed_gpu_ids) device_mapping = HardwareUtil.get_docker_gpu_device_mapping(get_available_gpu_cards, len(get_available_gpu_cards)) print(gpus) print(get_available_gpu_cards) diff --git a/python/fedml/computing/scheduler/comm_utils/job_utils.py b/python/fedml/computing/scheduler/comm_utils/job_utils.py index 5b9a2c812a..8a917e539d 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/job_utils.py @@ -86,6 +86,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None, # Get the available GPU list, FEDML_GLOBAL_DEVICE_AVAILABLE_GPU_IDS_TAG-${device_id} available_gpu_ids = ComputeCacheManager.get_instance().get_gpu_cache().get_device_available_gpu_ids( device_id) + logging.info( + f"Available GPU Ids fetched from cache: {available_gpu_ids}") logging.info(f"Check worker({device_id})'s realtime gpu availability in DB" f" for run {run_id}: {available_gpu_ids}") @@ -94,8 +96,11 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None, if available_gpu_ids is None: # Get realtime GPU availability list from the system available_gpu_ids = JobRunnerUtils.get_realtime_gpu_available_ids().copy() + logging.info(f"Cache not set yet, fetching realtime available GPU Ids: {available_gpu_ids}") else: available_gpu_ids = JobRunnerUtils.trim_unavailable_gpu_ids(available_gpu_ids) + logging.info( + f"Trimmed available GPU Ids: {available_gpu_ids}") # Get the matched gpu ids string by the request gpu num cuda_visible_gpu_ids_str, matched_gpu_num = JobRunnerUtils.request_gpu_ids(request_gpu_num, @@ -119,6 +124,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None, ComputeCacheManager.get_instance().get_gpu_cache().set_device_available_gpu_ids( device_id, available_gpu_ids) + + logging.info(f"Updated cache with following available gpu ids: {available_gpu_ids}") # For a single run, could be scale up. So if existed such a key, should extend, not replace existed_gpu_nums = ComputeCacheManager.get_instance().get_gpu_cache().get_device_run_num_gpus(