Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add logs in occupy_gpu_ids, and funcs in hardware_utils for debugging #2207

Merged
merged 1 commit into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/hardware_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,29 @@
GPU_CARD_UTILS = [NvidiaGPUtil, QualcommNPUtil]


# This function is just for debugging, can be removed at later point
def get_gpu_list_and_realtime_gpu_available_ids() -> (List[dict], List[int]):
gpu_list = HardwareUtil.get_gpus()
gpu_count = len(gpu_list)
realtime_available_gpu_ids = HardwareUtil.get_available_gpu_ids(order='memory', limit=gpu_count, max_load=0.01,
max_memory=0.01)
return gpu_list, realtime_available_gpu_ids

# This function is just for debugging, can be removed at later point
def trim_unavailable_gpu_ids(gpu_ids) -> List[int]:
# Trim the gpu ids based on the realtime available gpu id list.
available_gpu_ids = [int(gpu_id) for gpu_id in gpu_ids]
gpu_list, realtime_available_gpu_ids = get_gpu_list_and_realtime_gpu_available_ids()
unavailable_gpu_ids = list()

for gpu_id in available_gpu_ids:
if gpu_id not in realtime_available_gpu_ids:
unavailable_gpu_ids.append(gpu_id)

trimmed_gpu_ids = list(set(available_gpu_ids) - set(unavailable_gpu_ids))
return trimmed_gpu_ids.copy()


class HardwareUtil(metaclass=Singleton):
__gpu_util: Optional[GPUCardUtil] = None

Expand Down Expand Up @@ -60,6 +83,8 @@ def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: Doc
if __name__ == "__main__":
gpus = HardwareUtil.get_gpus()
get_available_gpu_cards = HardwareUtil.get_available_gpu_ids(limit=len(gpus))
trimmed_gpu_ids = trim_unavailable_gpu_ids(get_available_gpu_cards)
print(trimmed_gpu_ids)
device_mapping = HardwareUtil.get_docker_gpu_device_mapping(get_available_gpu_cards, len(get_available_gpu_cards))
print(gpus)
print(get_available_gpu_cards)
Expand Down
7 changes: 7 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None,
# Get the available GPU list, FEDML_GLOBAL_DEVICE_AVAILABLE_GPU_IDS_TAG-${device_id}
available_gpu_ids = ComputeCacheManager.get_instance().get_gpu_cache().get_device_available_gpu_ids(
device_id)
logging.info(
f"Available GPU Ids fetched from cache: {available_gpu_ids}")

logging.info(f"Check worker({device_id})'s realtime gpu availability in DB"
f" for run {run_id}: {available_gpu_ids}")
Expand All @@ -94,8 +96,11 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None,
if available_gpu_ids is None:
# Get realtime GPU availability list from the system
available_gpu_ids = JobRunnerUtils.get_realtime_gpu_available_ids().copy()
logging.info(f"Cache not set yet, fetching realtime available GPU Ids: {available_gpu_ids}")
else:
available_gpu_ids = JobRunnerUtils.trim_unavailable_gpu_ids(available_gpu_ids)
logging.info(
f"Trimmed available GPU Ids: {available_gpu_ids}")

# Get the matched gpu ids string by the request gpu num
cuda_visible_gpu_ids_str, matched_gpu_num = JobRunnerUtils.request_gpu_ids(request_gpu_num,
Expand All @@ -119,6 +124,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None,

ComputeCacheManager.get_instance().get_gpu_cache().set_device_available_gpu_ids(
device_id, available_gpu_ids)

logging.info(f"Updated cache with following available gpu ids: {available_gpu_ids}")

# For a single run, could be scale up. So if existed such a key, should extend, not replace
existed_gpu_nums = ComputeCacheManager.get_instance().get_gpu_cache().get_device_run_num_gpus(
Expand Down
Loading