From ae06bf312bf7feb1467a0d481557f2bfa95bc1a4 Mon Sep 17 00:00:00 2001 From: Alay Shah Date: Tue, 21 May 2024 01:52:39 -0700 Subject: [PATCH 1/6] Add 10 minutes TTL Cache for config fetch --- .../computing/scheduler/model_scheduler/modelops_configs.py | 2 ++ python/fedml/core/mlops/mlops_configs.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py b/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py index e988c29a8a..719f3825c4 100644 --- a/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py +++ b/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py @@ -4,6 +4,7 @@ import certifi import requests +import cachetools.func import fedml from fedml.core.mlops.mlops_utils import MLOpsUtils @@ -32,6 +33,7 @@ def get_instance(args): return ModelOpsConfigs._config_instance @staticmethod + @cachetools.func.ttl_cache(ttl=600) def get_request_params(): url = fedml._get_backend_service() url = "{}/fedmlOpsServer/configs/fetch".format(url) diff --git a/python/fedml/core/mlops/mlops_configs.py b/python/fedml/core/mlops/mlops_configs.py index c8c6422d6c..6c25c38128 100644 --- a/python/fedml/core/mlops/mlops_configs.py +++ b/python/fedml/core/mlops/mlops_configs.py @@ -4,6 +4,7 @@ import certifi import requests +import cachetools.func import fedml from fedml.core.mlops.mlops_utils import MLOpsUtils @@ -41,6 +42,7 @@ def __init__(self): pass @staticmethod + @cachetools.func.ttl_cache(ttl=600) def get_request_params(): url = fedml._get_backend_service() url = f"{url}/fedmlOpsServer/configs/fetch" From f7ab709a39af6981a3ae4d8985a060f92b937fdb Mon Sep 17 00:00:00 2001 From: Alay Dilipbhai Shah Date: Tue, 21 May 2024 16:29:09 -0700 Subject: [PATCH 2/6] Update setup.py --- python/setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/setup.py b/python/setup.py index fa425c98f7..0e314de29c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -64,6 +64,8 @@ def finalize_options(self): 'uvicorn', 'wandb==0.13.2', 'wget', + # Need to pin this version due to breaking change released in python docker sdk + 'requests<2.32', ] requirements_extra_mpi = [ From 649e42fa3f259cc892ca257b822aa844627da39d Mon Sep 17 00:00:00 2001 From: Alay Shah Date: Tue, 21 May 2024 22:25:18 -0700 Subject: [PATCH 3/6] Remove Docker Client Timeout --- .../fedml/computing/scheduler/comm_utils/container_utils.py | 4 ++-- python/fedml/computing/scheduler/comm_utils/job_utils.py | 2 +- .../scheduler/model_scheduler/device_model_deployment.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/fedml/computing/scheduler/comm_utils/container_utils.py b/python/fedml/computing/scheduler/comm_utils/container_utils.py index f86e9fe1a2..2f5fa31fb5 100644 --- a/python/fedml/computing/scheduler/comm_utils/container_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/container_utils.py @@ -26,7 +26,7 @@ def get_instance(): def get_docker_client(self): try: - client = docker.from_env(timeout=5, version="auto") + client = docker.from_env() except Exception: logging.error("Failed to connect to the docker daemon, please ensure that you have " "installed Docker Desktop or Docker Engine, and the docker is running") @@ -180,7 +180,7 @@ def get_container_rank_same_model(prefix: str): running_model_name = hash("model_endpoint_id_{}_name_{}_model_id_{}_name_{}_ver_{}") """ try: - client = docker.from_env(timeout=5, version="auto") + client = docker.from_env() except Exception: logging.error("Failed to connect to the docker daemon, please ensure that you have " "installed Docker Desktop or Docker Engine, and the docker is running") diff --git a/python/fedml/computing/scheduler/comm_utils/job_utils.py b/python/fedml/computing/scheduler/comm_utils/job_utils.py index 08ce44d1dd..5b9a2c812a 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/job_utils.py @@ -570,7 +570,7 @@ def get_run_container_name(run_id: int) -> str: @staticmethod def get_docker_client(docker_args: DockerArgs) -> DockerClient: try: - client = docker.from_env(timeout=5, version="auto") + client = docker.from_env() if docker_args.username != "" and docker_args.registry != "": client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry) except Exception as e: diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index f54965b599..1876373d25 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -210,7 +210,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, infer_host = "127.0.0.1" try: - client = docker.from_env(timeout=5, version="auto") + client = docker.from_env() if enable_custom_image and docker_registry_user_name != "" and docker_registry_user_password != "" \ and docker_registry != "": client.login(username=docker_registry_user_name, password=docker_registry_user_password, @@ -467,7 +467,7 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type, logging.info(f"Attempt: {deploy_attempt} / {deploy_attempt_threshold} ...") try: - client = docker.from_env(timeout=5, version="auto") + client = docker.from_env() except Exception: logging.error("Failed to connect to the docker daemon, please ensure that you have " "installed Docker Desktop or Docker Engine, and the docker is running") From 92b7e162b66fa09b22a6af2dcfc22acf46ddf5cb Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 28 May 2024 17:46:52 +0000 Subject: [PATCH 4/6] [Deploy] Try to convert the gpu_topology value type to int. --- .../model_scheduler/device_replica_controller.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py index 667d57c4f4..ea19efb8b6 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py @@ -67,7 +67,9 @@ def __init__(self, master_id, request_json: dict): def calc_total_gpu_num(self): total_gpu_num = 0 for device_id, gpu_num in self.devices_avail_gpus.items(): - total_gpu_num += gpu_num + if type(gpu_num) is not int: + logging.warning(f"The value in gpu_topology should be int, but got {type(gpu_num)}. Try to convert it.") + total_gpu_num += int(gpu_num) return total_gpu_num def init_id_replica_num(self): @@ -77,6 +79,11 @@ def init_id_replica_num(self): """ id_replica_num = {} for id, avail_num in self.devices_avail_gpus.items(): + if type(avail_num) is not int: + logging.warning(f"The value in gpu_topology should be int, " + f"but got {type(avail_num)}. Try to convert it.") + avail_num = int(avail_num) + if avail_num % self.gpu_per_replica != 0: raise ValueError("The number of gpus for each device should be divisible by gpu_per_replica") id_replica_num[str(id)] = avail_num // self.gpu_per_replica From 0a6eba9f6dfda93ed0df500e63b8b64bad5df44d Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 28 May 2024 22:48:38 +0000 Subject: [PATCH 5/6] [Deploy] Fix version diff function. --- .../scheduler/model_scheduler/master_job_runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py index 0bfc205b34..c761cd6d8f 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py @@ -64,4 +64,4 @@ def generate_request_json_with_replica_num_diff(run_id, edge_id, request_json): @staticmethod def generate_request_json_with_replica_version_diff(run_id, edge_id, request_json): - return FedMLDeployMasterJobRunner.generate_request_json_with_replica_num_diff(run_id, edge_id, request_json) + return FedMLDeployMasterJobRunner.generate_request_json_with_replica_version_diff(run_id, edge_id, request_json) From e8844d389b1aa7809df3ea3d8255c6ce83501e7b Mon Sep 17 00:00:00 2001 From: Raphael Jin Date: Tue, 28 May 2024 22:50:46 -0400 Subject: [PATCH 6/6] [Deploy] Fix timezone issue using pandas --- .../scheduler/model_scheduler/autoscaler/autoscaler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py index bb2b59e7d9..eb9f08b0eb 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py +++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py @@ -50,11 +50,11 @@ def filter_by_timestamp(cls, filtered = metrics if before_now_minutes: less_than_ts = \ - str(pd.Timestamp.now() - pd.Timedelta(minutes=before_now_minutes)) + str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(minutes=before_now_minutes)) filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp")) if before_now_seconds: less_than_ts = \ - str(pd.Timestamp.now() - pd.Timedelta(seconds=before_now_seconds)) + str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(seconds=before_now_seconds)) filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp")) return filtered @@ -151,6 +151,7 @@ def scale_operation_query_concurrency(cls, # Otherwise, we proceed as normal. queries_num = period_data.shape[0] + logging.info(f"Detect {queries_num} of requests in {concurrent_query_policy.window_size_secs} seconds") try: # QSR: Queries per Second per Replica: (Number of Queries / Number of Current Replicas) / Window Size