Merge pull request #1889 from FedML-AI/alexleung/dev_branch

[CoreEngine] upload the inferences logs.
FedML-AI · Feb 3, 2024 · d4afb3b · d4afb3b
2 parents c962519 + 493d830
commit d4afb3b
Show file tree

Hide file tree

Showing 10 changed files with 102 additions and 44 deletions.
diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -34,7 +34,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.8.18.dev12"
+__version__ = "0.8.18.dev13"
 
 
 # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release

diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py
diff --git a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py
@@ -186,3 +186,17 @@ def get_pid_from_cmd_line(cmd_line, break_on_first=True):
                 pass
 
         return ret_pids
+
+    @staticmethod
+    def is_process_running(process_id):
+        is_running = False
+        try:
+            process = psutil.Process(process_id)
+            if process.status() == psutil.STATUS_RUNNING or \
+                    process.status() == psutil.STATUS_SLEEPING or \
+                    process.status() == psutil.STATUS_IDLE:
+                is_running = True
+        except Exception as e:
+            pass
+
+        return is_running
diff --git a/python/fedml/computing/scheduler/comm_utils/sys_utils.py b/python/fedml/computing/scheduler/comm_utils/sys_utils.py
@@ -35,6 +35,7 @@
                     "143": "Command terminated with signal 15 (SIGTERM) (kill command)."}
 
 enable_simulation_gpu = False
+simulation_gpu_count = 1
 
 
 def get_sys_runner_info():
@@ -112,7 +113,7 @@ def get_sys_runner_info():
         pass
 
     if enable_simulation_gpu:
-        gpu_count = 8
+        gpu_count = simulation_gpu_count
         gpu_total_mem = "80G"
         gpu_available_mem = "80G"
         gpu_vendor = "NVIDIA"
@@ -156,7 +157,7 @@ def get_gpu_list():
              'memoryUsed': 7.0, 'memoryFree': 81042.0, 'driver': '535.54.03', 'gpu_name': 'NVIDIA A100-SXM4-80GB',
              'serial': '1320723000504', 'display_mode': 'Enabled', 'display_active': 'Disabled', 'temperature': 33.0}]
 
-        return ret_gpu_list
+        return ret_gpu_list[0:simulation_gpu_count]
 
     gpu_list = GPUtil.getGPUs()
     ret_gpu_list = list()

diff --git a/python/fedml/computing/scheduler/slave/client_runner.py b/python/fedml/computing/scheduler/slave/client_runner.py
@@ -1648,6 +1648,8 @@ def setup_agent_mqtt_connection(self, service_config):
 
         JobCleanup.get_instance().sync_data_on_startup(self.edge_id)
 
+        os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server.get_edge_id())
+        os.environ["FEDML_DEPLOY_WORKER_IDS"] = str([client.get_edge_id() for client in self.model_device_client_list])
         self.mlops_metrics.stop_device_realtime_perf()
         self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"])
 

diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py
@@ -168,6 +168,8 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None):
         topic_name = "ml_client/mlops/gpu_device_info"
         device_info_json = {
             "edgeId": edge_id,
+            "deploy_master_id": os.environ.get("FEDML_DEPLOY_MASTER_ID", ""),
+            "deploy_worker_ids": os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]"),
             "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2),
             "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2),
             "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2),

diff --git a/python/fedml/core/mlops/mlops_runtime_log_daemon.py b/python/fedml/core/mlops/mlops_runtime_log_daemon.py
@@ -10,6 +10,7 @@
 import requests
 import yaml
 
+from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils
 from ...core.mlops.mlops_configs import MLOpsConfigs
 import fedml
 
@@ -478,7 +479,8 @@ def stop_all_log_processor(self):
 
     def is_log_processor_running(self, in_run_id, in_device_id):
         for (log_child_process, log_run_id, log_device_id) in self.log_child_process_list:
-            if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id):
+            if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id) and \
+                    log_child_process is not None and RunProcessUtils.is_process_running(log_child_process.pid):
                 return True
 
         return False

diff --git a/python/fedml/workflow/driver_example/hello_world_job.yaml b/python/fedml/workflow/driver_example/hello_world_job.yaml
@@ -24,6 +24,6 @@ bootstrap: |
   echo "Bootstrap finished."
 
 computing:
-  resource_type: H100-80GB-HBM3    # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
+  resource_type: A100-80GB-SXM    # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
   minimum_num_gpus: 1             # minimum # of GPUs to provision
-  maximum_cost_per_hour: $0.5    # max cost per hour of all machines for your job
+  maximum_cost_per_hour: $10    # max cost per hour of all machines for your job
diff --git a/python/fedml/workflow/driver_example/main.py b/python/fedml/workflow/driver_example/main.py
@@ -5,24 +5,34 @@
 from fedml.workflow.jobs import Job, JobStatus
 from fedml.workflow.workflow import Workflow
 
+# CURRENT_CONFIG_VERSION = "release"
+# MY_API_KEY = "10e87dd6d6574311a80200455e4d9b30"
+CURRENT_CONFIG_VERSION = "local"
+CURRENT_ON_PREM_LOCAL_HOST = "localhost"
+CURRENT_ON_PREM_LOCAL_PORT = 18080
+MY_API_KEY = "1316b93c82da40ce90113a2ed12f0b14"
+
 
 class HelloWorldJob(Job):
     def __init__(self, name):
         super().__init__(name)
         self.run_id = None
 
     def run(self):
-        fedml.set_env_version("test")
+        fedml.set_env_version(CURRENT_CONFIG_VERSION)
+        fedml.set_local_on_premise_platform_host(CURRENT_ON_PREM_LOCAL_HOST)
+        fedml.set_local_on_premise_platform_port(CURRENT_ON_PREM_LOCAL_PORT)
+
         working_directory = os.path.dirname(os.path.abspath(__file__))
         absolute_path = os.path.join(working_directory, "hello_world_job.yaml")
-        result = fedml.api.launch_job(yaml_file=absolute_path, api_key="30d1bbcae9ec48ffa314caa8e944d187")
+        result = fedml.api.launch_job(yaml_file=absolute_path, api_key=MY_API_KEY)
         if result.run_id and int(result.run_id) > 0:
             self.run_id = result.run_id
 
     def status(self):
         if self.run_id:
             try:
-                _, run_status = fedml.api.run_status(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187")
+                _, run_status = fedml.api.run_status(run_id=self.run_id, api_key=MY_API_KEY)
                 return JobStatus.get_job_status_from_run_status(run_status)
             except Exception as e:
                 logging.error(f"Error while getting status of run {self.run_id}: {e}")
@@ -31,7 +41,7 @@ def status(self):
     def kill(self):
         if self.run_id:
             try:
-                return fedml.api.run_stop(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187")
+                return fedml.api.run_stop(run_id=self.run_id, api_key=MY_API_KEY)
             except Exception as e:
                 logging.error(f"Error while stopping run {self.run_id}: {e}")
 

diff --git a/python/setup.py b/python/setup.py
@@ -114,7 +114,7 @@ def finalize_options(self):
 
 setup(
     name="fedml",
-    version="0.8.18.dev12",
+    version="0.8.18.dev13",
     author="FedML Team",
     author_email="[email protected]",
     description="A research and production integrated edge-cloud library for "