Skip to content

Commit

Permalink
Merge pull request #1889 from FedML-AI/alexleung/dev_branch
Browse files Browse the repository at this point in the history
[CoreEngine] upload the inferences logs.
  • Loading branch information
fedml-alex authored Feb 3, 2024
2 parents c962519 + 493d830 commit d4afb3b
Show file tree
Hide file tree
Showing 10 changed files with 102 additions and 44 deletions.
2 changes: 1 addition & 1 deletion python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.8.18.dev12"
__version__ = "0.8.18.dev13"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down
93 changes: 60 additions & 33 deletions python/fedml/computing/scheduler/comm_utils/job_monitor.py

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/run_process_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,17 @@ def get_pid_from_cmd_line(cmd_line, break_on_first=True):
pass

return ret_pids

@staticmethod
def is_process_running(process_id):
is_running = False
try:
process = psutil.Process(process_id)
if process.status() == psutil.STATUS_RUNNING or \
process.status() == psutil.STATUS_SLEEPING or \
process.status() == psutil.STATUS_IDLE:
is_running = True
except Exception as e:
pass

return is_running
5 changes: 3 additions & 2 deletions python/fedml/computing/scheduler/comm_utils/sys_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"143": "Command terminated with signal 15 (SIGTERM) (kill command)."}

enable_simulation_gpu = False
simulation_gpu_count = 1


def get_sys_runner_info():
Expand Down Expand Up @@ -112,7 +113,7 @@ def get_sys_runner_info():
pass

if enable_simulation_gpu:
gpu_count = 8
gpu_count = simulation_gpu_count
gpu_total_mem = "80G"
gpu_available_mem = "80G"
gpu_vendor = "NVIDIA"
Expand Down Expand Up @@ -156,7 +157,7 @@ def get_gpu_list():
'memoryUsed': 7.0, 'memoryFree': 81042.0, 'driver': '535.54.03', 'gpu_name': 'NVIDIA A100-SXM4-80GB',
'serial': '1320723000504', 'display_mode': 'Enabled', 'display_active': 'Disabled', 'temperature': 33.0}]

return ret_gpu_list
return ret_gpu_list[0:simulation_gpu_count]

gpu_list = GPUtil.getGPUs()
ret_gpu_list = list()
Expand Down
2 changes: 2 additions & 0 deletions python/fedml/computing/scheduler/slave/client_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1648,6 +1648,8 @@ def setup_agent_mqtt_connection(self, service_config):

JobCleanup.get_instance().sync_data_on_startup(self.edge_id)

os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server.get_edge_id())
os.environ["FEDML_DEPLOY_WORKER_IDS"] = str([client.get_edge_id() for client in self.model_device_client_list])
self.mlops_metrics.stop_device_realtime_perf()
self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"])

Expand Down
2 changes: 2 additions & 0 deletions python/fedml/core/mlops/mlops_device_perfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None):
topic_name = "ml_client/mlops/gpu_device_info"
device_info_json = {
"edgeId": edge_id,
"deploy_master_id": os.environ.get("FEDML_DEPLOY_MASTER_ID", ""),
"deploy_worker_ids": os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]"),
"memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2),
"memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2),
"diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2),
Expand Down
4 changes: 3 additions & 1 deletion python/fedml/core/mlops/mlops_runtime_log_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import requests
import yaml

from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils
from ...core.mlops.mlops_configs import MLOpsConfigs
import fedml

Expand Down Expand Up @@ -478,7 +479,8 @@ def stop_all_log_processor(self):

def is_log_processor_running(self, in_run_id, in_device_id):
for (log_child_process, log_run_id, log_device_id) in self.log_child_process_list:
if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id):
if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id) and \
log_child_process is not None and RunProcessUtils.is_process_running(log_child_process.pid):
return True

return False
Expand Down
4 changes: 2 additions & 2 deletions python/fedml/workflow/driver_example/hello_world_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ bootstrap: |
echo "Bootstrap finished."
computing:
resource_type: H100-80GB-HBM3 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
resource_type: A100-80GB-SXM # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
minimum_num_gpus: 1 # minimum # of GPUs to provision
maximum_cost_per_hour: $0.5 # max cost per hour of all machines for your job
maximum_cost_per_hour: $10 # max cost per hour of all machines for your job
18 changes: 14 additions & 4 deletions python/fedml/workflow/driver_example/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,34 @@
from fedml.workflow.jobs import Job, JobStatus
from fedml.workflow.workflow import Workflow

# CURRENT_CONFIG_VERSION = "release"
# MY_API_KEY = "10e87dd6d6574311a80200455e4d9b30"
CURRENT_CONFIG_VERSION = "local"
CURRENT_ON_PREM_LOCAL_HOST = "localhost"
CURRENT_ON_PREM_LOCAL_PORT = 18080
MY_API_KEY = "1316b93c82da40ce90113a2ed12f0b14"


class HelloWorldJob(Job):
def __init__(self, name):
super().__init__(name)
self.run_id = None

def run(self):
fedml.set_env_version("test")
fedml.set_env_version(CURRENT_CONFIG_VERSION)
fedml.set_local_on_premise_platform_host(CURRENT_ON_PREM_LOCAL_HOST)
fedml.set_local_on_premise_platform_port(CURRENT_ON_PREM_LOCAL_PORT)

working_directory = os.path.dirname(os.path.abspath(__file__))
absolute_path = os.path.join(working_directory, "hello_world_job.yaml")
result = fedml.api.launch_job(yaml_file=absolute_path, api_key="30d1bbcae9ec48ffa314caa8e944d187")
result = fedml.api.launch_job(yaml_file=absolute_path, api_key=MY_API_KEY)
if result.run_id and int(result.run_id) > 0:
self.run_id = result.run_id

def status(self):
if self.run_id:
try:
_, run_status = fedml.api.run_status(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187")
_, run_status = fedml.api.run_status(run_id=self.run_id, api_key=MY_API_KEY)
return JobStatus.get_job_status_from_run_status(run_status)
except Exception as e:
logging.error(f"Error while getting status of run {self.run_id}: {e}")
Expand All @@ -31,7 +41,7 @@ def status(self):
def kill(self):
if self.run_id:
try:
return fedml.api.run_stop(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187")
return fedml.api.run_stop(run_id=self.run_id, api_key=MY_API_KEY)
except Exception as e:
logging.error(f"Error while stopping run {self.run_id}: {e}")

Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def finalize_options(self):

setup(
name="fedml",
version="0.8.18.dev12",
version="0.8.18.dev13",
author="FedML Team",
author_email="[email protected]",
description="A research and production integrated edge-cloud library for "
Expand Down

0 comments on commit d4afb3b

Please sign in to comment.