Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/v0.7.0 #1890

Merged
merged 6 commits into from
Feb 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.8.18a11"
__version__ = "0.8.18a13"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down Expand Up @@ -537,7 +537,7 @@ def _get_local_s3_like_service_url():
from fedml import api

from fedml import mlops
from fedml.mlops import log, Artifact, log_artifact, log_model, log_metric, log_llm_record
from fedml.mlops import log, Artifact, log_artifact, log_model, log_metric, log_llm_record, log_endpoint

__all__ = [
"FedMLRunner",
Expand Down
93 changes: 60 additions & 33 deletions python/fedml/computing/scheduler/comm_utils/job_monitor.py

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/run_process_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,17 @@ def get_pid_from_cmd_line(cmd_line, break_on_first=True):
pass

return ret_pids

@staticmethod
def is_process_running(process_id):
is_running = False
try:
process = psutil.Process(process_id)
if process.status() == psutil.STATUS_RUNNING or \
process.status() == psutil.STATUS_SLEEPING or \
process.status() == psutil.STATUS_IDLE:
is_running = True
except Exception as e:
pass

return is_running
5 changes: 3 additions & 2 deletions python/fedml/computing/scheduler/comm_utils/sys_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"143": "Command terminated with signal 15 (SIGTERM) (kill command)."}

enable_simulation_gpu = False
simulation_gpu_count = 1


def get_sys_runner_info():
Expand Down Expand Up @@ -112,7 +113,7 @@ def get_sys_runner_info():
pass

if enable_simulation_gpu:
gpu_count = 8
gpu_count = simulation_gpu_count
gpu_total_mem = "80G"
gpu_available_mem = "80G"
gpu_vendor = "NVIDIA"
Expand Down Expand Up @@ -156,7 +157,7 @@ def get_gpu_list():
'memoryUsed': 7.0, 'memoryFree': 81042.0, 'driver': '535.54.03', 'gpu_name': 'NVIDIA A100-SXM4-80GB',
'serial': '1320723000504', 'display_mode': 'Enabled', 'display_active': 'Disabled', 'temperature': 33.0}]

return ret_gpu_list
return ret_gpu_list[0:simulation_gpu_count]

gpu_list = GPUtil.getGPUs()
ret_gpu_list = list()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,12 @@ def run(self, process_event, completed_event):
run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED)
logging.info(f"[endpoint/device][{run_id}/{self.edge_id}] Release gpu resource when the worker deployment occurred exceptions.")
self.release_gpu_ids(run_id)
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id)
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id)
time.sleep(2)
sys.exit(1)
finally:
logging.info("Release resources.")
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id)
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id)
if self.mlops_metrics is not None:
self.mlops_metrics.stop_sys_perf()
time.sleep(3)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ def run(self, process_event, completed_event):

self.run_process_event = process_event
self.run_process_completed_event = completed_event
run_id = self.request_json.get("end_point_id")

try:
MLOpsUtils.set_ntp_offset(self.ntp_offset)

Expand All @@ -230,14 +232,14 @@ def run(self, process_event, completed_event):
self.mlops_metrics.report_server_training_status(
self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED,
is_from_model=True, edge_id=self.edge_id)
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id)
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id)
if self.mlops_metrics is not None:
self.mlops_metrics.stop_sys_perf()
time.sleep(3)
sys.exit(1)
finally:
logging.info("Release resources.")
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id)
MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id)
if self.mlops_metrics is not None:
self.mlops_metrics.stop_sys_perf()
time.sleep(3)
Expand Down
2 changes: 2 additions & 0 deletions python/fedml/computing/scheduler/slave/client_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1648,6 +1648,8 @@ def setup_agent_mqtt_connection(self, service_config):

JobCleanup.get_instance().sync_data_on_startup(self.edge_id)

os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server.get_edge_id())
os.environ["FEDML_DEPLOY_WORKER_IDS"] = str([client.get_edge_id() for client in self.model_device_client_list])
self.mlops_metrics.stop_device_realtime_perf()
self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"])

Expand Down
2 changes: 2 additions & 0 deletions python/fedml/core/mlops/mlops_device_perfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None):
topic_name = "ml_client/mlops/gpu_device_info"
device_info_json = {
"edgeId": edge_id,
"deploy_master_id": os.environ.get("FEDML_DEPLOY_MASTER_ID", ""),
"deploy_worker_ids": os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]"),
"memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2),
"memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2),
"diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2),
Expand Down
4 changes: 3 additions & 1 deletion python/fedml/core/mlops/mlops_runtime_log_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import requests
import yaml

from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils
from ...core.mlops.mlops_configs import MLOpsConfigs
import fedml

Expand Down Expand Up @@ -478,7 +479,8 @@ def stop_all_log_processor(self):

def is_log_processor_running(self, in_run_id, in_device_id):
for (log_child_process, log_run_id, log_device_id) in self.log_child_process_list:
if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id):
if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id) and \
log_child_process is not None and RunProcessUtils.is_process_running(log_child_process.pid):
return True

return False
Expand Down
4 changes: 2 additions & 2 deletions python/fedml/workflow/driver_example/hello_world_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ bootstrap: |
echo "Bootstrap finished."

computing:
resource_type: H100-80GB-HBM3 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
resource_type: A100-80GB-SXM # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type
minimum_num_gpus: 1 # minimum # of GPUs to provision
maximum_cost_per_hour: $0.5 # max cost per hour of all machines for your job
maximum_cost_per_hour: $10 # max cost per hour of all machines for your job
18 changes: 14 additions & 4 deletions python/fedml/workflow/driver_example/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,34 @@
from fedml.workflow.jobs import Job, JobStatus
from fedml.workflow.workflow import Workflow

# CURRENT_CONFIG_VERSION = "release"
# MY_API_KEY = "10e87dd6d6574311a80200455e4d9b30"
CURRENT_CONFIG_VERSION = "local"
CURRENT_ON_PREM_LOCAL_HOST = "localhost"
CURRENT_ON_PREM_LOCAL_PORT = 18080
MY_API_KEY = "1316b93c82da40ce90113a2ed12f0b14"


class HelloWorldJob(Job):
def __init__(self, name):
super().__init__(name)
self.run_id = None

def run(self):
fedml.set_env_version("test")
fedml.set_env_version(CURRENT_CONFIG_VERSION)
fedml.set_local_on_premise_platform_host(CURRENT_ON_PREM_LOCAL_HOST)
fedml.set_local_on_premise_platform_port(CURRENT_ON_PREM_LOCAL_PORT)

working_directory = os.path.dirname(os.path.abspath(__file__))
absolute_path = os.path.join(working_directory, "hello_world_job.yaml")
result = fedml.api.launch_job(yaml_file=absolute_path, api_key="30d1bbcae9ec48ffa314caa8e944d187")
result = fedml.api.launch_job(yaml_file=absolute_path, api_key=MY_API_KEY)
if result.run_id and int(result.run_id) > 0:
self.run_id = result.run_id

def status(self):
if self.run_id:
try:
_, run_status = fedml.api.run_status(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187")
_, run_status = fedml.api.run_status(run_id=self.run_id, api_key=MY_API_KEY)
return JobStatus.get_job_status_from_run_status(run_status)
except Exception as e:
logging.error(f"Error while getting status of run {self.run_id}: {e}")
Expand All @@ -31,7 +41,7 @@ def status(self):
def kill(self):
if self.run_id:
try:
return fedml.api.run_stop(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187")
return fedml.api.run_stop(run_id=self.run_id, api_key=MY_API_KEY)
except Exception as e:
logging.error(f"Error while stopping run {self.run_id}: {e}")

Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def finalize_options(self):

setup(
name="fedml",
version="0.8.18a11",
version="0.8.18a13",
author="FedML Team",
author_email="[email protected]",
description="A research and production integrated edge-cloud library for "
Expand Down
Loading