diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py index 2e95fedeeb..81e864b2ea 100644 --- a/python/fedml/__init__.py +++ b/python/fedml/__init__.py @@ -34,7 +34,7 @@ _global_training_type = None _global_comm_backend = None -__version__ = "0.8.18a11" +__version__ = "0.8.18a13" # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release @@ -537,7 +537,7 @@ def _get_local_s3_like_service_url(): from fedml import api from fedml import mlops -from fedml.mlops import log, Artifact, log_artifact, log_model, log_metric, log_llm_record +from fedml.mlops import log, Artifact, log_artifact, log_model, log_metric, log_llm_record, log_endpoint __all__ = [ "FedMLRunner", diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index 178cd42828..2077b2509f 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -84,7 +84,8 @@ def monitor_slave_run_process_status(self): if not self.released_runs.get(str(job.job_id), False): self.released_runs[str(job.job_id)] = True # Release the gpu ids - print(f"[run/device][{job.job_id}/{job.edge_id}] Release gpu resource when run processes has exited on monioring slave runs periodically.") + print( + f"[run/device][{job.job_id}/{job.edge_id}] Release gpu resource when run processes has exited on monioring slave runs periodically.") JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id) # Get the timeout threshold @@ -137,7 +138,8 @@ def monitor_slave_run_process_status(self): self.released_endpoints[str(job.job_id)] = True # Release the gpu ids - print(f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when monioring worker endpoint periodically.") + print( + f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when monioring worker endpoint periodically.") JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id) except Exception as e: @@ -235,7 +237,8 @@ def monitor_slave_endpoint_status(self): device_client_data_interface.FedMLClientDataInterface.get_instance().create_job_table() except Exception as e: pass - FedMLModelDatabase.get_instance().set_database_base_dir(device_client_constants.ClientConstants.get_database_dir()) + FedMLModelDatabase.get_instance().set_database_base_dir( + device_client_constants.ClientConstants.get_database_dir()) job_list = device_client_data_interface.FedMLClientDataInterface.get_instance().get_jobs_from_db() agent_config = dict() agent_config["mqtt_config"] = self.mqtt_config @@ -252,7 +255,8 @@ def monitor_slave_endpoint_status(self): self.released_endpoints[str(job.job_id)] = True # Release the gpu ids - print(f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when woker endpoint failed on monitoring periodically.") + print( + f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when woker endpoint failed on monitoring periodically.") JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id) elif job.status == device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED: @@ -295,7 +299,7 @@ def monitor_slave_endpoint_status(self): device_ids[0], job.edge_id, job.job_id, endpoint_name, model_name, model_id, model_version, inference_port=None, disable=True) - + # [Critical] # 1. After restart, # the "running" status of container does NOT mean the endpoint is ready. @@ -304,18 +308,22 @@ def monitor_slave_endpoint_status(self): status = result_json.get("model_status", None) if status != device_server_constants.ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_UPDATING: - started, inference_port = ContainerUtils.get_instance().restart_container(endpoint_container_name) - deployment_result["model_status"] = device_server_constants.ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_UPDATING - + started, inference_port = ContainerUtils.get_instance().restart_container( + endpoint_container_name) + deployment_result[ + "model_status"] = device_server_constants.ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_UPDATING + # Change the local port for next ready check endpoint_sync_protocol.set_local_deployment_status_result( job.job_id, endpoint_name, model_name, model_version, job.edge_id, inference_port, status_result, deployment_result) else: - started, inference_port = ContainerUtils.get_instance().start_container(endpoint_container_name) - + started, inference_port = ContainerUtils.get_instance().start_container( + endpoint_container_name) + if is_endpoint_ready: - deployment_result["model_status"] = device_server_constants.ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED + deployment_result[ + "model_status"] = device_server_constants.ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED # Send the inference info to the master agent # TODO: Consistency control @@ -327,7 +335,7 @@ def monitor_slave_endpoint_status(self): endpoint_sync_protocol.set_local_deployment_status_result( job.job_id, endpoint_name, model_name, model_version, job.edge_id, inference_port, status_result, deployment_result) - + elif job.status == device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE: endpoint_json = json.loads(job.running_json) model_config = endpoint_json.get("model_config", {}) @@ -357,7 +365,8 @@ def monitor_slave_endpoint_status(self): for i in range(num_containers): endpoint_container_name = endpoint_container_name_prefix + f"__{i}" - started, inference_port = ContainerUtils.get_instance().start_container(endpoint_container_name) + started, inference_port = ContainerUtils.get_instance().start_container( + endpoint_container_name) if started and inference_port != 0: endpoint_sync_protocol.send_sync_inference_info( device_ids[0], job.edge_id, job.job_id, endpoint_name, model_name, @@ -383,7 +392,8 @@ def monitor_slave_endpoint_status(self): f"{device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED}.") mlops.log_training_status( - device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, run_id=job.job_id, + device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, + run_id=job.job_id, edge_id=job.edge_id, is_from_model=True, enable_broadcast=True ) @@ -391,7 +401,8 @@ def monitor_slave_endpoint_status(self): self.released_endpoints[str(job.job_id)] = True # Release the gpu ids - print(f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when the worker endpoint runs timeout on monioring periodically.") + print( + f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when the worker endpoint runs timeout on monioring periodically.") JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id) # Get endpoint container name prefix @@ -408,7 +419,8 @@ def monitor_slave_endpoint_status(self): print(f"[Worker][{job.job_id}:{job.edge_id}] Release gpu ids.") except Exception as e: - print(f"[Worker][{job.job_id}:{job.edge_id}] Exception when syncing endpoint process on the slave agent. {traceback.format_exc()}") + print( + f"[Worker][{job.job_id}:{job.edge_id}] Exception when syncing endpoint process on the slave agent. {traceback.format_exc()}") except Exception as e: print(f"[Worker] Exception when syncing endpoint process on the slave agent. {traceback.format_exc()}") pass @@ -458,7 +470,8 @@ def _check_and_reset_endpoint_status( self.released_endpoints[str(endpoint_id)] = True # Release the gpu ids - print(f"[endpoint/device][{endpoint_id}/{device_id}] Release gpu resource when the worker endpoint is not ready on monitoring periodically.") + print( + f"[endpoint/device][{endpoint_id}/{device_id}] Release gpu resource when the worker endpoint is not ready on monitoring periodically.") JobRunnerUtils.get_instance().release_gpu_ids(endpoint_id, device_id) return False @@ -471,11 +484,11 @@ def is_inference_ready(self, inference_url, timeout=None, device_id=None, endpoi if response_ok: print("Use http health check.") return response_ok - + if response_ok is None: # Internal server can response, but reply is not ready return False - + # Cannot reach the server, will try other protocols print("Use http health check failed.") @@ -549,7 +562,7 @@ def inference( return False, None - def _check_all_slave_endpoint_status(self, endpoint_id, endpoint_name, model_name, + def _check_all_slave_endpoint_status(self, endpoint_id, endpoint_name, model_name, server_internal_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT): # Get model deployment result is_endpoint_offline = True @@ -610,7 +623,8 @@ def monitor_master_endpoint_status(self): if endpoint_status == device_server_constants.ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED: # Release the gpu ids - print(f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when the master endpoint failed on monitoring periodically.") + print( + f"[endpoint/device][{job.job_id}/{job.edge_id}] Release gpu resource when the master endpoint failed on monitoring periodically.") JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id) elif endpoint_status == device_server_constants.ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: if model_name is None: @@ -618,8 +632,8 @@ def monitor_master_endpoint_status(self): try: # If the endpoint is offline, then report offline status to the MLOps. model_config_parameters = model_config.get("parameters", {}) - server_internal_port = model_config_parameters.get("server_internal_port", - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) + server_internal_port = model_config_parameters.get("server_internal_port", + ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) is_endpoint_online = self._check_all_slave_endpoint_status(job.job_id, endpoint_name, model_name, server_internal_port) if not is_endpoint_online: @@ -664,8 +678,8 @@ def monitor_master_endpoint_status(self): elif endpoint_status == device_server_constants.ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: # If the endpoint is offline, then report offline status to the MLOps. model_config_parameters = model_config.get("parameters", {}) - server_internal_port = model_config_parameters.get("server_internal_port", - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) + server_internal_port = model_config_parameters.get("server_internal_port", + ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) is_endpoint_online = self._check_all_slave_endpoint_status( job.job_id, endpoint_name, model_name, server_internal_port) if is_endpoint_online: @@ -691,14 +705,23 @@ def monitor_endpoint_logs(self): device_client_data_interface.FedMLClientDataInterface.get_instance().create_job_table() except Exception as e: pass + number_of_finished_jobs = 0 job_list = device_client_data_interface.FedMLClientDataInterface.get_instance().get_jobs_from_db() for job in job_list.job_list: count += 1 if count >= 1000: break + if job.status == device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ + job.status == device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: + MLOpsRuntimeLogDaemon.get_instance(fedml_args).stop_log_processor(job.job_id, job.edge_id) + continue + if job.status != device_client_constants.ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED: continue + number_of_finished_jobs += 1 + if number_of_finished_jobs >= 100: + break endpoint_json = json.loads(job.running_json) if job.running_json is not None else {} model_config = endpoint_json.get("model_config", {}) @@ -711,14 +734,6 @@ def monitor_endpoint_logs(self): job.job_id, job.edge_id, device_server_constants.ServerConstants.get_log_file_dir(), is_server=True, log_file_prefix=JobMonitor.ENDPOINT_CONTAINER_LOG_PREFIX, ) - if not MLOpsRuntimeLogDaemon.get_instance(fedml_args).is_log_processor_running(job.job_id, job.edge_id): - setattr(fedml_args, "log_file_dir", os.path.dirname(log_file_path)) - MLOpsRuntimeLogDaemon.get_instance(fedml_args).log_file_dir = os.path.dirname(log_file_path) - MLOpsRuntimeLogDaemon.get_instance(fedml_args).start_log_processor( - job.job_id, job.edge_id, - log_source=device_client_constants.ClientConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT, - log_file_prefix=JobMonitor.ENDPOINT_CONTAINER_LOG_PREFIX - ) # Get endpoint container name endpoint_container_name_prefix = device_client_constants.ClientConstants.get_endpoint_container_name( @@ -728,12 +743,14 @@ def monitor_endpoint_logs(self): num_containers = ContainerUtils.get_instance().get_container_rank_same_model( endpoint_container_name_prefix) + is_job_container_running = False for i in range(num_containers): endpoint_container_name = endpoint_container_name_prefix + f"__{i}" endpoint_logs = ContainerUtils.get_instance().get_container_logs(endpoint_container_name) if endpoint_logs is None: continue + is_job_container_running = True # Write container logs to the log file if i == 0: @@ -743,5 +760,15 @@ def monitor_endpoint_logs(self): with open(log_file_path, "a") as f: f.write(endpoint_logs) + if is_job_container_running and not MLOpsRuntimeLogDaemon.get_instance(fedml_args). \ + is_log_processor_running(job.job_id, job.edge_id): + setattr(fedml_args, "log_file_dir", os.path.dirname(log_file_path)) + MLOpsRuntimeLogDaemon.get_instance(fedml_args).log_file_dir = os.path.dirname(log_file_path) + MLOpsRuntimeLogDaemon.get_instance(fedml_args).start_log_processor( + job.job_id, job.edge_id, + log_source=device_client_constants.ClientConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT, + log_file_prefix=JobMonitor.ENDPOINT_CONTAINER_LOG_PREFIX + ) + except Exception as e: print(f"Exception when syncing endpoint log to MLOps {traceback.format_exc()}.") diff --git a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py index 33d5202589..7358c5bfb8 100644 --- a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py @@ -186,3 +186,17 @@ def get_pid_from_cmd_line(cmd_line, break_on_first=True): pass return ret_pids + + @staticmethod + def is_process_running(process_id): + is_running = False + try: + process = psutil.Process(process_id) + if process.status() == psutil.STATUS_RUNNING or \ + process.status() == psutil.STATUS_SLEEPING or \ + process.status() == psutil.STATUS_IDLE: + is_running = True + except Exception as e: + pass + + return is_running diff --git a/python/fedml/computing/scheduler/comm_utils/sys_utils.py b/python/fedml/computing/scheduler/comm_utils/sys_utils.py index f238956ba2..8bdc6f8141 100644 --- a/python/fedml/computing/scheduler/comm_utils/sys_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/sys_utils.py @@ -35,6 +35,7 @@ "143": "Command terminated with signal 15 (SIGTERM) (kill command)."} enable_simulation_gpu = False +simulation_gpu_count = 1 def get_sys_runner_info(): @@ -112,7 +113,7 @@ def get_sys_runner_info(): pass if enable_simulation_gpu: - gpu_count = 8 + gpu_count = simulation_gpu_count gpu_total_mem = "80G" gpu_available_mem = "80G" gpu_vendor = "NVIDIA" @@ -156,7 +157,7 @@ def get_gpu_list(): 'memoryUsed': 7.0, 'memoryFree': 81042.0, 'driver': '535.54.03', 'gpu_name': 'NVIDIA A100-SXM4-80GB', 'serial': '1320723000504', 'display_mode': 'Enabled', 'display_active': 'Disabled', 'temperature': 33.0}] - return ret_gpu_list + return ret_gpu_list[0:simulation_gpu_count] gpu_list = GPUtil.getGPUs() ret_gpu_list = list() diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_runner.py b/python/fedml/computing/scheduler/model_scheduler/device_client_runner.py index 90e8c79be0..e64222313f 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_runner.py @@ -279,12 +279,12 @@ def run(self, process_event, completed_event): run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) logging.info(f"[endpoint/device][{run_id}/{self.edge_id}] Release gpu resource when the worker deployment occurred exceptions.") self.release_gpu_ids(run_id) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) time.sleep(2) sys.exit(1) finally: logging.info("Release resources.") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) if self.mlops_metrics is not None: self.mlops_metrics.stop_sys_perf() time.sleep(3) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py b/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py index 783ffab0c6..ff1ce82b4a 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py @@ -210,6 +210,8 @@ def run(self, process_event, completed_event): self.run_process_event = process_event self.run_process_completed_event = completed_event + run_id = self.request_json.get("end_point_id") + try: MLOpsUtils.set_ntp_offset(self.ntp_offset) @@ -230,14 +232,14 @@ def run(self, process_event, completed_event): self.mlops_metrics.report_server_training_status( self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, is_from_model=True, edge_id=self.edge_id) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) if self.mlops_metrics is not None: self.mlops_metrics.stop_sys_perf() time.sleep(3) sys.exit(1) finally: logging.info("Release resources.") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) if self.mlops_metrics is not None: self.mlops_metrics.stop_sys_perf() time.sleep(3) diff --git a/python/fedml/computing/scheduler/slave/client_runner.py b/python/fedml/computing/scheduler/slave/client_runner.py index 3821c6874a..80d0f86949 100755 --- a/python/fedml/computing/scheduler/slave/client_runner.py +++ b/python/fedml/computing/scheduler/slave/client_runner.py @@ -1648,6 +1648,8 @@ def setup_agent_mqtt_connection(self, service_config): JobCleanup.get_instance().sync_data_on_startup(self.edge_id) + os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server.get_edge_id()) + os.environ["FEDML_DEPLOY_WORKER_IDS"] = str([client.get_edge_id() for client in self.model_device_client_list]) self.mlops_metrics.stop_device_realtime_perf() self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"]) diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 7d4f58bf87..e02203ff6f 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -168,6 +168,8 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None): topic_name = "ml_client/mlops/gpu_device_info" device_info_json = { "edgeId": edge_id, + "deploy_master_id": os.environ.get("FEDML_DEPLOY_MASTER_ID", ""), + "deploy_worker_ids": os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]"), "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), diff --git a/python/fedml/core/mlops/mlops_runtime_log_daemon.py b/python/fedml/core/mlops/mlops_runtime_log_daemon.py index b131ecef16..3fecdb834b 100644 --- a/python/fedml/core/mlops/mlops_runtime_log_daemon.py +++ b/python/fedml/core/mlops/mlops_runtime_log_daemon.py @@ -10,6 +10,7 @@ import requests import yaml +from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils from ...core.mlops.mlops_configs import MLOpsConfigs import fedml @@ -478,7 +479,8 @@ def stop_all_log_processor(self): def is_log_processor_running(self, in_run_id, in_device_id): for (log_child_process, log_run_id, log_device_id) in self.log_child_process_list: - if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id): + if str(in_run_id) == str(log_run_id) and str(in_device_id) == str(log_device_id) and \ + log_child_process is not None and RunProcessUtils.is_process_running(log_child_process.pid): return True return False diff --git a/python/fedml/workflow/driver_example/hello_world_job.yaml b/python/fedml/workflow/driver_example/hello_world_job.yaml index ba9a2d0384..046fb2b0e2 100755 --- a/python/fedml/workflow/driver_example/hello_world_job.yaml +++ b/python/fedml/workflow/driver_example/hello_world_job.yaml @@ -24,6 +24,6 @@ bootstrap: | echo "Bootstrap finished." computing: - resource_type: H100-80GB-HBM3 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type + resource_type: A100-80GB-SXM # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type minimum_num_gpus: 1 # minimum # of GPUs to provision - maximum_cost_per_hour: $0.5 # max cost per hour of all machines for your job + maximum_cost_per_hour: $10 # max cost per hour of all machines for your job diff --git a/python/fedml/workflow/driver_example/main.py b/python/fedml/workflow/driver_example/main.py index d432345558..eafcb61117 100644 --- a/python/fedml/workflow/driver_example/main.py +++ b/python/fedml/workflow/driver_example/main.py @@ -5,6 +5,13 @@ from fedml.workflow.jobs import Job, JobStatus from fedml.workflow.workflow import Workflow +# CURRENT_CONFIG_VERSION = "release" +# MY_API_KEY = "10e87dd6d6574311a80200455e4d9b30" +CURRENT_CONFIG_VERSION = "local" +CURRENT_ON_PREM_LOCAL_HOST = "localhost" +CURRENT_ON_PREM_LOCAL_PORT = 18080 +MY_API_KEY = "1316b93c82da40ce90113a2ed12f0b14" + class HelloWorldJob(Job): def __init__(self, name): @@ -12,17 +19,20 @@ def __init__(self, name): self.run_id = None def run(self): - fedml.set_env_version("test") + fedml.set_env_version(CURRENT_CONFIG_VERSION) + fedml.set_local_on_premise_platform_host(CURRENT_ON_PREM_LOCAL_HOST) + fedml.set_local_on_premise_platform_port(CURRENT_ON_PREM_LOCAL_PORT) + working_directory = os.path.dirname(os.path.abspath(__file__)) absolute_path = os.path.join(working_directory, "hello_world_job.yaml") - result = fedml.api.launch_job(yaml_file=absolute_path, api_key="30d1bbcae9ec48ffa314caa8e944d187") + result = fedml.api.launch_job(yaml_file=absolute_path, api_key=MY_API_KEY) if result.run_id and int(result.run_id) > 0: self.run_id = result.run_id def status(self): if self.run_id: try: - _, run_status = fedml.api.run_status(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187") + _, run_status = fedml.api.run_status(run_id=self.run_id, api_key=MY_API_KEY) return JobStatus.get_job_status_from_run_status(run_status) except Exception as e: logging.error(f"Error while getting status of run {self.run_id}: {e}") @@ -31,7 +41,7 @@ def status(self): def kill(self): if self.run_id: try: - return fedml.api.run_stop(run_id=self.run_id, api_key="30d1bbcae9ec48ffa314caa8e944d187") + return fedml.api.run_stop(run_id=self.run_id, api_key=MY_API_KEY) except Exception as e: logging.error(f"Error while stopping run {self.run_id}: {e}") diff --git a/python/setup.py b/python/setup.py index 578c6e6067..05a0ba4c50 100644 --- a/python/setup.py +++ b/python/setup.py @@ -114,7 +114,7 @@ def finalize_options(self): setup( name="fedml", - version="0.8.18a11", + version="0.8.18a13", author="FedML Team", author_email="ch@fedml.ai", description="A research and production integrated edge-cloud library for "