Skip to content

Commit

Permalink
Merge pull request #1966 from FedML-AI/alaydshah/fix/package_build
Browse files Browse the repository at this point in the history
Enhance Build Packaging
  • Loading branch information
Raphael-Jin authored Mar 12, 2024
2 parents 61312d5 + ec64ed3 commit 5e639c0
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 22 deletions.
10 changes: 10 additions & 0 deletions python/examples/launch/train_build_package/src/bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
### don't modify this part ###
set -x
##############################

pip install -r requirements.txt
echo "Bootstrap finished."

### don't modify this part ###
exit 0
##############################
9 changes: 3 additions & 6 deletions python/examples/launch/train_build_package/train_job.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Local directory where your source code resides.
# It should be the relative path to this job yaml file or the absolute path.
# If your job doesn't contain any source code, it can be empty.
workspace: .
workspace: "./src"

# Running entry commands which will be executed as the job entry point.
# If an error occurs, you should exit with a non-zero code, e.g. exit 1.
Expand All @@ -14,14 +14,11 @@ job_type: train # options: train, deploy, federate

# Bootstrap shell commands which will be executed before running entry commands.
# Support multiple lines, which can be empty.
bootstrap: |
echo "Bootstrap finished."
bootstrap: bash bootstrap.sh

computing:
minimum_num_gpus: 1 # minimum # of GPUs to provision
maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card
#allow_cross_cloud_resources: true # true, false
#device_type: CPU # options: GPU, CPU, hybrid
resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type

data_args:
Expand All @@ -36,4 +33,4 @@ model_args:
output_dim: '10'

training_params:
learning_rate: 0.004
learning_rate: 0.004
5 changes: 3 additions & 2 deletions python/fedml/api/modules/federate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fedml.computing.scheduler.comm_utils.sys_utils import generate_yaml_doc
from fedml.computing.scheduler.comm_utils.yaml_utils import load_yaml_config
from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
from fedml.computing.scheduler.scheduler_entry.constants import Constants as SchedulerEntryConstants
import fedml.api.modules.build
from fedml.computing.scheduler.scheduler_entry.launch_manager import FedMLLaunchManager

Expand Down Expand Up @@ -66,8 +67,8 @@ def build_with_job_yaml(job_yaml_file, dest_folder=None):
shutil.copyfile(server_package, dest_package)
print(f"Your server package file is located at: {dest_package}")

bootstrap_bat_file = os.path.join(job_dir_path, "bootstrap.bat")
bootstrap_sh_file = os.path.join(job_dir_path, "bootstrap.sh")
bootstrap_sh_file = os.path.join(job_dir_path, SchedulerEntryConstants.BOOTSTRAP_FILE_NAME)
bootstrap_bat_file = bootstrap_sh_file.rstrip(".sh") + ".bat"
job_entry_bat_file = os.path.join(
job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME.rstrip('.sh') + '.bat')
job_entry_sh_file = os.path.join(job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME)
Expand Down
13 changes: 9 additions & 4 deletions python/fedml/api/modules/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@
from fedml.computing.scheduler.comm_utils.sys_utils import generate_yaml_doc
from fedml.computing.scheduler.comm_utils.yaml_utils import load_yaml_config
from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants
from fedml.computing.scheduler.scheduler_entry.constants import Constants as SchedulerEntryConstants
from fedml.computing.scheduler.scheduler_entry.launch_manager import FedMLLaunchManager
import fedml.api.modules.build


def build(source_folder, entry_point, entry_args, config_folder, dest_folder, ignore,
model_name, model_cache_path, input_dim, output_dim, dataset_name, dataset_type, dataset_path):

# Check the config file
config_file_path = os.path.join(config_folder, ModuleConstants.FEDML_CONFIG_YAML_FILE)

if not os.path.exists(config_file_path):
print(f"Please make sure the following config file exists. \n{config_file_path}")
return
Expand All @@ -37,8 +40,10 @@ def build(source_folder, entry_point, entry_args, config_folder, dest_folder, ig


def build_with_job_yaml(job_yaml_file, dest_folder=None):

job_config, app_config, client_package, server_package = FedMLLaunchManager.get_instance().prepare_launch(
job_yaml_file)

if client_package is None or os.path.exists(client_package) is False:
print("Build failed, please check your job yaml file.")
return
Expand All @@ -52,11 +57,11 @@ def build_with_job_yaml(job_yaml_file, dest_folder=None):
os.path.join(dest_folder, os.path.basename(client_package)))
shutil.copyfile(client_package, dest_package)

bootstrap_bat_file = os.path.join(job_dir_path, "bootstrap.bat")
bootstrap_sh_file = os.path.join(job_dir_path, "bootstrap.sh")
job_entry_bat_file = os.path.join(
job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME.rstrip('.sh') + '.bat')
bootstrap_sh_file = os.path.join(job_dir_path, SchedulerEntryConstants.BOOTSTRAP_FILE_NAME)
bootstrap_bat_file = bootstrap_sh_file.rstrip(".sh") + ".bat"
job_entry_sh_file = os.path.join(job_dir_path, SchedulerConstants.LAUNCH_JOB_DEFAULT_ENTRY_NAME)
job_entry_bat_file = job_entry_sh_file.rstrip(".sh") + '.bat'

if os.path.exists(bootstrap_bat_file):
os.remove(bootstrap_bat_file)
if os.path.exists(bootstrap_sh_file):
Expand Down
32 changes: 32 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/sys_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,38 @@ def remove_simulator_process(data_dir, runner_info_dir, process_id):
pass


def remove_files(file_paths: List[str]):
"""
Remove files if they exist.
Args:
file_paths: List of file paths
Usage:
file_list = ["file4.txt", "file5.txt", "file6.txt"]
remove_files(file_list)
"""
if not isinstance(file_paths, list):
raise ValueError("file_paths must be a list of file paths.")

for path in file_paths:
if os.path.exists(path):
os.remove(path)


def convert_and_remove_bat_files(shell_script_paths: List[str]):
if not isinstance(shell_script_paths, list):
raise ValueError("sh_file_paths must be a list of file paths.")

# Convert to bat file paths
bat_file_paths = list(map(lambda path: path[:-2] + 'bat', shell_script_paths))

# Filter out non-bat paths
bat_file_paths = list(filter(lambda path: path.endswith('.bat'), bat_file_paths))

remove_files(bat_file_paths)


def simulator_process_is_running(process_id):
for process in psutil.process_iter():
if str(process.pid) == str(process_id):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class Constants(Singleton):

FEDML_LAUNCH_JOB_TEMP_DIR = "tmp"

BOOTSTRAP_FILE_NAME = "bootstrap.sh"
BOOTSTRAP_FILE_NAME = "fedml_bootstrap_generated.sh"
STD_CONFIG_ENV_SECTION = "environment_args"
STD_CONFIG_ENV_SECTION_BOOTSTRAP_KEY = "bootstrap"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -533,28 +533,28 @@ def cleanup_temp_files(self):
Constants.LAUNCH_JOB_LAUNCH_CONF_FOLDER_NAME)
shutil.rmtree(conf_folder, ignore_errors=True)

files_to_remove = []
source_full_path = os.path.join(self.executable_file_folder, self.executable_file)
if os.path.exists(source_full_path):
os.remove(source_full_path)
boostrap_path = os.path.join(self.executable_file_folder, Constants.BOOTSTRAP_FILE_NAME)
if os.path.exists(boostrap_path):
os.remove(boostrap_path)
files_to_remove.extend([source_full_path, boostrap_path])

server_source_full_path = os.path.join(self.executable_file_folder, self.server_executable_file)
if os.path.exists(server_source_full_path):
os.remove(server_source_full_path)
files_to_remove.append(server_source_full_path)

source_full_path_to_base = os.path.join(self.base_dir, self.executable_file_folder, self.executable_file)
if os.path.exists(source_full_path_to_base):
os.remove(source_full_path_to_base)
boostrap_path = os.path.join(self.base_dir, self.executable_file_folder, Constants.BOOTSTRAP_FILE_NAME)
if os.path.exists(boostrap_path):
os.remove(boostrap_path)
files_to_remove.extend([source_full_path_to_base, boostrap_path])

server_source_full_path_to_base = os.path.join(self.base_dir, self.executable_file_folder,
self.server_executable_file)
if os.path.exists(server_source_full_path_to_base):
os.remove(server_source_full_path_to_base)
files_to_remove.append(server_source_full_path_to_base)

sys_utils.remove_files(files_to_remove)
sys_utils.convert_and_remove_bat_files(files_to_remove)


def read_gitignore_file(self):
try:
Expand Down

0 comments on commit 5e639c0

Please sign in to comment.