From 405eb94c60ca5b461a35584dfb57cc2b8176f1ca Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Wed, 21 Aug 2024 15:02:28 +0800 Subject: [PATCH 1/6] init scale --- .../llava1.5_7b/flagscale/run_pretraining.py | 171 ++++++++++++++++++ .../docker_image/flagscale_2409/Dockerfile | 4 + .../flagscale_2409/flagscale_2409_install.sh | 1 + .../config/config_H100x1x8.py | 15 ++ .../config/requirements.txt | 1 + .../run_benchmarks/config/cluster_conf.py | 4 +- training/run_benchmarks/config/test_conf.py | 6 +- .../flagscale/start_flagscale_task.py | 7 + 8 files changed, 204 insertions(+), 5 deletions(-) create mode 100644 training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py create mode 100644 training/nvidia/docker_image/flagscale_2409/Dockerfile create mode 100644 training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh create mode 100644 training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py create mode 100644 training/nvidia/llava1.5_7b-flagscale/config/requirements.txt diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py new file mode 100644 index 000000000..56203bb6d --- /dev/null +++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py @@ -0,0 +1,171 @@ +import subprocess +from argparse import ArgumentParser +import os +import sys +from importlib import import_module +import yaml + + +def parse_args(): + '''we parse ddp related args, check system config args, and running env + args such as --data_dir_xxx. Then pass all useful args to the real + training script. + ''' + parser = ArgumentParser(description="flagscale main python") + parser.add_argument("--world_size", type=int, required=True) + parser.add_argument("--vendor", type=str, required=True) + parser.add_argument("--data_dir", type=str, required=True) + parser.add_argument("--hosts", type=str, required=True) + parser.add_argument("--host_addr", type=str, required=True) + parser.add_argument("--log_dir", type=str, required=True) + parser.add_argument("--flagperf_config_file", type=str, required=True) + args, unknown_args = parser.parse_known_args() + args.unknown_args = unknown_args + return args + + +def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate_cmd, debug_mode=False): + if not debug_mode: + exec_cmd = scale_download_cmd.replace('', log_dir) + print(exec_cmd) + + install_logdir = os.path.join(log_dir, "install_logs") + os.makedirs(install_logdir) + + logfile = os.path.join(install_logdir, "scale_download.log.txt") + with open(logfile, 'w') as f: + p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p.wait() + f.close() + + exec_cmd = f"cd {log_dir}; {scale_install_cmd}" + logfile = os.path.join(install_logdir, "scale_install.log.txt") + with open(logfile, 'w') as f: + p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p.wait() + f.close() + + exec_cmd = energon_locate_cmd + logfile = os.path.join(install_logdir, "energon_locate.log.txt") + with open(logfile, 'w') as f: + p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p.wait() + f.close() + + with open(logfile, 'r') as f: + energon_locate = f.readline().replace('\n', '') + print(energon_locate) + + src_dir = os.path.join(energon_locate, "megatron", "energon") + dst_dir = os.path.join(log_dir, "FlagScale", "megatron", "megatron") + exec_cmd = f"cp -r {src_dir} {dst_dir}/" + + logfile = os.path.join(install_logdir, "energon_copy.log.txt") + with open(logfile, 'w') as f: + p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p.wait() + f.close() + + +def replace_yamls(scale_home, config_module, args): + scale_conf_dir = os.path.join(scale_home, getattr(config_module, "scale_conf_dir")) + dist_yaml = os.path.join(scale_conf_dir, getattr(config_module, "configyaml")) + with open(dist_yaml, 'r') as f: + dist_data = yaml.safe_load(f) + + try: + dist_data["experiment"]["exp_dir"] = os.path.join(scale_home, "outputs_llava1.5") + hosts = args.hosts.split(",") + dist_data["experiment"]["runner"]["nnodes"] = len(hosts) + hostfile = os.path.join(scale_home, "hostfile") + with open(hostfile, 'w') as f: + for host in hosts: + slots = dist_data["experiment"]["runner"]["nproc_per_node"] + chiptype = getattr(config_module, "flagscale_chip_type") + f.write(f"{host} slots={slots} type={chiptype}\n") + dist_data["experiment"]["runner"]["hostfile"] = hostfile + except Exception as e: + print(e) + print("You're using an illegal config.yaml in flagscale. You must fix it") + + print(dist_data) + + train_yaml = os.path.join(scale_conf_dir, getattr(config_module, "trainyaml")) + + with open(train_yaml, 'r') as f: + train_data = yaml.safe_load(f) + + try: + train_data["system"]["checkpoint"]["save_interval"] = 20 + train_data["system"]["checkpoint"]["pretrained_checkpoint"] = os.path.join(args.data_dir, "LLaVA_megatron", "vicuna_instruct_clip336_tp1_combined_mcore") + + train_data["model"]["train_iters"] = 5000 + train_data["model"].pop("img_embedding_idx", None) + train_data["data"]["data_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml")) + train_data["data"]["valid_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml")) + train_data["data"]["prompt_path"] = os.path.join(scale_home, "megatron/examples/multimodal/manual_prompts.json") + train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(args.data_dir, "vicuna-7b-v1___5/tokenizer.model") + except Exception as e: + print("You're using an illegal trainllava.yaml in flagscale. You must fix it") + + + print(train_data) + + dataset_yaml = os.path.join(scale_home, getattr(config_module, "datasetyaml")) + + with open(dataset_yaml, 'r') as f: + dataset_data = yaml.safe_load(f) + + try: + llava_train_dir = os.path.join(args.data_dir, "LLaVA-Pretrain/wds") + dataset_data["splits"]["train"]["datasets"][0]["path"] = llava_train_dir + dataset_data["splits"]["val"]["datasets"][0]["path"] = llava_train_dir + except Exception as e: + print("You're using an illegal dataset.yaml in flagscale. You must fix it") + + print(dataset_data) + + with open(dist_yaml, 'w') as f: + yaml.safe_dump(dist_data, f) + + with open(train_yaml, 'w') as f: + yaml.safe_dump(train_data, f) + + with open(dataset_yaml, 'w') as f: + yaml.safe_dump(dataset_data, f) + + +if __name__ == "__main__": + args = parse_args() + print(args) + host = args.host_addr + hosts = args.hosts.split(",") + print(host, hosts) + + if host != hosts[0]: + exit(0) + + sys.path.append(os.path.dirname(args.flagperf_config_file)) + config_file = os.path.basename(args.flagperf_config_file).split('.')[0] + + module = import_module(config_file) + + scale_download_cmd = getattr(module, 'scale_download_cmd') + scale_install_cmd = getattr(module, 'scale_install_cmd') + energon_locate_cmd = getattr(module, 'energon_locate_cmd') + + install_scale(scale_download_cmd, args.log_dir, scale_install_cmd, energon_locate_cmd) + + scale_home = os.path.join(args.log_dir, "FlagScale") + replace_yamls(scale_home, module, args) + + scale_conf_dir = getattr(module, "scale_conf_dir") + configyaml = getattr(module, "configyaml") + configname = os.path.splitext(os.path.basename(configyaml)) + exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}" + + print(exec_cmd) + exit(0) + with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f: + p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p.wait() diff --git a/training/nvidia/docker_image/flagscale_2409/Dockerfile b/training/nvidia/docker_image/flagscale_2409/Dockerfile new file mode 100644 index 000000000..45e28b6f8 --- /dev/null +++ b/training/nvidia/docker_image/flagscale_2409/Dockerfile @@ -0,0 +1,4 @@ +FROM base-harbor.platform-sz.jingneng-inner.ac.cn/airs-user/6907316d-94d9-469a-b481-1bdf0bfe2287_9f3b64c6-acad-4186-8693-864997cc7e10_aoyulong/flagscale:20240522120728 +RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple" +RUN /bin/bash -c "uname -a" +RUN /bin/bash -c alias python3=python diff --git a/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh b/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh new file mode 100644 index 000000000..a9bf588e2 --- /dev/null +++ b/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh @@ -0,0 +1 @@ +#!/bin/bash diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py new file mode 100644 index 000000000..9fdd25c64 --- /dev/null +++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py @@ -0,0 +1,15 @@ +# this cmd should install scale at . is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py +scale_download_cmd = "cd ; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 604f79b" + +# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here +scale_install_cmd = "" + +# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py +energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs" + +scale_conf_dir = "examples/llava/conf" +configyaml = "config.yaml" +trainyaml = "train/train_llava1.5_7b.yaml" +datasetyaml = "megatron/examples/multimodal/pretrain_dataset.yaml" +flagscale_chip_type = "H100" + diff --git a/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt b/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt new file mode 100644 index 000000000..4f0d1d961 --- /dev/null +++ b/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt @@ -0,0 +1 @@ +megatron-energon==2.2.0 diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py index 0723c086c..9c35adb93 100644 --- a/training/run_benchmarks/config/cluster_conf.py +++ b/training/run_benchmarks/config/cluster_conf.py @@ -1,7 +1,7 @@ '''Cluster configs''' # Hosts to run the benchmark. Each item is an IP address or a hostname. -HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"] +HOSTS = ["10.1.2.155"] # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored' HOSTS_PORTS = ["2222"] @@ -10,4 +10,4 @@ MASTER_PORT = "29501" # ssh connection port -SSH_PORT = "22" \ No newline at end of file +SSH_PORT = "22" diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 47455d7cd..919b6decc 100755 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -37,12 +37,12 @@ ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES" # Set pip source, which will be used in preparing envs in container -PIP_SOURCE = "https://mirror.baidu.com/pypi/simple" +PIP_SOURCE = "https://pypi.tuna.tsinghua.edu.cn/simple" # The path that flagperf deploy in the cluster. # Users must set FLAGPERF_PATH to where flagperf deploy # You can assume the preset "/home/FlagPerf/training" points to Null -FLAGPERF_PATH = "/home/FlagPerf/training" +FLAGPERF_PATH = "/home/shihonghao/xlcscale/FlagPerf/training" # Set log path on the host here. FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/" @@ -68,7 +68,7 @@ # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/", # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/", - + "llava1.5_7b:flagscale_2409:H100:1:8:1": "/raid/dataset/LLAVA" # "llava1.5_7b:deepspeed-torch:A800:1:8:1": "/raid/dataset/LLAVA/", #"llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/", #"aquila2_7b_finetune:flagscale:A800:1:8:1": "/raid/dataset/aquila2_7b_finetune", diff --git a/training/run_benchmarks/flagscale/start_flagscale_task.py b/training/run_benchmarks/flagscale/start_flagscale_task.py index 46fd1e8b2..432e8dc3a 100644 --- a/training/run_benchmarks/flagscale/start_flagscale_task.py +++ b/training/run_benchmarks/flagscale/start_flagscale_task.py @@ -48,6 +48,10 @@ def parse_args(): type=int, required=True, help="how many processes will run on each host.") + parser.add_argument("--hosts", + type=str, + required=True, + help="hosts to run the testcase.") parser.add_argument("--vendor", type=str, @@ -120,6 +124,8 @@ def main(): exec_cmd = "cd " + os.path.dirname(train_script_path) + ";" exec_cmd = exec_cmd + "python run_pretraining.py" exec_cmd = exec_cmd + " --world_size=" + str(task_args.nproc) + exec_cmd = exec_cmd + " --hosts=" + task_args.hosts + exec_cmd = exec_cmd + " --host_addr=" + task_args.host_addr exec_cmd = exec_cmd + " --vendor=" + task_args.vendor exec_cmd = exec_cmd + " --data_dir=" + task_args.data_dir exec_cmd = exec_cmd + " --log_dir=" + task_log_dir @@ -127,6 +133,7 @@ def main(): task_log_file = os.path.join(task_log_dir, "rank0.out.log") + START_LOGGER.info(exec_cmd) with open(task_log_file, "w") as f: p = subprocess.Popen(exec_cmd, shell=True, From f7f3f40e28faba649c72303ca12647a53f6a492c Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Wed, 21 Aug 2024 15:17:20 +0800 Subject: [PATCH 2/6] fix --- training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py index 56203bb6d..52af7ec30 100644 --- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py +++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py @@ -74,7 +74,7 @@ def replace_yamls(scale_home, config_module, args): dist_data = yaml.safe_load(f) try: - dist_data["experiment"]["exp_dir"] = os.path.join(scale_home, "outputs_llava1.5") + dist_data["experiment"]["exp_dir"] = os.path.join(args.log_dir, "outputs_llava1.5") hosts = args.hosts.split(",") dist_data["experiment"]["runner"]["nnodes"] = len(hosts) hostfile = os.path.join(scale_home, "hostfile") From 01bcf3290aa8dd88b827ee7104b42498e0437096 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Thu, 22 Aug 2024 15:34:13 +0800 Subject: [PATCH 3/6] upd path --- .../llava1.5_7b/flagscale/run_pretraining.py | 36 +++++++++---------- .../config/config_H100x1x8.py | 14 +++++--- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py index 52af7ec30..f822bc35a 100644 --- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py +++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py @@ -24,9 +24,9 @@ def parse_args(): return args -def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate_cmd, debug_mode=False): +def install_scale(module, log_dir, debug_mode=False): if not debug_mode: - exec_cmd = scale_download_cmd.replace('', log_dir) + exec_cmd = getattr(module, "scale_download_cmd") print(exec_cmd) install_logdir = os.path.join(log_dir, "install_logs") @@ -38,14 +38,14 @@ def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate p.wait() f.close() - exec_cmd = f"cd {log_dir}; {scale_install_cmd}" + exec_cmd = getattr(module, "scale_install_cmd") logfile = os.path.join(install_logdir, "scale_install.log.txt") with open(logfile, 'w') as f: p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) p.wait() f.close() - exec_cmd = energon_locate_cmd + exec_cmd = getattr(module, "energon_locate_cmd") logfile = os.path.join(install_logdir, "energon_locate.log.txt") with open(logfile, 'w') as f: p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) @@ -57,7 +57,7 @@ def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate print(energon_locate) src_dir = os.path.join(energon_locate, "megatron", "energon") - dst_dir = os.path.join(log_dir, "FlagScale", "megatron", "megatron") + dst_dir = os.path.join(getattr(module, "scale_home"), "megatron", "megatron") exec_cmd = f"cp -r {src_dir} {dst_dir}/" logfile = os.path.join(install_logdir, "energon_copy.log.txt") @@ -68,8 +68,8 @@ def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate def replace_yamls(scale_home, config_module, args): - scale_conf_dir = os.path.join(scale_home, getattr(config_module, "scale_conf_dir")) - dist_yaml = os.path.join(scale_conf_dir, getattr(config_module, "configyaml")) + scale_conf_dir = getattr(config_module, "scale_conf_dir") + dist_yaml = getattr(config_module, "configyaml") with open(dist_yaml, 'r') as f: dist_data = yaml.safe_load(f) @@ -90,7 +90,7 @@ def replace_yamls(scale_home, config_module, args): print(dist_data) - train_yaml = os.path.join(scale_conf_dir, getattr(config_module, "trainyaml")) + train_yaml = getattr(config_module, "trainyaml") with open(train_yaml, 'r') as f: train_data = yaml.safe_load(f) @@ -101,9 +101,9 @@ def replace_yamls(scale_home, config_module, args): train_data["model"]["train_iters"] = 5000 train_data["model"].pop("img_embedding_idx", None) - train_data["data"]["data_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml")) - train_data["data"]["valid_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml")) - train_data["data"]["prompt_path"] = os.path.join(scale_home, "megatron/examples/multimodal/manual_prompts.json") + train_data["data"]["data_path"] = getattr(config_module, "datasetyaml") + train_data["data"]["valid_path"] = getattr(config_module, "datasetyaml") + train_data["data"]["prompt_path"] = getattr(config_module, "prompt") train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(args.data_dir, "vicuna-7b-v1___5/tokenizer.model") except Exception as e: print("You're using an illegal trainllava.yaml in flagscale. You must fix it") @@ -111,7 +111,7 @@ def replace_yamls(scale_home, config_module, args): print(train_data) - dataset_yaml = os.path.join(scale_home, getattr(config_module, "datasetyaml")) + dataset_yaml = getattr(config_module, "datasetyaml") with open(dataset_yaml, 'r') as f: dataset_data = yaml.safe_load(f) @@ -149,23 +149,19 @@ def replace_yamls(scale_home, config_module, args): config_file = os.path.basename(args.flagperf_config_file).split('.')[0] module = import_module(config_file) + print(module) + scale_home = getattr(module, "scale_home") - scale_download_cmd = getattr(module, 'scale_download_cmd') - scale_install_cmd = getattr(module, 'scale_install_cmd') - energon_locate_cmd = getattr(module, 'energon_locate_cmd') + install_scale(module, args.log_dir) - install_scale(scale_download_cmd, args.log_dir, scale_install_cmd, energon_locate_cmd) - - scale_home = os.path.join(args.log_dir, "FlagScale") replace_yamls(scale_home, module, args) scale_conf_dir = getattr(module, "scale_conf_dir") configyaml = getattr(module, "configyaml") - configname = os.path.splitext(os.path.basename(configyaml)) + configname = os.path.splitext(os.path.basename(configyaml))[0] exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}" print(exec_cmd) - exit(0) with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f: p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) p.wait() diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py index 9fdd25c64..542dfc16a 100644 --- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py +++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py @@ -1,5 +1,8 @@ +scale_parent = "/workspace" +scale_home = f"{scale_parent}/FlagScale" + # this cmd should install scale at . is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py -scale_download_cmd = "cd ; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 604f79b" +scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 604f79b" # NV need nothing because all requirements have been established in base docker image. vendor can do anything related here scale_install_cmd = "" @@ -7,9 +10,10 @@ # locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs" -scale_conf_dir = "examples/llava/conf" -configyaml = "config.yaml" -trainyaml = "train/train_llava1.5_7b.yaml" -datasetyaml = "megatron/examples/multimodal/pretrain_dataset.yaml" +scale_conf_dir = f"{scale_home}/examples/llava/conf" +configyaml = f"{scale_conf_dir}/config.yaml" +trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml" +datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" +prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" flagscale_chip_type = "H100" From 1228e219a0600bcfaf15f200b689e1b10ebc7860 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Thu, 22 Aug 2024 15:42:24 +0800 Subject: [PATCH 4/6] add --- training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py | 1 + .../nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py | 2 +- training/run_benchmarks/flagscale/start_flagscale_task.py | 3 --- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py index f822bc35a..3d04a40ac 100644 --- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py +++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py @@ -77,6 +77,7 @@ def replace_yamls(scale_home, config_module, args): dist_data["experiment"]["exp_dir"] = os.path.join(args.log_dir, "outputs_llava1.5") hosts = args.hosts.split(",") dist_data["experiment"]["runner"]["nnodes"] = len(hosts) + dist_data["experiment"]["runner"]["ssh_port"] = getattr(config_module, "flagscale_ssh_port") hostfile = os.path.join(scale_home, "hostfile") with open(hostfile, 'w') as f: for host in hosts: diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py index 542dfc16a..1f1324f19 100644 --- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py +++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py @@ -16,4 +16,4 @@ datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" flagscale_chip_type = "H100" - +flagscale_ssh_port = 60128 diff --git a/training/run_benchmarks/flagscale/start_flagscale_task.py b/training/run_benchmarks/flagscale/start_flagscale_task.py index 432e8dc3a..4ec02b196 100644 --- a/training/run_benchmarks/flagscale/start_flagscale_task.py +++ b/training/run_benchmarks/flagscale/start_flagscale_task.py @@ -1,8 +1,5 @@ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- -'''This script is called in container to execute the real training task. - Support pytorch DDP only. -''' import os import sys import subprocess From 9c7a238af841af8148e4b2be4dfb2d390a943500 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Fri, 23 Aug 2024 14:40:29 +0800 Subject: [PATCH 5/6] final --- .../llava1.5_7b/flagscale/run_pretraining.py | 36 ++++++++++++++++++- ...{config_H100x1x8.py => config_H100x4x8.py} | 2 ++ .../run_benchmarks/config/cluster_conf.py | 2 +- training/run_benchmarks/config/test_conf.py | 4 +-- 4 files changed, 40 insertions(+), 4 deletions(-) rename training/nvidia/llava1.5_7b-flagscale/config/{config_H100x1x8.py => config_H100x4x8.py} (87%) diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py index 3d04a40ac..5ceb63d1c 100644 --- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py +++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py @@ -4,6 +4,7 @@ import sys from importlib import import_module import yaml +import time def parse_args(): @@ -97,7 +98,7 @@ def replace_yamls(scale_home, config_module, args): train_data = yaml.safe_load(f) try: - train_data["system"]["checkpoint"]["save_interval"] = 20 + train_data["system"]["checkpoint"]["save_interval"] = 1000 train_data["system"]["checkpoint"]["pretrained_checkpoint"] = os.path.join(args.data_dir, "LLaVA_megatron", "vicuna_instruct_clip336_tp1_combined_mcore") train_data["model"]["train_iters"] = 5000 @@ -166,3 +167,36 @@ def replace_yamls(scale_home, config_module, args): with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f: p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) p.wait() + + timestamp_log_host = hosts[-1] + timestamp_log_noderank = len(hosts) - 1 + + timestamp_log_file = os.path.join(args.log_dir, "outputs_llava1.5", "logs", "host_" + str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output") + + info_line = [] + while True: + try: + with open(timestamp_log_file, 'r') as f: + lines = f.readlines() + for line in lines: + if "elapsed time per iteration" in line: + info_line.append(line) + except Exception as e: + print("Maybe some errors") + if len(info_line) == 5000: + break + time.sleep(300) + + infos = [] + for line in info_line: + info = line.split("|")[2] + steptime = info.split(":")[1] + stepsecond = float(steptime) / 1000 + infos.append(stepsecond) + print(infos) + + ave_steptime = sum(infos[1:]) / len(infos[1:]) + tps = 2048 * 256 / ave_steptime / args.world_size + mfu = tps * 7E9 * 6 / getattr(module, "flops") + print(f"MFU: {mfu}") + diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py similarity index 87% rename from training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py rename to training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py index 1f1324f19..0db97e2da 100644 --- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py +++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py @@ -1,3 +1,4 @@ +# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage scale_parent = "/workspace" scale_home = f"{scale_parent}/FlagScale" @@ -17,3 +18,4 @@ prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" flagscale_chip_type = "H100" flagscale_ssh_port = 60128 +flops = 989E12 diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py index 9c35adb93..be628e197 100644 --- a/training/run_benchmarks/config/cluster_conf.py +++ b/training/run_benchmarks/config/cluster_conf.py @@ -1,7 +1,7 @@ '''Cluster configs''' # Hosts to run the benchmark. Each item is an IP address or a hostname. -HOSTS = ["10.1.2.155"] +HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"] # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored' HOSTS_PORTS = ["2222"] diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 919b6decc..485f170cf 100755 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -42,7 +42,7 @@ # The path that flagperf deploy in the cluster. # Users must set FLAGPERF_PATH to where flagperf deploy # You can assume the preset "/home/FlagPerf/training" points to Null -FLAGPERF_PATH = "/home/shihonghao/xlcscale/FlagPerf/training" +FLAGPERF_PATH = "/home/FlagPerf/training" # Set log path on the host here. FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/" @@ -68,7 +68,7 @@ # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/", # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/", - "llava1.5_7b:flagscale_2409:H100:1:8:1": "/raid/dataset/LLAVA" + #"llava1.5_7b:flagscale_2409:H100:4:8:1": "/workspace/data_dir" # "llava1.5_7b:deepspeed-torch:A800:1:8:1": "/raid/dataset/LLAVA/", #"llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/", #"aquila2_7b_finetune:flagscale:A800:1:8:1": "/raid/dataset/aquila2_7b_finetune", From 1e16104dfe719ebeb8765dd1c3cf781352370ebf Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Fri, 23 Aug 2024 14:45:59 +0800 Subject: [PATCH 6/6] fix --- .../llava1.5_7b/flagscale/run_pretraining.py | 82 +++++++++++++------ .../config/config_H100x4x8.py | 5 ++ training/run_benchmarks/config/test_conf.py | 9 +- 3 files changed, 65 insertions(+), 31 deletions(-) diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py index 5ceb63d1c..29697fe5c 100644 --- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py +++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py @@ -35,21 +35,30 @@ def install_scale(module, log_dir, debug_mode=False): logfile = os.path.join(install_logdir, "scale_download.log.txt") with open(logfile, 'w') as f: - p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p = subprocess.Popen(exec_cmd, + shell=True, + stdout=f, + stderr=subprocess.STDOUT) p.wait() f.close() exec_cmd = getattr(module, "scale_install_cmd") logfile = os.path.join(install_logdir, "scale_install.log.txt") with open(logfile, 'w') as f: - p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p = subprocess.Popen(exec_cmd, + shell=True, + stdout=f, + stderr=subprocess.STDOUT) p.wait() f.close() - + exec_cmd = getattr(module, "energon_locate_cmd") logfile = os.path.join(install_logdir, "energon_locate.log.txt") with open(logfile, 'w') as f: - p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p = subprocess.Popen(exec_cmd, + shell=True, + stdout=f, + stderr=subprocess.STDOUT) p.wait() f.close() @@ -58,12 +67,16 @@ def install_scale(module, log_dir, debug_mode=False): print(energon_locate) src_dir = os.path.join(energon_locate, "megatron", "energon") - dst_dir = os.path.join(getattr(module, "scale_home"), "megatron", "megatron") + dst_dir = os.path.join(getattr(module, "scale_home"), "megatron", + "megatron") exec_cmd = f"cp -r {src_dir} {dst_dir}/" - + logfile = os.path.join(install_logdir, "energon_copy.log.txt") with open(logfile, 'w') as f: - p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p = subprocess.Popen(exec_cmd, + shell=True, + stdout=f, + stderr=subprocess.STDOUT) p.wait() f.close() @@ -75,10 +88,12 @@ def replace_yamls(scale_home, config_module, args): dist_data = yaml.safe_load(f) try: - dist_data["experiment"]["exp_dir"] = os.path.join(args.log_dir, "outputs_llava1.5") + dist_data["experiment"]["exp_dir"] = os.path.join( + args.log_dir, "outputs_llava1.5") hosts = args.hosts.split(",") dist_data["experiment"]["runner"]["nnodes"] = len(hosts) - dist_data["experiment"]["runner"]["ssh_port"] = getattr(config_module, "flagscale_ssh_port") + dist_data["experiment"]["runner"]["ssh_port"] = getattr( + config_module, "flagscale_ssh_port") hostfile = os.path.join(scale_home, "hostfile") with open(hostfile, 'w') as f: for host in hosts: @@ -88,7 +103,9 @@ def replace_yamls(scale_home, config_module, args): dist_data["experiment"]["runner"]["hostfile"] = hostfile except Exception as e: print(e) - print("You're using an illegal config.yaml in flagscale. You must fix it") + print( + "You're using an illegal config.yaml in flagscale. You must fix it" + ) print(dist_data) @@ -99,32 +116,41 @@ def replace_yamls(scale_home, config_module, args): try: train_data["system"]["checkpoint"]["save_interval"] = 1000 - train_data["system"]["checkpoint"]["pretrained_checkpoint"] = os.path.join(args.data_dir, "LLaVA_megatron", "vicuna_instruct_clip336_tp1_combined_mcore") + train_data["system"]["checkpoint"][ + "pretrained_checkpoint"] = os.path.join( + args.data_dir, "LLaVA_megatron", + "vicuna_instruct_clip336_tp1_combined_mcore") - train_data["model"]["train_iters"] = 5000 + train_data["model"]["train_iters"] = getattr(config_module, "steps") train_data["model"].pop("img_embedding_idx", None) train_data["data"]["data_path"] = getattr(config_module, "datasetyaml") - train_data["data"]["valid_path"] = getattr(config_module, "datasetyaml") + train_data["data"]["valid_path"] = getattr(config_module, + "datasetyaml") train_data["data"]["prompt_path"] = getattr(config_module, "prompt") - train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(args.data_dir, "vicuna-7b-v1___5/tokenizer.model") + train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join( + args.data_dir, "vicuna-7b-v1___5/tokenizer.model") except Exception as e: - print("You're using an illegal trainllava.yaml in flagscale. You must fix it") - + print( + "You're using an illegal trainllava.yaml in flagscale. You must fix it" + ) print(train_data) dataset_yaml = getattr(config_module, "datasetyaml") - + with open(dataset_yaml, 'r') as f: dataset_data = yaml.safe_load(f) try: llava_train_dir = os.path.join(args.data_dir, "LLaVA-Pretrain/wds") - dataset_data["splits"]["train"]["datasets"][0]["path"] = llava_train_dir + dataset_data["splits"]["train"]["datasets"][0][ + "path"] = llava_train_dir dataset_data["splits"]["val"]["datasets"][0]["path"] = llava_train_dir except Exception as e: - print("You're using an illegal dataset.yaml in flagscale. You must fix it") - + print( + "You're using an illegal dataset.yaml in flagscale. You must fix it" + ) + print(dataset_data) with open(dist_yaml, 'w') as f: @@ -143,7 +169,7 @@ def replace_yamls(scale_home, config_module, args): host = args.host_addr hosts = args.hosts.split(",") print(host, hosts) - + if host != hosts[0]: exit(0) @@ -162,16 +188,21 @@ def replace_yamls(scale_home, config_module, args): configyaml = getattr(module, "configyaml") configname = os.path.splitext(os.path.basename(configyaml))[0] exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}" - + print(exec_cmd) with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f: - p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p = subprocess.Popen(exec_cmd, + shell=True, + stdout=f, + stderr=subprocess.STDOUT) p.wait() timestamp_log_host = hosts[-1] timestamp_log_noderank = len(hosts) - 1 - timestamp_log_file = os.path.join(args.log_dir, "outputs_llava1.5", "logs", "host_" + str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output") + timestamp_log_file = os.path.join( + args.log_dir, "outputs_llava1.5", "logs", "host_" + + str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output") info_line = [] while True: @@ -183,7 +214,7 @@ def replace_yamls(scale_home, config_module, args): info_line.append(line) except Exception as e: print("Maybe some errors") - if len(info_line) == 5000: + if len(info_line) == getattr(module, "steps"): break time.sleep(300) @@ -199,4 +230,3 @@ def replace_yamls(scale_home, config_module, args): tps = 2048 * 256 / ave_steptime / args.world_size mfu = tps * 7E9 * 6 / getattr(module, "flops") print(f"MFU: {mfu}") - diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py index 0db97e2da..fa05ff97a 100644 --- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py +++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py @@ -16,6 +16,11 @@ trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml" datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" + +# flagscale's requirements flagscale_chip_type = "H100" flagscale_ssh_port = 60128 flops = 989E12 + +# for llava's algorithm +steps = 5000 diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 485f170cf..5c68f500f 100755 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -85,14 +85,14 @@ # "resnet50:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "mask_rcnn:pytorch_1.8:A100:1:8:1": "/raid/dataset/maskrcnn/coco2017", # "dlrm:pytorch_1.10:A100:1:8:1": "/raid/dataset/criteo_1TB_click_logs/binary_dataset/", - + # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech", # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/", # "moflow:pytorch_1.13:A100:1:8:1": "/raid/dataset/MoFlow/data/", # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/", - + # "transformer:pytorch_1.13:A100:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict", # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "transformer_xl:pytorch_1.8:A100:1:8:1": "/raid/dataset/transformer_xl/", @@ -102,7 +102,7 @@ # "bert_hf:pytorch_1.13:A100:1:8:1": "/raid/dataset/bert_hf_train", # "longformer:pytorch_1.12:A100:1:8:1": "/raid/dataset/longformer_train/", # "detr:pytorch_1.13:A100:1:8:1": "/raid/dataset/detr/coco2017/", - + # "llama2_7b:deepspeed:A100:1:8:1": "/raid/dataset/llama2_7b_pretrain", # "aquila2_7b:flagscale:A100:1:8:1": "/raid/dataset/aquila2_7b_pretrain", # "llama2_70B:megatron:H800:4:8:1": "/raid/dataset/llama2_70B_pretrain", @@ -123,7 +123,7 @@ # "gpt3_13B:paddle_2.5.1:TP2PP1SH1SP4A10040G:1:8:1":"/raid/dataset/gpt-3/" # "gpt3_13B:paddle_2.5.1:TP2PP1SH2SP4A10040G:1:8:1":"/raid/dataset/gpt-3/" # "gpt3_13B:paddle_2.5.1:TP2PP4SH1SP1A10040G:1:8:1":"/raid/dataset/gpt-3/" - + # "qwen1.5_MoE:megatron_pai:A800:1:8:1":"/raid/datasets/qwen1.5_MoE/" # "mixtral_8x7B:megatron_core060:H100:4:8:1": "/raid/datasets/mistral" @@ -200,5 +200,4 @@ #"gpt3_13B:paddle_2.6.0:TP2PP1SH2SP4C50040G:1:8:1":"/raid/data_set/data-gpt3" #"gpt3_13B:paddle_2.6.0:TP1PP1SH2SP8C50080G:1:8:1":"/raid/data_set/data-gpt3" # "qwen1.5_MoE:megatron_pai:C500:1:8:1":"/raid/datasets/qwen1.5_MoE/" - }