From 405eb94c60ca5b461a35584dfb57cc2b8176f1ca Mon Sep 17 00:00:00 2001
From: shh2000 <13820618441@163.com>
Date: Wed, 21 Aug 2024 15:02:28 +0800
Subject: [PATCH 1/6] init scale

---
 .../llava1.5_7b/flagscale/run_pretraining.py  | 171 ++++++++++++++++++
 .../docker_image/flagscale_2409/Dockerfile    |   4 +
 .../flagscale_2409/flagscale_2409_install.sh  |   1 +
 .../config/config_H100x1x8.py                 |  15 ++
 .../config/requirements.txt                   |   1 +
 .../run_benchmarks/config/cluster_conf.py     |   4 +-
 training/run_benchmarks/config/test_conf.py   |   6 +-
 .../flagscale/start_flagscale_task.py         |   7 +
 8 files changed, 204 insertions(+), 5 deletions(-)
 create mode 100644 training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
 create mode 100644 training/nvidia/docker_image/flagscale_2409/Dockerfile
 create mode 100644 training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh
 create mode 100644 training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
 create mode 100644 training/nvidia/llava1.5_7b-flagscale/config/requirements.txt

diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
new file mode 100644
index 000000000..56203bb6d
--- /dev/null
+++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
@@ -0,0 +1,171 @@
+import subprocess
+from argparse import ArgumentParser
+import os
+import sys
+from importlib import import_module
+import yaml
+
+
+def parse_args():
+    '''we parse ddp related args, check system config args, and running env
+       args such as --data_dir_xxx. Then pass all useful args to the real
+       training script.
+    '''
+    parser = ArgumentParser(description="flagscale main python")
+    parser.add_argument("--world_size", type=int, required=True)
+    parser.add_argument("--vendor", type=str, required=True)
+    parser.add_argument("--data_dir", type=str, required=True)
+    parser.add_argument("--hosts", type=str, required=True)
+    parser.add_argument("--host_addr", type=str, required=True)
+    parser.add_argument("--log_dir", type=str, required=True)
+    parser.add_argument("--flagperf_config_file", type=str, required=True)
+    args, unknown_args = parser.parse_known_args()
+    args.unknown_args = unknown_args
+    return args
+
+
+def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate_cmd, debug_mode=False):
+    if not debug_mode:
+        exec_cmd = scale_download_cmd.replace('<scale_home>', log_dir)
+        print(exec_cmd)
+
+        install_logdir = os.path.join(log_dir, "install_logs")
+        os.makedirs(install_logdir)
+
+        logfile = os.path.join(install_logdir, "scale_download.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        exec_cmd = f"cd {log_dir}; {scale_install_cmd}"
+        logfile = os.path.join(install_logdir, "scale_install.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+        
+        exec_cmd = energon_locate_cmd
+        logfile = os.path.join(install_logdir, "energon_locate.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+        with open(logfile, 'r') as f:
+            energon_locate = f.readline().replace('\n', '')
+        print(energon_locate)
+
+        src_dir = os.path.join(energon_locate, "megatron", "energon")
+        dst_dir = os.path.join(log_dir, "FlagScale", "megatron", "megatron")
+        exec_cmd = f"cp -r {src_dir} {dst_dir}/"
+        
+        logfile = os.path.join(install_logdir, "energon_copy.log.txt")
+        with open(logfile, 'w') as f:
+            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
+        f.close()
+
+
+def replace_yamls(scale_home, config_module, args):
+    scale_conf_dir = os.path.join(scale_home, getattr(config_module, "scale_conf_dir"))
+    dist_yaml = os.path.join(scale_conf_dir, getattr(config_module, "configyaml"))
+    with open(dist_yaml, 'r') as f:
+        dist_data = yaml.safe_load(f)
+
+    try:
+        dist_data["experiment"]["exp_dir"] = os.path.join(scale_home, "outputs_llava1.5")
+        hosts = args.hosts.split(",")
+        dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
+        hostfile = os.path.join(scale_home, "hostfile")
+        with open(hostfile, 'w') as f:
+            for host in hosts:
+                slots = dist_data["experiment"]["runner"]["nproc_per_node"]
+                chiptype = getattr(config_module, "flagscale_chip_type")
+                f.write(f"{host} slots={slots} type={chiptype}\n")
+        dist_data["experiment"]["runner"]["hostfile"] = hostfile
+    except Exception as e:
+        print(e)
+        print("You're using an illegal config.yaml in flagscale. You must fix it")
+
+    print(dist_data)
+
+    train_yaml = os.path.join(scale_conf_dir, getattr(config_module, "trainyaml"))
+
+    with open(train_yaml, 'r') as f:
+        train_data = yaml.safe_load(f)
+
+    try:
+        train_data["system"]["checkpoint"]["save_interval"] = 20
+        train_data["system"]["checkpoint"]["pretrained_checkpoint"] = os.path.join(args.data_dir, "LLaVA_megatron", "vicuna_instruct_clip336_tp1_combined_mcore")
+
+        train_data["model"]["train_iters"] = 5000
+        train_data["model"].pop("img_embedding_idx", None)
+        train_data["data"]["data_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml"))
+        train_data["data"]["valid_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml"))
+        train_data["data"]["prompt_path"] = os.path.join(scale_home, "megatron/examples/multimodal/manual_prompts.json")
+        train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(args.data_dir, "vicuna-7b-v1___5/tokenizer.model")
+    except Exception as e:
+        print("You're using an illegal trainllava.yaml in flagscale. You must fix it")
+
+
+    print(train_data)
+
+    dataset_yaml = os.path.join(scale_home, getattr(config_module, "datasetyaml"))
+    
+    with open(dataset_yaml, 'r') as f:
+        dataset_data = yaml.safe_load(f)
+
+    try:
+        llava_train_dir = os.path.join(args.data_dir, "LLaVA-Pretrain/wds")
+        dataset_data["splits"]["train"]["datasets"][0]["path"] = llava_train_dir
+        dataset_data["splits"]["val"]["datasets"][0]["path"] = llava_train_dir
+    except Exception as e:
+        print("You're using an illegal dataset.yaml in flagscale. You must fix it")
+    
+    print(dataset_data)
+
+    with open(dist_yaml, 'w') as f:
+        yaml.safe_dump(dist_data, f)
+
+    with open(train_yaml, 'w') as f:
+        yaml.safe_dump(train_data, f)
+
+    with open(dataset_yaml, 'w') as f:
+        yaml.safe_dump(dataset_data, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    host = args.host_addr
+    hosts = args.hosts.split(",")
+    print(host, hosts)
+    
+    if host != hosts[0]:
+        exit(0)
+
+    sys.path.append(os.path.dirname(args.flagperf_config_file))
+    config_file = os.path.basename(args.flagperf_config_file).split('.')[0]
+
+    module = import_module(config_file)
+
+    scale_download_cmd = getattr(module, 'scale_download_cmd')
+    scale_install_cmd = getattr(module, 'scale_install_cmd')
+    energon_locate_cmd = getattr(module, 'energon_locate_cmd')
+
+    install_scale(scale_download_cmd, args.log_dir, scale_install_cmd, energon_locate_cmd)
+
+    scale_home = os.path.join(args.log_dir, "FlagScale")
+    replace_yamls(scale_home, module, args)
+
+    scale_conf_dir = getattr(module, "scale_conf_dir")
+    configyaml = getattr(module, "configyaml")
+    configname = os.path.splitext(os.path.basename(configyaml))
+    exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}"
+    
+    print(exec_cmd)
+    exit(0)
+    with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
+        p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
diff --git a/training/nvidia/docker_image/flagscale_2409/Dockerfile b/training/nvidia/docker_image/flagscale_2409/Dockerfile
new file mode 100644
index 000000000..45e28b6f8
--- /dev/null
+++ b/training/nvidia/docker_image/flagscale_2409/Dockerfile
@@ -0,0 +1,4 @@
+FROM base-harbor.platform-sz.jingneng-inner.ac.cn/airs-user/6907316d-94d9-469a-b481-1bdf0bfe2287_9f3b64c6-acad-4186-8693-864997cc7e10_aoyulong/flagscale:20240522120728
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
diff --git a/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh b/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh
new file mode 100644
index 000000000..a9bf588e2
--- /dev/null
+++ b/training/nvidia/docker_image/flagscale_2409/flagscale_2409_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
new file mode 100644
index 000000000..9fdd25c64
--- /dev/null
+++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
@@ -0,0 +1,15 @@
+# this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
+scale_download_cmd = "cd <scale_home>; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 604f79b"
+
+# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
+scale_install_cmd = ""
+
+# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
+energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"
+
+scale_conf_dir = "examples/llava/conf"
+configyaml = "config.yaml"
+trainyaml = "train/train_llava1.5_7b.yaml"
+datasetyaml = "megatron/examples/multimodal/pretrain_dataset.yaml"
+flagscale_chip_type = "H100"
+
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt b/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt
new file mode 100644
index 000000000..4f0d1d961
--- /dev/null
+++ b/training/nvidia/llava1.5_7b-flagscale/config/requirements.txt
@@ -0,0 +1 @@
+megatron-energon==2.2.0
diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py
index 0723c086c..9c35adb93 100644
--- a/training/run_benchmarks/config/cluster_conf.py
+++ b/training/run_benchmarks/config/cluster_conf.py
@@ -1,7 +1,7 @@
 '''Cluster configs'''
 
 # Hosts to run the benchmark. Each item is an IP address or a hostname.
-HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"]
+HOSTS = ["10.1.2.155"]
 
 # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
 HOSTS_PORTS = ["2222"]
@@ -10,4 +10,4 @@
 MASTER_PORT = "29501"
 
 # ssh connection port
-SSH_PORT = "22"
\ No newline at end of file
+SSH_PORT = "22"
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 47455d7cd..919b6decc 100755
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -37,12 +37,12 @@
 ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES"
 
 # Set pip source, which will be used in preparing envs in container
-PIP_SOURCE = "https://mirror.baidu.com/pypi/simple"
+PIP_SOURCE = "https://pypi.tuna.tsinghua.edu.cn/simple"
 
 # The path that flagperf deploy in the cluster.
 # Users must set FLAGPERF_PATH to where flagperf deploy
 # You can assume the preset "/home/FlagPerf/training" points to Null
-FLAGPERF_PATH = "/home/FlagPerf/training"
+FLAGPERF_PATH = "/home/shihonghao/xlcscale/FlagPerf/training"
 # Set log path on the host here.
 FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/"
 
@@ -68,7 +68,7 @@
     # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
     # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
 
-
+    "llava1.5_7b:flagscale_2409:H100:1:8:1": "/raid/dataset/LLAVA"
     # "llava1.5_7b:deepspeed-torch:A800:1:8:1": "/raid/dataset/LLAVA/",
     #"llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/",
     #"aquila2_7b_finetune:flagscale:A800:1:8:1": "/raid/dataset/aquila2_7b_finetune",
diff --git a/training/run_benchmarks/flagscale/start_flagscale_task.py b/training/run_benchmarks/flagscale/start_flagscale_task.py
index 46fd1e8b2..432e8dc3a 100644
--- a/training/run_benchmarks/flagscale/start_flagscale_task.py
+++ b/training/run_benchmarks/flagscale/start_flagscale_task.py
@@ -48,6 +48,10 @@ def parse_args():
                         type=int,
                         required=True,
                         help="how many processes will run on each host.")
+    parser.add_argument("--hosts",
+                        type=str,
+                        required=True,
+                        help="hosts to run the testcase.")
 
     parser.add_argument("--vendor",
                         type=str,
@@ -120,6 +124,8 @@ def main():
     exec_cmd = "cd " + os.path.dirname(train_script_path) + ";"
     exec_cmd = exec_cmd + "python run_pretraining.py"
     exec_cmd = exec_cmd + " --world_size=" + str(task_args.nproc)
+    exec_cmd = exec_cmd + " --hosts=" + task_args.hosts
+    exec_cmd = exec_cmd + " --host_addr=" + task_args.host_addr
     exec_cmd = exec_cmd + " --vendor=" + task_args.vendor
     exec_cmd = exec_cmd + " --data_dir=" + task_args.data_dir
     exec_cmd = exec_cmd + " --log_dir=" + task_log_dir
@@ -127,6 +133,7 @@ def main():
 
     task_log_file = os.path.join(task_log_dir, "rank0.out.log")
 
+    START_LOGGER.info(exec_cmd)
     with open(task_log_file, "w") as f:
         p = subprocess.Popen(exec_cmd,
                              shell=True,

From f7f3f40e28faba649c72303ca12647a53f6a492c Mon Sep 17 00:00:00 2001
From: shh2000 <13820618441@163.com>
Date: Wed, 21 Aug 2024 15:17:20 +0800
Subject: [PATCH 2/6] fix

---
 training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
index 56203bb6d..52af7ec30 100644
--- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
+++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
@@ -74,7 +74,7 @@ def replace_yamls(scale_home, config_module, args):
         dist_data = yaml.safe_load(f)
 
     try:
-        dist_data["experiment"]["exp_dir"] = os.path.join(scale_home, "outputs_llava1.5")
+        dist_data["experiment"]["exp_dir"] = os.path.join(args.log_dir, "outputs_llava1.5")
         hosts = args.hosts.split(",")
         dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
         hostfile = os.path.join(scale_home, "hostfile")

From 01bcf3290aa8dd88b827ee7104b42498e0437096 Mon Sep 17 00:00:00 2001
From: shh2000 <13820618441@163.com>
Date: Thu, 22 Aug 2024 15:34:13 +0800
Subject: [PATCH 3/6] upd path

---
 .../llava1.5_7b/flagscale/run_pretraining.py  | 36 +++++++++----------
 .../config/config_H100x1x8.py                 | 14 +++++---
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
index 52af7ec30..f822bc35a 100644
--- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
+++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
@@ -24,9 +24,9 @@ def parse_args():
     return args
 
 
-def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate_cmd, debug_mode=False):
+def install_scale(module, log_dir, debug_mode=False):
     if not debug_mode:
-        exec_cmd = scale_download_cmd.replace('<scale_home>', log_dir)
+        exec_cmd = getattr(module, "scale_download_cmd")
         print(exec_cmd)
 
         install_logdir = os.path.join(log_dir, "install_logs")
@@ -38,14 +38,14 @@ def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate
         p.wait()
         f.close()
 
-        exec_cmd = f"cd {log_dir}; {scale_install_cmd}"
+        exec_cmd = getattr(module, "scale_install_cmd")
         logfile = os.path.join(install_logdir, "scale_install.log.txt")
         with open(logfile, 'w') as f:
             p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
         p.wait()
         f.close()
         
-        exec_cmd = energon_locate_cmd
+        exec_cmd = getattr(module, "energon_locate_cmd")
         logfile = os.path.join(install_logdir, "energon_locate.log.txt")
         with open(logfile, 'w') as f:
             p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
@@ -57,7 +57,7 @@ def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate
         print(energon_locate)
 
         src_dir = os.path.join(energon_locate, "megatron", "energon")
-        dst_dir = os.path.join(log_dir, "FlagScale", "megatron", "megatron")
+        dst_dir = os.path.join(getattr(module, "scale_home"), "megatron", "megatron")
         exec_cmd = f"cp -r {src_dir} {dst_dir}/"
         
         logfile = os.path.join(install_logdir, "energon_copy.log.txt")
@@ -68,8 +68,8 @@ def install_scale(scale_download_cmd, log_dir, scale_install_cmd, energon_locate
 
 
 def replace_yamls(scale_home, config_module, args):
-    scale_conf_dir = os.path.join(scale_home, getattr(config_module, "scale_conf_dir"))
-    dist_yaml = os.path.join(scale_conf_dir, getattr(config_module, "configyaml"))
+    scale_conf_dir = getattr(config_module, "scale_conf_dir")
+    dist_yaml = getattr(config_module, "configyaml")
     with open(dist_yaml, 'r') as f:
         dist_data = yaml.safe_load(f)
 
@@ -90,7 +90,7 @@ def replace_yamls(scale_home, config_module, args):
 
     print(dist_data)
 
-    train_yaml = os.path.join(scale_conf_dir, getattr(config_module, "trainyaml"))
+    train_yaml = getattr(config_module, "trainyaml")
 
     with open(train_yaml, 'r') as f:
         train_data = yaml.safe_load(f)
@@ -101,9 +101,9 @@ def replace_yamls(scale_home, config_module, args):
 
         train_data["model"]["train_iters"] = 5000
         train_data["model"].pop("img_embedding_idx", None)
-        train_data["data"]["data_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml"))
-        train_data["data"]["valid_path"] = os.path.join(scale_home, getattr(config_module, "datasetyaml"))
-        train_data["data"]["prompt_path"] = os.path.join(scale_home, "megatron/examples/multimodal/manual_prompts.json")
+        train_data["data"]["data_path"] = getattr(config_module, "datasetyaml")
+        train_data["data"]["valid_path"] = getattr(config_module, "datasetyaml")
+        train_data["data"]["prompt_path"] = getattr(config_module, "prompt")
         train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(args.data_dir, "vicuna-7b-v1___5/tokenizer.model")
     except Exception as e:
         print("You're using an illegal trainllava.yaml in flagscale. You must fix it")
@@ -111,7 +111,7 @@ def replace_yamls(scale_home, config_module, args):
 
     print(train_data)
 
-    dataset_yaml = os.path.join(scale_home, getattr(config_module, "datasetyaml"))
+    dataset_yaml = getattr(config_module, "datasetyaml")
     
     with open(dataset_yaml, 'r') as f:
         dataset_data = yaml.safe_load(f)
@@ -149,23 +149,19 @@ def replace_yamls(scale_home, config_module, args):
     config_file = os.path.basename(args.flagperf_config_file).split('.')[0]
 
     module = import_module(config_file)
+    print(module)
+    scale_home = getattr(module, "scale_home")
 
-    scale_download_cmd = getattr(module, 'scale_download_cmd')
-    scale_install_cmd = getattr(module, 'scale_install_cmd')
-    energon_locate_cmd = getattr(module, 'energon_locate_cmd')
+    install_scale(module, args.log_dir)
 
-    install_scale(scale_download_cmd, args.log_dir, scale_install_cmd, energon_locate_cmd)
-
-    scale_home = os.path.join(args.log_dir, "FlagScale")
     replace_yamls(scale_home, module, args)
 
     scale_conf_dir = getattr(module, "scale_conf_dir")
     configyaml = getattr(module, "configyaml")
-    configname = os.path.splitext(os.path.basename(configyaml))
+    configname = os.path.splitext(os.path.basename(configyaml))[0]
     exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}"
     
     print(exec_cmd)
-    exit(0)
     with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
         p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
         p.wait()
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
index 9fdd25c64..542dfc16a 100644
--- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
+++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
@@ -1,5 +1,8 @@
+scale_parent = "/workspace"
+scale_home = f"{scale_parent}/FlagScale"
+
 # this cmd should install scale at <scale_home>. <scale_home> is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py
-scale_download_cmd = "cd <scale_home>; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 604f79b"
+scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 604f79b"
 
 # NV need nothing because all requirements have been established in base docker image. vendor can do anything related here
 scale_install_cmd = ""
@@ -7,9 +10,10 @@
 # locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py
 energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs"
 
-scale_conf_dir = "examples/llava/conf"
-configyaml = "config.yaml"
-trainyaml = "train/train_llava1.5_7b.yaml"
-datasetyaml = "megatron/examples/multimodal/pretrain_dataset.yaml"
+scale_conf_dir = f"{scale_home}/examples/llava/conf"
+configyaml = f"{scale_conf_dir}/config.yaml"
+trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
+datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
+prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
 flagscale_chip_type = "H100"
 

From 1228e219a0600bcfaf15f200b689e1b10ebc7860 Mon Sep 17 00:00:00 2001
From: shh2000 <13820618441@163.com>
Date: Thu, 22 Aug 2024 15:42:24 +0800
Subject: [PATCH 4/6] add

---
 training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py   | 1 +
 .../nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py     | 2 +-
 training/run_benchmarks/flagscale/start_flagscale_task.py      | 3 ---
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
index f822bc35a..3d04a40ac 100644
--- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
+++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
@@ -77,6 +77,7 @@ def replace_yamls(scale_home, config_module, args):
         dist_data["experiment"]["exp_dir"] = os.path.join(args.log_dir, "outputs_llava1.5")
         hosts = args.hosts.split(",")
         dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
+        dist_data["experiment"]["runner"]["ssh_port"] = getattr(config_module, "flagscale_ssh_port")
         hostfile = os.path.join(scale_home, "hostfile")
         with open(hostfile, 'w') as f:
             for host in hosts:
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
index 542dfc16a..1f1324f19 100644
--- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
+++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
@@ -16,4 +16,4 @@
 datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
 prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
 flagscale_chip_type = "H100"
-
+flagscale_ssh_port = 60128
diff --git a/training/run_benchmarks/flagscale/start_flagscale_task.py b/training/run_benchmarks/flagscale/start_flagscale_task.py
index 432e8dc3a..4ec02b196 100644
--- a/training/run_benchmarks/flagscale/start_flagscale_task.py
+++ b/training/run_benchmarks/flagscale/start_flagscale_task.py
@@ -1,8 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
-'''This script is called in container to execute the real training task.
-   Support pytorch DDP only.
-'''
 import os
 import sys
 import subprocess

From 9c7a238af841af8148e4b2be4dfb2d390a943500 Mon Sep 17 00:00:00 2001
From: shh2000 <13820618441@163.com>
Date: Fri, 23 Aug 2024 14:40:29 +0800
Subject: [PATCH 5/6] final

---
 .../llava1.5_7b/flagscale/run_pretraining.py  | 36 ++++++++++++++++++-
 ...{config_H100x1x8.py => config_H100x4x8.py} |  2 ++
 .../run_benchmarks/config/cluster_conf.py     |  2 +-
 training/run_benchmarks/config/test_conf.py   |  4 +--
 4 files changed, 40 insertions(+), 4 deletions(-)
 rename training/nvidia/llava1.5_7b-flagscale/config/{config_H100x1x8.py => config_H100x4x8.py} (87%)

diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
index 3d04a40ac..5ceb63d1c 100644
--- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
+++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
@@ -4,6 +4,7 @@
 import sys
 from importlib import import_module
 import yaml
+import time
 
 
 def parse_args():
@@ -97,7 +98,7 @@ def replace_yamls(scale_home, config_module, args):
         train_data = yaml.safe_load(f)
 
     try:
-        train_data["system"]["checkpoint"]["save_interval"] = 20
+        train_data["system"]["checkpoint"]["save_interval"] = 1000
         train_data["system"]["checkpoint"]["pretrained_checkpoint"] = os.path.join(args.data_dir, "LLaVA_megatron", "vicuna_instruct_clip336_tp1_combined_mcore")
 
         train_data["model"]["train_iters"] = 5000
@@ -166,3 +167,36 @@ def replace_yamls(scale_home, config_module, args):
     with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
         p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
         p.wait()
+
+    timestamp_log_host = hosts[-1]
+    timestamp_log_noderank = len(hosts) - 1
+
+    timestamp_log_file = os.path.join(args.log_dir, "outputs_llava1.5", "logs", "host_" + str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")
+
+    info_line = []
+    while True:
+        try:
+            with open(timestamp_log_file, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    if "elapsed time per iteration" in line:
+                        info_line.append(line)
+        except Exception as e:
+            print("Maybe some errors")
+        if len(info_line) == 5000:
+            break
+        time.sleep(300)
+
+    infos = []
+    for line in info_line:
+        info = line.split("|")[2]
+        steptime = info.split(":")[1]
+        stepsecond = float(steptime) / 1000
+        infos.append(stepsecond)
+    print(infos)
+
+    ave_steptime = sum(infos[1:]) / len(infos[1:])
+    tps = 2048 * 256 / ave_steptime / args.world_size
+    mfu = tps * 7E9 * 6 / getattr(module, "flops")
+    print(f"MFU: {mfu}")
+
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
similarity index 87%
rename from training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
rename to training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
index 1f1324f19..0db97e2da 100644
--- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x1x8.py
+++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
@@ -1,3 +1,4 @@
+# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage
 scale_parent = "/workspace"
 scale_home = f"{scale_parent}/FlagScale"
 
@@ -17,3 +18,4 @@
 prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
 flagscale_chip_type = "H100"
 flagscale_ssh_port = 60128
+flops = 989E12
diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py
index 9c35adb93..be628e197 100644
--- a/training/run_benchmarks/config/cluster_conf.py
+++ b/training/run_benchmarks/config/cluster_conf.py
@@ -1,7 +1,7 @@
 '''Cluster configs'''
 
 # Hosts to run the benchmark. Each item is an IP address or a hostname.
-HOSTS = ["10.1.2.155"]
+HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"]
 
 # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored'
 HOSTS_PORTS = ["2222"]
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 919b6decc..485f170cf 100755
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -42,7 +42,7 @@
 # The path that flagperf deploy in the cluster.
 # Users must set FLAGPERF_PATH to where flagperf deploy
 # You can assume the preset "/home/FlagPerf/training" points to Null
-FLAGPERF_PATH = "/home/shihonghao/xlcscale/FlagPerf/training"
+FLAGPERF_PATH = "/home/FlagPerf/training"
 # Set log path on the host here.
 FLAGPERF_LOG_PATH = FLAGPERF_PATH + "/result/"
 
@@ -68,7 +68,7 @@
     # "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
     # "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
 
-    "llava1.5_7b:flagscale_2409:H100:1:8:1": "/raid/dataset/LLAVA"
+    #"llava1.5_7b:flagscale_2409:H100:4:8:1": "/workspace/data_dir"
     # "llava1.5_7b:deepspeed-torch:A800:1:8:1": "/raid/dataset/LLAVA/",
     #"llama2_7b_finetune:pytorch_2.0.1:A100:1:1:1": "/raid/dataset/llama2_finetune/",
     #"aquila2_7b_finetune:flagscale:A800:1:8:1": "/raid/dataset/aquila2_7b_finetune",

From 1e16104dfe719ebeb8765dd1c3cf781352370ebf Mon Sep 17 00:00:00 2001
From: shh2000 <13820618441@163.com>
Date: Fri, 23 Aug 2024 14:45:59 +0800
Subject: [PATCH 6/6] fix

---
 .../llava1.5_7b/flagscale/run_pretraining.py  | 82 +++++++++++++------
 .../config/config_H100x4x8.py                 |  5 ++
 training/run_benchmarks/config/test_conf.py   |  9 +-
 3 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
index 5ceb63d1c..29697fe5c 100644
--- a/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
+++ b/training/benchmarks/llava1.5_7b/flagscale/run_pretraining.py
@@ -35,21 +35,30 @@ def install_scale(module, log_dir, debug_mode=False):
 
         logfile = os.path.join(install_logdir, "scale_download.log.txt")
         with open(logfile, 'w') as f:
-            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
         p.wait()
         f.close()
 
         exec_cmd = getattr(module, "scale_install_cmd")
         logfile = os.path.join(install_logdir, "scale_install.log.txt")
         with open(logfile, 'w') as f:
-            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
         p.wait()
         f.close()
-        
+
         exec_cmd = getattr(module, "energon_locate_cmd")
         logfile = os.path.join(install_logdir, "energon_locate.log.txt")
         with open(logfile, 'w') as f:
-            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
         p.wait()
         f.close()
 
@@ -58,12 +67,16 @@ def install_scale(module, log_dir, debug_mode=False):
         print(energon_locate)
 
         src_dir = os.path.join(energon_locate, "megatron", "energon")
-        dst_dir = os.path.join(getattr(module, "scale_home"), "megatron", "megatron")
+        dst_dir = os.path.join(getattr(module, "scale_home"), "megatron",
+                               "megatron")
         exec_cmd = f"cp -r {src_dir} {dst_dir}/"
-        
+
         logfile = os.path.join(install_logdir, "energon_copy.log.txt")
         with open(logfile, 'w') as f:
-            p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+            p = subprocess.Popen(exec_cmd,
+                                 shell=True,
+                                 stdout=f,
+                                 stderr=subprocess.STDOUT)
         p.wait()
         f.close()
 
@@ -75,10 +88,12 @@ def replace_yamls(scale_home, config_module, args):
         dist_data = yaml.safe_load(f)
 
     try:
-        dist_data["experiment"]["exp_dir"] = os.path.join(args.log_dir, "outputs_llava1.5")
+        dist_data["experiment"]["exp_dir"] = os.path.join(
+            args.log_dir, "outputs_llava1.5")
         hosts = args.hosts.split(",")
         dist_data["experiment"]["runner"]["nnodes"] = len(hosts)
-        dist_data["experiment"]["runner"]["ssh_port"] = getattr(config_module, "flagscale_ssh_port")
+        dist_data["experiment"]["runner"]["ssh_port"] = getattr(
+            config_module, "flagscale_ssh_port")
         hostfile = os.path.join(scale_home, "hostfile")
         with open(hostfile, 'w') as f:
             for host in hosts:
@@ -88,7 +103,9 @@ def replace_yamls(scale_home, config_module, args):
         dist_data["experiment"]["runner"]["hostfile"] = hostfile
     except Exception as e:
         print(e)
-        print("You're using an illegal config.yaml in flagscale. You must fix it")
+        print(
+            "You're using an illegal config.yaml in flagscale. You must fix it"
+        )
 
     print(dist_data)
 
@@ -99,32 +116,41 @@ def replace_yamls(scale_home, config_module, args):
 
     try:
         train_data["system"]["checkpoint"]["save_interval"] = 1000
-        train_data["system"]["checkpoint"]["pretrained_checkpoint"] = os.path.join(args.data_dir, "LLaVA_megatron", "vicuna_instruct_clip336_tp1_combined_mcore")
+        train_data["system"]["checkpoint"][
+            "pretrained_checkpoint"] = os.path.join(
+                args.data_dir, "LLaVA_megatron",
+                "vicuna_instruct_clip336_tp1_combined_mcore")
 
-        train_data["model"]["train_iters"] = 5000
+        train_data["model"]["train_iters"] = getattr(config_module, "steps")
         train_data["model"].pop("img_embedding_idx", None)
         train_data["data"]["data_path"] = getattr(config_module, "datasetyaml")
-        train_data["data"]["valid_path"] = getattr(config_module, "datasetyaml")
+        train_data["data"]["valid_path"] = getattr(config_module,
+                                                   "datasetyaml")
         train_data["data"]["prompt_path"] = getattr(config_module, "prompt")
-        train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(args.data_dir, "vicuna-7b-v1___5/tokenizer.model")
+        train_data["data"]["tokenizer"]["tokenizer_model"] = os.path.join(
+            args.data_dir, "vicuna-7b-v1___5/tokenizer.model")
     except Exception as e:
-        print("You're using an illegal trainllava.yaml in flagscale. You must fix it")
-
+        print(
+            "You're using an illegal trainllava.yaml in flagscale. You must fix it"
+        )
 
     print(train_data)
 
     dataset_yaml = getattr(config_module, "datasetyaml")
-    
+
     with open(dataset_yaml, 'r') as f:
         dataset_data = yaml.safe_load(f)
 
     try:
         llava_train_dir = os.path.join(args.data_dir, "LLaVA-Pretrain/wds")
-        dataset_data["splits"]["train"]["datasets"][0]["path"] = llava_train_dir
+        dataset_data["splits"]["train"]["datasets"][0][
+            "path"] = llava_train_dir
         dataset_data["splits"]["val"]["datasets"][0]["path"] = llava_train_dir
     except Exception as e:
-        print("You're using an illegal dataset.yaml in flagscale. You must fix it")
-    
+        print(
+            "You're using an illegal dataset.yaml in flagscale. You must fix it"
+        )
+
     print(dataset_data)
 
     with open(dist_yaml, 'w') as f:
@@ -143,7 +169,7 @@ def replace_yamls(scale_home, config_module, args):
     host = args.host_addr
     hosts = args.hosts.split(",")
     print(host, hosts)
-    
+
     if host != hosts[0]:
         exit(0)
 
@@ -162,16 +188,21 @@ def replace_yamls(scale_home, config_module, args):
     configyaml = getattr(module, "configyaml")
     configname = os.path.splitext(os.path.basename(configyaml))[0]
     exec_cmd = f"cd {scale_home}; python3 run.py --config-path {scale_conf_dir} --config-name {configname}"
-    
+
     print(exec_cmd)
     with open(os.path.join(args.log_dir, "flagscale_main.log.txt"), 'w') as f:
-        p = subprocess.Popen(exec_cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p = subprocess.Popen(exec_cmd,
+                             shell=True,
+                             stdout=f,
+                             stderr=subprocess.STDOUT)
         p.wait()
 
     timestamp_log_host = hosts[-1]
     timestamp_log_noderank = len(hosts) - 1
 
-    timestamp_log_file = os.path.join(args.log_dir, "outputs_llava1.5", "logs", "host_" + str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")
+    timestamp_log_file = os.path.join(
+        args.log_dir, "outputs_llava1.5", "logs", "host_" +
+        str(timestamp_log_noderank) + "_" + timestamp_log_host + ".output")
 
     info_line = []
     while True:
@@ -183,7 +214,7 @@ def replace_yamls(scale_home, config_module, args):
                         info_line.append(line)
         except Exception as e:
             print("Maybe some errors")
-        if len(info_line) == 5000:
+        if len(info_line) == getattr(module, "steps"):
             break
         time.sleep(300)
 
@@ -199,4 +230,3 @@ def replace_yamls(scale_home, config_module, args):
     tps = 2048 * 256 / ave_steptime / args.world_size
     mfu = tps * 7E9 * 6 / getattr(module, "flops")
     print(f"MFU: {mfu}")
-
diff --git a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
index 0db97e2da..fa05ff97a 100644
--- a/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
+++ b/training/nvidia/llava1.5_7b-flagscale/config/config_H100x4x8.py
@@ -16,6 +16,11 @@
 trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml"
 datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml"
 prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json"
+
+# flagscale's requirements
 flagscale_chip_type = "H100"
 flagscale_ssh_port = 60128
 flops = 989E12
+
+# for llava's algorithm
+steps = 5000
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 485f170cf..5c68f500f 100755
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -85,14 +85,14 @@
     # "resnet50:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "mask_rcnn:pytorch_1.8:A100:1:8:1": "/raid/dataset/maskrcnn/coco2017",
     # "dlrm:pytorch_1.10:A100:1:8:1": "/raid/dataset/criteo_1TB_click_logs/binary_dataset/",
-    
+
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
     # "resnet50:tensorflow2:A100:1:8:1": "/raid/dataset/ImageNet2012/tf_records/",
     # "moflow:pytorch_1.13:A100:1:8:1": "/raid/dataset/MoFlow/data/",
 
     # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
-    
+
     # "transformer:pytorch_1.13:A100:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "transformer_xl:pytorch_1.8:A100:1:8:1": "/raid/dataset/transformer_xl/",
@@ -102,7 +102,7 @@
     # "bert_hf:pytorch_1.13:A100:1:8:1": "/raid/dataset/bert_hf_train",
     # "longformer:pytorch_1.12:A100:1:8:1": "/raid/dataset/longformer_train/",
     # "detr:pytorch_1.13:A100:1:8:1": "/raid/dataset/detr/coco2017/",
-    
+
     # "llama2_7b:deepspeed:A100:1:8:1": "/raid/dataset/llama2_7b_pretrain",
     # "aquila2_7b:flagscale:A100:1:8:1": "/raid/dataset/aquila2_7b_pretrain",
     # "llama2_70B:megatron:H800:4:8:1": "/raid/dataset/llama2_70B_pretrain",
@@ -123,7 +123,7 @@
     # "gpt3_13B:paddle_2.5.1:TP2PP1SH1SP4A10040G:1:8:1":"/raid/dataset/gpt-3/"
     # "gpt3_13B:paddle_2.5.1:TP2PP1SH2SP4A10040G:1:8:1":"/raid/dataset/gpt-3/"
     # "gpt3_13B:paddle_2.5.1:TP2PP4SH1SP1A10040G:1:8:1":"/raid/dataset/gpt-3/"
-    
+
     # "qwen1.5_MoE:megatron_pai:A800:1:8:1":"/raid/datasets/qwen1.5_MoE/"
     # "mixtral_8x7B:megatron_core060:H100:4:8:1": "/raid/datasets/mistral"
 
@@ -200,5 +200,4 @@
     #"gpt3_13B:paddle_2.6.0:TP2PP1SH2SP4C50040G:1:8:1":"/raid/data_set/data-gpt3"
     #"gpt3_13B:paddle_2.6.0:TP1PP1SH2SP8C50080G:1:8:1":"/raid/data_set/data-gpt3"
     # "qwen1.5_MoE:megatron_pai:C500:1:8:1":"/raid/datasets/qwen1.5_MoE/"
-    
 }