diff --git a/training/kunlunxin/docker_image/flagscale_llava/Dockerfile b/training/kunlunxin/docker_image/flagscale_llava/Dockerfile new file mode 100644 index 000000000..c91f0328b --- /dev/null +++ b/training/kunlunxin/docker_image/flagscale_llava/Dockerfile @@ -0,0 +1,14 @@ +FROM zhiyuan_flagscale_llava:newest +RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple" +RUN /bin/bash -c "uname -a" +RUN /bin/bash -c alias python3=python +ENV PATH /root/miniconda/envs/python39_torch201_cuda/bin:$PATH + +echo '#!/bin/bash\nservice ssh restart\nexec "$@"' > /docker-start.sh + +RUN chmod +x /docker-start.sh +RUN apt-get install -y openssh-server +ENTRYPOINT ["sh", "/docker-start.sh"] + +# THIS PART WILL BE REPLACED IF YOU PASS SOME OTHER COMMAND TO docker RUN +CMD while true; do echo "default arg" && sleep 1; done diff --git a/training/kunlunxin/docker_image/flagscale_llava/flagscale_llava_install.sh b/training/kunlunxin/docker_image/flagscale_llava/flagscale_llava_install.sh new file mode 100644 index 000000000..e94f50418 --- /dev/null +++ b/training/kunlunxin/docker_image/flagscale_llava/flagscale_llava_install.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -x + +# cd /opt/xpytorch && bash xpytorch-cp39-torch201-ubuntu2004-x64.run + +CUDART_DUMMY_REGISTER=1 python -m torch_xmlir --doctor &> /tmp/xpytorch.version.out +CUDART_DUMMY_REGISTER=1 python -c "import torch; print(torch.rand(512, 128).cuda())" &> /tmp/xpytorch.test.out +/etc/init.d/ssh restart diff --git a/training/kunlunxin/llava1.5_7b-flagscale/README.md b/training/kunlunxin/llava1.5_7b-flagscale/README.md new file mode 100644 index 000000000..d44c78c53 --- /dev/null +++ b/training/kunlunxin/llava1.5_7b-flagscale/README.md @@ -0,0 +1 @@ +此测例为FlagScale相关项目测例 diff --git a/training/kunlunxin/llava1.5_7b-flagscale/config/config_R300px4x8.py b/training/kunlunxin/llava1.5_7b-flagscale/config/config_R300px4x8.py new file mode 100644 index 000000000..9c05f133c --- /dev/null +++ b/training/kunlunxin/llava1.5_7b-flagscale/config/config_R300px4x8.py @@ -0,0 +1,27 @@ +# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage +scale_parent = "/share" +scale_home = f"{scale_parent}/FlagScale" + +# this cmd should install scale at . is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py +scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 085811f" + +# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here +scale_install_cmd = "" + +# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py +energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs" + +scale_conf_dir = f"{scale_home}/examples/llava/conf" +configyaml = f"{scale_conf_dir}/config.yaml" +trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml" +datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" +prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" + +cmds = {"before_start": "source ~/.bashrc"} +# flagscale's requirements +flagscale_chip_type = "R300p" +flagscale_ssh_port = 4323 +flops = 999 + +# for llava's algorithm +steps = 30 diff --git a/training/kunlunxin/llava1.5_7b-flagscale/config/requirements.txt b/training/kunlunxin/llava1.5_7b-flagscale/config/requirements.txt new file mode 100644 index 000000000..4f0d1d961 --- /dev/null +++ b/training/kunlunxin/llava1.5_7b-flagscale/config/requirements.txt @@ -0,0 +1 @@ +megatron-energon==2.2.0 diff --git a/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/README.md b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/README.md new file mode 100644 index 000000000..d44c78c53 --- /dev/null +++ b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/README.md @@ -0,0 +1 @@ +此测例为FlagScale相关项目测例 diff --git a/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/config_R300px4x8.py b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/config_R300px4x8.py new file mode 100644 index 000000000..be3e3a80a --- /dev/null +++ b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/config_R300px4x8.py @@ -0,0 +1,27 @@ +# scale_parent must under FlagPerf/ or data_dir/, otherwise you cannot mount it into baremetal, therefore cannot use shared storage +scale_parent = "/share" +scale_home = f"{scale_parent}/FlagScale" + +# this cmd should install scale at . is set by flagperf.training.benchmarks.llava1.5_7b.flagscale.run_pretraining.py +scale_download_cmd = f"cd {scale_parent}; git clone https://github.com/FlagOpen/FlagScale.git; cd FlagScale; git checkout 085811f" + +# NV need nothing because all requirements have been established in base docker image. vendor can do anything related here +scale_install_cmd = "" + +# locate energon. the copy from energon_install_path to flagscale/megatron/ is done by flagperf...run_pretraining.py +energon_locate_cmd = r"pip show megatron-energon | grep Location | awk -F: '{print $2}' | xargs" + +scale_conf_dir = f"{scale_home}/examples/llava/conf" +configyaml = f"{scale_conf_dir}/config.yaml" +trainyaml = f"{scale_conf_dir}/train/train_llava1.5_7b.yaml" +datasetyaml = f"{scale_home}/megatron/examples/multimodal/pretrain_dataset.yaml" +prompt = f"{scale_home}/megatron/examples/multimodal/manual_prompts.json" + +cmds = {"before_start": "source ~/.bashrc"} +# flagscale's requirements +flagscale_chip_type = "R300p" +flagscale_ssh_port = 4323 +flops = 999 + +# for llava's algorithm +steps = 5000 diff --git a/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/requirements.txt b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/requirements.txt new file mode 100644 index 000000000..4f0d1d961 --- /dev/null +++ b/training/kunlunxin/llava1.5_7b_continuetrain-flagscale/config/requirements.txt @@ -0,0 +1 @@ +megatron-energon==2.2.0 diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index aa7ebaea3..8c3a81998 100755 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -133,6 +133,8 @@ # "mixtral_8x7B:megatron_core060:H100:4:8:1": "/raid/datasets/mistral" # kunlunxin cases + #"llava1.5_7b:flagscale_llava:R300p:4:8:1": "/workspace/data_dir" + #"llava1.5_7b_continuetrain:flagscale_llava:R300p:4:8:1": "/workspace/data_dir" # "gpt2:pytorch:R300:1:8:1": "/raid/dataset/gpt2", # "resnet50:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "mask_rcnn:pytorch:R300:1:8:1": "/raid/dataset/coco2017/",