Skip to content

Commit

Permalink
[KUNLUNXIN] case config: llama3-8b, chatglm3-6b, interconnect (#757)
Browse files Browse the repository at this point in the history
* [KUNLUNXIN] case config: llama3-8b, chatglm3-6b, interconnect-MPI_interserver

* [KUNLUNXIN] minor fix for llama3-8b, remove cpu-initialization

---------

Co-authored-by: w4yne <[email protected]>
  • Loading branch information
w4yne and w4yne authored Oct 6, 2024
1 parent 189d15b commit ea906ae
Show file tree
Hide file tree
Showing 11 changed files with 356 additions and 299 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ TOOL=all_reduce
LOG=_${TOOL}.log.${RANDOM}.$$
PERF=/opt/xccl/perf/${TOOL}

# FIXME: hard code hostname, need graceful impl.
if [[ w"$HOSTNAME" != w"p-perf-kunlun-01" ]]; then
if [[ w"$NODERANK" != w"0" ]]; then
echo "launch mpirun only on first node, exiting.\n"
exit
fi
Expand Down
7 changes: 7 additions & 0 deletions training/benchmarks/chatglm3_6b/deepspeed/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ def get_deepspeed_engine(args, model_config_dir, flashattn):
model.gradient_checkpointing_enable()
model_engine, _, _, _ = deepspeed.initialize(
args=args, model=model, model_parameters=model.parameters())

if args.load_ckpt_dir is not None:
print(f"Loading ckpt from dir: {args.data_dir}/{args.load_ckpt_dir}")
model_engine.load_checkpoint(os.path.join(args.data_dir, args.load_ckpt_dir))

return model_engine


Expand Down Expand Up @@ -113,6 +118,8 @@ def get_metric(texts):
flashattn = getattr(module, 'flashattn')
gradient_checkpointing_enable = getattr(module, 'gradient_checkpointing_enable', False)
args.gradient_checkpointing_enable = gradient_checkpointing_enable
load_ckpt_dir = getattr(module, 'load_ckpt_dir', None)
args.load_ckpt_dir = load_ckpt_dir

deepspeed.init_distributed()
model_engine = get_deepspeed_engine(args, os.path.join("chatglm3_6b_hf"),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
seqlength = 4096
batchsize = 8
datafilename = "openwebtext_chatglm3_100M.npy"
theoryflops = 128000000000000.0
theoryflops = 999000000000000.0
epochs = 1
flashattn = True
gradient_checkpointing_enable = True
gradient_checkpointing_enable = True
load_ckpt_dir = "step0-ckpt"
9 changes: 9 additions & 0 deletions training/kunlunxin/docker_image/megatron_core060/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM klx-flagperf-mcore060:latest
RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
RUN /bin/bash -c "uname -a"
RUN /bin/bash -c alias python3=python
ENV PATH /root/miniconda/envs/python38_torch201_cuda/bin:$PATH

RUN pip3 install loguru
RUN pip3 install schedule
RUN pip3 install munch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

set -x

pip install schedule

# xpytorch install
cd /opt/xpytorch && bash xpytorch-cp38-torch201-ubuntu2004-x64.run
CUDART_DUMMY_REGISTER=1 python -m torch_xmlir --doctor &> /tmp/xpytorch.version.out
CUDART_DUMMY_REGISTER=1 python -c "import torch; print(torch.rand(512, 128).cuda())" &> /tmp/xpytorch.test.out

Loading

0 comments on commit ea906ae

Please sign in to comment.