-
Notifications
You must be signed in to change notification settings - Fork 81
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add docker image for BERT e2e training task (#454)
* Add python training script, requirements.txt (dependencies), and dockerfile for the e2e BERT training task * Add github action to build bert-testing image on PR * Add default values and include in docker env for MASTER_ADDR and MASTER_HOST
- Loading branch information
Showing
4 changed files
with
277 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# Use the NVIDIA CUDA runtime as a parent image | ||
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 | ||
|
||
# Set environment variable to disable interactive prompts | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# Set default values for MASTER_ADDR and MASTER_PORT | ||
ENV MASTER_ADDR=127.0.0.1 | ||
ENV MASTER_PORT=12355 | ||
|
||
# Set default number of GPUs per node | ||
ENV NUM_GPUS_PER_NODE=8 | ||
|
||
# Python ependency version numbers | ||
ARG PYTHON=python3.10 | ||
ARG PYTHON_VERSION=3.10.12 | ||
ARG PIP=pip3 | ||
|
||
RUN apt-get update \ | ||
&& apt-get upgrade -y \ | ||
&& apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
ca-certificates \ | ||
cmake \ | ||
curl \ | ||
emacs \ | ||
git \ | ||
jq \ | ||
libopencv-dev \ | ||
software-properties-common \ | ||
wget \ | ||
unzip \ | ||
vim \ | ||
pkg-config \ | ||
gdb \ | ||
lcov \ | ||
libbz2-dev \ | ||
zlib1g-dev \ | ||
openssl \ | ||
libssl-dev \ | ||
libsqlite3-dev \ | ||
libgdbm-dev \ | ||
libc6-dev \ | ||
libbz2-dev \ | ||
libncurses-dev \ | ||
tk-dev \ | ||
libffi-dev \ | ||
libcap-dev \ | ||
gnupg2 \ | ||
gpg-agent \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& apt-get clean | ||
|
||
# Install Python | ||
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ | ||
&& tar -xzf Python-$PYTHON_VERSION.tgz \ | ||
&& cd Python-$PYTHON_VERSION \ | ||
&& ./configure --enable-shared --prefix=/usr/local \ | ||
&& make -j $(nproc) && make install \ | ||
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \ | ||
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \ | ||
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ | ||
&& ${PIP} --no-cache-dir install --upgrade \ | ||
pip \ | ||
setuptools | ||
|
||
# Set the working directory in the container | ||
WORKDIR /app | ||
|
||
# Copy only the necessary files into the container at /app | ||
COPY train.py /app/ | ||
COPY requirements.txt /app/ | ||
|
||
# Install any needed packages specified in requirements.txt | ||
RUN python -m pip install --upgrade pip && \ | ||
pip install --no-cache-dir -r requirements.txt | ||
|
||
ARG EFA_INSTALLER_VERSION=latest | ||
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 | ||
ARG AWS_OFI_NCCL_VERSION=1.9.1 | ||
ARG NCCL_TESTS_VERSION=master | ||
|
||
# Install necessary dependencies and remove old ones | ||
RUN apt-get update -y && \ | ||
apt-get remove -y --allow-change-held-packages \ | ||
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev && \ | ||
rm -rf /opt/hpcx /usr/local/mpi /usr/local/ucx /etc/ld.so.conf.d/hpcx.conf && \ | ||
ldconfig && \ | ||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ | ||
sudo git gcc vim kmod openssh-client openssh-server build-essential \ | ||
wget curl autoconf libtool gdb automake python3-distutils cmake \ | ||
apt-utils devscripts debhelper libsubunit-dev check pkg-config libhwloc-dev | ||
|
||
# SSH configuration | ||
RUN mkdir -p /var/run/sshd && \ | ||
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ | ||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ | ||
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config | ||
|
||
# Set environment variables for OpenMPI and CUDA | ||
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH | ||
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH | ||
|
||
# Install EFA | ||
RUN cd $HOME \ | ||
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& cd aws-efa-installer \ | ||
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ | ||
&& rm -rf $HOME/aws-efa-installer | ||
|
||
# Install NCCL | ||
RUN apt-key del 7fa2af80 && \ | ||
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \ | ||
dpkg -i cuda-keyring_1.0-1_all.deb && \ | ||
sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 | ||
|
||
# Install AWS-OFI-NCCL plugin | ||
RUN export OPAL_PREFIX="" && \ | ||
git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ | ||
cd /opt/aws-ofi-nccl && \ | ||
git checkout v${AWS_OFI_NCCL_VERSION}-aws && \ | ||
./autogen.sh && \ | ||
./configure --prefix=/opt/aws-ofi-nccl/install --with-libfabric=/opt/amazon/efa/ --with-cuda=/usr/local/cuda --with-mpi=/opt/amazon/openmpi/ && \ | ||
make && make install | ||
|
||
# Set environment variables for NCCL and clean up | ||
ENV NCCL_PROTO simple | ||
RUN rm -rf /var/lib/apt/lists/* | ||
# Ensure NCCL library is found first | ||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
torch==2.3 | ||
transformers==4.29 | ||
numpy==1.23 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import os | ||
import time | ||
import torch | ||
import torch.distributed as dist | ||
from torch.nn.parallel import DistributedDataParallel as DDP | ||
from transformers import BertForPreTraining, BertTokenizer | ||
from torch.utils.data import DataLoader, TensorDataset | ||
import numpy as np | ||
|
||
|
||
def create_dummy_data(tokenizer, num_samples=100, max_length=128): | ||
# Create dummy input data | ||
sentences = [ | ||
"This is a dummy sentence number {}".format(i) for i in range(num_samples) | ||
] | ||
tokenized_inputs = tokenizer( | ||
sentences, | ||
max_length=max_length, | ||
padding="max_length", | ||
truncation=True, | ||
return_tensors="pt", | ||
) | ||
labels = tokenized_inputs.input_ids.detach().clone() | ||
|
||
# MLM task: randomly mask some tokens | ||
mlm_probability = 0.15 | ||
input_ids, labels = mask_tokens( | ||
tokenized_inputs.input_ids, tokenizer, mlm_probability | ||
) | ||
|
||
# NSP task: create dummy pairs | ||
next_sentence_labels = torch.randint(0, 2, (num_samples,)) | ||
|
||
return TensorDataset( | ||
input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels | ||
) | ||
|
||
|
||
def mask_tokens(inputs, tokenizer, mlm_probability): | ||
labels = inputs.clone() | ||
probability_matrix = torch.full(labels.shape, mlm_probability) | ||
special_tokens_mask = [ | ||
tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) | ||
for val in labels.tolist() | ||
] | ||
probability_matrix.masked_fill_( | ||
torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0 | ||
) | ||
masked_indices = torch.bernoulli(probability_matrix).bool() | ||
labels[~masked_indices] = -100 # We only compute loss on masked tokens | ||
|
||
inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) | ||
|
||
return inputs, labels | ||
|
||
|
||
def setup(rank, world_size, local_rank): | ||
master_addr = os.environ["MASTER_ADDR"] | ||
master_port = os.environ["MASTER_PORT"] | ||
dist.init_process_group( | ||
"nccl", | ||
init_method=f"tcp://{master_addr}:{master_port}", | ||
rank=rank, | ||
world_size=world_size, | ||
) | ||
torch.cuda.set_device(local_rank) | ||
print(f"Process {rank} initialized, using GPU {local_rank}") | ||
|
||
|
||
def cleanup(): | ||
dist.destroy_process_group() | ||
|
||
|
||
def train_bert(rank, world_size, local_rank, model, tokenizer): | ||
setup(rank, world_size, local_rank) | ||
|
||
model = model.to(local_rank) | ||
ddp_model = DDP(model, device_ids=[local_rank]) | ||
|
||
dataset = create_dummy_data(tokenizer) | ||
train_dataloader = DataLoader(dataset, batch_size=8) | ||
|
||
optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001) | ||
criterion = torch.nn.CrossEntropyLoss() | ||
|
||
start_time = time.time() | ||
|
||
for epoch in range(1): # Short run for testing | ||
ddp_model.train() | ||
for batch in train_dataloader: | ||
optimizer.zero_grad() | ||
inputs, masks, labels, next_sentence_labels = batch | ||
inputs, masks, labels, next_sentence_labels = ( | ||
inputs.to(local_rank), | ||
masks.to(local_rank), | ||
labels.to(local_rank), | ||
next_sentence_labels.to(local_rank), | ||
) | ||
outputs = ddp_model( | ||
input_ids=inputs, | ||
attention_mask=masks, | ||
labels=labels, | ||
next_sentence_label=next_sentence_labels, | ||
) | ||
loss = outputs.loss | ||
loss.backward() | ||
optimizer.step() | ||
|
||
end_time = time.time() | ||
training_time = end_time - start_time | ||
throughput = len(dataset) / training_time | ||
|
||
print(f"Process {rank} - Training time: {training_time:.2f} seconds") | ||
print(f"Process {rank} - Throughput: {throughput:.2f} samples/second") | ||
|
||
cleanup() | ||
|
||
|
||
def main(): | ||
rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) | ||
world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) | ||
|
||
num_gpus_per_node = int(os.environ["NUM_GPUS_PER_NODE"]) | ||
local_rank = rank % num_gpus_per_node | ||
|
||
print(f"Process started for rank {rank} with local rank {local_rank}") | ||
|
||
# Pre-download model and tokenizer | ||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | ||
model = BertForPreTraining.from_pretrained("bert-base-uncased") | ||
|
||
print(f"successfully downloaded model and tokenizer for rank: {rank}") | ||
|
||
train_bert(rank, world_size, local_rank, model, tokenizer) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |