Skip to content

Commit

Permalink
Add docker image for BERT e2e training task (#454)
Browse files Browse the repository at this point in the history
* Add python training script, requirements.txt (dependencies), and dockerfile for the e2e BERT training task

* Add github action to build bert-testing image on PR

* Add default values and include in docker env for MASTER_ADDR and MASTER_HOST
  • Loading branch information
mattcjo authored Aug 1, 2024
1 parent 0d54419 commit b133519
Show file tree
Hide file tree
Showing 4 changed files with 277 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,8 @@ jobs:
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference
build-bert-training:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/bert-training/Dockerfile e2e2/test/images/bert-training
131 changes: 131 additions & 0 deletions e2e2/test/images/bert-training/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Use the NVIDIA CUDA runtime as a parent image
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04

# Set environment variable to disable interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# Set default values for MASTER_ADDR and MASTER_PORT
ENV MASTER_ADDR=127.0.0.1
ENV MASTER_PORT=12355

# Set default number of GPUs per node
ENV NUM_GPUS_PER_NODE=8

# Python ependency version numbers
ARG PYTHON=python3.10
ARG PYTHON_VERSION=3.10.12
ARG PIP=pip3

RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
jq \
libopencv-dev \
software-properties-common \
wget \
unzip \
vim \
pkg-config \
gdb \
lcov \
libbz2-dev \
zlib1g-dev \
openssl \
libssl-dev \
libsqlite3-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
libncurses-dev \
tk-dev \
libffi-dev \
libcap-dev \
gnupg2 \
gpg-agent \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean

# Install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
&& make -j $(nproc) && make install \
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
pip \
setuptools

# Set the working directory in the container
WORKDIR /app

# Copy only the necessary files into the container at /app
COPY train.py /app/
COPY requirements.txt /app/

# Install any needed packages specified in requirements.txt
RUN python -m pip install --upgrade pip && \
pip install --no-cache-dir -r requirements.txt

ARG EFA_INSTALLER_VERSION=latest
# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
ARG AWS_OFI_NCCL_VERSION=1.9.1
ARG NCCL_TESTS_VERSION=master

# Install necessary dependencies and remove old ones
RUN apt-get update -y && \
apt-get remove -y --allow-change-held-packages \
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev && \
rm -rf /opt/hpcx /usr/local/mpi /usr/local/ucx /etc/ld.so.conf.d/hpcx.conf && \
ldconfig && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
sudo git gcc vim kmod openssh-client openssh-server build-essential \
wget curl autoconf libtool gdb automake python3-distutils cmake \
apt-utils devscripts debhelper libsubunit-dev check pkg-config libhwloc-dev

# SSH configuration
RUN mkdir -p /var/run/sshd && \
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

# Set environment variables for OpenMPI and CUDA
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH

# Install EFA
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer

# Install NCCL
RUN apt-key del 7fa2af80 && \
curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2

# Install AWS-OFI-NCCL plugin
RUN export OPAL_PREFIX="" && \
git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \
cd /opt/aws-ofi-nccl && \
git checkout v${AWS_OFI_NCCL_VERSION}-aws && \
./autogen.sh && \
./configure --prefix=/opt/aws-ofi-nccl/install --with-libfabric=/opt/amazon/efa/ --with-cuda=/usr/local/cuda --with-mpi=/opt/amazon/openmpi/ && \
make && make install

# Set environment variables for NCCL and clean up
ENV NCCL_PROTO simple
RUN rm -rf /var/lib/apt/lists/*
# Ensure NCCL library is found first
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
3 changes: 3 additions & 0 deletions e2e2/test/images/bert-training/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
torch==2.3
transformers==4.29
numpy==1.23
138 changes: 138 additions & 0 deletions e2e2/test/images/bert-training/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
import time
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import BertForPreTraining, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import numpy as np


def create_dummy_data(tokenizer, num_samples=100, max_length=128):
# Create dummy input data
sentences = [
"This is a dummy sentence number {}".format(i) for i in range(num_samples)
]
tokenized_inputs = tokenizer(
sentences,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
labels = tokenized_inputs.input_ids.detach().clone()

# MLM task: randomly mask some tokens
mlm_probability = 0.15
input_ids, labels = mask_tokens(
tokenized_inputs.input_ids, tokenizer, mlm_probability
)

# NSP task: create dummy pairs
next_sentence_labels = torch.randint(0, 2, (num_samples,))

return TensorDataset(
input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels
)


def mask_tokens(inputs, tokenizer, mlm_probability):
labels = inputs.clone()
probability_matrix = torch.full(labels.shape, mlm_probability)
special_tokens_mask = [
tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
for val in labels.tolist()
]
probability_matrix.masked_fill_(
torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0
)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens

inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

return inputs, labels


def setup(rank, world_size, local_rank):
master_addr = os.environ["MASTER_ADDR"]
master_port = os.environ["MASTER_PORT"]
dist.init_process_group(
"nccl",
init_method=f"tcp://{master_addr}:{master_port}",
rank=rank,
world_size=world_size,
)
torch.cuda.set_device(local_rank)
print(f"Process {rank} initialized, using GPU {local_rank}")


def cleanup():
dist.destroy_process_group()


def train_bert(rank, world_size, local_rank, model, tokenizer):
setup(rank, world_size, local_rank)

model = model.to(local_rank)
ddp_model = DDP(model, device_ids=[local_rank])

dataset = create_dummy_data(tokenizer)
train_dataloader = DataLoader(dataset, batch_size=8)

optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

start_time = time.time()

for epoch in range(1): # Short run for testing
ddp_model.train()
for batch in train_dataloader:
optimizer.zero_grad()
inputs, masks, labels, next_sentence_labels = batch
inputs, masks, labels, next_sentence_labels = (
inputs.to(local_rank),
masks.to(local_rank),
labels.to(local_rank),
next_sentence_labels.to(local_rank),
)
outputs = ddp_model(
input_ids=inputs,
attention_mask=masks,
labels=labels,
next_sentence_label=next_sentence_labels,
)
loss = outputs.loss
loss.backward()
optimizer.step()

end_time = time.time()
training_time = end_time - start_time
throughput = len(dataset) / training_time

print(f"Process {rank} - Training time: {training_time:.2f} seconds")
print(f"Process {rank} - Throughput: {throughput:.2f} samples/second")

cleanup()


def main():
rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])

num_gpus_per_node = int(os.environ["NUM_GPUS_PER_NODE"])
local_rank = rank % num_gpus_per_node

print(f"Process started for rank {rank} with local rank {local_rank}")

# Pre-download model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForPreTraining.from_pretrained("bert-base-uncased")

print(f"successfully downloaded model and tokenizer for rank: {rank}")

train_bert(rank, world_size, local_rank, model, tokenizer)


if __name__ == "__main__":
main()

0 comments on commit b133519

Please sign in to comment.