diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d25612c07..b79a575b0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,3 +31,8 @@ jobs: steps: - uses: actions/checkout@v3 - run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference + build-bert-training: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: docker build --file e2e2/test/images/bert-training/Dockerfile e2e2/test/images/bert-training diff --git a/e2e2/test/images/bert-training/Dockerfile b/e2e2/test/images/bert-training/Dockerfile new file mode 100644 index 000000000..6990d0778 --- /dev/null +++ b/e2e2/test/images/bert-training/Dockerfile @@ -0,0 +1,131 @@ +# Use the NVIDIA CUDA runtime as a parent image +FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 + +# Set environment variable to disable interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Set default values for MASTER_ADDR and MASTER_PORT +ENV MASTER_ADDR=127.0.0.1 +ENV MASTER_PORT=12355 + +# Set default number of GPUs per node +ENV NUM_GPUS_PER_NODE=8 + +# Python ependency version numbers +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG PIP=pip3 + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + software-properties-common \ + wget \ + unzip \ + vim \ + pkg-config \ + gdb \ + lcov \ + libbz2-dev \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libncurses-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Set the working directory in the container +WORKDIR /app + +# Copy only the necessary files into the container at /app +COPY train.py /app/ +COPY requirements.txt /app/ + +# Install any needed packages specified in requirements.txt +RUN python -m pip install --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt + +ARG EFA_INSTALLER_VERSION=latest +# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0 +ARG AWS_OFI_NCCL_VERSION=1.9.1 +ARG NCCL_TESTS_VERSION=master + +# Install necessary dependencies and remove old ones +RUN apt-get update -y && \ + apt-get remove -y --allow-change-held-packages \ + libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev && \ + rm -rf /opt/hpcx /usr/local/mpi /usr/local/ucx /etc/ld.so.conf.d/hpcx.conf && \ + ldconfig && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + sudo git gcc vim kmod openssh-client openssh-server build-essential \ + wget curl autoconf libtool gdb automake python3-distutils cmake \ + apt-utils devscripts debhelper libsubunit-dev check pkg-config libhwloc-dev + +# SSH configuration +RUN mkdir -p /var/run/sshd && \ + sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# Set environment variables for OpenMPI and CUDA +ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH +ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH + +# Install EFA +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf $HOME/aws-efa-installer + +# Install NCCL +RUN apt-key del 7fa2af80 && \ + curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2 + +# Install AWS-OFI-NCCL plugin +RUN export OPAL_PREFIX="" && \ + git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ + cd /opt/aws-ofi-nccl && \ + git checkout v${AWS_OFI_NCCL_VERSION}-aws && \ + ./autogen.sh && \ + ./configure --prefix=/opt/aws-ofi-nccl/install --with-libfabric=/opt/amazon/efa/ --with-cuda=/usr/local/cuda --with-mpi=/opt/amazon/openmpi/ && \ + make && make install + +# Set environment variables for NCCL and clean up +ENV NCCL_PROTO simple +RUN rm -rf /var/lib/apt/lists/* +# Ensure NCCL library is found first +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH diff --git a/e2e2/test/images/bert-training/requirements.txt b/e2e2/test/images/bert-training/requirements.txt new file mode 100644 index 000000000..a9831ed72 --- /dev/null +++ b/e2e2/test/images/bert-training/requirements.txt @@ -0,0 +1,3 @@ +torch==2.3 +transformers==4.29 +numpy==1.23 \ No newline at end of file diff --git a/e2e2/test/images/bert-training/train.py b/e2e2/test/images/bert-training/train.py new file mode 100644 index 000000000..0f9b5447d --- /dev/null +++ b/e2e2/test/images/bert-training/train.py @@ -0,0 +1,138 @@ +import os +import time +import torch +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset +import numpy as np + + +def create_dummy_data(tokenizer, num_samples=100, max_length=128): + # Create dummy input data + sentences = [ + "This is a dummy sentence number {}".format(i) for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + labels = tokenized_inputs.input_ids.detach().clone() + + # MLM task: randomly mask some tokens + mlm_probability = 0.15 + input_ids, labels = mask_tokens( + tokenized_inputs.input_ids, tokenizer, mlm_probability + ) + + # NSP task: create dummy pairs + next_sentence_labels = torch.randint(0, 2, (num_samples,)) + + return TensorDataset( + input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels + ) + + +def mask_tokens(inputs, tokenizer, mlm_probability): + labels = inputs.clone() + probability_matrix = torch.full(labels.shape, mlm_probability) + special_tokens_mask = [ + tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) + for val in labels.tolist() + ] + probability_matrix.masked_fill_( + torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0 + ) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) + + return inputs, labels + + +def setup(rank, world_size, local_rank): + master_addr = os.environ["MASTER_ADDR"] + master_port = os.environ["MASTER_PORT"] + dist.init_process_group( + "nccl", + init_method=f"tcp://{master_addr}:{master_port}", + rank=rank, + world_size=world_size, + ) + torch.cuda.set_device(local_rank) + print(f"Process {rank} initialized, using GPU {local_rank}") + + +def cleanup(): + dist.destroy_process_group() + + +def train_bert(rank, world_size, local_rank, model, tokenizer): + setup(rank, world_size, local_rank) + + model = model.to(local_rank) + ddp_model = DDP(model, device_ids=[local_rank]) + + dataset = create_dummy_data(tokenizer) + train_dataloader = DataLoader(dataset, batch_size=8) + + optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001) + criterion = torch.nn.CrossEntropyLoss() + + start_time = time.time() + + for epoch in range(1): # Short run for testing + ddp_model.train() + for batch in train_dataloader: + optimizer.zero_grad() + inputs, masks, labels, next_sentence_labels = batch + inputs, masks, labels, next_sentence_labels = ( + inputs.to(local_rank), + masks.to(local_rank), + labels.to(local_rank), + next_sentence_labels.to(local_rank), + ) + outputs = ddp_model( + input_ids=inputs, + attention_mask=masks, + labels=labels, + next_sentence_label=next_sentence_labels, + ) + loss = outputs.loss + loss.backward() + optimizer.step() + + end_time = time.time() + training_time = end_time - start_time + throughput = len(dataset) / training_time + + print(f"Process {rank} - Training time: {training_time:.2f} seconds") + print(f"Process {rank} - Throughput: {throughput:.2f} samples/second") + + cleanup() + + +def main(): + rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) + world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) + + num_gpus_per_node = int(os.environ["NUM_GPUS_PER_NODE"]) + local_rank = rank % num_gpus_per_node + + print(f"Process started for rank {rank} with local rank {local_rank}") + + # Pre-download model and tokenizer + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + + print(f"successfully downloaded model and tokenizer for rank: {rank}") + + train_bert(rank, world_size, local_rank, model, tokenizer) + + +if __name__ == "__main__": + main()