aws · ndbaker1 · Aug 1, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -31,3 +31,8 @@ jobs:
     steps:
     - uses: actions/checkout@v3
     - run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference
+  build-bert-training:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - run: docker build --file e2e2/test/images/bert-training/Dockerfile e2e2/test/images/bert-training
diff --git a/e2e2/test/images/bert-training/Dockerfile b/e2e2/test/images/bert-training/Dockerfile
@@ -0,0 +1,131 @@
+# Use the NVIDIA CUDA runtime as a parent image
+FROM nvidia/cuda:12.5.0-devel-ubuntu22.04
+
+# Set environment variable to disable interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set default values for MASTER_ADDR and MASTER_PORT
+ENV MASTER_ADDR=127.0.0.1
+ENV MASTER_PORT=12355
+
+# Set default number of GPUs per node
+ENV NUM_GPUS_PER_NODE=8
+
+# Python ependency version numbers
+ARG PYTHON=python3.10
+ARG PYTHON_VERSION=3.10.12
+ARG PIP=pip3
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    emacs \
+    git \
+    jq \
+    libopencv-dev \
+    software-properties-common \
+    wget \
+    unzip \
+    vim \
+    pkg-config \
+    gdb \
+    lcov \
+    libbz2-dev \
+    zlib1g-dev \
+    openssl \
+    libssl-dev \
+    libsqlite3-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libbz2-dev \
+    libncurses-dev \
+    tk-dev \
+    libffi-dev \
+    libcap-dev \
+    gnupg2 \
+    gpg-agent \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Install Python
+RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
+ && tar -xzf Python-$PYTHON_VERSION.tgz \
+ && cd Python-$PYTHON_VERSION \
+ && ./configure --enable-shared --prefix=/usr/local \
+ && make -j $(nproc) && make install \
+ && cd .. && rm -rf ../Python-$PYTHON_VERSION* \
+ && ln -s /usr/local/bin/pip3 /usr/bin/pip \
+ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
+ && ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy only the necessary files into the container at /app
+COPY train.py /app/
+COPY requirements.txt /app/
+
+# Install any needed packages specified in requirements.txt
+RUN python -m pip install --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+
+ARG EFA_INSTALLER_VERSION=latest
+# 1.7.4+ is required, to enforce proper EFA function with OFI_NCCL_DISABLE_GDR_REQUIRED_CHECK=0
+ARG AWS_OFI_NCCL_VERSION=1.9.1
+ARG NCCL_TESTS_VERSION=master
+
+# Install necessary dependencies and remove old ones
+RUN apt-get update -y && \
+    apt-get remove -y --allow-change-held-packages \
+    libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev && \
+    rm -rf /opt/hpcx /usr/local/mpi /usr/local/ucx /etc/ld.so.conf.d/hpcx.conf && \
+    ldconfig && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    sudo git gcc vim kmod openssh-client openssh-server build-essential \
+    wget curl autoconf libtool gdb automake python3-distutils cmake \
+    apt-utils devscripts debhelper libsubunit-dev check pkg-config libhwloc-dev
+
+# SSH configuration
+RUN mkdir -p /var/run/sshd && \
+    sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+# Set environment variables for OpenMPI and CUDA
+ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
+
+# Install EFA
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+# Install NCCL
+RUN apt-key del 7fa2af80 && \
+    curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    sudo apt install libnccl2=2.18.5-1+cuda12.2 libnccl-dev=2.18.5-1+cuda12.2
+
+# Install AWS-OFI-NCCL plugin
+RUN export OPAL_PREFIX="" && \
+    git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \
+    cd /opt/aws-ofi-nccl && \
+    git checkout v${AWS_OFI_NCCL_VERSION}-aws && \
+    ./autogen.sh && \
+    ./configure --prefix=/opt/aws-ofi-nccl/install --with-libfabric=/opt/amazon/efa/ --with-cuda=/usr/local/cuda --with-mpi=/opt/amazon/openmpi/ && \
+    make && make install
+
+# Set environment variables for NCCL and clean up
+ENV NCCL_PROTO simple
+RUN rm -rf /var/lib/apt/lists/*
+# Ensure NCCL library is found first
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
diff --git a/e2e2/test/images/bert-training/requirements.txt b/e2e2/test/images/bert-training/requirements.txt
@@ -0,0 +1,3 @@
+torch==2.3
+transformers==4.29
+numpy==1.23
diff --git a/e2e2/test/images/bert-training/train.py b/e2e2/test/images/bert-training/train.py
@@ -0,0 +1,139 @@
+import os
+import time
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from transformers import BertForPreTraining, BertTokenizer
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+
+
+def create_dummy_data(tokenizer, num_samples=100, max_length=128):
+    # Create dummy input data
+    sentences = [
+        "This is a dummy sentence number {}".format(i) for i in range(num_samples)
+    ]
+    tokenized_inputs = tokenizer(
+        sentences,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    labels = tokenized_inputs.input_ids.detach().clone()
+
+    # MLM task: randomly mask some tokens
+    mlm_probability = 0.15
+    input_ids, labels = mask_tokens(
+        tokenized_inputs.input_ids, tokenizer, mlm_probability
+    )
+
+    # NSP task: create dummy pairs
+    next_sentence_labels = torch.randint(0, 2, (num_samples,))
+
+    return TensorDataset(
+        input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels
+    )
+
+
+def mask_tokens(inputs, tokenizer, mlm_probability):
+    labels = inputs.clone()
+    probability_matrix = torch.full(labels.shape, mlm_probability)
+    special_tokens_mask = [
+        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
+        for val in labels.tolist()
+    ]
+    probability_matrix.masked_fill_(
+        torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0
+    )
+    masked_indices = torch.bernoulli(probability_matrix).bool()
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+    inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
+
+    return inputs, labels
+
+
+def setup(rank, world_size, local_rank):
+    master_addr = os.environ["MASTER_ADDR"]
+    master_port = os.environ["MASTER_PORT"]
+    dist.init_process_group(
+        "nccl",
+        init_method=f"tcp://{master_addr}:{master_port}",
+        rank=rank,
+        world_size=world_size,
+    )
+    torch.cuda.set_device(local_rank)
+    print(f"Process {rank} initialized, using GPU {local_rank}")
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def train_bert(rank, world_size, local_rank, model, tokenizer):
+    setup(rank, world_size, local_rank)
+
+    model = model.to(local_rank)
+    ddp_model = DDP(model, device_ids=[local_rank])
+
+    dataset = create_dummy_data(tokenizer)
+    train_dataloader = DataLoader(dataset, batch_size=8)
+
+    optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001)
+    criterion = torch.nn.CrossEntropyLoss()
+
+    start_time = time.time()
+
+    for epoch in range(1):  # Short run for testing
+        ddp_model.train()
+        for batch in train_dataloader:
+            optimizer.zero_grad()
+            inputs, masks, labels, next_sentence_labels = batch
+            inputs, masks, labels, next_sentence_labels = (
+                inputs.to(local_rank),
+                masks.to(local_rank),
+                labels.to(local_rank),
+                next_sentence_labels.to(local_rank),
+            )
+            outputs = ddp_model(
+                input_ids=inputs,
+                attention_mask=masks,
+                labels=labels,
+                next_sentence_label=next_sentence_labels,
+            )
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+
+    end_time = time.time()
+    training_time = end_time - start_time
+    throughput = len(dataset) / training_time
+
+    print(f"Process {rank} - Training time: {training_time:.2f} seconds")
+    print(f"Process {rank} - Throughput: {throughput:.2f} samples/second")
+
+    cleanup()
+
+
+def main():
+    rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
+    world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
+
+
+    num_gpus_per_node = int(os.environ["NUM_GPUS_PER_NODE"]) 
+    local_rank = rank % num_gpus_per_node
+
+    print(f"Process started for rank {rank} with local rank {local_rank}")
+
+    # Pre-download model and tokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = BertForPreTraining.from_pretrained("bert-base-uncased")
+
+    print(f"successfully downloaded model and tokenizer for rank: {rank}")
+
+    train_bert(rank, world_size, local_rank, model, tokenizer)
+
+
+if __name__ == "__main__":
+    main()