From f5c18311fd938cd9df24d5693a52540648427202 Mon Sep 17 00:00:00 2001
From: Matt Johnson <71744796+mattcjo@users.noreply.github.com>
Date: Tue, 16 Jul 2024 09:51:27 -0700
Subject: [PATCH] Add docker image for BERT e2e inference task (#455)

* Add image for e2e bert inference testing, and all its dependencies

* Update git workflow with a new action to verify bert inference image builds

* Update Dockerfile name to not include prefix

* Update git workflow to account for changing of the Dockerfile name for bert inference

* Update bert-inference Dockerfile to install Python from source

* Update bert inference docker build to use relative path instead of absolute

* revert bert inference dockerfile path back to full path from relative
---
 .github/workflows/ci.yaml                     |   7 +-
 e2e2/test/images/bert-inference/Dockerfile    |  69 +++++++++++
 e2e2/test/images/bert-inference/infer.py      | 112 ++++++++++++++++++
 .../images/bert-inference/requirements.txt    |   3 +
 4 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 e2e2/test/images/bert-inference/Dockerfile
 create mode 100644 e2e2/test/images/bert-inference/infer.py
 create mode 100644 e2e2/test/images/bert-inference/requirements.txt

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 926714489..d25612c07 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -25,4 +25,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
-    - run: docker build --file e2e2/test/images/nvidia/Dockerfile .
\ No newline at end of file
+    - run: docker build --file e2e2/test/images/nvidia/Dockerfile .
+  build-bert-inference:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference
diff --git a/e2e2/test/images/bert-inference/Dockerfile b/e2e2/test/images/bert-inference/Dockerfile
new file mode 100644
index 000000000..7433e33ce
--- /dev/null
+++ b/e2e2/test/images/bert-inference/Dockerfile
@@ -0,0 +1,69 @@
+# Use the NVIDIA CUDA runtime as a parent image
+FROM nvidia/cuda:12.5.0-devel-ubuntu22.04
+
+# Set environment variable to disable interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Dependency version numbers
+ARG PYTHON=python3.10
+ARG PYTHON_VERSION=3.10.12
+ARG PIP=pip3
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    emacs \
+    git \
+    jq \
+    libopencv-dev \
+    software-properties-common \
+    wget \
+    unzip \
+    vim \
+    pkg-config \
+    gdb \
+    lcov \
+    libbz2-dev \
+    zlib1g-dev \
+    openssl \
+    libssl-dev \
+    libsqlite3-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libbz2-dev \
+    libncurses-dev \
+    tk-dev \
+    libffi-dev \
+    libcap-dev \
+    gnupg2 \
+    gpg-agent \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Install Python
+RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
+ && tar -xzf Python-$PYTHON_VERSION.tgz \
+ && cd Python-$PYTHON_VERSION \
+ && ./configure --enable-shared --prefix=/usr/local \
+ && make -j $(nproc) && make install \
+ && cd .. && rm -rf ../Python-$PYTHON_VERSION* \
+ && ln -s /usr/local/bin/pip3 /usr/bin/pip \
+ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
+ && ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy only the necessary files into the container at /app
+COPY infer.py /app/
+COPY requirements.txt /app/
+
+# Install any needed packages specified in requirements.txt
+RUN python -m pip install --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
diff --git a/e2e2/test/images/bert-inference/infer.py b/e2e2/test/images/bert-inference/infer.py
new file mode 100644
index 000000000..d417537e2
--- /dev/null
+++ b/e2e2/test/images/bert-inference/infer.py
@@ -0,0 +1,112 @@
+import os
+import time
+import torch
+from transformers import BertForPreTraining, BertTokenizer
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+
+
+def create_dummy_data(tokenizer, num_samples=100, max_length=128):
+    # Create dummy input data
+    sentences = [
+        "This is a dummy sentence number {}".format(i) for i in range(num_samples)
+    ]
+    tokenized_inputs = tokenizer(
+        sentences,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    labels = tokenized_inputs.input_ids.detach().clone()
+
+    # MLM task: randomly mask some tokens
+    mlm_probability = 0.15
+    input_ids, labels = mask_tokens(
+        tokenized_inputs.input_ids, tokenizer, mlm_probability
+    )
+
+    # NSP task: create dummy pairs
+    next_sentence_labels = torch.randint(0, 2, (num_samples,))
+
+    return TensorDataset(
+        input_ids, tokenized_inputs.attention_mask, next_sentence_labels
+    )
+
+
+def mask_tokens(inputs, tokenizer, mlm_probability):
+    labels = inputs.clone()
+    probability_matrix = torch.full(labels.shape, mlm_probability)
+    special_tokens_mask = [
+        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
+        for val in labels.tolist()
+    ]
+    probability_matrix.masked_fill_(
+        torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0
+    )
+    masked_indices = torch.bernoulli(probability_matrix).bool()
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+    inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
+
+    return inputs, labels
+
+
+def run_inference(model, tokenizer, batch_size, mode):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+
+    dataset = create_dummy_data(tokenizer)
+    dataloader = DataLoader(dataset, batch_size=batch_size)
+
+    total_time = 0
+    total_batches = len(dataloader)
+
+    with torch.no_grad():
+        for batch in dataloader:
+            inputs, masks, next_sentence_labels = batch
+            inputs, masks, next_sentence_labels = (
+                inputs.to(device),
+                masks.to(device),
+                next_sentence_labels.to(device),
+            )
+
+            start_time = time.time()
+            outputs = model(
+                input_ids=inputs,
+                attention_mask=masks,
+                next_sentence_label=next_sentence_labels,
+            )
+            end_time = time.time()
+
+            total_time += end_time - start_time
+
+    avg_time_per_batch = total_time / total_batches
+    throughput = (total_batches * batch_size) / total_time
+    
+    print(f"Inference Mode: {mode}")
+    print(f"Average time per batch: {avg_time_per_batch:.4f} seconds")
+    print(f"Throughput: {throughput:.2f} samples/second")
+
+
+def main():
+    # Verify GPU availability
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU isnot available. Exiting")
+
+    print("GPU is available")
+
+    # Pre-download model and tokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = BertForPreTraining.from_pretrained("bert-base-uncased")
+
+    mode = os.environ.get("INFERENCE_MODE", "throughput").lower()
+    batch_size = 1 if mode == "latency" else 8
+
+    print(f"Running inference in {mode} mode with batch size {batch_size}")
+    run_inference(model, tokenizer, batch_size, mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/e2e2/test/images/bert-inference/requirements.txt b/e2e2/test/images/bert-inference/requirements.txt
new file mode 100644
index 000000000..a9831ed72
--- /dev/null
+++ b/e2e2/test/images/bert-inference/requirements.txt
@@ -0,0 +1,3 @@
+torch==2.3
+transformers==4.29
+numpy==1.23
\ No newline at end of file