From f5c18311fd938cd9df24d5693a52540648427202 Mon Sep 17 00:00:00 2001 From: Matt Johnson <71744796+mattcjo@users.noreply.github.com> Date: Tue, 16 Jul 2024 09:51:27 -0700 Subject: [PATCH] Add docker image for BERT e2e inference task (#455) * Add image for e2e bert inference testing, and all its dependencies * Update git workflow with a new action to verify bert inference image builds * Update Dockerfile name to not include prefix * Update git workflow to account for changing of the Dockerfile name for bert inference * Update bert-inference Dockerfile to install Python from source * Update bert inference docker build to use relative path instead of absolute * revert bert inference dockerfile path back to full path from relative --- .github/workflows/ci.yaml | 7 +- e2e2/test/images/bert-inference/Dockerfile | 69 +++++++++++ e2e2/test/images/bert-inference/infer.py | 112 ++++++++++++++++++ .../images/bert-inference/requirements.txt | 3 + 4 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 e2e2/test/images/bert-inference/Dockerfile create mode 100644 e2e2/test/images/bert-inference/infer.py create mode 100644 e2e2/test/images/bert-inference/requirements.txt diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 926714489..d25612c07 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,4 +25,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/nvidia/Dockerfile . \ No newline at end of file + - run: docker build --file e2e2/test/images/nvidia/Dockerfile . + build-bert-inference: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference diff --git a/e2e2/test/images/bert-inference/Dockerfile b/e2e2/test/images/bert-inference/Dockerfile new file mode 100644 index 000000000..7433e33ce --- /dev/null +++ b/e2e2/test/images/bert-inference/Dockerfile @@ -0,0 +1,69 @@ +# Use the NVIDIA CUDA runtime as a parent image +FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 + +# Set environment variable to disable interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Dependency version numbers +ARG PYTHON=python3.10 +ARG PYTHON_VERSION=3.10.12 +ARG PIP=pip3 + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + jq \ + libopencv-dev \ + software-properties-common \ + wget \ + unzip \ + vim \ + pkg-config \ + gdb \ + lcov \ + libbz2-dev \ + zlib1g-dev \ + openssl \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libncurses-dev \ + tk-dev \ + libffi-dev \ + libcap-dev \ + gnupg2 \ + gpg-agent \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + pip \ + setuptools + +# Set the working directory in the container +WORKDIR /app + +# Copy only the necessary files into the container at /app +COPY infer.py /app/ +COPY requirements.txt /app/ + +# Install any needed packages specified in requirements.txt +RUN python -m pip install --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt diff --git a/e2e2/test/images/bert-inference/infer.py b/e2e2/test/images/bert-inference/infer.py new file mode 100644 index 000000000..d417537e2 --- /dev/null +++ b/e2e2/test/images/bert-inference/infer.py @@ -0,0 +1,112 @@ +import os +import time +import torch +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset +import numpy as np + + +def create_dummy_data(tokenizer, num_samples=100, max_length=128): + # Create dummy input data + sentences = [ + "This is a dummy sentence number {}".format(i) for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + labels = tokenized_inputs.input_ids.detach().clone() + + # MLM task: randomly mask some tokens + mlm_probability = 0.15 + input_ids, labels = mask_tokens( + tokenized_inputs.input_ids, tokenizer, mlm_probability + ) + + # NSP task: create dummy pairs + next_sentence_labels = torch.randint(0, 2, (num_samples,)) + + return TensorDataset( + input_ids, tokenized_inputs.attention_mask, next_sentence_labels + ) + + +def mask_tokens(inputs, tokenizer, mlm_probability): + labels = inputs.clone() + probability_matrix = torch.full(labels.shape, mlm_probability) + special_tokens_mask = [ + tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) + for val in labels.tolist() + ] + probability_matrix.masked_fill_( + torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0 + ) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) + + return inputs, labels + + +def run_inference(model, tokenizer, batch_size, mode): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.eval() + + dataset = create_dummy_data(tokenizer) + dataloader = DataLoader(dataset, batch_size=batch_size) + + total_time = 0 + total_batches = len(dataloader) + + with torch.no_grad(): + for batch in dataloader: + inputs, masks, next_sentence_labels = batch + inputs, masks, next_sentence_labels = ( + inputs.to(device), + masks.to(device), + next_sentence_labels.to(device), + ) + + start_time = time.time() + outputs = model( + input_ids=inputs, + attention_mask=masks, + next_sentence_label=next_sentence_labels, + ) + end_time = time.time() + + total_time += end_time - start_time + + avg_time_per_batch = total_time / total_batches + throughput = (total_batches * batch_size) / total_time + + print(f"Inference Mode: {mode}") + print(f"Average time per batch: {avg_time_per_batch:.4f} seconds") + print(f"Throughput: {throughput:.2f} samples/second") + + +def main(): + # Verify GPU availability + if not torch.cuda.is_available(): + raise RuntimeError("GPU isnot available. Exiting") + + print("GPU is available") + + # Pre-download model and tokenizer + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + + mode = os.environ.get("INFERENCE_MODE", "throughput").lower() + batch_size = 1 if mode == "latency" else 8 + + print(f"Running inference in {mode} mode with batch size {batch_size}") + run_inference(model, tokenizer, batch_size, mode) + + +if __name__ == "__main__": + main() diff --git a/e2e2/test/images/bert-inference/requirements.txt b/e2e2/test/images/bert-inference/requirements.txt new file mode 100644 index 000000000..a9831ed72 --- /dev/null +++ b/e2e2/test/images/bert-inference/requirements.txt @@ -0,0 +1,3 @@ +torch==2.3 +transformers==4.29 +numpy==1.23 \ No newline at end of file