-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add docker image for BERT e2e inference task (#455)
* Add image for e2e bert inference testing, and all its dependencies * Update git workflow with a new action to verify bert inference image builds * Update Dockerfile name to not include prefix * Update git workflow to account for changing of the Dockerfile name for bert inference * Update bert-inference Dockerfile to install Python from source * Update bert inference docker build to use relative path instead of absolute * revert bert inference dockerfile path back to full path from relative
- Loading branch information
Showing
4 changed files
with
190 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Use the NVIDIA CUDA runtime as a parent image | ||
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04 | ||
|
||
# Set environment variable to disable interactive prompts | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
|
||
# Dependency version numbers | ||
ARG PYTHON=python3.10 | ||
ARG PYTHON_VERSION=3.10.12 | ||
ARG PIP=pip3 | ||
|
||
RUN apt-get update \ | ||
&& apt-get upgrade -y \ | ||
&& apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
ca-certificates \ | ||
cmake \ | ||
curl \ | ||
emacs \ | ||
git \ | ||
jq \ | ||
libopencv-dev \ | ||
software-properties-common \ | ||
wget \ | ||
unzip \ | ||
vim \ | ||
pkg-config \ | ||
gdb \ | ||
lcov \ | ||
libbz2-dev \ | ||
zlib1g-dev \ | ||
openssl \ | ||
libssl-dev \ | ||
libsqlite3-dev \ | ||
libgdbm-dev \ | ||
libc6-dev \ | ||
libbz2-dev \ | ||
libncurses-dev \ | ||
tk-dev \ | ||
libffi-dev \ | ||
libcap-dev \ | ||
gnupg2 \ | ||
gpg-agent \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& apt-get clean | ||
|
||
# Install Python | ||
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ | ||
&& tar -xzf Python-$PYTHON_VERSION.tgz \ | ||
&& cd Python-$PYTHON_VERSION \ | ||
&& ./configure --enable-shared --prefix=/usr/local \ | ||
&& make -j $(nproc) && make install \ | ||
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \ | ||
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \ | ||
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ | ||
&& ${PIP} --no-cache-dir install --upgrade \ | ||
pip \ | ||
setuptools | ||
|
||
# Set the working directory in the container | ||
WORKDIR /app | ||
|
||
# Copy only the necessary files into the container at /app | ||
COPY infer.py /app/ | ||
COPY requirements.txt /app/ | ||
|
||
# Install any needed packages specified in requirements.txt | ||
RUN python -m pip install --upgrade pip && \ | ||
pip install --no-cache-dir -r requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import os | ||
import time | ||
import torch | ||
from transformers import BertForPreTraining, BertTokenizer | ||
from torch.utils.data import DataLoader, TensorDataset | ||
import numpy as np | ||
|
||
|
||
def create_dummy_data(tokenizer, num_samples=100, max_length=128): | ||
# Create dummy input data | ||
sentences = [ | ||
"This is a dummy sentence number {}".format(i) for i in range(num_samples) | ||
] | ||
tokenized_inputs = tokenizer( | ||
sentences, | ||
max_length=max_length, | ||
padding="max_length", | ||
truncation=True, | ||
return_tensors="pt", | ||
) | ||
labels = tokenized_inputs.input_ids.detach().clone() | ||
|
||
# MLM task: randomly mask some tokens | ||
mlm_probability = 0.15 | ||
input_ids, labels = mask_tokens( | ||
tokenized_inputs.input_ids, tokenizer, mlm_probability | ||
) | ||
|
||
# NSP task: create dummy pairs | ||
next_sentence_labels = torch.randint(0, 2, (num_samples,)) | ||
|
||
return TensorDataset( | ||
input_ids, tokenized_inputs.attention_mask, next_sentence_labels | ||
) | ||
|
||
|
||
def mask_tokens(inputs, tokenizer, mlm_probability): | ||
labels = inputs.clone() | ||
probability_matrix = torch.full(labels.shape, mlm_probability) | ||
special_tokens_mask = [ | ||
tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) | ||
for val in labels.tolist() | ||
] | ||
probability_matrix.masked_fill_( | ||
torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0 | ||
) | ||
masked_indices = torch.bernoulli(probability_matrix).bool() | ||
labels[~masked_indices] = -100 # We only compute loss on masked tokens | ||
|
||
inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) | ||
|
||
return inputs, labels | ||
|
||
|
||
def run_inference(model, tokenizer, batch_size, mode): | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
model.to(device) | ||
model.eval() | ||
|
||
dataset = create_dummy_data(tokenizer) | ||
dataloader = DataLoader(dataset, batch_size=batch_size) | ||
|
||
total_time = 0 | ||
total_batches = len(dataloader) | ||
|
||
with torch.no_grad(): | ||
for batch in dataloader: | ||
inputs, masks, next_sentence_labels = batch | ||
inputs, masks, next_sentence_labels = ( | ||
inputs.to(device), | ||
masks.to(device), | ||
next_sentence_labels.to(device), | ||
) | ||
|
||
start_time = time.time() | ||
outputs = model( | ||
input_ids=inputs, | ||
attention_mask=masks, | ||
next_sentence_label=next_sentence_labels, | ||
) | ||
end_time = time.time() | ||
|
||
total_time += end_time - start_time | ||
|
||
avg_time_per_batch = total_time / total_batches | ||
throughput = (total_batches * batch_size) / total_time | ||
|
||
print(f"Inference Mode: {mode}") | ||
print(f"Average time per batch: {avg_time_per_batch:.4f} seconds") | ||
print(f"Throughput: {throughput:.2f} samples/second") | ||
|
||
|
||
def main(): | ||
# Verify GPU availability | ||
if not torch.cuda.is_available(): | ||
raise RuntimeError("GPU isnot available. Exiting") | ||
|
||
print("GPU is available") | ||
|
||
# Pre-download model and tokenizer | ||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | ||
model = BertForPreTraining.from_pretrained("bert-base-uncased") | ||
|
||
mode = os.environ.get("INFERENCE_MODE", "throughput").lower() | ||
batch_size = 1 if mode == "latency" else 8 | ||
|
||
print(f"Running inference in {mode} mode with batch size {batch_size}") | ||
run_inference(model, tokenizer, batch_size, mode) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
torch==2.3 | ||
transformers==4.29 | ||
numpy==1.23 |