Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TRT-LLM backend build to Triton (#6365) #6392

Merged
merged 2 commits into from
Oct 7, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 77 additions & 6 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
import importlib.util
import multiprocessing
import os
import os.path
Expand All @@ -36,6 +37,8 @@
import sys
from inspect import getsourcefile

import requests

#
# Build Triton Inference Server.
#
Expand Down Expand Up @@ -74,7 +77,9 @@
"2023.0.0", # ORT OpenVINO
"2023.0.0", # Standalone OpenVINO
"2.4.7", # DCGM version
"py310_23.1.0-1", # Conda version.
"py310_23.1.0-1", # Conda version
"9.1.0.1", # TRT version for building TRT-LLM backend
"12.2", # CUDA version for building TRT-LLM backend
)
}

Expand Down Expand Up @@ -564,6 +569,8 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
args = fastertransformer_cmake_args()
elif be == "tensorrt":
args = tensorrt_cmake_args()
elif be == "tensorrtllm":
args = tensorrtllm_cmake_args(images)
else:
args = []

Expand Down Expand Up @@ -859,6 +866,39 @@ def fastertransformer_cmake_args():
return cargs


def tensorrtllm_cmake_args(images):
cargs = [
cmake_backend_arg(
"tensorrtllm",
"TRT_LIB_DIR",
None,
"${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib",
),
cmake_backend_arg(
"tensorrtllm", "TRT_INCLUDE_DIR", None, "${TRT_ROOT}/include"
),
cmake_backend_arg(
"tensorrtllm",
"TRTLLM_BUILD_CONTAINER",
None,
images["base"],
),
cmake_backend_arg(
"tensorrtllm",
"TENSORRT_VERSION",
None,
TRITON_VERSION_MAP[FLAGS.version][7],
),
cmake_backend_arg(
"tensorrtllm",
"CUDA_VERSION",
None,
TRITON_VERSION_MAP[FLAGS.version][8],
),
]
return cargs


def install_dcgm_libraries(dcgm_version, target_machine):
if dcgm_version == "":
fail(
Expand Down Expand Up @@ -1237,10 +1277,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach

if "fastertransformer" in backends:
be = "fastertransformer"
import importlib.util

import requests

url = "https://raw.githubusercontent.com/triton-inference-server/fastertransformer_backend/{}/docker/create_dockerfile_and_build.py".format(
backends[be]
)
Expand Down Expand Up @@ -1278,6 +1314,23 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
pip3 install --upgrade numpy && \
rm -rf /var/lib/apt/lists/*
"""
# Add dependencies needed for tensorrtllm backend
if "tensorrtllm" in backends:
be = "tensorrtllm"
# FIXME: Update the url
url = "https://gitlab-master.nvidia.com/krish/tensorrtllm_backend/-/raw/{}/tools/gen_trtllm_dockerfile.py".format(
backends[be]
)

response = requests.get(url)
spec = importlib.util.spec_from_loader(
"trtllm_buildscript", loader=None, origin=url
)
trtllm_buildscript = importlib.util.module_from_spec(spec)
exec(response.content, trtllm_buildscript.__dict__)
df += trtllm_buildscript.create_postbuild(
argmap["TRT_LLM_TRT_VERSION"], argmap["TRT_LLM_CUDA_VERSION"]
)

df += """
WORKDIR /opt/tritonserver
Expand Down Expand Up @@ -1441,6 +1494,8 @@ def create_build_dockerfiles(
if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP
else TRITON_VERSION_MAP[FLAGS.version][6],
}
dockerfileargmap["TRT_LLM_TRT_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][7]
dockerfileargmap["TRT_LLM_CUDA_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][8]

# For CPU-only image we need to copy some cuda libraries and dependencies
# since we are using PyTorch and TensorFlow containers that
Expand Down Expand Up @@ -1726,6 +1781,12 @@ def core_build(
cmake_script.blankln()


def tensorrtllm_prebuild(cmake_script):
# Export the TRT_ROOT environment variable
cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt")
cmake_script.cmd("export ARCH=$(uname -m)")


def backend_build(
be,
cmake_script,
Expand All @@ -1746,7 +1807,16 @@ def backend_build(
cmake_script.comment()
cmake_script.mkdir(build_dir)
cmake_script.cwd(build_dir)
cmake_script.gitclone(backend_repo(be), tag, be, github_organization)
# FIXME: Use GitHub repo
if be == "tensorrtllm":
cmake_script.gitclone(
backend_repo(be), tag, be, "https://gitlab-master.nvidia.com/krish"
)
else:
cmake_script.gitclone(backend_repo(be), tag, be, github_organization)

if be == "tensorrtllm":
tensorrtllm_prebuild(cmake_script)

cmake_script.mkdir(repo_build_dir)
cmake_script.cwd(repo_build_dir)
Expand All @@ -1757,6 +1827,7 @@ def backend_build(

cmake_script.mkdir(os.path.join(install_dir, "backends"))
cmake_script.rmdir(os.path.join(install_dir, "backends", be))

cmake_script.cpdir(
os.path.join(repo_install_dir, "backends", be),
os.path.join(install_dir, "backends"),
Expand Down
Loading