Skip to content

Commit

Permalink
Add TRT-LLM backend build to Triton (#6365) (#6392)
Browse files Browse the repository at this point in the history
* Add TRT-LLM backend build to Triton (#6365)

* Add trtllm backend to build

* Temporarily adding version map for 23.07

* Fix build issue

* Update comment

* Comment out python binding changes

* Add post build

* Update trtllm backend naming

* Update TRTLLM base image

* Fix cmake arch

* Revert temp changes for python binding PR

* Address comment

* Move import to the top (#6395)

* Move import to the top

* pre commit format
  • Loading branch information
krishung5 authored Oct 7, 2023
1 parent 2bf543b commit a33f257
Showing 1 changed file with 77 additions and 6 deletions.
83 changes: 77 additions & 6 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
import importlib.util
import multiprocessing
import os
import os.path
Expand All @@ -36,6 +37,8 @@
import sys
from inspect import getsourcefile

import requests

#
# Build Triton Inference Server.
#
Expand Down Expand Up @@ -74,7 +77,9 @@
"2023.0.0", # ORT OpenVINO
"2023.0.0", # Standalone OpenVINO
"2.4.7", # DCGM version
"py310_23.1.0-1", # Conda version.
"py310_23.1.0-1", # Conda version
"9.1.0.1", # TRT version for building TRT-LLM backend
"12.2", # CUDA version for building TRT-LLM backend
)
}

Expand Down Expand Up @@ -564,6 +569,8 @@ def backend_cmake_args(images, components, be, install_dir, library_paths):
args = fastertransformer_cmake_args()
elif be == "tensorrt":
args = tensorrt_cmake_args()
elif be == "tensorrtllm":
args = tensorrtllm_cmake_args(images)
else:
args = []

Expand Down Expand Up @@ -859,6 +866,39 @@ def fastertransformer_cmake_args():
return cargs


def tensorrtllm_cmake_args(images):
cargs = [
cmake_backend_arg(
"tensorrtllm",
"TRT_LIB_DIR",
None,
"${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib",
),
cmake_backend_arg(
"tensorrtllm", "TRT_INCLUDE_DIR", None, "${TRT_ROOT}/include"
),
cmake_backend_arg(
"tensorrtllm",
"TRTLLM_BUILD_CONTAINER",
None,
images["base"],
),
cmake_backend_arg(
"tensorrtllm",
"TENSORRT_VERSION",
None,
TRITON_VERSION_MAP[FLAGS.version][7],
),
cmake_backend_arg(
"tensorrtllm",
"CUDA_VERSION",
None,
TRITON_VERSION_MAP[FLAGS.version][8],
),
]
return cargs


def install_dcgm_libraries(dcgm_version, target_machine):
if dcgm_version == "":
fail(
Expand Down Expand Up @@ -1237,10 +1277,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach

if "fastertransformer" in backends:
be = "fastertransformer"
import importlib.util

import requests

url = "https://raw.githubusercontent.com/triton-inference-server/fastertransformer_backend/{}/docker/create_dockerfile_and_build.py".format(
backends[be]
)
Expand Down Expand Up @@ -1278,6 +1314,23 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
pip3 install --upgrade numpy && \
rm -rf /var/lib/apt/lists/*
"""
# Add dependencies needed for tensorrtllm backend
if "tensorrtllm" in backends:
be = "tensorrtllm"
# FIXME: Update the url
url = "https://gitlab-master.nvidia.com/krish/tensorrtllm_backend/-/raw/{}/tools/gen_trtllm_dockerfile.py".format(
backends[be]
)

response = requests.get(url)
spec = importlib.util.spec_from_loader(
"trtllm_buildscript", loader=None, origin=url
)
trtllm_buildscript = importlib.util.module_from_spec(spec)
exec(response.content, trtllm_buildscript.__dict__)
df += trtllm_buildscript.create_postbuild(
argmap["TRT_LLM_TRT_VERSION"], argmap["TRT_LLM_CUDA_VERSION"]
)

df += """
WORKDIR /opt/tritonserver
Expand Down Expand Up @@ -1441,6 +1494,8 @@ def create_build_dockerfiles(
if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP
else TRITON_VERSION_MAP[FLAGS.version][6],
}
dockerfileargmap["TRT_LLM_TRT_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][7]
dockerfileargmap["TRT_LLM_CUDA_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][8]

# For CPU-only image we need to copy some cuda libraries and dependencies
# since we are using PyTorch and TensorFlow containers that
Expand Down Expand Up @@ -1726,6 +1781,12 @@ def core_build(
cmake_script.blankln()


def tensorrtllm_prebuild(cmake_script):
# Export the TRT_ROOT environment variable
cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt")
cmake_script.cmd("export ARCH=$(uname -m)")


def backend_build(
be,
cmake_script,
Expand All @@ -1746,7 +1807,16 @@ def backend_build(
cmake_script.comment()
cmake_script.mkdir(build_dir)
cmake_script.cwd(build_dir)
cmake_script.gitclone(backend_repo(be), tag, be, github_organization)
# FIXME: Use GitHub repo
if be == "tensorrtllm":
cmake_script.gitclone(
backend_repo(be), tag, be, "https://gitlab-master.nvidia.com/krish"
)
else:
cmake_script.gitclone(backend_repo(be), tag, be, github_organization)

if be == "tensorrtllm":
tensorrtllm_prebuild(cmake_script)

cmake_script.mkdir(repo_build_dir)
cmake_script.cwd(repo_build_dir)
Expand All @@ -1757,6 +1827,7 @@ def backend_build(

cmake_script.mkdir(os.path.join(install_dir, "backends"))
cmake_script.rmdir(os.path.join(install_dir, "backends", be))

cmake_script.cpdir(
os.path.join(repo_install_dir, "backends", be),
os.path.join(install_dir, "backends"),
Expand Down

0 comments on commit a33f257

Please sign in to comment.