From a33f25740b82dd655c65611dd67c95556fb181d2 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 6 Oct 2023 17:22:12 -0700 Subject: [PATCH] Add TRT-LLM backend build to Triton (#6365) (#6392) * Add TRT-LLM backend build to Triton (#6365) * Add trtllm backend to build * Temporarily adding version map for 23.07 * Fix build issue * Update comment * Comment out python binding changes * Add post build * Update trtllm backend naming * Update TRTLLM base image * Fix cmake arch * Revert temp changes for python binding PR * Address comment * Move import to the top (#6395) * Move import to the top * pre commit format --- build.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/build.py b/build.py index 8985646e1d..7de2d95449 100755 --- a/build.py +++ b/build.py @@ -26,6 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import importlib.util import multiprocessing import os import os.path @@ -36,6 +37,8 @@ import sys from inspect import getsourcefile +import requests + # # Build Triton Inference Server. # @@ -74,7 +77,9 @@ "2023.0.0", # ORT OpenVINO "2023.0.0", # Standalone OpenVINO "2.4.7", # DCGM version - "py310_23.1.0-1", # Conda version. + "py310_23.1.0-1", # Conda version + "9.1.0.1", # TRT version for building TRT-LLM backend + "12.2", # CUDA version for building TRT-LLM backend ) } @@ -564,6 +569,8 @@ def backend_cmake_args(images, components, be, install_dir, library_paths): args = fastertransformer_cmake_args() elif be == "tensorrt": args = tensorrt_cmake_args() + elif be == "tensorrtllm": + args = tensorrtllm_cmake_args(images) else: args = [] @@ -859,6 +866,39 @@ def fastertransformer_cmake_args(): return cargs +def tensorrtllm_cmake_args(images): + cargs = [ + cmake_backend_arg( + "tensorrtllm", + "TRT_LIB_DIR", + None, + "${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib", + ), + cmake_backend_arg( + "tensorrtllm", "TRT_INCLUDE_DIR", None, "${TRT_ROOT}/include" + ), + cmake_backend_arg( + "tensorrtllm", + "TRTLLM_BUILD_CONTAINER", + None, + images["base"], + ), + cmake_backend_arg( + "tensorrtllm", + "TENSORRT_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][7], + ), + cmake_backend_arg( + "tensorrtllm", + "CUDA_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][8], + ), + ] + return cargs + + def install_dcgm_libraries(dcgm_version, target_machine): if dcgm_version == "": fail( @@ -1237,10 +1277,6 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach if "fastertransformer" in backends: be = "fastertransformer" - import importlib.util - - import requests - url = "https://raw.githubusercontent.com/triton-inference-server/fastertransformer_backend/{}/docker/create_dockerfile_and_build.py".format( backends[be] ) @@ -1278,6 +1314,23 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach pip3 install --upgrade numpy && \ rm -rf /var/lib/apt/lists/* """ + # Add dependencies needed for tensorrtllm backend + if "tensorrtllm" in backends: + be = "tensorrtllm" + # FIXME: Update the url + url = "https://gitlab-master.nvidia.com/krish/tensorrtllm_backend/-/raw/{}/tools/gen_trtllm_dockerfile.py".format( + backends[be] + ) + + response = requests.get(url) + spec = importlib.util.spec_from_loader( + "trtllm_buildscript", loader=None, origin=url + ) + trtllm_buildscript = importlib.util.module_from_spec(spec) + exec(response.content, trtllm_buildscript.__dict__) + df += trtllm_buildscript.create_postbuild( + argmap["TRT_LLM_TRT_VERSION"], argmap["TRT_LLM_CUDA_VERSION"] + ) df += """ WORKDIR /opt/tritonserver @@ -1441,6 +1494,8 @@ def create_build_dockerfiles( if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP else TRITON_VERSION_MAP[FLAGS.version][6], } + dockerfileargmap["TRT_LLM_TRT_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][7] + dockerfileargmap["TRT_LLM_CUDA_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][8] # For CPU-only image we need to copy some cuda libraries and dependencies # since we are using PyTorch and TensorFlow containers that @@ -1726,6 +1781,12 @@ def core_build( cmake_script.blankln() +def tensorrtllm_prebuild(cmake_script): + # Export the TRT_ROOT environment variable + cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt") + cmake_script.cmd("export ARCH=$(uname -m)") + + def backend_build( be, cmake_script, @@ -1746,7 +1807,16 @@ def backend_build( cmake_script.comment() cmake_script.mkdir(build_dir) cmake_script.cwd(build_dir) - cmake_script.gitclone(backend_repo(be), tag, be, github_organization) + # FIXME: Use GitHub repo + if be == "tensorrtllm": + cmake_script.gitclone( + backend_repo(be), tag, be, "https://gitlab-master.nvidia.com/krish" + ) + else: + cmake_script.gitclone(backend_repo(be), tag, be, github_organization) + + if be == "tensorrtllm": + tensorrtllm_prebuild(cmake_script) cmake_script.mkdir(repo_build_dir) cmake_script.cwd(repo_build_dir) @@ -1757,6 +1827,7 @@ def backend_build( cmake_script.mkdir(os.path.join(install_dir, "backends")) cmake_script.rmdir(os.path.join(install_dir, "backends", be)) + cmake_script.cpdir( os.path.join(repo_install_dir, "backends", be), os.path.join(install_dir, "backends"),