diff --git a/build.py b/build.py index 214c8d809c..5ee71e206b 100755 --- a/build.py +++ b/build.py @@ -1319,18 +1319,50 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach # Add dependencies needed for tensorrtllm backend if "tensorrtllm" in backends: be = "tensorrtllm" - url = "https://raw.githubusercontent.com/triton-inference-server/tensorrtllm_backend/{}/tools/gen_trtllm_dockerfile.py".format( - backends[be] - ) - - response = requests.get(url) - spec = importlib.util.spec_from_loader( - "trtllm_buildscript", loader=None, origin=url + df += """ +WORKDIR /workspace +RUN apt-get update && apt-get install -y --no-install-recommends python3-pip + +# Remove previous TRT installation +RUN apt-get remove --purge -y tensorrt* libnvinfer* +RUN pip uninstall -y tensorrt +# Install new version of TRT using the script from TRT-LLM +RUN apt-get update && apt-get install -y --no-install-recommends python-is-python3 +RUN git clone --single-branch --depth=1 -b {} https://{}:{}@gitlab-master.nvidia.com/ftp/tekit_backend.git tensorrtllm_backend +RUN cd tensorrtllm_backend && git submodule update --init --recursive +ENV TRT_VER=9.2.0.4 +ENV CUDA_VER=12.3 +ENV CUDNN_VER=8.9.6.50-1+cuda12.2 +ENV NCCL_VER=2.19.3-1+cuda12.3 +ENV CUBLAS_VER=12.3.2.9-1 +RUN cp tensorrtllm_backend/tensorrt_llm/docker/common/install_tensorrt.sh /tmp/ +RUN rm -fr tensorrtllm_backend + """.format( + backends[be], + os.environ["REMOVE_ME_TRTLLM_USERNAME"], + os.environ["REMOVE_ME_TRTLLM_TOKEN"], ) - trtllm_buildscript = importlib.util.module_from_spec(spec) - exec(response.content, trtllm_buildscript.__dict__) - df += trtllm_buildscript.create_postbuild(backends[be]) + df += """ +RUN bash /tmp/install_tensorrt.sh --CUDA_VER=$CUDA_VER --CUDNN_VER=$CUDNN_VER --NCCL_VER=$NCCL_VER --CUBLAS_VER=$CUBLAS_VER && rm /tmp/install_tensorrt.sh +ENV TRT_ROOT=/usr/local/tensorrt +# Remove TRT contents that are not needed in runtime +RUN ARCH="$(uname -i)" && \ + rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \ + rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \ + rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples + +# Install required packages for TRT-LLM models +RUN python3 -m pip install --upgrade pip && \ + pip3 install transformers + +# Uninstall unused nvidia packages +RUN if pip freeze | grep -q "nvidia.*"; then \ + pip freeze | grep "nvidia.*" | xargs pip uninstall -y; \ + fi +RUN pip cache purge +ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH +""" if "vllm" in backends: # [DLIS-5606] Build Conda environment for vLLM backend # Remove Pip install once vLLM backend moves to Conda environment. @@ -1790,6 +1822,12 @@ def core_build( def tensorrtllm_prebuild(cmake_script): # Export the TRT_ROOT environment variable + cmake_script.cmd("export TRT_VER=9.2.0.4") + cmake_script.cmd("export CUDA_VER=12.3") + cmake_script.cmd("export CUDNN_VER=8.9.6.50-1+cuda12.2") + cmake_script.cmd("export NCCL_VER=2.19.3-1+cuda12.3") + cmake_script.cmd("export CUBLAS_VER=12.3.2.9-1") + cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt") cmake_script.cmd("export ARCH=$(uname -m)") @@ -1820,10 +1858,18 @@ def backend_build( cmake_script.comment() cmake_script.mkdir(build_dir) cmake_script.cwd(build_dir) - cmake_script.gitclone(backend_repo(be), tag, be, github_organization) if be == "tensorrtllm": + cmake_script.cmd( + "git clone --single-branch --depth=1 -b {} https://{}:{}@gitlab-master.nvidia.com/ftp/tekit_backend.git tensorrtllm".format( + tag, + os.environ["REMOVE_ME_TRTLLM_USERNAME"], + os.environ["REMOVE_ME_TRTLLM_TOKEN"], + ) + ) tensorrtllm_prebuild(cmake_script) + else: + cmake_script.gitclone(backend_repo(be), tag, be, github_organization) cmake_script.mkdir(repo_build_dir) cmake_script.cwd(repo_build_dir)