Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TRT-LLM backend build changes #6406

Merged
merged 12 commits into from
Oct 13, 2023
107 changes: 76 additions & 31 deletions build.py
Original file line number Diff line number Diff line change
@@ -78,8 +78,6 @@
"2023.0.0", # Standalone OpenVINO
"2.4.7", # DCGM version
"py310_23.1.0-1", # Conda version
"9.1.0.1", # TRT version for building TRT-LLM backend
"12.2", # CUDA version for building TRT-LLM backend
"0.2.0", # vLLM version
)
}
@@ -884,19 +882,8 @@ def tensorrtllm_cmake_args(images):
None,
images["base"],
),
cmake_backend_arg(
"tensorrtllm",
"TENSORRT_VERSION",
None,
TRITON_VERSION_MAP[FLAGS.version][7],
),
cmake_backend_arg(
"tensorrtllm",
"CUDA_VERSION",
None,
TRITON_VERSION_MAP[FLAGS.version][8],
),
]
cargs.append(cmake_backend_enable("tensorrtllm", "TRITON_BUILD", True))
return cargs


@@ -1315,32 +1302,75 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
pip3 install --upgrade numpy && \
rm -rf /var/lib/apt/lists/*
"""
# FIXME: Use the postbuild script here
# Add dependencies needed for tensorrtllm backend
if "tensorrtllm" in backends:
be = "tensorrtllm"
# FIXME: Update the url
url = "https://gitlab-master.nvidia.com/krish/tensorrtllm_backend/-/raw/{}/tools/gen_trtllm_dockerfile.py".format(
backends[be]
)
# # FIXME: Update the url
# url = "https://gitlab-master.nvidia.com/ftp/tekit_backend/-/raw/{}/tools/gen_trtllm_dockerfile.py".format(
# backends[be]
# )

# response = requests.get(url)
# spec = importlib.util.spec_from_loader(
# "trtllm_buildscript", loader=None, origin=url
# )
# trtllm_buildscript = importlib.util.module_from_spec(spec)
# exec(response.content, trtllm_buildscript.__dict__)
# df += trtllm_buildscript.create_postbuild(
# backends[be] # repo tag
# )
df += """
WORKDIR /workspace

response = requests.get(url)
spec = importlib.util.spec_from_loader(
"trtllm_buildscript", loader=None, origin=url
)
trtllm_buildscript = importlib.util.module_from_spec(spec)
exec(response.content, trtllm_buildscript.__dict__)
df += trtllm_buildscript.create_postbuild(
argmap["TRT_LLM_TRT_VERSION"], argmap["TRT_LLM_CUDA_VERSION"]
# Remove previous TRT installation
RUN apt-get remove --purge -y tensorrt* libnvinfer*
RUN pip uninstall -y tensorrt

# Install new version of TRT using the script from TRT-LLM
RUN apt-get update && apt-get install -y --no-install-recommends python-is-python3
RUN git clone --single-branch --depth=1 -b {} https://{}:{}@gitlab-master.nvidia.com/ftp/tekit_backend.git tensorrtllm_backend
RUN cd tensorrtllm_backend && git submodule update --init --recursive
RUN cp tensorrtllm_backend/tensorrt_llm/docker/common/install_tensorrt.sh /tmp/
RUN rm -fr tensorrtllm_backend
""".format(
backends[be],
os.environ["REMOVE_ME_TRTLLM_USERNAME"],
os.environ["REMOVE_ME_TRTLLM_TOKEN"],
)

df += """
RUN bash /tmp/install_tensorrt.sh && rm /tmp/install_tensorrt.sh
ENV TRT_ROOT=/usr/local/tensorrt

# Remove TRT contents that are not needed in runtime
RUN ARCH="$(uname -i)" && \
rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples

# Install required packages for TRT-LLM models
RUN python3 -m pip install --upgrade pip && \
pip3 install transformers && \
pip3 install torch

# Uninstall unused nvidia packages
RUN if pip freeze | grep -q "nvidia.*"; then \
pip freeze | grep "nvidia.*" | xargs pip uninstall -y; \
fi
RUN pip cache purge

ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
"""

if "vllm" in backends:
# [DLIS-5606] Build Conda environment for vLLM backend
# Remove Pip install once vLLM backend moves to Conda environment.
df += """
# vLLM needed for vLLM backend
RUN pip3 install vllm=={}
""".format(
TRITON_VERSION_MAP[FLAGS.version][9]
TRITON_VERSION_MAP[FLAGS.version][7]
)

df += """
@@ -1505,8 +1535,6 @@ def create_build_dockerfiles(
if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP
else TRITON_VERSION_MAP[FLAGS.version][6],
}
dockerfileargmap["TRT_LLM_TRT_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][7]
dockerfileargmap["TRT_LLM_CUDA_VERSION"] = TRITON_VERSION_MAP[FLAGS.version][8]

# For CPU-only image we need to copy some cuda libraries and dependencies
# since we are using PyTorch and TensorFlow containers that
@@ -1797,6 +1825,16 @@ def tensorrtllm_prebuild(cmake_script):
cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt")
cmake_script.cmd("export ARCH=$(uname -m)")

# FIXME: Update the file structure to the one Triton expects. This is a temporary fix
# to get the build working for r23.10.
# Uncomment the patch once moving to the GitHub repo
# cmake_script.cmd(
# "patch tensorrtllm/inflight_batcher_llm/CMakeLists.txt < tensorrtllm/inflight_batcher_llm/CMakeLists.txt.patch"
# )
cmake_script.cmd("mv tensorrtllm/inflight_batcher_llm/src tensorrtllm")
cmake_script.cmd("mv tensorrtllm/inflight_batcher_llm/cmake tensorrtllm")
cmake_script.cmd("mv tensorrtllm/inflight_batcher_llm/CMakeLists.txt tensorrtllm")


def backend_build(
be,
@@ -1820,8 +1858,15 @@ def backend_build(
cmake_script.cwd(build_dir)
# FIXME: Use GitHub repo
if be == "tensorrtllm":
cmake_script.gitclone(
backend_repo(be), tag, be, "https://gitlab-master.nvidia.com/krish"
# cmake_script.gitclone(
# backend_repo("tekit"), tag, be, "https://gitlab-master.nvidia.com/ftp"
# )
cmake_script.cmd(
"git clone --single-branch --depth=1 -b {} https://{}:{}@gitlab-master.nvidia.com/ftp/tekit_backend.git tensorrtllm".format(
tag,
os.environ["REMOVE_ME_TRTLLM_USERNAME"],
os.environ["REMOVE_ME_TRTLLM_TOKEN"],
)
)
else:
cmake_script.gitclone(backend_repo(be), tag, be, github_organization)