From e460b60695981102e6c0786ba55bf9875b7ff402 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 10 Jul 2024 19:59:47 +0800
Subject: [PATCH 01/14] remove cutlass to BUILD_TEST

---
 CMakeLists.txt | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a2026c6ee..173a689f47 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,20 +43,21 @@ option(BUILD_PY_FFI "Build python ffi" ON)
 option(BUILD_TEST "Build tests" OFF)
 
 include(FetchContent)
+if (BUILD_TEST)
+  FetchContent_Declare(
+    repo-cutlass
+    GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+    GIT_TAG        6f47420213f757831fae65c686aa471749fa8d60
+    GIT_SHALLOW ON
+  )
 
-FetchContent_Declare(
-  repo-cutlass
-  GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
-  GIT_TAG        6f47420213f757831fae65c686aa471749fa8d60
-  GIT_SHALLOW ON
-)
-
-set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+  set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
-FetchContent_MakeAvailable(repo-cutlass)
+  FetchContent_MakeAvailable(repo-cutlass)
 
-set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
-set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
+  set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
+  set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
+endif()
 
 option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
 

From cb5b6d2c67b7a8d47e4ccb5c49661e97700c0333 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 10 Jul 2024 20:21:58 +0800
Subject: [PATCH 02/14] remove turbomind_backend

---
 src/turbomind/triton_backend/CMakeLists.txt | 247 --------------------
 1 file changed, 247 deletions(-)

diff --git a/src/turbomind/triton_backend/CMakeLists.txt b/src/turbomind/triton_backend/CMakeLists.txt
index 7aa96e6ca5..4311d9d9be 100644
--- a/src/turbomind/triton_backend/CMakeLists.txt
+++ b/src/turbomind/triton_backend/CMakeLists.txt
@@ -34,250 +34,3 @@ set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE
 install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 add_subdirectory(llama)
-
-# Needn't build triton backend on windows
-if (MSVC)
-  return ()
-endif()
-
-#
-# Options
-#
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
-option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
-
-set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
-set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
-
-set(TRITON_BACKEND_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/backend repo")
-set(TRITON_CORE_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/core repo")
-set(TRITON_COMMON_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/common repo")
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-set(USE_TRITONSERVER_DATATYPE "ON")
-message("-- Enable USE_TRITONSERVER_DATATYPE")
-
-#
-# Dependencies
-#
-# FetchContent's composability isn't very good. We must include the
-# transitive closure of all repos so that we can override the tag.
-#
-include(FetchContent)
-
-FetchContent_Declare(
-  repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
-  GIT_TAG ${TRITON_COMMON_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_Declare(
-  repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
-  GIT_TAG ${TRITON_CORE_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_Declare(
-  repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
-  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
-  GIT_SHALLOW ON
-)
-FetchContent_MakeAvailable(repo-common repo-core repo-backend)
-
-#
-# CUDA
-#
-if(${TRITON_ENABLE_GPU})
-  find_package(CUDAToolkit REQUIRED)
-endif() # TRITON_ENABLE_GPU
-
-#
-# Shared library implementing the Triton Backend API
-#
-configure_file(libtriton_fastertransformer.ldscript libtriton_fastertransformer.ldscript COPYONLY)
-
-add_library(
-  triton-turbomind-backend SHARED
-  libfastertransformer.cc
-)
-
-add_library(
-  TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend
-)
-
-find_package(CUDAToolkit REQUIRED)
-find_package(CUDA 10.1 REQUIRED)
-if (${CUDA_VERSION} GREATER_EQUAL 11.0)
-  message(STATUS "Add DCUDA11_MODE")
-  add_definitions("-DCUDA11_MODE")
-endif()
-
-set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
-
-target_compile_definitions(triton-turbomind-backend PUBLIC
-  USE_TRITONSERVER_DATATYPE)
-
-if (BUILD_MULTI_GPU)
-  target_compile_definitions(triton-turbomind-backend PUBLIC
-    BUILD_MULTI_GPU)
-endif ()
-
-target_include_directories(
-  triton-turbomind-backend
-  PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${TRITON_PYTORCH_INCLUDE_PATHS}
-  ${Python3_INCLUDE_DIRS}
-  ${repo-ft_SOURCE_DIR}
-  ${repo-ft_SOURCE_DIR}/3rdparty/cutlass/include
-  ${repo-core_SOURCE_DIR}/include
-  )
-
-target_link_directories(
-  triton-turbomind-backend
-  PRIVATE
-  ${CUDA_PATH}/lib64
-  )
-
-target_compile_features(triton-turbomind-backend PRIVATE cxx_std_14)
-
-target_compile_options(
-  triton-turbomind-backend PRIVATE
-  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits >#-Werror>
-)
-
-if(${TRITON_ENABLE_GPU})
-  target_compile_definitions(
-    triton-turbomind-backend
-    PRIVATE TRITON_ENABLE_GPU=1
-  )
-endif() # TRITON_ENABLE_GPU
-
-set_target_properties(
-  triton-turbomind-backend
-  PROPERTIES
-    POSITION_INDEPENDENT_CODE ON
-    OUTPUT_NAME triton_turbomind
-    SKIP_BUILD_RPATH TRUE
-    BUILD_WITH_INSTALL_RPATH TRUE
-    INSTALL_RPATH_USE_LINK_PATH FALSE
-    INSTALL_RPATH "$\{ORIGIN\}"
-    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_fastertransformer.ldscript
-    LINK_FLAGS "-Wl,--no-as-needed,--version-script ${CMAKE_CURRENT_BINARY_DIR}/libtriton_fastertransformer.ldscript"
-)
-
-# Need to turn off unused-but-set-variable due to Torchvision
-# Need to turn off unknown-pragmas due to ATen OpenMP
-set_target_properties(
-  triton-turbomind-backend
-  PROPERTIES COMPILE_FLAGS
-    "-Wno-unknown-pragmas -Wno-unused-but-set-variable"
-)
-
-set(TRITON_PYTORCH_LDFLAGS "")
-FOREACH(p ${TRITON_PYTORCH_LIB_PATHS})
-  set(TRITON_PYTORCH_LDFLAGS ${TRITON_PYTORCH_LDFLAGS} "-L${p}")
-ENDFOREACH(p)
-
-target_link_libraries(
-  triton-turbomind-backend
-  PRIVATE
-    triton-core-serverapi  # from repo-core
-    triton-core-backendapi # from repo-core
-    triton-core-serverstub # from repo-core
-    triton-backend-utils   # from repo-backend
-    transformer-shared     # from repo-ft
-    ${TRITON_PYTORCH_LDFLAGS}
-    -lcublas
-    -lcublasLt
-    -lcudart
-    -lcurand
-)
-
-if (BUILD_MULTI_GPU)
-  target_compile_definitions(
-    triton-turbomind-backend
-    PUBLIC
-      BUILD_MULTI_GPU
-  )
-  target_include_directories(
-    triton-turbomind-backend
-    PRIVATE
-      ${MPI_INCLUDE_PATH}
-  )
-  target_link_directories(
-    triton-turbomind-backend
-    PRIVATE
-      ${MPI_Libraries}
-      /usr/local/mpi/lib
-  )
-  target_link_libraries(
-    triton-turbomind-backend
-    PRIVATE
-      ${NCCL_LIBRARIES}
-      ${MPI_LIBRARIES}
-  )
-endif()
-
-if(${TRITON_ENABLE_GPU})
-  target_link_libraries(
-    triton-turbomind-backend
-    PRIVATE
-      CUDA::cudart
-  )
-endif() # TRITON_ENABLE_GPU
-
-#
-# Install
-#
-include(GNUInstallDirs)
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TurboMindBackend)
-
-install(
-  TARGETS
-    triton-turbomind-backend
-  EXPORT
-    triton-turbomind-backend-targets
-  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
-)
-
-install(
-  EXPORT
-    triton-turbomind-backend-targets
-  FILE
-    TritonTurboMindBackendTargets.cmake
-  NAMESPACE
-    TritonTurboMindBackend::
-  DESTINATION
-    ${INSTALL_CONFIGDIR}
-)
-
-include(CMakePackageConfigHelpers)
-configure_package_config_file(
-  ${CMAKE_SOURCE_DIR}/cmake/TritonTurboMindBackendConfig.cmake.in
-  ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake
-  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-install(
-  FILES
-  ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake
-  DESTINATION ${INSTALL_CONFIGDIR}
-)
-
-#
-# Export from build tree
-#
-export(
-  EXPORT triton-turbomind-backend-targets
-  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendTargets.cmake
-  NAMESPACE TritonTurboMindBackend::
-)
-
-export(PACKAGE TritonTurboMindBackend)

From 4fbd39e90f4e8d758185d67cc9eae00c5ea04329 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 10 Jul 2024 20:22:55 +0800
Subject: [PATCH 03/14] get_hf_config_content

---
 docs/en/benchmark/profile_triton_server.md    | 58 ------------------
 docs/en/index.rst                             |  1 -
 docs/zh_cn/benchmark/profile_triton_server.md | 61 -------------------
 docs/zh_cn/index.rst                          |  1 -
 4 files changed, 121 deletions(-)
 delete mode 100644 docs/en/benchmark/profile_triton_server.md
 delete mode 100644 docs/zh_cn/benchmark/profile_triton_server.md

diff --git a/docs/en/benchmark/profile_triton_server.md b/docs/en/benchmark/profile_triton_server.md
deleted file mode 100644
index 1af1f5a074..0000000000
--- a/docs/en/benchmark/profile_triton_server.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Profile Triton Inference Server
-
-Triton Inference Server (TIS) is another serving method supported by LMDeploy besides `api_server`. Its performance testing methods and metrics are similar to those of [api_server](./profile_api_server.md).
-
-The profiling script is `profile_serving.py`. Before running it, please install the lmdeploy precompiled package, download the profiling script and the test dataset:
-
-```shell
-pip install 'lmdeploy[serve]'
-git clone --depth=1 https://github.com/InternLM/lmdeploy
-cd lmdeploy/benchmark
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-## Metrics
-
-LMDeploy records the performance metrics like first token latency, token throughput (tokens/s) and request throughput (RPM)
-
-`first_token_latency` is only reported in the case of streaming inference.
-
-The formula for calculating `token throughput` is:
-
-$$
-TokenThroughput=Number\\ of\\ generated\\ tokens/TotalTime
-$$
-
-And the formula for calculating `request throughput` is:
-
-$$
-RPM(request\\ per\\ minute)=Number\\ of\\ prompts/TotalTime * 60
-$$
-
-Total time includes prefill time.
-
-## Profile
-
-In this section, we take [internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b) as an example to show the benchmark procedure.
-
-### Launch triton inference server
-
-Before launching the server, the LLM model must be converted to the turbomind format in advance.
-
-```shell
-lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b --trust-remote-code
-```
-
-Then, the triton inference server can be launched by:
-
-```shell
-bash ./internlm-7b/service_docker_up.sh
-```
-
-### Profile
-
-```shell
-python3 profile_serving.py 0.0.0.0:33337 ./internlm-7b/triton_models/tokenizer ./ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-For detailed argument specification of `profile_serving.py`, such as request concurrency, sampling parameters an so on, please run the help command `python3 profile_serving.py -h`.
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 62e535fddb..3403b07857 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -57,7 +57,6 @@ Documentation
    benchmark/profile_generation.md
    benchmark/profile_throughput.md
    benchmark/profile_api_server.md
-   benchmark/profile_triton_server.md
    benchmark/evaluate_with_opencompass.md
 
 .. _supported_models:
diff --git a/docs/zh_cn/benchmark/profile_triton_server.md b/docs/zh_cn/benchmark/profile_triton_server.md
deleted file mode 100644
index f17a2964bd..0000000000
--- a/docs/zh_cn/benchmark/profile_triton_server.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Triton Inference Server 性能测试
-
-Triton Inference Server(TIS) 是 LMDeploy 支持的除了 api_server 之外的另一种 serving 方式。它的性能测试方式和测试指标和 [api_server](./profile_api_server.md) 的测试方式类似。
-
-```{note}
-LMDeploy 尚未实现 Triton Inference Server 的 ensemble 推理模式，所以推理性能要比 api_server 弱。对于追求性能的用户，我们推荐使用 api_server 部署服务。
-```
-
-TIS 性能测试脚本是 `profile_serving.py`。测试之前，请安装 lmdeploy 预编译包，并下载评测脚本和测试数据集。
-
-```shell
-pip install 'lmdeploy[serve]'
-git clone --depth=1 https://github.com/InternLM/lmdeploy
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-## 测量指标
-
-LMDeploy 统计首token延时（first_token_latency）、token吞吐量（tokens/s）和请求吞吐量（RPM）。
-
-`first_token_latency` 只有在流式推理的情况下才会输出。
-
-token吞吐量的计算公式为：
-
-$$
-吞吐量 = 生成的token数量 / 总时间
-$$
-
-请求吞吐量的计算公式为：
-
-$$
-吞吐量 = 请求数量 / 总时间
-$$
-
-总时间包括 prefill 时间
-
-## 测量方法
-
-我们以 [internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b) 为例，展示 triton inference server 的性能测试流程
-
-### 启动服务
-
-启动服务之前，必须先把模型转换为 turbomind 模型格式：
-
-```shell
-lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b --trust-remote-code
-```
-
-然后，执行如下命令，启动服务：
-
-```shell
-bash ./internlm-7b/service_docker_up.sh
-```
-
-### 测速
-
-```shell
-python3 profile_serving.py 0.0.0.0:33337 ./internlm-7b/triton_models/tokenizer ./ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-关于 `profile_serving.py` 脚本中的参数，比如请求并发数、采样参数等等，可以通过运行命令 `python3 profile_serving.py -h` 查阅。
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 16e2e6179e..f64b0f196b 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -57,7 +57,6 @@ LMDeploy 工具箱提供以下核心功能：
    benchmark/profile_generation.md
    benchmark/profile_throughput.md
    benchmark/profile_api_server.md
-   benchmark/profile_triton_server.md
    benchmark/evaluate_with_opencompass.md
 
 .. _支持的模型:

From 0d520fb4d56e7d7ffe6a40a9b026bf53e5805ff9 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Thu, 11 Jul 2024 14:02:22 +0800
Subject: [PATCH 04/14] remove profile_serving.py and libfastertransformer

---
 benchmark/profile_serving.py                  |  260 ---
 docs/en/inference/turbomind.md                |    2 -
 docs/zh_cn/inference/turbomind.md             |    2 -
 .../triton_backend/libfastertransformer.cc    | 1914 -----------------
 .../libtriton_fastertransformer.ldscript      |   30 -
 5 files changed, 2208 deletions(-)
 delete mode 100644 benchmark/profile_serving.py
 delete mode 100644 src/turbomind/triton_backend/libfastertransformer.cc
 delete mode 100644 src/turbomind/triton_backend/libtriton_fastertransformer.ldscript

diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py
deleted file mode 100644
index 60058d290c..0000000000
--- a/benchmark/profile_serving.py
+++ /dev/null
@@ -1,260 +0,0 @@
-import csv
-import json
-import random
-import time
-from queue import Queue
-from threading import Thread
-from typing import List, Tuple
-
-import fire
-import numpy as np
-from tqdm import tqdm
-
-from lmdeploy.serve.turbomind.chatbot import Chatbot
-from lmdeploy.tokenizer import Tokenizer
-
-
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: Tokenizer,
-) -> List[Tuple[str, int, int]]:
-    # Load the dataset.
-    with open(dataset_path) as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data['conversations']) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data['conversations'][0]['value'],
-                data['conversations'][1]['value']) for data in dataset]
-
-    # pre-sample to avoid go through all the dataset
-    dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
-
-    # Tokenize the prompts and completions.
-    prompts = [prompt for prompt, _ in dataset]
-    prompt_token_ids = tokenizer(prompts).input_ids
-    completions = [completion for _, completion in dataset]
-    completion_token_ids = tokenizer(completions).input_ids
-    tokenized_dataset = []
-    for i in range(len(dataset)):
-        output_len = len(completion_token_ids[i])
-        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
-
-    # Filter out too long sequences.
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for prompt, prompt_token_ids, output_len in tokenized_dataset:
-        prompt_len = len(prompt_token_ids)
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
-
-    # Sample the requests.
-    sampled_requests = random.sample(filtered_dataset, num_requests)
-    return sampled_requests
-
-
-class Engine:
-
-    def __init__(self,
-                 server_addr: str,
-                 tokenzier_path: str,
-                 temperature: float = 0.8,
-                 top_k: int = 1,
-                 top_p: float = 1.0,
-                 csv: str = '',
-                 log_level: str = 'ERROR',
-                 **kwargs):
-        self.server_addr = server_addr
-        self.tokenizer = Tokenizer(tokenzier_path)
-        self.temperature = temperature
-        self.top_k = top_k
-        self.top_p = top_p
-        self.csv = csv
-        self.log_level = log_level
-        self.pbar = None
-
-    def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
-                   stream_output: bool):
-
-        chatbot = Chatbot(self.server_addr,
-                          ignore_eos=True,
-                          top_k=self.top_k,
-                          top_p=self.top_p,
-                          temperature=self.temperature,
-                          capability='completion',
-                          log_level=self.log_level)
-        stats = []
-        for prompt, input_seqlen, output_seqlen in iter(
-                req_queue.get, [None, None, None]):
-            timestamps = []
-            tokens = []
-            timestamps.append(time.perf_counter())
-            for _, _, n_token in chatbot.stream_infer(
-                    session_id,
-                    prompt,
-                    request_output_len=output_seqlen,
-                    sequence_start=True,
-                    sequence_end=True):
-                timestamps.append(time.perf_counter())
-                tokens.append(n_token)
-            first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
-            token_latency = np.round(timestamps[-1] - timestamps[0], 3)
-            completion_tokens = tokens[-1]
-            assert output_seqlen <= completion_tokens <= output_seqlen + 1, \
-                f'Error. session_id({session_id}) request {output_seqlen} ' \
-                f'tokens, but generate {completion_tokens} tokens.\n' \
-                f'prompt: {prompt}'
-            total_tokens = tokens[-1] + input_seqlen
-            stats.append([
-                first_token_latency, completion_tokens, output_seqlen,
-                total_tokens, token_latency
-            ])
-            self.pbar.update(1)
-        res_queue.put((session_id, stats))
-
-    def process_request(self,
-                        requests,
-                        concurrency: int = 1,
-                        stream_output: bool = True):
-        res_queue = Queue()
-        req_queue = Queue()
-        threads = []
-
-        self.pbar = tqdm(total=len(requests))
-
-        # feed request to q
-        for req in requests:
-            req_queue.put(req)
-        for i in range(concurrency):
-            req_queue.put([None, None, None])
-
-        start = time.time()
-
-        # start threads
-        for i in range(concurrency):
-            t = Thread(target=self._inference,
-                       args=(req_queue, res_queue, i, stream_output))
-            t.start()
-            threads.append(t)
-
-        # wait for finish
-        for t in threads:
-            t.join()
-
-        elapsed_time = time.time() - start
-
-        stats = []
-        while not res_queue.empty():
-            session_id, _stats = res_queue.get()
-            # print(f'\n{"-" * 50}\n'
-            #       f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
-            if len(_stats) != 0:
-                stats.append(np.array(_stats))
-
-        stats = np.concatenate(stats).reshape(-1, 5)
-
-        first_token_latency_min = np.min(stats[:, 0], axis=0)
-        first_token_latency_max = np.max(stats[:, 0], axis=0)
-        first_token_latency_ave = np.mean(stats[:, 0], axis=0)
-        completion_tokens = np.sum(stats[:, 1], axis=0)
-        request_output_tokens = np.sum(stats[:, 2], axis=0)
-        total_tokens = np.sum(stats[:, 3], axis=0)
-        prompt_tokens = total_tokens - completion_tokens
-        completion_token_throughput = completion_tokens / elapsed_time
-        total_token_throughput = total_tokens / elapsed_time
-        rps = len(requests) / elapsed_time
-        rpm = rps * 60
-
-        if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
-            print(f'Did not generate requested number of tokens. '
-                  f'Request {request_output_tokens:.0f}, '
-                  f'but got {completion_tokens:.0f}')
-
-        print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
-              f'elapsed_time: {elapsed_time:.3f}s\n')
-        if stream_output:
-            print(f'first_token latency(min, max, ave): '
-                  f'{first_token_latency_min:.3f}s, '
-                  f'{first_token_latency_max:.3f}s, '
-                  f'{first_token_latency_ave:.3f}s\n')
-        print(
-            f'number of prompt tokens: {prompt_tokens:.0f}\n'
-            f'number of completion tokens: {completion_tokens:.0f}\n'
-            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
-            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
-            f'RPS (request per second): {rps:.3f} req/s\n'
-            f'RPM (request per minute): {rpm:.3f} req/min\n'
-            f'{"-" * 50}\n')
-
-        if self.csv:
-            with open(self.csv, 'w') as csvfile:
-                writer = csv.writer(csvfile)
-                writer.writerow([
-                    'batch', 'num_prompts', 'RPS', 'RPM', 'FTL(ave)(s)',
-                    'FTL(min)(s)', 'FTL(max)(s)', 'throughput(out tok/s)',
-                    'throughput(total tok/s)'
-                ])
-                writer.writerow([
-                    concurrency,
-                    len(requests), f'{rps:.3f}', f'{rpm:.3f}',
-                    f'{first_token_latency_ave:.3f}' if stream_output else '-',
-                    f'{first_token_latency_min:.3f}' if stream_output else '-',
-                    f'{first_token_latency_max:.3f}' if stream_output else '-',
-                    f'{completion_token_throughput:.3f}',
-                    f'{total_token_throughput:.3f}'
-                ])
-
-
-def main(server_addr: str,
-         tokenizer_path: str,
-         dataset: str,
-         concurrency: int = 32,
-         num_prompts: int = 1000,
-         top_k: int = 1,
-         top_p: float = 1.0,
-         temperature: float = 1.0,
-         stream_output: bool = True,
-         csv: str = './profile_tis.csv',
-         seed: int = 0):
-    """Benchmark the request througput of the triton inference server.
-
-    Args:
-        server_addr (str): Address of the triton inference server with format 0.0.0.0:0
-        tokenizer_path (str): Path to the tokenizer model in localhost
-        dataset (str): Path to the dataset
-        concurrency (int, optional): Number of working threads to process the sampled prompts.
-            Defaults to 32.
-        num_prompts (int, optional): Number of prompts to process. Defaults to 1000.
-        top_k (int, optional): The number of highest probability vocabulary tokens
-            to keep for top-k-filtering. Defaults to 1.
-        top_p (float, optional): the set of most probable tokens with
-            probabilities that add up to top_p or higher
-            are kept for generation. Defaults to 1.0.
-        temperature (float, optional): The value used to modulate the next token probabilities.
-            Defaults to 1.0.
-        stream_output (bool, optional): Indicator for streaming output. Defaults to True.
-        seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0.
-    """    # noqa
-
-    random.seed(seed)
-
-    engine = Engine(server_addr,
-                    tokenizer_path,
-                    top_k=top_k,
-                    top_p=top_p,
-                    temperature=temperature,
-                    log_level='ERROR',
-                    csv=csv)
-
-    requests = sample_requests(dataset, num_prompts, engine.tokenizer)
-
-    engine.process_request(requests, concurrency, stream_output)
-
-
-if __name__ == '__main__':
-    fire.Fire(main)
diff --git a/docs/en/inference/turbomind.md b/docs/en/inference/turbomind.md
index fc197f85db..2b438723d2 100644
--- a/docs/en/inference/turbomind.md
+++ b/docs/en/inference/turbomind.md
@@ -57,8 +57,6 @@ Our implementation of the LLaMa family models is modified from Gpt-NeoX model in
 
 TurboMind supports a Python API that enables streaming output and tensor parallel mode.
 
-The ability to use [tritonserver](https://github.com/triton-inference-server/server) for serving is also inherited from FasterTransformer. However, to support submitting concurrent requests into our persistent batch model, we no longer use sequence batching or dynamic batching as FasterTransformer does. The bookkeeping of request and sequence states are managed by TurboMind instead.
-
 ## Difference between FasterTransformer and TurboMind
 
 Apart of the features described above, there are still many minor differences that we don't cover in this document. Notably, many capabilities of FT are dropped in TurboMind because of the difference in objectives (e.g. prefix prompt, beam search, context embedding, sparse GEMM, GPT/T5/other model families, etc)
diff --git a/docs/zh_cn/inference/turbomind.md b/docs/zh_cn/inference/turbomind.md
index c179eaa1ae..78dba19aba 100644
--- a/docs/zh_cn/inference/turbomind.md
+++ b/docs/zh_cn/inference/turbomind.md
@@ -57,8 +57,6 @@ TurboMind 的 [KV 缓存管理器](https://github.com/InternLM/lmdeploy/blob/mai
 
 TurboMind 的 Python API 支持流式结果返回和张量并行模式。
 
-同时 TurboMind 也继承了 FasterTransformer 能够注册为 [Triton Inference Server](https://github.com/triton-inference-server/server) 推理后端的能力。但是为了支持 persistent batch 中的并发请求，我们不再像 FasterTransformer 那样使用 sequence batching 或者 dynamic batching 。相反，TurboMind 负责记录和管理请求序列的状态。
-
 ## TurboMind 和 FasterTransformer 的区别
 
 除了上文中提到的功能外，TurboMind 相较于 FasterTransformer 还有不少差别。譬如不少 FasterTransformer 的功能在 TurboMind 中都被去掉了，这其中包括前缀提示词、 beam search 、上下文 embedding、稀疏化 GEMM 操作和对应 GPT 或 T5 等结构的模型的支持等等。
diff --git a/src/turbomind/triton_backend/libfastertransformer.cc b/src/turbomind/triton_backend/libfastertransformer.cc
deleted file mode 100644
index dcdb598ec8..0000000000
--- a/src/turbomind/triton_backend/libfastertransformer.cc
+++ /dev/null
@@ -1,1914 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Modified from
-// https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
-
-#include <stdint.h>
-
-#include <exception>
-#include <string>
-#include <thread>
-#include <vector>
-
-#pragma GCC diagnostic push
-// #pragma GCC diagnostic ignored "-Wsign-compare"
-#pragma GCC diagnostic ignored "-Wcast-function-type"
-#pragma warning(push, 0)
-#pragma warning(pop)
-#pragma GCC diagnostic pop
-
-// must include triton libraries first
-#include "triton/backend/backend_common.h"
-#include "triton/backend/backend_input_collector.h"
-#include "triton/backend/backend_memory.h"
-#include "triton/backend/backend_model.h"
-#include "triton/backend/backend_model_instance.h"
-#include "triton/backend/backend_output_responder.h"
-#include "triton/core/tritonbackend.h"
-
-// FT's libraries have dependency with triton's lib
-#include "src/turbomind/macro.h"
-#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
-#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/instance_comm.h"
-#include "src/turbomind/utils/mpi_utils.h"
-#include "src/turbomind/utils/nccl_utils.h"
-
-std::exception_ptr ptr[8];
-
-namespace ft = turbomind;
-
-namespace triton {
-namespace backend {
-namespace turbomind_backend {
-
-#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X)                                                 \
-    do {                                                                                                               \
-        TRITONSERVER_Error* raarie_err__ = (X);                                                                        \
-        if (raarie_err__ != nullptr) {                                                                                 \
-            SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__);                                           \
-            return;                                                                                                    \
-        }                                                                                                              \
-    } while (false)
-
-// Cuda Error handling
-TRITONSERVER_Error*
-ConvertCUDAStatusToTritonError(cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
-{
-    if (cuda_error != cudaSuccess) {
-        return TRITONSERVER_ErrorNew(code, cudaGetErrorString(cuda_error));
-    }
-    return nullptr;  // success
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Ragged Baching
-
-struct RaggedBatchingParams {
-    bool           is_input_ragged      = false;
-    int32_t        max_seq_length       = 0;
-    int32_t        max_elements_per_seq = 0;
-    const int32_t* batch_input_ptr      = nullptr;
-    size_t         batch_intput_size    = 0;
-    size_t         total_input_elements = 0;
-};
-
-using RaggedBatchingParam_Map = std::unordered_map<std::string, RaggedBatchingParams>;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-//
-// ModelState
-//
-// State associated with a model that is using this backend. An object
-// of this class is created and associated with each
-// TRITONBACKEND_Model.
-//
-class ModelState: public BackendModel {
-public:
-    static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, ModelState** state);
-    virtual ~ModelState() = default;
-
-    TRITONSERVER_Error* LoadModel(const std::string&                                                 artifact_name,
-                                  const int32_t                                                      node_id,
-                                  const int32_t                                                      device_id,
-                                  const int32_t                                                      device_id_start,
-                                  const int32_t                                                      stream_id,
-                                  std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params,
-                                  std::shared_ptr<ft::AbstractCustomComm>            custom_all_reduce_comms,
-                                  std::string*                                       model_path,
-                                  std::unique_ptr<AbstractTransformerModelInstance>* ft_model_instance);
-
-    int GetGpuSize()
-    {
-        return gpu_size;
-    };
-    int GetWorldSize()
-    {
-        return world_size;
-    };
-    int GetParallelSize()
-    {
-        return tp_pp_size;
-    };
-    int GetInstanceId()
-    {
-        return current_model_instance_id++;
-    };
-    int GetInstanceGroupCount()
-    {
-        return instance_group_count;
-    };
-    bool SequenceBatchingEnabled()
-    {
-        return sequence_batching_enabled;
-    };
-    bool DynamicBatchingEnabled()
-    {
-        return dynamic_batching_enabled;
-    };
-    std::shared_ptr<AbstractTransformerModel> GetFtModel()
-    {
-        return ft_model;
-    };
-
-private:
-    ModelState(TRITONBACKEND_Model* triton_model);
-    TRITONSERVER_Error*                       AutoCompleteConfig();
-    std::string                               GetParameter(const char* parameter);
-    int                                       current_model_instance_id = 0;
-    bool                                      sequence_batching_enabled = false;
-    bool                                      dynamic_batching_enabled  = false;
-    int                                       instance_group_count      = 1;
-    std::shared_ptr<AbstractTransformerModel> ft_model;
-    int                                       node_id, gpu_size, world_size, tp_pp_size;
-    std::vector<cudaStream_t>                 streams_;
-
-    std::shared_ptr<AbstractTransformerModel> ModelFactory(common::TritonJson::Value& param,
-                                                           const std::string&         model_filename);
-};
-
-TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
-{
-    try {
-        *state = new ModelState(triton_model);
-    }
-    catch (const BackendModelException& ex) {
-        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
-                             TRITONSERVER_ERROR_INTERNAL,
-                             std::string("unexpected nullptr in BackendModelException"));
-        RETURN_IF_ERROR(ex.err_);
-    }
-
-    // Auto-complete the configuration if requested, or T5-Encoder
-    bool auto_complete_config = false;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(triton_model, &auto_complete_config));
-    auto_complete_config |=
-        (*state)->GetParameter("model_type") == "T5-Encoder" || (*state)->GetParameter("model_type") == "bert";
-    if (auto_complete_config) {
-        RETURN_IF_ERROR((*state)->AutoCompleteConfig());
-
-        triton::common::TritonJson::WriteBuffer json_buffer;
-        (*state)->ModelConfig().Write(&json_buffer);
-
-        TRITONSERVER_Message* message;
-        RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(&message, json_buffer.Base(), json_buffer.Size()));
-        RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(triton_model, 1 /* config_version */, message));
-    }
-
-    return nullptr;  // success
-}
-
-std::string param_get(common::TritonJson::Value& param, const char* field, const std::string& fallback = "")
-{
-    common::TritonJson::Value key;
-    std::string               value = fallback;
-    param.MemberAsObject(field, &key);
-    key.MemberAsString("string_value", &value);
-    return value;
-}
-
-int param_get_int(common::TritonJson::Value& param, const char* field, int fallback = 0)
-{
-    int ret = fallback;
-    try {
-        ret = std::stoi(param_get(param, field));
-    }
-    catch (std::invalid_argument& ia) {
-        LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
-                    (std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
-    }
-    return ret;
-}
-
-float param_get_float(common::TritonJson::Value& param, const char* field, float fallback = 0.0)
-{
-    float ret = fallback;
-    try {
-        ret = std::stof(param_get(param, field));
-    }
-    catch (std::invalid_argument& ia) {
-        LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
-                    (std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
-    }
-    return ret;
-}
-
-bool param_get_bool(common::TritonJson::Value& param, const char* field, bool fallback = false)
-{
-    return static_cast<bool>(param_get_int(param, field, static_cast<int>(fallback)));
-}
-
-std::shared_ptr<AbstractTransformerModel> ModelState::ModelFactory(common::TritonJson::Value& param,
-                                                                   const std::string&         model_filename)
-{
-    std::shared_ptr<AbstractTransformerModel> ft_model;
-
-    const std::string model_dir = param_get(
-        param, "model_checkpoint_path", JoinPath({RepositoryPath(), std::to_string(Version()), model_filename}));
-    const std::string model_type = param_get(param, "model_type", "GPT");
-    const std::string data_type  = param_get(param, "data_type");
-    const int         tp         = param_get_int(param, "tensor_para_size");
-    const int         pp         = param_get_int(param, "pipeline_para_size");
-    const int         custom_ar  = param_get_int(param, "enable_custom_all_reduce");
-
-    const std::string dt_message = std::string("Invalid configuration argument 'data_type': ") + data_type;
-
-    if (model_type == "Llama") {
-        if (data_type == "fp16") {
-            ft_model = std::make_shared<LlamaTritonModel<half>>(tp, pp, custom_ar, model_dir);
-        }
-        else {
-#if ENABLE_FP32
-            ft_model = std::make_shared<LlamaTritonModel<float>>(tp, pp, custom_ar, model_dir);
-#else
-            THROW_IF_BACKEND_MODEL_ERROR(
-                TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "turbomind is not built with FP32 support"));
-#endif
-        }
-    }
-    else {
-        THROW_IF_BACKEND_MODEL_ERROR(
-            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, ("Unknown model \"" + model_type + "\"").c_str()));
-    }
-
-    return ft_model;
-}
-
-ModelState::ModelState(TRITONBACKEND_Model* triton_model): BackendModel(triton_model, true)
-{
-    node_id       = ft::mpi::getCommWorldRank();
-    int num_nodes = ft::mpi::getCommWorldSize();
-
-    triton::common::TritonJson::WriteBuffer buffer;
-    ModelConfig().PrettyWrite(&buffer);
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("model configuration:\n") + buffer.Contents()).c_str());
-
-    common::TritonJson::Value param;
-    model_config_.MemberAsObject("parameters", &param);
-
-    // instance groups
-    triton::common::TritonJson::Value instance_group, instance_obj, instance_group_count_val, instance_group_kind;
-    if (!ModelConfig().Find("instance_group", &instance_group) || instance_group.ArraySize() > 1) {
-        THROW_IF_BACKEND_MODEL_ERROR(
-            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Only supports one instance group !"));
-    }
-    instance_group.IndexAsObject(0, &instance_obj);
-    instance_obj.Find("count", &instance_group_count_val);
-    instance_obj.Find("kind", &instance_group_kind);
-    std::string instance_group_kind_str;
-    int64_t     instance_group_count_int64 = 1;
-    instance_group_kind.AsString(&instance_group_kind_str);
-    instance_group_count_val.AsInt(&instance_group_count_int64);
-    instance_group_count = (int)instance_group_count_int64;
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        ("Instance group type: " + instance_group_kind_str + " count: " + std::to_string(instance_group_count_int64))
-            .c_str());
-    if (instance_group_kind_str != "KIND_CPU") {
-        THROW_IF_BACKEND_MODEL_ERROR(
-            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Instance Group: only KIND_CPU supports!"));
-    }
-
-    // instance group validation
-    bool multi_node_enabled  = num_nodes > 1;
-    tp_pp_size               = param_get_int(param, "tensor_para_size") * param_get_int(param, "pipeline_para_size");
-    gpu_size                 = ft::getDeviceCount();
-    world_size               = gpu_size * num_nodes;
-    int  model_instance_size = num_nodes > 1 ? gpu_size : tp_pp_size;
-    bool multi_model_instance_valid = (multi_node_enabled && tp_pp_size == world_size && instance_group_count == 1)
-                                      || (!multi_node_enabled && gpu_size % tp_pp_size == 0
-                                          && model_instance_size * instance_group_count >= gpu_size);
-
-    printf("num_nodes=%d\n", num_nodes);
-    printf("tp_pp_size=%d\n", tp_pp_size);
-    printf("gpu_size=%d\n", gpu_size);
-    printf("world_size=%d\n", world_size);
-    printf("model_instance_size=%d\n", model_instance_size);
-    if (!multi_model_instance_valid) {
-        THROW_IF_BACKEND_MODEL_ERROR(
-            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
-                                  "1. Number of visible GPUs must be evenly divisble by TP * PP \n"
-                                  "2. Number of visible GPUs must be <= instance count * TP * PP \n"
-                                  "3. Multi-Node Inference only support one model instance \n"));
-    }
-
-    int64_t max_batch_size = 0;
-    model_config_.MemberAsInt("max_batch_size", &max_batch_size);
-
-    // sequence batching
-    triton::common::TritonJson::Value sequence_batching;
-    sequence_batching_enabled         = ModelConfig().Find("sequence_batching", &sequence_batching);
-    std::string sequence_batching_log = sequence_batching_enabled ? "enabled" : "disabled";
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Sequence Batching: ") + sequence_batching_log).c_str());
-    // if (sequence_batching_enabled && max_batch_size != 1) {
-    //   THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
-    //     "Sequence Batching for interactive text generation: only supports max
-    //     batch size = 1 currently !"));
-    // }
-
-    // dynamic batching
-    triton::common::TritonJson::Value dynamic_batching;
-    dynamic_batching_enabled         = ModelConfig().Find("dynamic_batching", &dynamic_batching);
-    std::string dynamic_batching_log = dynamic_batching_enabled ? "enabled" : "disabled";
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Dynamic Batching: ") + dynamic_batching_log).c_str());
-    if (dynamic_batching_enabled && sequence_batching_enabled) {
-        THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
-                                                           "Sequence Batching cannot work with dynamic "
-                                                           "batching at the same time !"));
-    }
-
-    std::string model_filename;
-    model_config_.MemberAsString("default_model_filename", &model_filename);
-
-    if (model_filename == "") {
-        model_filename = std::to_string(param_get_int(param, "tensor_para_size")) + "-gpu";
-    }
-
-    ft_model = ModelFactory(param, model_filename);
-
-    std::cout << ft_model->toString();
-
-    int total_weight_gpu_size = (instance_group_count * model_instance_size) >= gpu_size ?
-                                    gpu_size :
-                                    (instance_group_count * model_instance_size);
-    streams_.resize(instance_group_count * model_instance_size);
-
-    /* create shared weights
-    assume 8 gpus, 8 model instances, Tensor Para Size 2
-    then we will distribute model instances to [0, 1], [2, 3], [4, 5], [6, 7],
-    [0, 1], [2, 3], [4, 5], [6, 7] GPUs;
-    two instance instances on GPUs [0, 1] will share the same weights
-    */
-    std::vector<std::thread> threads;
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Weights:")).c_str());
-    ft::print_mem_usage();
-    for (int gid = 0; gid < total_weight_gpu_size; gid++) {
-        int rank = node_id * gpu_size + gid % tp_pp_size;
-        threads.push_back(std::thread(&AbstractTransformerModel::createSharedWeights, ft_model, gid, rank));
-    }
-    for (auto& t : threads) {
-        t.join();
-    }
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Weights:")).c_str());
-    ft::print_mem_usage();
-}
-
-TRITONSERVER_Error*
-ModelState::LoadModel(const std::string&                                                 artifact_name,
-                      const int32_t                                                      node_id,
-                      const int32_t                                                      device_id,
-                      const int32_t                                                      device_id_start,
-                      const int32_t                                                      stream_id,
-                      std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params_instance,
-                      std::shared_ptr<ft::AbstractCustomComm>                            custom_all_reduce_comms,
-                      std::string*                                                       model_path,
-                      std::unique_ptr<AbstractTransformerModelInstance>*                 ft_model_instance)
-{
-    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
-                     cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
-                 "Failed to set cuda device");
-
-    std::string cc_model_filename = artifact_name;
-    if (cc_model_filename.empty()) {
-        cc_model_filename = "gpt3-model";
-    }
-
-    if (!node_id && !device_id) {
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Model:")).c_str());
-    }
-    ft::print_mem_usage();
-
-    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaStreamCreate(&streams_[stream_id]),
-                                                TRITONSERVER_ERROR_INTERNAL,
-                                                "Failed to create the stream"),
-                 "Failed to create the stream");
-
-    const int rank = node_id * GetGpuSize() + device_id - device_id_start;
-
-    auto model_instance = ft_model->createModelInstance(
-        device_id, rank, streams_[stream_id], nccl_params_instance, custom_all_reduce_comms);
-    ft_model_instance->reset(model_instance.release());
-
-    if (!node_id && !device_id) {
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Model:")).c_str());
-    }
-    ft::print_mem_usage();
-
-    return nullptr;  // success
-}
-
-TRITONSERVER_Error* ModelState::AutoCompleteConfig()
-{
-    if (GetParameter("model_type") == "T5-Encoder") {
-        const std::string         data_type = GetParameter("data_type");
-        auto&                     config    = ModelConfig();
-        common::TritonJson::Value outputs, output, dtype_object;
-        std::string               name;
-        config.MemberAsArray("output", &outputs);
-
-        std::unordered_map<std::string, std::string> return_type_map{
-            {"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
-
-        std::set<std::string> outputs_to_modify = {"output_hidden_state", "output_attentions"};
-        for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
-            outputs.IndexAsObject(idx, &output);
-            output.MemberAsString("name", &name);
-            if (outputs_to_modify.find(name) == outputs_to_modify.end()) {
-                continue;
-            }
-            output.Find("data_type", &dtype_object);
-            dtype_object.SetString(return_type_map[data_type]);
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                        ("Automatically setting return data_type for \"" + name + "\" to \""
-                         + return_type_map[data_type] + "\"")
-                            .c_str());
-        }
-    }
-    else if (GetParameter("model_type") == "bert") {
-        const std::string         data_type = GetParameter("data_type");
-        auto&                     config    = ModelConfig();
-        common::TritonJson::Value inputs, input, dtype_object;
-        common::TritonJson::Value outputs, output;
-        std::string               name;
-        config.MemberAsArray("input", &inputs);
-        config.MemberAsArray("output", &outputs);
-
-        std::unordered_map<std::string, std::string> return_type_map{
-            {"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
-
-        for (size_t idx = 0; idx < inputs.ArraySize(); idx++) {
-            inputs.IndexAsObject(idx, &input);
-            input.MemberAsString("name", &name);
-            if (name != "input_hidden_state") {
-                continue;
-            }
-            input.Find("data_type", &dtype_object);
-            dtype_object.SetString(return_type_map[data_type]);
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                        ("Automatically setting return data_type for "
-                         "\"input_hidden_state\" to \""
-                         + return_type_map[data_type] + "\"")
-                            .c_str());
-        }
-
-        for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
-            outputs.IndexAsObject(idx, &output);
-            output.MemberAsString("name", &name);
-            if (name != "output_hidden_state") {
-                continue;
-            }
-            output.Find("data_type", &dtype_object);
-            dtype_object.SetString(return_type_map[data_type]);
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                        ("Automatically setting return data_type for "
-                         "\"output_hidden_state\" to \""
-                         + return_type_map[data_type] + "\"")
-                            .c_str());
-        }
-    }
-    else {
-        // Auto-complete configuration is not supported since turbomind does
-        // not store/capture sufficient model metadata so just log error instead.
-        LOG_MESSAGE(TRITONSERVER_LOG_WARN,
-                    (std::string("skipping model configuration auto-complete for '") + Name()
-                     + "': not supported for turbomind backend")
-                        .c_str());
-    }
-
-    return nullptr;  // success
-}
-
-std::string ModelState::GetParameter(const char* parameter)
-{
-    auto&                     config = ModelConfig();
-    common::TritonJson::Value parameters, model_type_obj;
-    std::string               model_type;
-    config.MemberAsObject("parameters", &parameters);
-    parameters.MemberAsObject(parameter, &model_type_obj);
-    model_type_obj.MemberAsString("string_value", &model_type);
-    return model_type;
-}
-
-struct stream_callback_ctx_t {
-    size_t                                       total_batch_size;
-    TRITONBACKEND_Request**                      requests;
-    uint32_t                                     request_count;
-    std::vector<TRITONBACKEND_Response*>*        responses;
-    std::vector<TRITONBACKEND_ResponseFactory*>* factories;
-    BackendModelInstance*                        model;
-};
-
-void generate_response_placeholders(std::vector<TRITONBACKEND_Response*>*        responses,
-                                    std::vector<TRITONBACKEND_ResponseFactory*>* factories)
-{
-    TRITONSERVER_Error* err = nullptr;
-    for (auto factory : *factories) {
-        TRITONBACKEND_Response* response;
-        err = TRITONBACKEND_ResponseNewFromFactory(&response, factory);
-        if (err) {
-            LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response from factory");
-            TRITONSERVER_ErrorDelete(err);
-        }
-        responses->push_back(response);
-    }
-}
-
-//
-// ModelInstanceState
-//
-// State associated with a model instance. An object of this class is
-// created and associated with each TRITONBACKEND_ModelInstance.
-//
-class ModelInstanceState: public BackendModelInstance {
-public:
-    static TRITONSERVER_Error*
-    Create(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state);
-    virtual ~ModelInstanceState();
-
-    // Get the state of the model that corresponds to this instance.
-    ModelState* StateForModel() const
-    {
-        return model_state_;
-    }
-
-    // Execute...
-    void ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count);
-
-    std::shared_ptr<std::unordered_map<std::string, Tensor>>
-    Execute(std::vector<TRITONBACKEND_Response*>*                    responses,
-            stream_callback_ctx_t*                                   context,
-            const uint32_t                                           response_count,
-            std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors);
-
-    void ReadOutputTensors(size_t                                                   total_batch_size,
-                           std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
-                           TRITONBACKEND_Request**                                  requests,
-                           const uint32_t                                           request_count,
-                           std::vector<TRITONBACKEND_Response*>*                    responses);
-
-    int GetModelInstanceCount()
-    {
-        return model_instance_count_;
-    };
-    int GetModelInstanceId()
-    {
-        return model_instance_id_;
-    };
-
-private:
-    ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance);
-    TRITONSERVER_Error* ValidateInputs();
-    TRITONSERVER_Error* ValidateOutputs();
-
-    void SetInputTensors(size_t                                                    total_batch_size,
-                         TRITONBACKEND_Request**                                   requests,
-                         const uint32_t                                            request_count,
-                         std::vector<TRITONBACKEND_Response*>*                     responses,
-                         BackendInputCollector*                                    collector,
-                         std::vector<const char*>*                                 input_names,
-                         std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
-                         std::vector<BackendMemory*>*                              input_memories,
-                         bool*                                                     cuda_copy);
-
-    void BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors);
-
-    ModelState* model_state_;
-
-    // model instance id
-    int model_instance_count_           = 1;
-    int model_instance_id_              = 0;
-    int model_instance_gpu_size_        = 1;
-    int model_instance_device_id_start_ = 0;
-
-    // output tensor stream
-    cudaStream_t output_stream_;
-
-    // tensor parallel + pipeline parallel
-    int gpu_size_   = 1;
-    int world_size_ = 1;
-    int tp_pp_size_ = 1;
-
-    // Should we use the streaming API?
-    bool is_decoupled_ = false;
-
-    // The full path to the FT model file.
-    std::string model_path_;
-
-    std::vector<std::unique_ptr<AbstractTransformerModelInstance>> ft_model_instance_;
-
-    std::unique_ptr<ft::AbstractInstanceComm> instance_comm_;
-
-    // inter-node broadcast buffer
-    std::vector<char*> bcast_buffers;
-
-    // Map from configuration name for an input to the index of
-    // that input in the model.
-    std::unordered_map<std::string, int> input_index_map_;
-
-    // Map from configuration name for an output to the index of
-    // that output in the model.
-    std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
-
-    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params_;
-
-    // custom all reduce comms
-    std::vector<std::shared_ptr<ft::AbstractCustomComm>> custom_all_reduce_comms_;
-};
-
-TRITONSERVER_Error* ModelInstanceState::Create(ModelState*                  model_state,
-                                               TRITONBACKEND_ModelInstance* triton_model_instance,
-                                               ModelInstanceState**         state)
-{
-    try {
-        *state = new ModelInstanceState(model_state, triton_model_instance);
-    }
-    catch (const BackendModelInstanceException& ex) {
-        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
-                             TRITONSERVER_ERROR_INTERNAL,
-                             std::string("unexpected nullptr in BackendModelInstanceException"));
-        RETURN_IF_ERROR(ex.err_);
-    }
-
-    return nullptr;  // success
-}
-
-int ThreadLoadModel(ModelState*                                                       model_state,
-                    const std::string&                                                artifact_name,
-                    const int32_t                                                     node_id,
-                    const int32_t                                                     device_id,
-                    const int32_t                                                     device_id_start,
-                    const int32_t                                                     stream_id,
-                    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
-                    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comms,
-                    std::string*                                                      model_path,
-                    std::unique_ptr<AbstractTransformerModelInstance>*                ft_model_instance)
-{
-    THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(artifact_name,
-                                                           node_id,
-                                                           device_id,
-                                                           device_id_start,
-                                                           stream_id,
-                                                           nccl_params,
-                                                           custom_all_reduce_comms,
-                                                           model_path,
-                                                           ft_model_instance));
-    return 0;
-}
-
-ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance):
-    BackendModelInstance(model_state, triton_model_instance), model_state_(model_state)
-{
-    int node_id   = ft::mpi::getCommWorldRank();
-    int num_nodes = ft::mpi::getCommWorldSize();
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Model name ") + ArtifactFilename()).c_str());
-
-    triton::common::TritonJson::Value transaction_policy;
-    is_decoupled_ = false;
-    model_state_->ModelConfig().MemberAsObject("model_transaction_policy", &transaction_policy);
-    transaction_policy.MemberAsBool("decoupled", &is_decoupled_);
-
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_VERBOSE,
-        (std::string("Use ") + (is_decoupled_ ? "DECOUPLED (streaming)" : "COUPLED (classic)") + " API.").c_str());
-
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs());
-    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
-
-    // NOTE:  model instance params
-    model_instance_id_    = model_state->GetInstanceId();
-    model_instance_count_ = model_state->GetInstanceGroupCount();
-    tp_pp_size_           = model_state->GetParallelSize();
-    gpu_size_             = model_state->GetGpuSize();
-    world_size_           = model_state->GetWorldSize();
-
-    model_instance_gpu_size_ = num_nodes > 1 ? gpu_size_ : tp_pp_size_;
-    ft_model_instance_.resize(model_instance_gpu_size_);
-    std::vector<std::thread> threads;
-
-    std::shared_ptr<AbstractTransformerModel> shared_ft_model = model_state->GetFtModel();
-
-    // NOTE: CPU_KIND only, the backend fully controls how to distribute models to
-    // GPUs
-    model_instance_device_id_start_ = (model_instance_id_ * model_instance_gpu_size_) % gpu_size_;
-    // create output tensor stream
-    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaSetDevice(model_instance_device_id_start_),
-                                                TRITONSERVER_ERROR_INTERNAL,
-                                                "Failed to set cuda device"),
-                 "Failed to set cuda device");
-    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
-                     cudaStreamCreate(&output_stream_), TRITONSERVER_ERROR_INTERNAL, "Failed to create the stream"),
-                 "Failed to create the stream");
-
-    // create nccl params
-    nccl_params_ = shared_ft_model->createNcclParams(node_id, model_instance_device_id_start_, num_nodes > 1);
-
-    shared_ft_model->createCustomComms(&custom_all_reduce_comms_, world_size_);
-    std::string model_instance_gpu_ids = "[ ";
-    for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
-         gid++) {
-        model_instance_gpu_ids += (std::to_string(gid) + " ");
-        threads.push_back(std::thread(ThreadLoadModel,
-                                      model_state,
-                                      ArtifactFilename(),
-                                      node_id,
-                                      gid,
-                                      model_instance_device_id_start_,
-                                      model_instance_id_ * model_instance_gpu_size_ + gid,
-                                      nccl_params_,
-                                      custom_all_reduce_comms_[gid - model_instance_device_id_start_],
-                                      &model_path_,
-                                      &ft_model_instance_[gid - model_instance_device_id_start_]));
-    }
-    model_instance_gpu_ids += "]";
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    instance_comm_ = shared_ft_model->createInstanceComm(tp_pp_size_);
-
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-                (std::string("Model instance is created on GPU ") + model_instance_gpu_ids).c_str());
-}
-
-ModelInstanceState::~ModelInstanceState()
-{
-#ifdef TRITON_ENABLE_GPU
-#endif  // TRITON_ENABLE_GPU
-    for (auto bcast_buffer : bcast_buffers) {
-        free(bcast_buffer);
-    }
-}
-
-TRITONSERVER_Error* ModelInstanceState::ValidateInputs()
-{
-    triton::common::TritonJson::Value ios, bios;
-    // input
-    std::string                       name, data_type;
-    triton::common::TritonJson::Value jshape;
-    // batch input
-    std::string                       kind, target_name, source_input;
-    triton::common::TritonJson::Value target_name_array, source_input_array;
-    model_state_->ModelConfig().MemberAsArray("input", &ios);
-    model_state_->ModelConfig().MemberAsArray("batch_input", &bios);
-
-    std::vector<std::string> valid_batch_input;
-
-    // batch input
-    for (size_t size = 0; size < bios.ArraySize(); size++) {
-        triton::common::TritonJson::Value batch_input;
-        bios.IndexAsObject(size, &batch_input);
-        batch_input.MemberAsString("kind", &kind);
-        batch_input.MemberAsArray("target_name", &target_name_array);
-        batch_input.MemberAsString("data_type", &data_type);
-        batch_input.MemberAsArray("source_input", &source_input_array);
-        target_name_array.IndexAsString(0, &target_name);
-        source_input_array.IndexAsString(0, &source_input);
-
-        LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-                    (std::string("Get batch input kind: " + kind + ", target_name: " + target_name
-                                 + ", data_type: " + data_type + ", source_input: " + source_input)
-                         .c_str()));
-
-        if (kind == "BATCH_ITEM_SHAPE" && data_type == "TYPE_INT32" && source_input + "_item_shape" == target_name) {
-            valid_batch_input.emplace_back(std::move(source_input));
-        }
-    }
-
-    // input
-    for (size_t size = 0; size < ios.ArraySize(); size++) {
-        triton::common::TritonJson::Value input;
-        ios.IndexAsObject(size, &input);
-        input.MemberAsString("name", &name);
-        input.MemberAsString("data_type", &data_type);
-        input.MemberAsArray("dims", &jshape);
-
-        triton::common::TritonJson::Value allow_ragged_batch_json;
-        bool                              allow_ragged_batch = false;
-        if (input.Find("allow_ragged_batch", &allow_ragged_batch_json)) {
-            RETURN_IF_ERROR(allow_ragged_batch_json.AsBool(&allow_ragged_batch));
-        }
-
-        if (allow_ragged_batch
-            && std::find(valid_batch_input.begin(), valid_batch_input.end(), name) == valid_batch_input.end()) {
-            return TRITONSERVER_ErrorNew(
-                TRITONSERVER_ERROR_INVALID_ARG,
-                std::string("Ragged Batch [ " + name + " ] needs the corresponding batch_input item shape !").c_str());
-        }
-
-        std::vector<int64_t> shape;
-        for (size_t size = 0; size < jshape.ArraySize(); size++) {
-            int64_t value = 0;
-            jshape.IndexAsInt(size, &value);
-            shape.push_back(value);
-        }
-
-        std::string str_shape = "[";
-        for (uint i = 0; i < shape.size(); i++) {
-            str_shape = str_shape + std::to_string(shape[i]);
-            if (i != shape.size() - 1) {
-                str_shape = str_shape + ", ";
-            }
-            else {
-                str_shape = str_shape + "]";
-            }
-        }
-
-        std::string allow_ragged_batch_str = allow_ragged_batch ? "true" : "false";
-
-        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                    (std::string("Get input name: " + name + ", type: " + data_type + ", shape: " + str_shape
-                                 + ", allow_ragged_batch: " + allow_ragged_batch_str)
-                         .c_str()));
-    }
-    return nullptr;  // success
-}
-
-TRITONSERVER_Error* ModelInstanceState::ValidateOutputs()
-{
-    triton::common::TritonJson::Value ios;
-    RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
-
-    std::string                       name, data_type;
-    triton::common::TritonJson::Value jshape;
-    model_state_->ModelConfig().MemberAsArray("output", &ios);
-    for (size_t size = 0; size < ios.ArraySize(); size++) {
-        triton::common::TritonJson::Value input;
-        ios.IndexAsObject(size, &input);
-        input.MemberAsString("name", &name);
-        input.MemberAsString("data_type", &data_type);
-        input.MemberAsArray("dims", &jshape);
-
-        std::vector<int64_t> shape;
-        for (size_t size = 0; size < jshape.ArraySize(); size++) {
-            int64_t value = 0;
-            jshape.IndexAsInt(size, &value);
-            shape.push_back(value);
-        }
-
-        std::string str_shape = "[";
-        for (uint i = 0; i < shape.size(); i++) {
-            str_shape = str_shape + std::to_string(shape[i]);
-            if (i != shape.size() - 1) {
-                str_shape = str_shape + ", ";
-            }
-            else {
-                str_shape = str_shape + "]";
-            }
-        }
-
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_VERBOSE,
-            (std::string("Get output name: " + name + ", type: " + data_type + ", shape: " + str_shape).c_str()));
-    }
-
-    return nullptr;  // success
-}
-
-void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count)
-{
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + std::to_string(request_count)
-                 + " requests")
-                    .c_str());
-    uint64_t exec_start_ns = 0;
-    SET_TIMESTAMP(exec_start_ns);
-
-    const int max_batch_size = model_state_->MaxBatchSize();
-
-    // For each request collect the total batch size for this inference
-    // execution. The batch-size, number of inputs, and size of each
-    // input has already been checked so don't need to do that here.
-    size_t total_batch_size = 0;
-
-    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
-    // size_t real_batch_dim = (int) sequence_batching_enabled;
-    constexpr size_t real_batch_dim = 0;
-
-    // only one batch slot per model instance when sequence_batching enabled
-    for (size_t i = 0; i < request_count; i++) {
-        // If we get a nullptr request then something is badly wrong. Fail
-        // and release all requests.
-        if (requests[i] == nullptr) {
-            RequestsRespondWithError(
-                requests,
-                request_count,
-                TRITONSERVER_ErrorNew(
-                    TRITONSERVER_ERROR_INTERNAL,
-                    std::string("null request given to TurboMind backend for '" + Name() + "'").c_str()));
-            return;
-        }
-
-        if (max_batch_size > 0) {
-            // Retrieve the batch size from one of the inputs, if the model
-            // supports batching, the first dimension size is batch size
-            int index = 0;
-            while (true) {
-                TRITONBACKEND_Input* input;
-                TRITONSERVER_Error*  err_0 = TRITONBACKEND_RequestInputByIndex(requests[i], index, &input);
-                if (err_0 == nullptr) {
-                    const char*         input_name;
-                    const int64_t*      shape;
-                    TRITONSERVER_Error* err_1 =
-                        TRITONBACKEND_InputProperties(input, &input_name, nullptr, &shape, nullptr, nullptr, nullptr);
-                    std::string input_name_str = std::string(input_name);
-                    if (err_1 == nullptr) {
-                        if (input_name_str != "START" && input_name_str != "END" && input_name_str != "READY") {
-                            total_batch_size += shape[real_batch_dim];
-                            break;
-                        }
-                        index++;
-                    }
-                    else {
-                        RequestsRespondWithError(requests, request_count, err_1);
-                        return;
-                    }
-                }
-                else {
-                    RequestsRespondWithError(requests, request_count, err_0);
-                    return;
-                }
-            }
-        }
-        else {
-            total_batch_size += 1;
-        }
-    }
-
-    // If there are no valid payloads then no need to run the inference.
-    if (total_batch_size == 0) {
-        return;
-    }
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                (std::string("get total batch_size = ") + std::to_string(total_batch_size)).c_str());
-
-    // Make sure the maximum batch size is not exceeded. The
-    // total_batch_size must be 1 for models that don't support batching
-    // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
-    // scheduler has done something badly wrong so fail and release all
-    // requests.
-    if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
-        RequestsRespondWithError(
-            requests,
-            request_count,
-            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
-                                  std::string("batch size " + std::to_string(total_batch_size) + " for '" + Name()
-                                              + "', max allowed is " + std::to_string(max_batch_size))
-                                      .c_str()));
-        return;
-    }
-
-    // At this point we are committed to running inference with all
-    // 'requests'. Create a response for each request. During input
-    // processing if there is an error with any request that error will
-    // be sent immediately with the corresponding response (and the
-    // response unique_ptr will then be nullptr). The request object
-    // itself will not be released until after all inferencing is done
-    // (below) as we may need to access the request object when
-    // determine how to process outputs (for example, even if we don't
-    // need the outputs for a request that has an error, we do need to
-    // know the size of those outputs associated with the request so we
-    // can skip them in the output tensors).
-    //
-    // When operating in the decoupled mode, responses should be created
-    // from factories. Here, we instantiate a factory for each request and
-    // generate the first response. At each new result from the model the
-    // generated response is filled, sent, and another response is created
-    // from the factory. The last response is send just like in the
-    // non-decoupled mode.
-    std::vector<TRITONBACKEND_Response*> responses;
-    responses.reserve(request_count);
-    std::vector<TRITONBACKEND_ResponseFactory*> factories;
-
-    for (size_t i = 0; i < request_count; i++) {
-        if (is_decoupled_) {
-            TRITONBACKEND_ResponseFactory* factory;
-            auto                           err = TRITONBACKEND_ResponseFactoryNew(&factory, requests[i]);
-            if (err == nullptr) {
-                factories.emplace_back(factory);
-            }
-            else {
-                factories.emplace_back(nullptr);
-                LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response factory");
-                TRITONSERVER_ErrorDelete(err);
-            }
-        }
-        else {
-            TRITONBACKEND_Response* response;
-            auto                    err = TRITONBACKEND_ResponseNew(&response, requests[i]);
-            if (err == nullptr) {
-                responses.emplace_back(response);
-            }
-            else {
-                responses.emplace_back(nullptr);
-                LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
-                TRITONSERVER_ErrorDelete(err);
-            }
-        }
-    }
-
-    std::vector<const char*>                                 input_names;
-    std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors =
-        std::make_shared<std::unordered_map<std::string, Tensor>>();
-    std::vector<BackendMemory*> input_memories;
-    bool                        cuda_copy = false;
-    if (is_decoupled_) {
-        generate_response_placeholders(&responses, &factories);
-    }
-    BackendInputCollector collector(requests,
-                                    request_count,
-                                    &responses,
-                                    model_state_->TritonMemoryManager(),
-                                    model_state_->EnablePinnedInput(),
-                                    CudaStream());
-    SetInputTensors(total_batch_size,
-                    requests,
-                    request_count,
-                    &responses,
-                    &collector,
-                    &input_names,
-                    &input_tensors,
-                    &input_memories,
-                    &cuda_copy);
-
-    // Wait for any in-flight input tensor copies to complete.
-#ifdef TRITON_ENABLE_GPU
-    if (cuda_copy) {
-        cudaStreamSynchronize(CudaStream());
-    }
-#endif
-
-    uint64_t compute_start_ns = 0;
-    SET_TIMESTAMP(compute_start_ns);
-
-    stream_callback_ctx_t context = {total_batch_size, requests, request_count, &responses, &factories, this};
-
-    auto output_tensors = Execute(&responses, &context, request_count, input_tensors);
-
-    uint64_t compute_end_ns = 0;
-    SET_TIMESTAMP(compute_end_ns);
-
-    // Free BackendMemory used for inputs
-    for (BackendMemory* mem : input_memories) {
-        delete mem;
-    }
-    input_memories.clear();
-
-    ReadOutputTensors(total_batch_size, output_tensors, requests, request_count, &responses);
-
-    uint64_t exec_end_ns = 0;
-    SET_TIMESTAMP(exec_end_ns);
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                (std::string("get response size = ") + std::to_string(responses.size())).c_str());
-
-    // Send all the responses that haven't already been sent because of
-    // an earlier error. Note that the responses are not set to nullptr
-    // here as we need that indication below to determine if the request
-    // we successful or not.
-    for (auto& response : responses) {
-        if (response != nullptr) {
-            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
-                         "failed to send TurboMind backend response");
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("response is sent")).c_str());
-        }
-        else {
-            LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("response is nullptr")).c_str());
-        }
-    }
-
-    // Report statistics for each request.
-    for (uint32_t r = 0; r < request_count; ++r) {
-        auto& request = requests[r];
-        LOG_IF_ERROR(TRITONBACKEND_ModelInstanceReportStatistics(TritonModelInstance(),
-                                                                 request,
-                                                                 (responses[r] != nullptr) /* success */,
-                                                                 exec_start_ns,
-                                                                 compute_start_ns,
-                                                                 compute_end_ns,
-                                                                 exec_end_ns),
-                     "failed reporting request statistics");
-
-        LOG_IF_ERROR(TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
-                     "failed releasing request");
-    }
-
-    // Report the entire batch statistics.
-    LOG_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportBatchStatistics(
-            TritonModelInstance(), total_batch_size, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),
-        "failed reporting batch request statistics");
-}
-
-void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, void* ctx)
-{
-    stream_callback_ctx_t* context = reinterpret_cast<stream_callback_ctx_t*>(ctx);
-    ModelInstanceState*    model   = reinterpret_cast<ModelInstanceState*>(context->model);
-
-    std::vector<TRITONBACKEND_Response*>* responses = context->responses;
-
-    model->ReadOutputTensors(
-        context->total_batch_size, output_tensors, context->requests, context->request_count, responses);
-
-    for (auto& response : *responses) {
-        if (response != nullptr) {
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
-            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr), "failed to send TurboMind backend response");
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
-        }
-        else {
-            LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("streaming response is nullptr")).c_str());
-        }
-    }
-    responses->clear();
-    generate_response_placeholders(responses, context->factories);
-}
-
-int ThreadForward(std::unique_ptr<AbstractTransformerModelInstance>*        ft_model_instance,
-                  std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
-                  std::shared_ptr<std::unordered_map<std::string, Tensor>>* output_tensors,
-                  ft::AbstractInstanceComm*                                 instance_comm,
-                  std::exception_ptr*                                       exception_ptr,
-                  const int                                                 device_id,
-                  const int                                                 use_stream_cb,
-                  stream_callback_ctx_t*                                    context)
-{
-    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
-                     cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
-                 "Failed to set cuda device");
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Start to forward")).c_str());
-    if (use_stream_cb) {
-        (*ft_model_instance)->registerCallback(streaming_callback, (void*)context);
-    }
-    *output_tensors = (*ft_model_instance)->forward(*input_tensors, instance_comm);
-    if (use_stream_cb) {
-        (*ft_model_instance)->unRegisterCallback();
-    }
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Stop to forward")).c_str());
-
-    if ((*output_tensors)->count("error_message")) {
-        *exception_ptr = *((std::exception_ptr*)((*output_tensors)->at("error_message").data));
-    }
-    return 0;
-}
-
-void triton_check_inputs(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, const char* filename)
-{
-    auto& output = output_tensors->at("output_ids");
-    auto  shape  = output.shape;
-    assert(shape.size() == 3);
-    assert(output.type == TYPE_UINT32);
-    auto        batch_size = shape[0];
-    auto        length     = shape[2];
-    std::string fName      = filename;
-    auto        file       = std::ofstream(fName, std::ios::out);
-    if (!file.is_open()) {}
-    else {
-        for (size_t i = 0; i < batch_size; i++) {
-            for (size_t j = 0; j < length; j++) {
-                file << ((uint32_t*)output.data)[i * length + j] << " ";
-            }
-            file << std::endl;
-        }
-    }
-}
-
-void ModelInstanceState::BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors)
-{
-    int node_id = ft::mpi::getCommWorldRank();
-
-    uint32_t input_count = node_id ? 0 : (*input_tensors)->size();
-    ft::mpi::bcast(&input_count, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
-    if (input_count > bcast_buffers.size()) {
-        bcast_buffers.resize(input_count);
-    }
-
-    if (node_id) {
-        for (uint input_index = 0; input_index < input_count; input_index++) {
-            std::vector<size_t> batchn_shape;
-            int64_t             shape_size  = 0;
-            int64_t             buffer_size = 1;
-            ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-            for (int s_id = 0; s_id < shape_size; s_id++) {
-                int64_t val;
-                ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-                batchn_shape.push_back(val);
-                buffer_size *= val;
-            }
-            int64_t data_type_size = 1;
-            ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-            buffer_size *= data_type_size;
-            bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], buffer_size);
-            char* input_buffer         = bcast_buffers[input_index];
-            ft::mpi::bcast(input_buffer, buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
-
-            int64_t name_size = 0;
-            ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-            char char_name[1024] = {0};
-            ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
-            uint32_t data_type_num = 0;
-            ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
-            TRITONSERVER_DataType triton_data_type = TRITONSERVER_DataType(data_type_num);
-
-            (*input_tensors)
-                ->insert({std::string(char_name),
-                          Tensor{TRITONSERVER_MEMORY_CPU, triton_data_type, batchn_shape, input_buffer}});
-        }
-    }
-    else {
-        int input_index = 0;
-        for (auto it = (*input_tensors)->begin(); it != (*input_tensors)->end(); ++it) {
-            std::vector<size_t> batchn_shape = it->second.shape;
-            int64_t             shape_size   = batchn_shape.size();
-            int64_t             buffer_size  = 1;
-            ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-            for (int s_id = 0; s_id < shape_size; s_id++) {
-                int64_t val = batchn_shape[s_id];
-                ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-                buffer_size *= val;
-            }
-
-            ft::Tensor tmp{
-                ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, nullptr};  // TODO change the getDataTypeByteNum function to static
-            int64_t data_type_size = tmp.getTypeSize(triton::Tensor::convertTritonTypeToFt(it->second.type));
-            ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-            buffer_size *= data_type_size;
-
-            ft::mpi::bcast(
-                const_cast<void*>(it->second.data), buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
-
-            std::string name      = it->first;
-            int64_t     name_size = name.size();
-            ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
-            bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], name_size);
-            char*   char_name          = bcast_buffers[input_index];
-            int64_t length             = (int64_t)name.copy(char_name, name_size);
-            ft::FT_CHECK(length == name_size);
-            ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
-
-            uint32_t data_type_num = (uint32_t)(it->second.type);
-            ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
-            input_index++;
-        }
-    }
-}
-
-std::shared_ptr<std::unordered_map<std::string, Tensor>>
-ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>*                    responses,
-                            stream_callback_ctx_t*                                   context,
-                            const uint32_t                                           response_count,
-                            std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors)
-{
-    int node_id = ft::mpi::getCommWorldRank();
-
-    if (node_id == 0) {
-        // Debug: input array
-        // triton_check_inputs(input_tensors, "triton_in");
-    }
-    if (node_id) {
-        input_tensors = std::make_shared<std::unordered_map<std::string, Tensor>>();
-    }
-
-    ft::mpi::barrier();
-
-    BroadcastInputTensors(&input_tensors);
-    std::vector<std::thread>                                 threads;
-    std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors_list[model_instance_gpu_size_];
-    std::exception_ptr                                       exception_ptr[model_instance_gpu_size_];
-    for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
-         gid++) {
-        int instance_local_id = gid - model_instance_device_id_start_;
-        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("before ThreadForward " + std::to_string(gid))).c_str());
-        threads.push_back(std::thread(ThreadForward,
-                                      &ft_model_instance_[instance_local_id],
-                                      &input_tensors,
-                                      &output_tensors_list[instance_local_id],
-                                      instance_comm_.get(),
-                                      &exception_ptr[instance_local_id],
-                                      gid,
-                                      is_decoupled_ && gid == model_instance_device_id_start_,
-                                      context));
-        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("after ThreadForward " + std::to_string(gid))).c_str());
-    }
-
-    for (auto& t : threads) {
-        t.join();
-    }
-
-    try {
-        for (int gid = model_instance_device_id_start_;
-             gid < model_instance_device_id_start_ + model_instance_gpu_size_;
-             gid++) {
-            int instance_local_id = gid - model_instance_device_id_start_;
-            if (exception_ptr[instance_local_id]) {
-                std::rethrow_exception(exception_ptr[instance_local_id]);
-            }
-        }
-    }
-    catch (std::exception& ex) {
-        SendErrorForResponses(responses,
-                              response_count,
-                              TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
-                                                    ("TurboMind execute failure: " + std::string(ex.what())).c_str()));
-    }
-    auto output_tensors = output_tensors_list[0];
-    return output_tensors;
-}
-
-void ModelInstanceState::SetInputTensors(
-    size_t                                                            total_batch_size,
-    TRITONBACKEND_Request**                                           requests,
-    const uint32_t                                                    request_count,
-    std::vector<TRITONBACKEND_Response*>*                             responses,
-    BackendInputCollector*                                            collector,
-    std::vector<const char*>*                                         input_names,
-    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* input_tensors,
-    std::vector<BackendMemory*>*                                      input_memories,
-    bool*                                                             cuda_copy)
-{
-    const int max_batch_size = model_state_->MaxBatchSize();
-    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
-    bool dynamic_batching_enabled = model_state_->DynamicBatchingEnabled() || model_state_->SequenceBatchingEnabled();
-
-    // All requests must have equally-sized input tensors so use any
-    // request as the representative for the input tensors.
-    uint32_t input_count;
-    RESPOND_ALL_AND_RETURN_IF_ERROR(
-        responses, request_count, TRITONBACKEND_RequestInputCount(requests[0], &input_count));
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("get input count = ") + std::to_string(input_count)).c_str());
-
-    // Process batch input if any
-    RaggedBatchingParam_Map batch_input_param_map;
-
-    if (dynamic_batching_enabled) {
-        // Handle batch inputs for ragged batching
-        for (const auto& batch_input : model_state_->BatchInputs()) {
-            std::vector<int64_t> shape;
-            collector->BatchInputShape(batch_input, &shape);
-
-            auto batch_input_kind = batch_input.BatchInputKind();
-            auto batch_input_name = batch_input.TargetNames()[0];
-
-            // we only take care of the ragged input_ids
-            // Assume the first dimension (length) are different and others are the
-            // same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding
-            // batch dimension)]
-            if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE
-                && (batch_input_name == "input_ids_item_shape"
-                    || batch_input_name == "request_prompt_embedding_item_shape")) {
-                RaggedBatchingParams param{};
-
-                size_t                  num_feature_dimensions = (size_t)shape[1];
-                const char*             dst_buffer             = nullptr;
-                size_t                  dst_buffer_byte_size;
-                TRITONSERVER_MemoryType dst_memory_type;
-                int64_t                 dst_memory_type_id;
-
-                // Batch inputs are always created on CPU
-                RESPOND_ALL_AND_SET_NULL_IF_ERROR((*responses),
-                                                  responses->size(),
-                                                  collector->ProcessBatchInput(batch_input,
-                                                                               nullptr,
-                                                                               0,
-                                                                               {{TRITONSERVER_MEMORY_CPU, 0}},
-                                                                               &dst_buffer,
-                                                                               &dst_buffer_byte_size,
-                                                                               &dst_memory_type,
-                                                                               &dst_memory_type_id));
-
-                param.batch_input_ptr = reinterpret_cast<const int32_t*>(dst_buffer);
-
-                // concat all feature dimensions
-                param.batch_intput_size = (dst_buffer_byte_size / sizeof(int32_t)) / num_feature_dimensions;
-                if (num_feature_dimensions > 1) {
-                    BackendMemory* batch_item_shape_memory;
-                    RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
-                                                    request_count,
-                                                    BackendMemory::Create(model_state_->TritonMemoryManager(),
-                                                                          {BackendMemory::AllocationType::CPU},
-                                                                          0,
-                                                                          dst_buffer_byte_size / num_feature_dimensions,
-                                                                          &batch_item_shape_memory));
-                    int32_t* batch_item_shape_memory_ptr =
-                        reinterpret_cast<int32_t*>(batch_item_shape_memory->MemoryPtr());
-                    for (size_t idx = 0; idx < param.batch_intput_size; idx++) {
-                        int32_t concat_dimensions = 1;
-                        for (size_t dim_idx = 0; dim_idx < num_feature_dimensions; dim_idx++) {
-                            concat_dimensions *= param.batch_input_ptr[idx * num_feature_dimensions + dim_idx];
-                            // dim0 is seq length dimension
-                            if (dim_idx == 0) {
-                                param.max_seq_length =
-                                    std::max(param.max_seq_length, param.batch_input_ptr[idx * num_feature_dimensions]);
-                            }
-                        }
-                        batch_item_shape_memory_ptr[idx] = concat_dimensions;
-                    }
-                    param.batch_input_ptr = reinterpret_cast<const int32_t*>(batch_item_shape_memory_ptr);
-                }
-                else {
-                    param.max_seq_length =
-                        *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
-                }
-
-                // check if padding is needed
-                param.is_input_ragged = std::any_of(param.batch_input_ptr,
-                                                    param.batch_input_ptr + param.batch_intput_size,
-                                                    [&](int x) { return x != param.batch_input_ptr[0]; });
-
-                // calculate statistics of elements
-                if (param.is_input_ragged) {
-                    param.max_elements_per_seq =
-                        *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
-                    param.total_input_elements =
-                        std::accumulate(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size, 0);
-                    batch_input_param_map.insert({batch_input_name, param});
-                    // verbose logging for debugging
-                    if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
-                        std::string value_str = "[ ";
-                        for (size_t i = 0; i < param.batch_intput_size; i++) {
-                            value_str += std::to_string(param.batch_input_ptr[i]) + " ";
-                        }
-                        value_str += "]";
-
-                        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                                    (std::string("collect batch input name: ") + batch_input_name + "\n size: "
-                                     + std::to_string(dst_buffer_byte_size) + " bytes\n value: " + value_str
-                                     + "\n max sequence length: " + std::to_string(param.max_seq_length)
-                                     + "\n max elements per sequence: " + std::to_string(param.max_elements_per_seq))
-                                        .c_str());
-                    }
-                }
-            }
-        }
-    }
-
-    // Process user-defined inputs
-    for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
-        TRITONBACKEND_Input* input;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            responses, request_count, TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
-
-        const char*           input_name;
-        TRITONSERVER_DataType input_datatype;
-        const int64_t*        input_shape;
-        uint32_t              input_dims_count;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(
-            responses,
-            request_count,
-            TRITONBACKEND_InputProperties(
-                input, &input_name, &input_datatype, &input_shape, &input_dims_count, nullptr, nullptr));
-
-        input_names->emplace_back(input_name);
-
-        std::string input_name_str = std::string(input_name);
-
-        // Pad input ids from different requests
-        RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
-        if (batch_input_param_map.find(input_name_str + "_item_shape") != batch_input_param_map.end()
-            && batch_input_param_map[input_name_str + "_item_shape"].is_input_ragged) {
-            RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
-
-            const int64_t total_batch_size_int64     = (int64_t)total_batch_size;
-            const int64_t max_elements_per_seq_int64 = (int64_t)param.max_elements_per_seq;
-            const size_t  padded_input_ids_buffer_size =
-                GetByteSize(input_datatype, std::vector<int64_t>{total_batch_size_int64, max_elements_per_seq_int64});
-            // Always host memory
-            BackendMemory* padded_input_memory;
-            BackendMemory* request_input_memory;
-            RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
-                                            request_count,
-                                            BackendMemory::Create(model_state_->TritonMemoryManager(),
-                                                                  {BackendMemory::AllocationType::CPU},
-                                                                  0,
-                                                                  padded_input_ids_buffer_size,
-                                                                  &padded_input_memory));
-            RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
-                                            request_count,
-                                            BackendMemory::Create(model_state_->TritonMemoryManager(),
-                                                                  {BackendMemory::AllocationType::CPU},
-                                                                  0,
-                                                                  padded_input_ids_buffer_size,
-                                                                  &request_input_memory));
-
-            memset(padded_input_memory->MemoryPtr(), 0, padded_input_ids_buffer_size);
-
-            collector->ProcessTensor(
-                input_name,
-                request_input_memory->MemoryPtr(),
-                GetByteSize(input_datatype, std::vector<int64_t>{(int64_t)param.total_input_elements}),
-                request_input_memory->MemoryType(),
-                request_input_memory->MemoryTypeId());
-
-            int64_t accumulated_elements_offset = 0;
-
-            char* padded_input_ids_ptr = padded_input_memory->MemoryPtr();
-            char* base_input_ids       = request_input_memory->MemoryPtr();
-
-            // copy each request buffer to padded buffer
-            for (int64_t single_batch_idx = 0; single_batch_idx < total_batch_size_int64; single_batch_idx++) {
-                int32_t sequence_elements = param.batch_input_ptr[single_batch_idx];
-                std::memcpy(padded_input_ids_ptr
-                                + GetByteSize(input_datatype,
-                                              std::vector<int64_t>{single_batch_idx, max_elements_per_seq_int64}),
-                            base_input_ids
-                                + GetByteSize(input_datatype, std::vector<int64_t>{accumulated_elements_offset}),
-                            GetByteSize(input_datatype, std::vector<int64_t>{sequence_elements}));
-
-                accumulated_elements_offset += sequence_elements;
-            }
-
-            // modify batch dimension shape, and sequence length dimension shape after
-            // padding
-            std::vector<size_t> batchn_shape(input_shape, input_shape + input_dims_count);
-            if (max_batch_size != 0) {
-                batchn_shape[0] = total_batch_size;
-                batchn_shape[1] = (size_t)param.max_seq_length;
-                // assume all non-seq-length dimensions have the same shape
-                if (input_dims_count > 2) {
-                    batchn_shape[2] = (size_t)(param.max_elements_per_seq / param.max_seq_length);
-                }
-            }
-            (*input_tensors)
-                ->insert({std::string(input_name),
-                          triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape, padded_input_ids_ptr}});
-
-            continue;
-        }
-
-        // bool start_end_ready_flag = (input_name_str == "START" || input_name_str
-        // == "END"
-        //   || input_name_str == "READY");
-
-        // int shape_dims_start = (int) (sequence_batching_enabled &&
-        // !start_end_ready_flag);
-
-        // The shape for the entire input patch, [total_batch_size, ...]
-        std::vector<int64_t> batchn_shape(input_shape, input_shape + input_dims_count);
-        if (max_batch_size != 0) {
-            batchn_shape[0] = total_batch_size;
-        }
-
-        std::vector<size_t> batchn_shape_2(input_shape, input_shape + input_dims_count);
-        if (max_batch_size != 0) {
-            batchn_shape_2[0] = total_batch_size;
-        }
-
-        // std::vector<int64_t> batchn_shape(
-        //     input_shape + shape_dims_start, input_shape + input_dims_count);
-        // if (max_batch_size != 0 && !start_end_ready_flag) {
-        //   batchn_shape[0] = total_batch_size;
-        // }
-
-        // std::vector<size_t> batchn_shape_2(
-        //     input_shape + shape_dims_start, input_shape + input_dims_count);
-        // if (max_batch_size != 0 && !start_end_ready_flag) {
-        //   batchn_shape_2[0] = total_batch_size;
-        // }
-
-        // The input must be in contiguous CPU/GPU memory.
-        const int64_t batchn_byte_size = GetByteSize(input_datatype, batchn_shape);
-
-        // Always host memory
-        BackendMemory* input_memory;
-        RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
-                                        request_count,
-                                        BackendMemory::Create(model_state_->TritonMemoryManager(),
-                                                              {BackendMemory::AllocationType::CPU},
-                                                              0,
-                                                              batchn_byte_size,
-                                                              &input_memory));
-        input_memories->push_back(input_memory);
-
-        TRITONSERVER_MemoryType memory_type    = input_memory->MemoryType();
-        int64_t                 memory_type_id = input_memory->MemoryTypeId();
-        char*                   input_buffer   = input_memory->MemoryPtr();
-
-        collector->ProcessTensor(input_name, input_buffer, batchn_byte_size, memory_type, memory_type_id);
-
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_VERBOSE,
-            (std::string("collect name: ") + input_name + " size: " + std::to_string(batchn_byte_size) + " bytes")
-                .c_str());
-        (*input_tensors)
-            ->insert({std::string(input_name),
-                      triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape_2, input_buffer}});
-    }
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                (std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
-    // Finalize...
-    *cuda_copy |= collector->Finalize();
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                (std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
-}
-
-void ModelInstanceState::ReadOutputTensors(size_t                                                   total_batch_size,
-                                           std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
-                                           TRITONBACKEND_Request**                                  requests,
-                                           const uint32_t                                           request_count,
-                                           std::vector<TRITONBACKEND_Response*>*                    responses)
-{
-    BackendOutputResponder responder(requests,
-                                     request_count,
-                                     responses,
-                                     model_state_->MaxBatchSize(),
-                                     model_state_->TritonMemoryManager(),
-                                     model_state_->EnablePinnedInput(),
-                                     output_stream_);
-
-    bool cuda_copy = false;
-    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
-    std::vector<std::vector<char>> string_buffers;
-
-    int idx = 0;
-    for (auto it = output_tensors->begin(); it != output_tensors->end(); ++it) {
-        LOG_MESSAGE(
-            TRITONSERVER_LOG_VERBOSE,
-            (std::string("Get output_tensors ") + std::to_string(idx) + std::string(": ") + std::string(it->first))
-                .c_str());
-        idx++;
-        auto& output = it->second;
-
-        // Verify output datatype matches datatype from model config
-        TRITONSERVER_DataType output_dtype = output.type;
-        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                    (std::string("    output_type: ") + TRITONSERVER_DataTypeString(output_dtype)).c_str());
-
-        const char* output_buffer = static_cast<const char*>(output.data);
-
-        //  Set output shape
-        // std::vector<int64_t> batchn_shape = sequence_batching_enabled ?
-        // std::vector<int64_t>{1} :
-        //   std::vector<int64_t>{};
-        std::vector<int64_t> batchn_shape;
-        if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
-            // std::string batch_shape_str = sequence_batching_enabled ? "    output
-            // shape: [1, " :
-            //   "    output shape: [";
-            std::string batch_shape_str = "    output shape: [";
-            for (uint i = 0; i < output.shape.size(); i++) {
-                batchn_shape.push_back(output.shape[i]);
-                batch_shape_str = batch_shape_str + std::to_string(output.shape[i]);
-                if (i != output.shape.size() - 1) {
-                    batch_shape_str = batch_shape_str + ", ";
-                }
-                else {
-                    batch_shape_str = batch_shape_str + "]";
-                }
-            }
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, batch_shape_str.c_str());
-        }
-        else {
-            batchn_shape.insert(batchn_shape.end(), output.shape.begin(), output.shape.end());
-        }
-
-        responder.ProcessTensor(it->first,
-                                output_dtype,
-                                batchn_shape,
-                                output_buffer,
-                                TRITONSERVER_MEMORY_CPU,
-                                model_instance_device_id_start_);
-    }
-
-    // Finalize and wait for any pending buffer copies.
-    cuda_copy |= responder.Finalize();
-
-#ifdef TRITON_ENABLE_GPU
-    if (cuda_copy) {
-        cudaStreamSynchronize(output_stream_);
-    }
-#endif  // TRITON_ENABLE_GPU
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                (std::string("PERFORMED GPU copy: ") + (cuda_copy ? std::string("YES") : std::string("NO"))).c_str());
-}
-
-/////////////
-
-extern "C" {
-
-TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
-{
-    int provided;
-    ft::mpi::initThread(nullptr, nullptr, ft::mpi::THREAD_MULTIPLE, &provided);
-    const char* cname;
-    RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
-    std::string name(cname);
-
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
-
-    // Check the backend API version that Triton supports vs. what this
-    // backend was compiled against.
-    uint32_t api_version_major, api_version_minor;
-    RETURN_IF_ERROR(TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
-
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-                (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
-                 + std::to_string(api_version_minor))
-                    .c_str());
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-                (std::string("'") + name
-                 + "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "."
-                 + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
-                    .c_str());
-
-    if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR)
-        || (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_UNSUPPORTED,
-            (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
-             + std::to_string(api_version_minor) + " does not support '" + name + "' TRITONBACKEND API version: "
-             + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
-                .c_str());
-    }
-    return nullptr;  // success
-}
-
-TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
-{
-    const char* cname;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
-    std::string name(cname);
-
-    uint64_t version;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
-
-    LOG_MESSAGE(
-        TRITONSERVER_LOG_INFO,
-        (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + std::to_string(version) + ")").c_str());
-
-    // Create a ModelState object and associate it with the
-    // TRITONBACKEND_Model.
-    ModelState* model_state;
-    RETURN_IF_ERROR(ModelState::Create(model, &model_state));
-    RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
-
-    return nullptr;  // success
-}
-
-TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
-{
-    void* vstate;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
-    ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
-
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
-
-    delete model_state;
-
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: MPI Finalize");
-
-    ft::mpi::finalize();
-
-    return nullptr;  // success
-}
-
-TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
-{
-    int node_id = ft::mpi::getCommWorldRank();
-
-    const char* cname;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
-    std::string name(cname);
-
-    // Get the model state associated with this instance's model.
-    TRITONBACKEND_Model* model;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
-
-    void* vmodelstate;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
-    ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
-
-    // Create a ModelInstanceState object and associate it with the
-    // TRITONBACKEND_ModelInstance.
-    ModelInstanceState* instance_state;
-    RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state));
-    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state)));
-
-    int model_instance_id    = instance_state->GetModelInstanceId();
-    int model_instance_count = instance_state->GetModelInstanceCount();
-
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
-                (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (count "
-                 + std::to_string(model_instance_count) + ")" + " (instance_id " + std::to_string(model_instance_id)
-                 + ")")
-                    .c_str());
-
-    if (node_id) {
-        while (true) {
-            instance_state->Execute(
-                nullptr, nullptr, 0, std::shared_ptr<std::unordered_map<std::string, Tensor>>(nullptr));
-        }
-    }
-
-    return nullptr;  // success
-}
-
-TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
-{
-    void* vstate;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
-    ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate);
-
-    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
-
-    delete instance_state;
-
-    return nullptr;  // success
-}
-
-TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(TRITONBACKEND_ModelInstance* instance,
-                                                       TRITONBACKEND_Request**      requests,
-                                                       const uint32_t               request_count)
-{
-    // Triton will not call this function simultaneously for the same
-    // 'instance'. But since this backend could be used by multiple
-    // instances from multiple models the implementation needs to handle
-    // multiple calls to this function at the same time (with different
-    // 'instance' objects). Suggested practice for this is to use only
-    // function-local and model-instance-specific state (obtained from
-    // 'instance'), which is what we do here.
-    ModelInstanceState* instance_state;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state)));
-    ModelState* model_state = instance_state->StateForModel();
-
-    // This backend specifies BLOCKING execution policy. That means that
-    // we should not return from this function until execution is
-    // complete. Triton will automatically release 'instance' on return
-    // from this function so that it is again available to be used for
-    // another call to TRITONBACKEND_ModelInstanceExecute.
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-                (std::string("model ") + model_state->Name() + ", instance " + instance_state->Name() + ", executing "
-                 + std::to_string(request_count) + " requests")
-                    .c_str());
-
-    // At this point we accept ownership of 'requests', which means that
-    // even if something goes wrong we must still return success from
-    // this function. If something does go wrong in processing a
-    // particular request then we send an error response just for the
-    // specific request.
-    instance_state->ProcessRequests(requests, request_count);
-
-    return nullptr;  // success
-}
-
-}  // extern "C"
-
-}  // namespace turbomind_backend
-}  // namespace backend
-}  // namespace triton
diff --git a/src/turbomind/triton_backend/libtriton_fastertransformer.ldscript b/src/turbomind/triton_backend/libtriton_fastertransformer.ldscript
deleted file mode 100644
index 26d2fbb33f..0000000000
--- a/src/turbomind/triton_backend/libtriton_fastertransformer.ldscript
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-{
-  global:
-    TRITONBACKEND_*;
-  local: *;
-};

From a7eac26b42eec557588be1364a14be4ebde3d9ee Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Thu, 11 Jul 2024 14:20:09 +0800
Subject: [PATCH 05/14] remove lmdeploy/serve/turbomind/triton_models

---
 lmdeploy/serve/turbomind/__init__.py          |   1 -
 lmdeploy/serve/turbomind/service_docker_up.sh |  87 ------
 .../triton_models/interactive/1/placeholder   |   0
 .../triton_models/interactive/config.pbtxt    | 281 ------------------
 .../triton_models/postprocessing/1/model.py   | 136 ---------
 .../triton_models/postprocessing/config.pbtxt |  41 ---
 .../triton_models/preprocessing/1/model.py    | 151 ----------
 .../triton_models/preprocessing/config.pbtxt  |  37 ---
 .../triton_models/tokenizer/placeholder       |   0
 .../triton_models/weights/config.ini          |   0
 lmdeploy/serve/turbomind/utils.py             | 103 -------
 lmdeploy/turbomind/deploy/converter.py        |  70 +----
 12 files changed, 2 insertions(+), 905 deletions(-)
 delete mode 100644 lmdeploy/serve/turbomind/__init__.py
 delete mode 100644 lmdeploy/serve/turbomind/service_docker_up.sh
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/interactive/1/placeholder
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/postprocessing/1/model.py
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/tokenizer/placeholder
 delete mode 100644 lmdeploy/serve/turbomind/triton_models/weights/config.ini
 delete mode 100644 lmdeploy/serve/turbomind/utils.py

diff --git a/lmdeploy/serve/turbomind/__init__.py b/lmdeploy/serve/turbomind/__init__.py
deleted file mode 100644
index ef101fec61..0000000000
--- a/lmdeploy/serve/turbomind/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/serve/turbomind/service_docker_up.sh b/lmdeploy/serve/turbomind/service_docker_up.sh
deleted file mode 100644
index d45345e616..0000000000
--- a/lmdeploy/serve/turbomind/service_docker_up.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/sh
-
-show_help() {
-  echo "Usage: $0 [-h] [--help] [-l] [--lib-dir]"
-  echo
-  echo "Options:"
-  echo "  -h, --help   Show this help message and exit"
-  echo "  --lib-dir    Specify the directory of turbomind libraries"
-}
-
-# check if '-h' or '--help' in the arguments
-for arg in "$@"
-do
-  if [ "$arg" == "-h" ] || [ "$arg" == "--help" ]; then
-    show_help
-    exit 0
-  fi
-done
-
-
-TP=1
-DEVICES="0"
-for ((i = 1; i < ${TP}; ++i)); do
-    DEVICES="${DEVICES},$i"
-done
-DEVICES="\"device=${DEVICES}\""
-
-
-SCRIPT_DIR="$(dirname "$0")"
-SCRIPT_ABS_DIR="$(realpath "$SCRIPT_DIR")"
-
-
-if [ -z "$1" ]; then
-    docker run \
-        --gpus $DEVICES \
-        --rm \
-        -v "${SCRIPT_ABS_DIR}":/workspace/models \
-        --shm-size 16g \
-        -p 33336:22 \
-        -p 33337-33400:33337-33400 \
-        --cap-add=SYS_PTRACE \
-        --cap-add=SYS_ADMIN \
-        --security-opt seccomp=unconfined \
-        --name lmdeploy \
-        -it --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \
-        tritonserver \
-        --model-repository=/workspace/models/model_repository \
-        --allow-http=0 \
-        --allow-grpc=1 \
-        --grpc-port=33337 \
-        --log-verbose=0 \
-        --allow-metrics=1
-fi
-
-for ((i = 1; i <= $#; i++)); do
-  arg=${!i}
-  case "$arg" in
-    --lib-dir)
-    if [ "$i" -eq "$#" ]; then
-        show_help
-        exit -1
-    fi
-    LIB_PATH=${@:i+1:1}
-      docker run \
-        --gpus $DEVICES \
-        --rm \
-        -v "${LIB_PATH}":/opt/tritonserver/backends/turbomind \
-        -v ""${SCRIPT_ABS_DIR}"":/workspace/models \
-        --shm-size 16g \
-        -p 33336:22 \
-        -p 33337-33400:33337-33400 \
-        --cap-add=SYS_PTRACE \
-        --cap-add=SYS_ADMIN \
-        --security-opt seccomp=unconfined \
-        --name lmdeploy \
-        -it --env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \
-        tritonserver \
-        --model-repository=/workspace/models/model_repository \
-        --allow-http=0 \
-        --allow-grpc=1 \
-        --grpc-port=33337 \
-        --log-verbose=0 \
-        --allow-metrics=1
-    break
-    ;;
-  esac
-done
diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/1/placeholder b/lmdeploy/serve/turbomind/triton_models/interactive/1/placeholder
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt
deleted file mode 100644
index 0b1e431ea4..0000000000
--- a/lmdeploy/serve/turbomind/triton_models/interactive/config.pbtxt
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "turbomind"
-backend: "turbomind"
-default_model_filename: "weights"
-max_batch_size: 1
-
-model_transaction_policy {
-  decoupled: True
-}
-
-instance_group [
-  {
-    # max concurrent instances
-    count: 48
-    kind: KIND_CPU
-  }
-]
-
-input [
-  {
-    name: "input_ids"
-    data_type: TYPE_UINT32
-    dims: [ -1 ]
-    # allow_ragged_batch: true
-  },
-  {
-    name: "input_lengths"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "request_output_len"
-    data_type: TYPE_UINT32
-    dims: [ -1 ]
-  },
-  {
-    name: "input_embeddings"
-    data_type: TYPE_INT8
-    dims: [ -1 ]
-    optional: true
-  },
-  {
-    name: "input_embedding_ranges"
-    data_type: TYPE_UINT32
-    dims: [ -1, 2 ]
-    optional: true
-  },
-  {
-    name: "step"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "session_len"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_k"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "beam_search_diversity_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "len_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "random_seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "is_return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-    {
-    name: "start_id"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "bad_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-  },
-  {
-    name: "stop_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-  },
-  {
-    name: "prompt_learning_task_name_ids"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "top_p_decay"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "top_p_min"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "top_p_reset_ids"
-    data_type: TYPE_UINT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "START"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "END"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "STOP"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "CORRID"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  }
-]
-output [
-  {
-    name: "output_ids"
-    data_type: TYPE_UINT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "sequence_length"
-    data_type: TYPE_UINT32
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  }
-]
-
-parameters {
-  key: "pipeline_para_size"
-  value: {
-    string_value: "1"
-  }
-}
-parameters {
-  key: "data_type"
-  value: {
-    string_value: "fp16"
-  }
-}
-parameters {
-  key: "model_type"
-  value: {
-    string_value: "Llama"
-  }
-}
-
-parameters {
-  key: "enable_custom_all_reduce"
-  value: {
-    string_value: "0"
-  }
-}
diff --git a/lmdeploy/serve/turbomind/triton_models/postprocessing/1/model.py b/lmdeploy/serve/turbomind/triton_models/postprocessing/1/model.py
deleted file mode 100644
index 0aa6805a39..0000000000
--- a/lmdeploy/serve/turbomind/triton_models/postprocessing/1/model.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-from pathlib import Path
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-
-# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
-# by triton inference server, it has to be converted first by running
-# `python lmdeploy/serve/turbomind/deploy.py`. Then
-# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
-from .tokenizer.tokenizer import Tokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name.
-
-    Every Python model that is created must have "TritonPythonModel" as the
-    class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device
-          ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        self.model_config = model_config = json.loads(args['model_config'])
-
-        # Parse model output configs
-        output_config = pb_utils.get_output_config_by_name(
-            model_config, 'OUTPUT')
-
-        # Convert Triton types to numpy types
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            output_config['data_type'])
-
-        cur_folder = Path(__file__).parent
-
-        self.tokenizer = Tokenizer(
-            osp.join(
-                cur_folder, self.model_config['parameters']['tokenizer_path']
-                ['string_value']))
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            tokens_batch = pb_utils.get_input_tensor_by_name(
-                request, 'TOKENS_BATCH').as_numpy()
-            sequence_length = pb_utils.get_input_tensor_by_name(
-                request, 'sequence_length').as_numpy()
-            skip_special_tokens = pb_utils.get_input_tensor_by_name(
-                request, 'skip_special_tokens').as_numpy()
-
-            # Postprocessing output data.
-            outputs = self._postprocessing(tokens_batch.tolist(),
-                                           sequence_length,
-                                           skip_special_tokens)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            output_tensor = pb_utils.Tensor(
-                'OUTPUT',
-                np.array(outputs).astype(self.output_dtype))
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_tensor])
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-
-        Implementing `finalize` function is optional. This function allows the
-        model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _postprocessing(self, tokens_batch, sequence_length,
-                        skip_special_tokens):
-        """decode token ids into texts."""
-        outputs = []
-        for beam_tokens, beam_len, beam_skip_special in zip(
-                tokens_batch, sequence_length, skip_special_tokens):
-            for tokens, _len, skip_special in zip(beam_tokens, beam_len,
-                                                  beam_skip_special):
-                output = self.tokenizer.decode(
-                    tokens, _len, skip_special_tokens=bool(skip_special))
-                output = output.encode('utf8')
-                outputs.append(output)
-        return outputs
diff --git a/lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt
deleted file mode 100644
index 7954a65f82..0000000000
--- a/lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt
+++ /dev/null
@@ -1,41 +0,0 @@
-name: "postprocessing"
-backend: "python"
-max_batch_size: 1
-input [
-  {
-    name: "TOKENS_BATCH"
-    data_type: TYPE_UINT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "sequence_length"
-    data_type: TYPE_UINT32
-    dims: [ -1 ]
-  },
-  {
-    name: "skip_special_tokens"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  }
-]
-output [
-  {
-    name: "OUTPUT"
-    data_type: TYPE_STRING
-    dims: [ -1, -1 ]
-  }
-]
-
-instance_group [
-    {
-        count: 16
-        kind: KIND_CPU
-    }
-]
-
-parameters {
-  key: "tokenizer_path"
-  value: {
-    string_value: "tokenizer/tokenizer.model"
-  }
-}
diff --git a/lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py b/lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py
deleted file mode 100644
index 7e659fbae0..0000000000
--- a/lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-from pathlib import Path
-
-import numpy as np
-import torch
-import triton_python_backend_utils as pb_utils
-from torch.nn.utils.rnn import pad_sequence
-
-# This tokenizer is `lmdeploy/turbomind/tokenizer.py`. When an LLM is served
-# by triton inference server, it has to be converted first by running
-# `python lmdeploy/serve/turbomind/deploy.py`. Then
-# `lmdeploy/turbomind/tokenizer.py` will be copied to `tokenizer/tokenizer.py`
-from .tokenizer.tokenizer import Tokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name.
-
-    Every Python model that is created must have "TritonPythonModel" as the
-    class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device
-          ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        self.model_config = model_config = json.loads(args['model_config'])
-
-        # Parse model output configs and convert Triton types to numpy types
-        input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
-        for input_name in input_names:
-            setattr(
-                self,
-                input_name.lower() + '_dtype',
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(
-                        model_config, input_name)['data_type']))
-
-        cur_folder = Path(__file__).parent
-        self.tokenizer = Tokenizer(
-            osp.join(
-                cur_folder, self.model_config['parameters']['tokenizer_path']
-                ['string_value']))
-        self.start_id = self.tokenizer.bos_token_id
-        self.end_id = self.tokenizer.eos_token_id
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            query = pb_utils.get_input_tensor_by_name(request,
-                                                      'QUERY').as_numpy()
-
-            # Preprocessing input data.
-            input_id, request_input_len = self._create_request(query)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            input_id_tensor = pb_utils.Tensor(
-                'INPUT_ID',
-                np.array(input_id).astype(self.input_id_dtype))
-            request_input_len_tensor = pb_utils.Tensor(
-                'REQUEST_INPUT_LEN',
-                np.array(request_input_len).astype(
-                    self.request_input_len_dtype))
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[input_id_tensor, request_input_len_tensor])
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-
-        Implementing `finalize` function is optional. This function allows the
-        model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _create_request(self, query):
-        """Tokenize prompts and return the token ids and their length.
-
-        Args:
-            query (List[str]): a list of prompt
-        Returns:
-            tuple: token ids and their length
-        """
-        start_ids = []
-        for s in query:
-            _s = s[0].decode()
-            if _s == '<BOS>':
-                start_id = [self.start_id
-                            ] if self.start_id is not None else [-1]
-            elif _s == '<EOS>':
-                start_id = [self.end_id] if self.end_id is not None else [-1]
-            else:
-                start_id = self.tokenizer.encode(_s)
-            start_ids.append(torch.IntTensor(start_id))
-
-        start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
-        start_ids = pad_sequence(start_ids,
-                                 batch_first=True,
-                                 padding_value=self.end_id)
-        return start_ids, start_lengths
diff --git a/lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt
deleted file mode 100644
index 997ba399ba..0000000000
--- a/lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-name: "preprocessing"
-backend: "python"
-max_batch_size: 1
-
-input [
-    {
-        name: "QUERY"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    }
-]
-output [
-    {
-        name: "INPUT_ID"
-        data_type: TYPE_UINT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_INPUT_LEN"
-        data_type: TYPE_UINT32
-        dims: [ 1 ]
-    }
-]
-
-instance_group [
-    {
-        count: 4
-        kind: KIND_CPU
-    }
-]
-
-parameters {
-  key: "tokenizer_path"
-  value: {
-    string_value: "tokenizer/tokenizer.model"
-  }
-}
diff --git a/lmdeploy/serve/turbomind/triton_models/tokenizer/placeholder b/lmdeploy/serve/turbomind/triton_models/tokenizer/placeholder
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/lmdeploy/serve/turbomind/triton_models/weights/config.ini b/lmdeploy/serve/turbomind/triton_models/weights/config.ini
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/lmdeploy/serve/turbomind/utils.py b/lmdeploy/serve/turbomind/utils.py
deleted file mode 100644
index 0a3d8bcf00..0000000000
--- a/lmdeploy/serve/turbomind/utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import np_to_triton_dtype
-
-
-def prepare_tensor(name, input_tensor):
-    """Create grpcclient's InferInput instance according to a given tensor."""
-    t = grpcclient.InferInput(name, list(input_tensor.shape),
-                              np_to_triton_dtype(input_tensor.dtype))
-    t.set_data_from_numpy(input_tensor)
-    return t
-
-
-class Preprocessor:
-    """Tokenize prompts.
-
-    Args:
-        tritonserver_addr (str): the communication address of the inference
-          server
-    """
-
-    def __init__(self, tritonserver_addr: str):
-        self.tritonserver_addr = tritonserver_addr
-        self.model_name = 'preprocessing'
-
-    def __call__(self, *args, **kwargs):
-        return self.infer(*args, **kwargs)
-
-    def infer(self, prompts: Union[str, List[str]]) -> tuple:
-        """Tokenize the input prompts.
-
-        Args:
-            prompts(str | List[str]): user's prompt, or a batch prompts
-
-        Returns:
-            Tuple(numpy.ndarray, numpy.ndarray, numpy.ndarray): prompt's token
-            ids, ids' length and requested output length
-        """
-        if isinstance(prompts, str):
-            input0 = [[prompts]]
-        elif isinstance(prompts, List):
-            input0 = [[prompt] for prompt in prompts]
-        else:
-            assert 0, f'str or List[str] prompts are expected but got ' \
-                      f'{type(prompts)}'
-
-        input0_data = np.array(input0).astype(object)
-        inputs = [prepare_tensor('QUERY', input0_data)]
-
-        with grpcclient.InferenceServerClient(self.tritonserver_addr) as \
-                client:
-            result = client.infer(self.model_name, inputs)
-            output0 = result.as_numpy('INPUT_ID')
-            output1 = result.as_numpy('REQUEST_INPUT_LEN')
-        return output0, output1
-
-
-class Postprocessor:
-    """De-tokenize prompts.
-
-    Args:
-        tritonserver_addr (str): the communication address of the inference
-          server
-    """
-
-    def __init__(self, tritonserver_addr: str):
-        self.tritonserver_addr = tritonserver_addr
-
-    def __call__(self, *args, **kwargs):
-        return self.infer(*args, **kwargs)
-
-    def infer(self,
-              output_ids: np.ndarray,
-              seqlen: np.ndarray,
-              skip_special_tokens: bool = True):
-        """De-tokenize tokens for text.
-
-        Args:
-            output_ids(np.ndarray): tokens' id
-            seqlen(np.ndarray): sequence length
-            skip_special_tokens (bool): Whether or not to remove special tokens
-                in the decoding. Default to be True.
-
-        Returns:
-            str: decoded tokens
-        """
-        inputs = [
-            prepare_tensor('TOKENS_BATCH', output_ids),
-            prepare_tensor('sequence_length', seqlen),
-            prepare_tensor('skip_special_tokens', skip_special_tokens)
-        ]
-        inputs[0].set_data_from_numpy(output_ids)
-        inputs[1].set_data_from_numpy(seqlen)
-        inputs[2].set_data_from_numpy(skip_special_tokens)
-        model_name = 'postprocessing'
-        with grpcclient.InferenceServerClient(self.tritonserver_addr) \
-                as client:
-            result = client.infer(model_name, inputs)
-            output0 = result.as_numpy('OUTPUT')
-        return output0
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 2371d008f8..4b9c758f1c 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import os.path as osp
-import re
 import shutil
 from pathlib import Path
 
@@ -78,44 +77,6 @@ def copy_triton_model_templates(_path: str):
     return dst_path
 
 
-def copy_tokenizer(model_path: str, tokenizer_path: str,
-                   triton_models_path: str, trust_remote_code: bool):
-    """Copy tokenizer."""
-    if tokenizer_path is not None:
-        assert osp.exists(tokenizer_path), f'{tokenizer_path} does not exists.'
-
-        shutil.copy(
-            tokenizer_path,
-            osp.join(triton_models_path,
-                     osp.join('tokenizer', osp.basename(tokenizer_path))))
-    else:
-        from transformers import AutoTokenizer
-        try:
-            _ = AutoTokenizer.from_pretrained(
-                model_path, trust_remote_code=trust_remote_code)
-        except Exception:
-            assert 0, (
-                f'Failed to load tokenizer model from path {model_path}.'
-                'please specify tokenizer path by --tokenizer-path')
-
-    # move tokenizer model to the target path
-    candidate = ['tokenizer.model', 'qwen.tiktoken', 'merges.txt']
-    for name in candidate:
-        tmp_path = osp.join(model_path, name)
-        if osp.exists(tmp_path):
-            shutil.copy(tmp_path,
-                        osp.join(triton_models_path, 'tokenizer', name))
-    # move py/json files that are related to tokenizer to the target path
-    for _file in os.listdir(model_path):
-        if _file.endswith('.json') or _file.endswith('.py'):
-            json_path = osp.join(model_path, _file)
-            shutil.copy(json_path,
-                        osp.join(triton_models_path, 'tokenizer', _file))
-    with get_package_root_path() as root_path:
-        shutil.copy(osp.join(root_path, 'tokenizer.py'),
-                    osp.join(triton_models_path, 'tokenizer'))
-
-
 def get_output_model_registered_name_and_config(model_path: str,
                                                 model_format: str,
                                                 group_size: int):
@@ -205,7 +166,6 @@ def main(model_name: str,
          tp: int = 1,
          quant_path: str = None,
          group_size: int = 0,
-         trust_remote_code: bool = False,
          revision: str = None,
          download_dir: str = None,
          **kwargs):
@@ -226,8 +186,6 @@ def main(model_name: str,
         quant_path (str): Path of the quantized model, which can be None.
         group_size (int): a parameter used in AWQ to quantize fp16 weights
             to 4 bits
-        trust_remote_code (bool):  Whether or not to allow for custom models
-            defined on the Hub in their own modeling files. Defaults to False
         revision (str): The specific model version to use. It can be a branch
             name, a tag name, or a commit id. If unspecified, will use
             the default version.
@@ -281,12 +239,9 @@ def main(model_name: str,
 
     create_workspace(dst_path)
 
-    triton_models_path = copy_triton_model_templates(dst_path)
+    weight_path = osp.join(dst_path, 'triton_models', 'weights')
+    os.makedirs(weight_path)
 
-    copy_tokenizer(model_path, tokenizer_path, triton_models_path,
-                   trust_remote_code)
-
-    weight_path = osp.join(triton_models_path, 'weights')
     input_model = INPUT_MODELS.get(input_model_name)(
         model_path=model_path,
         tokenizer_path=tokenizer_path,
@@ -297,27 +252,6 @@ def main(model_name: str,
 
     output_model.export()
 
-    # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
-    with open(osp.join(triton_models_path, 'interactive', 'config.pbtxt'),
-              'a') as f:
-        param = \
-            'parameters {\n  key: "tensor_para_size"\n  value: {\n    ' \
-            'string_value: ' + f'"{tp}"\n' + '  }\n}\n' + \
-            'parameters {\n  key: "model_name"\n  value: {\n    ' \
-            'string_value: ' + f'"{model_name}"\n' + '  }\n}\n'
-        f.write(param)
-
-    # pack model repository for triton inference server
-    pack_model_repository(dst_path)
-
-    # update the value of $TP in `service_docker_up.sh`
-    file_path = osp.join(dst_path, 'service_docker_up.sh')
-    with open(file_path, 'r') as f:
-        content = f.read()
-        content = re.sub('TP=1', f'TP={tp}', content)
-    with open(file_path, 'w') as f:
-        f.write(content)
-
 
 if __name__ == '__main__':
     fire.Fire(main)

From eae2264257400b21148e2e7ebd29cc9dafb9f730 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Thu, 11 Jul 2024 14:21:43 +0800
Subject: [PATCH 06/14] remove chatbot.py

---
 lmdeploy/serve/turbomind/chatbot.py | 687 ----------------------------
 1 file changed, 687 deletions(-)
 delete mode 100644 lmdeploy/serve/turbomind/chatbot.py

diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
deleted file mode 100644
index 68326a1ea2..0000000000
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ /dev/null
@@ -1,687 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import queue
-import random
-import threading
-from dataclasses import dataclass
-from enum import Enum
-from functools import partial
-from typing import List, Union
-
-import mmengine
-import numpy as np
-import tritonclient.grpc as grpcclient
-
-from lmdeploy.model import MODELS
-from lmdeploy.serve.turbomind.utils import (Postprocessor, Preprocessor,
-                                            prepare_tensor)
-from lmdeploy.utils import filter_suffix, get_logger
-
-
-@dataclass
-class Session:
-    session_id: Union[int, str]
-    request_id: str = ''
-    histories: str = ''  # history conversations of the session
-    sequence_length: int = 0  # the total generated token number in the session
-    prompt: str = ''
-    response: str = ''
-    status: int = None  # status of the session
-
-
-class StatusCode(Enum):
-    TRITON_STREAM_END = 0  # end of streaming
-    TRITON_STREAM_ING = 1  # response is in streaming
-    TRITON_SESSION_READY = 2  # session is ready for inference
-    TRITON_SERVER_ERR = -1  # triton server's error
-    TRITON_SESSION_CLOSED = -2  # session has been closed
-    TRITON_SESSION_OUT_OF_LIMIT = -3  # request length out of limit
-    TRITON_SESSION_INVALID_ARG = -4  # invalid argument
-
-
-def stream_callback(que, result, error):
-    """callback function invoked by triton client."""
-    que.put((result, error))
-
-
-class Chatbot:
-    """Chatbot for LLaMA series models with turbomind as inference engine.
-
-    Args:
-        tritonserver_addr (str): communicating address '<ip>:<port>' of
-            triton inference server
-        model_name (str): name of the to-be-deployed mode
-        log_level (int): the level of the log
-        display (bool): display the generated text on consolo or not
-    """
-
-    def __init__(self,
-                 tritonserver_addr: str,
-                 model_name: str = '',
-                 ignore_eos: bool = False,
-                 log_level: int = logging.INFO,
-                 display: bool = False,
-                 top_p: float = 1.0,
-                 top_k: int = 1,
-                 temperature: float = 0.8,
-                 repetition_penalty: float = 1.0,
-                 **model_kwargs):
-        self.tritonserver_addr = tritonserver_addr
-        self.model_name = model_name
-        if self.model_name == '':
-            self.model_name = self._get_model_name()
-        assert self.model_name in MODELS.module_dict.keys(), \
-            f"'{self.model_name}' is not supported. " \
-            f'The supported models are: {MODELS.module_dict.keys()}'
-        self.model = MODELS.get(self.model_name)(**model_kwargs)
-        self._session = None
-        self.preprocess = Preprocessor(tritonserver_addr)
-        self.postprocess = Postprocessor(tritonserver_addr)
-        self.bos_id = self._get_bos()
-        self.eos_id = self._get_eos()
-        stop_words = self._stop_words(self.model.stop_words)
-        bad_words = None
-        if ignore_eos:
-            stop_words = None
-            bad_words = np.array([[[self.eos_id], [1]]], dtype=np.int32)
-            self.eos_id = -1
-        self.cfg = mmengine.Config(
-            dict(session_len=self.model.session_len,
-                 top_p=top_p,
-                 top_k=top_k,
-                 temperature=temperature,
-                 repetition_penalty=repetition_penalty,
-                 stop_words=stop_words,
-                 bad_words=bad_words))
-        self.log_level = log_level
-        self.display = display
-
-    def stream_infer(self,
-                     session_id: int,
-                     prompt: str,
-                     request_id: str = '',
-                     request_output_len: int = None,
-                     sequence_start: bool = False,
-                     sequence_end: bool = False,
-                     skip_special_tokens: bool = True,
-                     *args,
-                     **kwargs):
-        """Start a new round conversion of a session.
-
-        Args:
-            session_id (int): the identical id of a session
-            prompt (str): user's prompt in this round conversation
-            request_id (str): the identical id of this round conversation
-            request_output_len (int): the expected generated token numbers
-            sequence_start (bool): start flag of a session
-            sequence_end (bool): end flag of a session
-            skip_special_tokens (bool): Whether or not to remove special tokens
-                in the decoding. Default to be True.
-        Returns:
-            iterator: The generated content by chatbot
-        """
-        assert isinstance(session_id, int), \
-            f'INT session id is required, but got {type(session_id)}'
-
-        logger = get_logger('service.ft', log_level=self.log_level)
-        logger.info(f'session {session_id}, request_id {request_id}, '
-                    f'request_output_len {request_output_len}')
-
-        if self._session is None:
-            sequence_start = True
-            self._session = Session(session_id=session_id)
-        elif self._session.status == 0:
-            logger.error(f'session {session_id} has been ended. Please set '
-                         f'`sequence_start` be True if you want to restart it')
-            yield StatusCode.TRITON_SESSION_CLOSED, '', 0
-            return
-
-        self._session.status = 1
-        self._session.request_id = request_id
-        self._session.response = ''
-        self.cfg.update(**kwargs)
-
-        self._session.prompt = self._get_prompt(prompt, sequence_start)
-        for status, res, tokens in self._stream_infer(
-                self._session,
-                self._session.prompt,
-                request_output_len,
-                sequence_start,
-                sequence_end,
-                skip_special_tokens=skip_special_tokens):
-            if status == StatusCode.TRITON_STREAM_END:  # remove stop_words
-                res = filter_suffix(res, self.model.stop_words)
-            if status.value < 0:
-                break
-            else:
-                yield status, res, tokens
-        if status.value == 0:
-            self._session.histories = \
-                self._session.histories + self._session.prompt + \
-                self._session.response
-        else:
-            yield status, res, tokens
-
-    def end(self, session_id: int, *args, **kwargs):
-        """end a session. Triton inference server will release the session's
-        occupied resource when it is ended.
-
-        Args:
-            session_id (int): the identical id of a session
-
-        Returns:
-            int: 0: success, -1: session not found
-        """
-        assert isinstance(session_id, int), \
-            f'INT session id is required, but got {type(session_id)}'
-
-        logger = get_logger('service.ft', log_level=self.log_level)
-        logger.info(f'end session: {session_id}')
-
-        if self._session is None:
-            logger.error(
-                f"session {session_id} doesn't exist. It cannot be ended")
-            return StatusCode.TRITON_SESSION_INVALID_ARG
-        if self._session.session_id != session_id:
-            logger.error(f'you cannot end session {session_id}, because this '
-                         f'session is {self._session.session_id}')
-            return StatusCode.TRITON_SESSION_INVALID_ARG
-        if self._session.status == 0:
-            logger.warning(f'session {session_id} has already been ended')
-            return StatusCode.TRITON_SESSION_CLOSED
-
-        self._session.status = 0
-        for status, _, _ in self._stream_infer(self._session,
-                                               prompt='',
-                                               request_output_len=0,
-                                               sequence_start=False,
-                                               sequence_end=True):
-            if status.value < 0:
-                break
-
-        self.reset_session()
-        return status
-
-    def cancel(self, session_id: int, *args, **kwargs):
-        """Cancel the session during generating tokens.
-
-        Args:
-            session_id (int): the identical id of a session
-
-        Returns:
-            int: 0: success, -1: session not found
-        """
-        assert isinstance(session_id, int), \
-            f'INT session id is required, but got {type(session_id)}'
-        logger = get_logger('service.ft', log_level=self.log_level)
-        logger.info(f'cancel session: {session_id}')
-
-        if self._session is None:
-            logger.error(
-                f"session {session_id} doesn't exist. It cannot be cancelled")
-            return StatusCode.TRITON_SESSION_INVALID_ARG
-        if self._session.session_id != session_id:
-            logger.error(
-                f'you cannot cancel session {session_id}, because this '
-                f'session is {self._session.session_id}')
-            return StatusCode.TRITON_SESSION_INVALID_ARG
-        if self._session.status == 0:
-            logger.error(f'session {session_id} has already been ended. '
-                         f'It cannot be cancelled')
-            return StatusCode.TRITON_SESSION_CLOSED
-
-        prev_session = self._session
-        status, res = None, None
-        for status, res, _ in self._stream_infer(self._session,
-                                                 prompt='',
-                                                 request_output_len=0,
-                                                 sequence_start=False,
-                                                 sequence_end=False,
-                                                 cancel=True):
-            if status.value < 0:
-                break
-        if status == StatusCode.TRITON_STREAM_END:
-            logger.info(f'cancel session {session_id} successfully')
-            if prev_session.histories:
-                logger.warning(f'TODO: start to recover session {session_id}')
-        else:
-            logger.info(f'cancel session {session_id} failed: {res}')
-        return status
-
-    def resume(self, session_id: int, *args, **kwargs):
-        """Resume a session by sending the history conversations to triton
-        inference server. After resuming, users can continue chatting with
-        chatbot.
-
-        Args:
-            session_id (int): the identical id of a session
-
-        Returns:
-            int: 0: success, -1: session not found
-        """
-        assert isinstance(session_id, int), \
-            f'INT session id is required, but got {type(session_id)}'
-
-        logger = get_logger('service.ft', log_level=self.log_level)
-        logger.info(f'resume session: {session_id}')
-
-        if self._session is None:
-            logger.error(
-                f"session {session_id} doesn't exist. It cannot be recovered")
-            return StatusCode.TRITON_SESSION_INVALID_ARG
-        if self._session.session_id != session_id:
-            logger.error(
-                f'you cannot resume session {session_id}, because this '
-                f'session is {self._session.session_id}')
-            return StatusCode.TRITON_SESSION_INVALID_ARG
-
-        self._session.status = 1
-        self._session.sequence_length = 0
-        histories = self._session.histories
-        for status, _, _ in self._stream_infer(self._session,
-                                               prompt=histories,
-                                               request_output_len=0,
-                                               sequence_start=True,
-                                               sequence_end=False):
-            if status.value < 0:
-                break
-
-        self._session.histories = histories
-        return status
-
-    def infer(self,
-              session_id: int,
-              prompt: str,
-              request_id: str = '',
-              request_output_len: int = None,
-              sequence_start: bool = False,
-              sequence_end: bool = False,
-              skip_special_tokens: bool = True,
-              *args,
-              **kwargs):
-        """Start a new round conversion of a session. Return the chat
-        completions in non-stream mode.
-
-        Args:
-            session_id (int): the identical id of a session
-            prompt (str): user's prompt in this round conversation
-            request_id (str): the identical id of this round conversation
-            request_output_len (int): the expected generated token numbers
-            sequence_start (bool): start flag of a session
-            sequence_end (bool): end flag of a session
-            skip_special_tokens (bool): Whether or not to remove special tokens
-                in the decoding. Default to be True.
-        Returns:
-            tuple(Status, str, int): status, text/chat completion,
-            generated token number
-        """
-        assert isinstance(session_id, int), \
-            f'INT session id is required, but got {type(session_id)}'
-
-        logger = get_logger('service.ft', log_level=self.log_level)
-        logger.info(f'session {session_id}, request_id {request_id}, '
-                    f'request_output_len {request_output_len}')
-
-        if self._session is None:
-            sequence_start = True
-            self._session = Session(session_id=session_id)
-        elif self._session.status == 0:
-            logger.error(f'session {session_id} has been ended. Please set '
-                         f'`sequence_start` be True if you want to restart it')
-            return StatusCode.TRITON_SESSION_CLOSED, '', 0
-
-        self._session.status = 1
-        self._session.request_id = request_id
-        self._session.response = ''
-
-        self._session.prompt = self._get_prompt(prompt, sequence_start)
-        status, res, tokens = None, '', 0
-        for status, res, tokens in self._stream_infer(
-                self._session,
-                self._session.prompt,
-                request_output_len,
-                sequence_start,
-                sequence_end,
-                skip_special_tokens=skip_special_tokens):
-            if status.value < 0:
-                break
-            if status == StatusCode.TRITON_STREAM_END:  # remove stop_words
-                res = filter_suffix(res, self.model.stop_words)
-        if status.value == 0:
-            self._session.histories = \
-                self._session.histories + self._session.prompt + \
-                self._session.response
-            return status, res, tokens
-        else:
-            return status, res, tokens
-
-    def reset_session(self):
-        """reset session."""
-        self._session = None
-
-    @property
-    def session(self):
-        """get session."""
-        return self._session
-
-    @session.setter
-    def session(self, value):
-        """set session."""
-        self._session = value
-
-    def _get_model_name(self):
-        with grpcclient.InferenceServerClient(
-                self.tritonserver_addr) as client:
-            model_config = client.get_model_config(model_name='turbomind',
-                                                   as_json=True)
-            return model_config['config']['parameters']['model_name'][
-                'string_value']
-
-    def _get_bos(self):
-        """return bos token id."""
-        token_ids, _ = self.preprocess('<BOS>')
-        return token_ids[0][0]
-
-    def _get_eos(self):
-        """return eos token id."""
-        token_ids, _ = self.preprocess('<EOS>')
-        return token_ids[0][0]
-
-    def _stop_words(self, stop_words: List[str]):
-        """return stop-words' token ids."""
-        if stop_words is None:
-            return None
-        assert isinstance(stop_words, List) and \
-               all(isinstance(elem, str) for elem in stop_words), \
-               f'stop_words must be a list but got {type(stop_words)}'
-        # each id in stop_words represents a stop word
-        # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
-        # detailed explanation about turbomind's stop_words
-        stop_words = [
-            int(self.preprocess(stop_word)[0][0][-1])
-            for stop_word in stop_words
-        ]
-        assert isinstance(stop_words, List) and \
-               all(isinstance(elem, int) for elem in stop_words), \
-               'invalid stop_words'
-        stop_word_offsets = range(1, len(stop_words) + 1)
-        stop_words = np.array([[stop_words,
-                                stop_word_offsets]]).astype(np.int32)
-        return stop_words
-
-    def _get_prompt(self, prompt: str, sequence_start: bool):
-        """return the concatenated prompt according to the model's chat
-        template."""
-        return self.model.get_prompt(prompt, sequence_start)
-
-    def _stream_infer(self,
-                      session: Session,
-                      prompt: str,
-                      request_output_len: int = 512,
-                      sequence_start: bool = True,
-                      sequence_end: bool = False,
-                      skip_special_tokens: bool = True,
-                      cancel: bool = False):
-        """communicate with inference server to chat, or cancel a session, or
-        end a session.
-
-        Args:
-            session (Session): an instance of a session
-            prompt (str): the concatenated prompt
-            request_output_len (int): the max number of tokens to be generated
-            sequence_start (bool): indicator for starting a sequence
-            sequence_end (bool): indicator for ending a sequence
-            cancel (bool): indicator for cancelling the session
-            skip_special_tokens (bool): Whether or not to remove special tokens
-                in the decoding. Default to be True.
-        Yields:
-            tuple: status, text, generated token number
-        """
-        logger = get_logger('service.ft', log_level=self.log_level)
-        logger.info(f'session {session.session_id}, '
-                    f'request id {session.request_id}, '
-                    f'request_output_len {request_output_len}, '
-                    f'start {sequence_start}, '
-                    f'end {sequence_end}, cancel {cancel}')
-
-        assert request_output_len is None or \
-               isinstance(request_output_len, int), \
-               f'request_output_len is supposed to be None or int, ' \
-               f'but got {type(request_output_len)}'
-
-        if sequence_start:
-            logger.info(f'session {session.session_id}, clear history since '
-                        f'sequence_start is True')
-            session.histories = ''
-            session.sequence_length = 0
-
-        input_ids, input_lengths = self.preprocess(prompt)
-        # got input_ids with default add_bos == True
-        if not sequence_start and input_ids[0][0] == self.bos_id:
-            input_ids = input_ids[:, 1:]
-            input_lengths = input_lengths - 1
-        # will crash if last_token_id == eos_id and send empty input_ids
-        if sequence_end and request_output_len == 0:
-            input_ids = np.array([[1]], dtype=np.uint32)
-            input_lengths = np.array([[1]], dtype=np.uint32)
-        input_tokens = input_lengths.squeeze()
-
-        if request_output_len is None:
-            request_output_len = max(
-                128,
-                self.cfg.session_len - session.sequence_length - input_tokens)
-
-        if input_tokens + request_output_len + \
-                session.sequence_length > self.cfg.session_len:
-            errmsg = f'session {session.session_id}, ' \
-                     f'out of max sequence length {self.cfg.session_len}, ' \
-                     f'#input tokens {input_tokens}, ' \
-                     f'history tokens {session.sequence_length}, ' \
-                     f'request length {request_output_len}'
-            logger.warning(errmsg)
-            yield StatusCode.TRITON_SESSION_OUT_OF_LIMIT, errmsg, 0
-            return
-
-        logger.info(f'session {session.session_id}, '
-                    f'max length: {self.cfg.session_len}, '
-                    f'input tokens: {input_tokens}, '
-                    f'request tokens: {request_output_len}, '
-                    f'history tokens: {session.sequence_length}')
-
-        preseq_length = session.sequence_length
-        session.response = ''
-        session.status = StatusCode.TRITON_SESSION_READY
-
-        que = queue.Queue()
-        producer = threading.Thread(target=self._stream_producer,
-                                    args=(self.tritonserver_addr, session, que,
-                                          self.cfg, input_ids, input_lengths,
-                                          request_output_len, sequence_start,
-                                          sequence_end, preseq_length, cancel))
-        producer.start()
-        for status, res, n_token in self.stream_consumer(
-                self.postprocess, que, session, input_tokens, preseq_length,
-                cancel, logger, self.display, self.eos_id,
-                skip_special_tokens):
-            yield status, res, n_token
-
-        producer.join()
-        self._session = que.get()
-        curseq_length = self._session.sequence_length
-        logger.info(f'session {session.session_id}, pre seq_len '
-                    f'{preseq_length}, cur seq_len {curseq_length}, '
-                    f'diff {curseq_length - preseq_length}')
-
-    @staticmethod
-    def _stream_producer(tritonserver_addr, session, que, cfg, input_ids,
-                         input_lengths, request_output_len, sequence_start,
-                         sequence_end, preseq_length, cancel):
-        """Send a request to the triton inference server.
-
-        Args:
-            tritonserver_addr (str): the communication address of the inference
-                server
-            session (Session): an instance of a session
-            que (multiprocessing.Queue): response queue
-            cfg (dict): parameters for sampling
-            input_ids (numpy.ndarray): token ids of input prompt
-            input_lengths (numpy.ndarray): length of input_ids
-            request_output_len (int): the max number of tokens to be generated
-            sequence_start (bool): indicator for starting a sequence
-            sequence_end (bool): indicator for ending a sequence
-            preseq_length (int): the history sequence length
-            cancel (bool): indicator for cancelling the session
-        """
-        request_output_len = np.full(input_lengths.shape,
-                                     request_output_len).astype(np.uint32)
-
-        callback = partial(stream_callback, que)
-        with grpcclient.InferenceServerClient(tritonserver_addr) as client:
-            inputs = [
-                prepare_tensor('input_ids', input_ids),
-                prepare_tensor('input_lengths', input_lengths),
-                prepare_tensor('request_output_len', request_output_len),
-                prepare_tensor('runtime_top_p',
-                               cfg.top_p * np.ones((1, 1), dtype=np.float32)),
-                prepare_tensor(
-                    'temperature',
-                    cfg.temperature * np.ones((1, 1), dtype=np.float32)),
-                prepare_tensor(
-                    'repetition_penalty',
-                    cfg.repetition_penalty * np.ones(
-                        (1, 1), dtype=np.float32)),
-                prepare_tensor('step',
-                               preseq_length * np.ones((1, 1), dtype=np.int32))
-            ]
-            if cfg.top_k is not None:
-                inputs += prepare_tensor(
-                    'runtime_top_k',
-                    cfg.top_k * np.ones((1, 1), dtype=np.uint32)),
-            if cfg.stop_words is not None:
-                inputs += [prepare_tensor('stop_words_list', cfg.stop_words)]
-            if cfg.bad_words is not None:
-                inputs += [prepare_tensor('bad_words_list', cfg.bad_words)]
-
-            inputs += [
-                prepare_tensor(
-                    'session_len',
-                    cfg.session_len *
-                    np.ones([input_ids.shape[0], 1], dtype=np.uint32)),
-                prepare_tensor('START', (1 if sequence_start else 0) * np.ones(
-                    (1, 1), dtype=np.int32)),
-                prepare_tensor('END', (1 if sequence_end else 0) * np.ones(
-                    (1, 1), dtype=np.int32)),
-                prepare_tensor(
-                    'CORRID',
-                    session.session_id * np.ones((1, 1), dtype=np.uint64)),
-                prepare_tensor('STOP', (1 if cancel else 0) * np.ones(
-                    (1, 1), dtype=np.int32))
-            ]
-            if sequence_start:
-                random_seed = random.getrandbits(64)
-                inputs += [
-                    prepare_tensor(
-                        'random_seed',
-                        random_seed * np.ones((1, 1), dtype=np.uint64))
-                ]
-            client.start_stream(callback)
-            client.async_stream_infer('turbomind',
-                                      inputs,
-                                      sequence_id=session.session_id,
-                                      request_id=session.request_id,
-                                      sequence_start=sequence_start,
-                                      sequence_end=sequence_end)
-        que.put(None)
-
-    @staticmethod
-    def stream_consumer(postprocess, res_queue, session, n_input_token,
-                        preseq_length, cancel, logger, display, eos_id,
-                        skip_special_tokens):
-        """Consume the response from the triton inference server.
-
-        Args:
-            postprocess (callable): postprocess function for
-                the generated tokens
-            res_queue (multiprocessing.Queue): response queue
-            session (Session): an instance of a session
-            n_input_token (int): token number of input prompt
-            preseq_length (int): the history sequence length
-            cancel (bool): indicator for cancelling the session
-            logger (util.Logger):
-            display (bool): display the text in the consolo interface or not
-            eos_id (int): eos token id
-            skip_special_tokens (bool): Whether or not to remove special tokens
-                in the decoding. Default to be True.
-
-        Yields:
-            tuple: status, text, generated token number
-        """
-        status, res, n_token = None, '', 0
-        output_ids = np.zeros((1, 1, 0), dtype=np.uint32)
-        text = ''
-        while True:
-            result_pack = res_queue.get()
-            if result_pack is None:
-                status = StatusCode.TRITON_STREAM_END
-                res = session.response
-                session.status = StatusCode.TRITON_STREAM_END
-                break
-            result, error = result_pack
-            if error is not None:
-                logger.error(f'got error from turbomind, code '
-                             f'{StatusCode.TRITON_SERVER_ERR}, {error}, '
-                             f'token {session.sequence_length}')
-                session.sequence_length = preseq_length
-                session.response = ''
-                status = StatusCode.TRITON_SERVER_ERR
-                res = f'{status}, {error}'
-                n_token = 0
-                break
-            if cancel:
-                continue
-            try:
-                sequence_length = result.as_numpy('sequence_length')
-                output_ids = result.as_numpy('output_ids')
-
-                session.sequence_length = sequence_length.squeeze()
-                output_ids = output_ids.reshape((1, 1, output_ids.shape[-1]))
-                output_ids = output_ids[:, :, n_input_token +
-                                        preseq_length:sequence_length.squeeze(
-                                        )]
-                last_token_id = None if output_ids.shape[
-                    -1] == 0 else output_ids[-1, -1, -1]
-                if last_token_id == eos_id:
-                    session.sequence_length = session.sequence_length - 1
-                    output_ids = output_ids[:, :, :-1]
-
-                output_str = postprocess(
-                    output_ids, np.array([[n_token]], dtype=np.uint32),
-                    np.array([[int(skip_special_tokens)]], dtype=np.int32))
-                text = output_str[0].decode()
-                # utf-8 char at the end means it's a potential unfinished
-                # byte sequence, continue to concate it with the next
-                # sequence and decode them together
-                if text.endswith('�'):
-                    continue
-                n_token = output_ids.shape[-1]
-                if display:
-                    print(text, end='', flush=True)
-                session.response += text
-                yield (StatusCode.TRITON_STREAM_ING, session.response,
-                       output_ids.shape[-1])
-            except Exception as e:
-                logger.error(f'catch exception: {e}')
-                logger.error(
-                    f'session {session.session_id}: prompt: {session.prompt}')
-        # `n_token` might be not updated since `if text.endswith('�')`
-        if n_token != output_ids.shape[-1]:
-            n_token = output_ids.shape[-1]
-            session.response += text
-        # put session back to queue so that `_stream_infer` can update it in
-        # `self.sessions`
-        while not res_queue.empty():
-            res_queue.get()
-        res_queue.put(session)
-        if display:
-            print('\n')
-        yield status, res, n_token

From afa0f085a55beda12ed08cc784e61aeb5916610b Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 16 Jul 2024 23:51:38 +0800
Subject: [PATCH 07/14] remove triton_server_backend

---
 lmdeploy/serve/client.py                      |  69 --------
 lmdeploy/serve/gradio/app.py                  |  18 +-
 .../serve/gradio/triton_server_backend.py     | 161 ------------------
 3 files changed, 6 insertions(+), 242 deletions(-)
 delete mode 100644 lmdeploy/serve/client.py
 delete mode 100644 lmdeploy/serve/gradio/triton_server_backend.py

diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py
deleted file mode 100644
index 424e83143f..0000000000
--- a/lmdeploy/serve/client.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-
-from lmdeploy.serve.turbomind.chatbot import Chatbot
-
-
-def input_prompt(model_name):
-    """Input a prompt in the consolo interface."""
-    if model_name == 'codellama':
-        print('\nenter !! to end the input >>>\n', end='')
-        sentinel = '!!'
-    else:
-        print('\ndouble enter to end input >>> ', end='')
-        sentinel = ''  # ends when this string is seen
-    return '\n'.join(iter(input, sentinel))
-
-
-def main(tritonserver_addr: str,
-         session_id: int = 1,
-         cap: str = 'chat',
-         stream_output: bool = True,
-         **kwargs):
-    """An example to communicate with inference server through the command line
-    interface.
-
-    Args:
-        tritonserver_addr (str): the address in format "ip:port" of
-          triton inference server
-        session_id (int): the identical id of a session
-        cap (str): the capability of a model. For example, codellama has
-            the ability among ['completion', 'infill', 'instruct', 'python']
-        stream_output (bool): indicator for streaming output or not
-        **kwargs (dict): other arguments for initializing model's chat template
-    """
-    log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
-    kwargs.update(capability=cap)
-    chatbot = Chatbot(tritonserver_addr,
-                      log_level=log_level,
-                      display=stream_output,
-                      **kwargs)
-    nth_round = 1
-    while True:
-        prompt = input_prompt(chatbot.model_name)
-        if prompt == 'exit':
-            exit(0)
-        elif prompt == 'end':
-            chatbot.end(session_id)
-        else:
-            request_id = f'{session_id}-{nth_round}'
-            if stream_output:
-                for status, res, n_token in chatbot.stream_infer(
-                        session_id,
-                        prompt,
-                        request_id=request_id,
-                        request_output_len=512):
-                    continue
-            else:
-                status, res, n_token = chatbot.infer(session_id,
-                                                     prompt,
-                                                     request_id=request_id,
-                                                     request_output_len=512)
-                print(res)
-        nth_round += 1
-
-
-if __name__ == '__main__':
-    import fire
-
-    fire.Fire(main)
diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index 768edb05d2..ad9b281a1b 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -20,10 +20,9 @@ def run(model_path_or_server: str,
     """chat with AI assistant through web ui.
 
     Args:
-        model_path_or_server (str): the path of the deployed model or the
-            tritonserver URL or restful api URL. For example:
-            - ./workspace
-            - 0.0.0.0:23333
+        model_path_or_server (str): the path of the deployed model or
+        restful api URL. For example:
+            - huggingface hub repo_id
             - http://0.0.0.0:23333
         server_name (str): the ip address of gradio server
         server_port (int): the port of gradio server
@@ -37,14 +36,9 @@ def run(model_path_or_server: str,
         tp (int): tensor parallel for Turbomind
     """
     if ':' in model_path_or_server:
-        if 'http:' in model_path_or_server:
-            from lmdeploy.serve.gradio.api_server_backend import run_api_server
-            run_api_server(model_path_or_server, server_name, server_port,
-                           batch_size)
-        else:
-            from lmdeploy.serve.gradio.triton_server_backend import \
-                run_triton_server
-            run_triton_server(model_path_or_server, server_name, server_port)
+        from lmdeploy.serve.gradio.api_server_backend import run_api_server
+        run_api_server(model_path_or_server, server_name, server_port,
+                       batch_size)
     else:
         pipeline_type, _ = get_task(model_path_or_server)
         if pipeline_type == 'vlm':
diff --git a/lmdeploy/serve/gradio/triton_server_backend.py b/lmdeploy/serve/gradio/triton_server_backend.py
deleted file mode 100644
index ef7ed8c3f1..0000000000
--- a/lmdeploy/serve/gradio/triton_server_backend.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-from functools import partial
-from threading import Lock
-from typing import Sequence
-
-import gradio as gr
-
-from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
-from lmdeploy.serve.turbomind.chatbot import Chatbot
-
-
-class InterFace:
-    global_session_id: int = 0
-    lock = Lock()
-
-
-def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot,
-                cancel_btn: gr.Button, reset_btn: gr.Button, session_id: int,
-                top_p: float, temperature: float, request_output_len: int):
-    """Chat with AI assistant.
-
-    Args:
-        instruction (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        llama_chatbot (Chatbot): the instance of a chatbot
-        cancel_btn (bool): enable the cancel button or not
-        reset_btn (bool): enable the reset button or not
-        session_id (int): the session id
-    """
-    instruction = state_chatbot[-1][0]
-
-    bot_response = llama_chatbot.stream_infer(
-        session_id,
-        instruction,
-        f'{session_id}-{len(state_chatbot)}',
-        request_output_len=request_output_len,
-        top_p=top_p,
-        temperature=temperature)
-
-    for status, tokens, _ in bot_response:
-        state_chatbot[-1] = (state_chatbot[-1][0], tokens)
-        yield (state_chatbot, state_chatbot, enable_btn, disable_btn)
-
-    yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
-
-
-def reset_all_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
-                   llama_chatbot: gr.State, triton_server_addr: str,
-                   model_name: str):
-    """reset the session."""
-    state_chatbot = []
-    log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
-    llama_chatbot = Chatbot(triton_server_addr,
-                            model_name,
-                            log_level=log_level,
-                            display=True)
-
-    return (
-        llama_chatbot,
-        state_chatbot,
-        state_chatbot,
-        gr.Textbox.update(value=''),
-    )
-
-
-def cancel_func(
-    state_chatbot: gr.State,
-    llama_chatbot: gr.State,
-    cancel_btn: gr.Button,
-    reset_btn: gr.Button,
-):
-    """cancel the session."""
-    yield (llama_chatbot, state_chatbot, disable_btn, disable_btn)
-    session_id = llama_chatbot._session.session_id
-    llama_chatbot.cancel(session_id)
-
-    yield (llama_chatbot, state_chatbot, disable_btn, enable_btn)
-
-
-def add_instruction(instruction, state_chatbot):
-    state_chatbot = state_chatbot + [(instruction, None)]
-    return ('', state_chatbot)
-
-
-def run_triton_server(triton_server_addr: str,
-                      server_name: str = 'localhost',
-                      server_port: int = 6006):
-    """chat with AI assistant through web ui.
-
-    Args:
-        triton_server_addr (str): the communication address of inference server
-        server_name (str): the ip address of gradio server
-        server_port (int): the port of gradio server
-    """
-    with gr.Blocks(css=CSS, theme=THEME) as demo:
-        log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
-        llama_chatbot = gr.State(
-            Chatbot(triton_server_addr, log_level=log_level, display=True))
-        state_chatbot = gr.State([])
-        state_session_id = gr.State(0)
-        model_name = llama_chatbot.value.model_name
-        reset_all = partial(reset_all_func,
-                            model_name=model_name,
-                            triton_server_addr=triton_server_addr)
-
-        with gr.Column(elem_id='container'):
-            gr.Markdown('## LMDeploy Playground')
-
-            chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
-            instruction_txtbox = gr.Textbox(
-                placeholder='Please input the instruction',
-                label='Instruction')
-            with gr.Row():
-                cancel_btn = gr.Button(value='Cancel', interactive=False)
-                reset_btn = gr.Button(value='Reset')
-            with gr.Row():
-                request_output_len = gr.Slider(1,
-                                               2048,
-                                               value=512,
-                                               step=1,
-                                               label='Maximum new tokens')
-                top_p = gr.Slider(0.01, 1, value=0.8, step=0.01, label='Top_p')
-                temperature = gr.Slider(0.01,
-                                        1.5,
-                                        value=0.7,
-                                        step=0.01,
-                                        label='Temperature')
-
-        send_event = instruction_txtbox.submit(
-            add_instruction, [instruction_txtbox, state_chatbot],
-            [instruction_txtbox, state_chatbot]).then(chat_stream, [
-                state_chatbot, llama_chatbot, cancel_btn, reset_btn,
-                state_session_id, top_p, temperature, request_output_len
-            ], [state_chatbot, chatbot, cancel_btn, reset_btn])
-
-        cancel_btn.click(cancel_func,
-                         [state_chatbot, llama_chatbot, cancel_btn, reset_btn],
-                         [llama_chatbot, chatbot, cancel_btn, reset_btn],
-                         cancels=[send_event])
-
-        reset_btn.click(
-            reset_all, [instruction_txtbox, state_chatbot, llama_chatbot],
-            [llama_chatbot, state_chatbot, chatbot, instruction_txtbox],
-            cancels=[send_event])
-
-        def init():
-            with InterFace.lock:
-                InterFace.global_session_id += 1
-            new_session_id = InterFace.global_session_id
-            return new_session_id
-
-        demo.load(init, inputs=None, outputs=[state_session_id])
-
-    print(f'server is gonna mount on: http://{server_name}:{server_port}')
-    demo.queue(concurrency_count=4, max_size=100, api_open=True).launch(
-        max_threads=10,
-        share=True,
-        server_port=server_port,
-        server_name=server_name,
-    )

From 20d7c47208d70bf1160ee01a17589177a8dbbe10 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Tue, 16 Jul 2024 23:54:48 +0800
Subject: [PATCH 08/14] remove triton_client CLI

---
 lmdeploy/cli/serve.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 031d09e302..38a760d849 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -196,23 +196,6 @@ def add_parser_api_client():
                             'api key will be used')
         ArgumentHelper.session_id(parser)
 
-    @staticmethod
-    def add_parser_triton_client():
-        """Add parser for triton_client command."""
-        parser = SubCliServe.subparsers.add_parser(
-            'triton_client',
-            formatter_class=DefaultsAndTypesHelpFormatter,
-            description=SubCliServe.triton_client.__doc__,
-            help=SubCliServe.triton_client.__doc__)
-        parser.set_defaults(run=SubCliServe.triton_client)
-        parser.add_argument(
-            'tritonserver_addr',
-            type=str,
-            help='The address in format "ip:port" of triton inference server')
-        ArgumentHelper.session_id(parser)
-        ArgumentHelper.cap(parser)
-        ArgumentHelper.stream_output(parser)
-
     @staticmethod
     def gradio(args):
         """Serve LLMs with web UI using gradio."""
@@ -331,16 +314,8 @@ def api_client(args):
         kwargs = convert_args(args)
         run_api_client(**kwargs)
 
-    @staticmethod
-    def triton_client(args):
-        """Interact with Triton Server using gRPC protocol."""
-        from lmdeploy.serve.client import main as run_triton_client
-        kwargs = convert_args(args)
-        run_triton_client(**kwargs)
-
     @staticmethod
     def add_parsers():
         SubCliServe.add_parser_gradio()
         SubCliServe.add_parser_api_server()
         SubCliServe.add_parser_api_client()
-        SubCliServe.add_parser_triton_client()

From b1ae7a50fc7a2598c994d39d40e8a7359596f85b Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 17 Jul 2024 11:47:19 +0800
Subject: [PATCH 09/14] fix

---
 lmdeploy/serve/turbomind/__init__.py   |  1 +
 lmdeploy/turbomind/deploy/converter.py | 75 ++++++++++++++------------
 2 files changed, 41 insertions(+), 35 deletions(-)
 create mode 100644 lmdeploy/serve/turbomind/__init__.py

diff --git a/lmdeploy/serve/turbomind/__init__.py b/lmdeploy/serve/turbomind/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/lmdeploy/serve/turbomind/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 4b9c758f1c..2bda0d4aba 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -2,7 +2,6 @@
 import os
 import os.path as osp
 import shutil
-from pathlib import Path
 
 import fire
 import torch
@@ -19,12 +18,6 @@
 SUPPORTED_FORMATS = ['meta_llama', 'hf', 'awq', None]
 
 
-def get_package_root_path():
-    """Get lmdeploy root path."""
-    import lmdeploy
-    return Path(lmdeploy.__file__).parent
-
-
 def get_input_model_registered_name(model_path: str, model_format: str):
     """Get the registered name of a model. The name will be used to access the
     INPUT_MODELS registry.
@@ -51,30 +44,41 @@ def create_workspace(_path: str):
         print(f'remove workspace in directory {_path}')
         shutil.rmtree(_path)
     print(f'create workspace in directory {_path}')
-    os.makedirs(_path)
-
-
-def copy_triton_model_templates(_path: str):
-    """copy triton model templates to the specified path.
-
-    Args:
-        _path (str): the target path
-    Returns:
-        str: the path of the triton models
-    """
-
-    root = get_package_root_path()
-    dir_path = osp.join(root, 'serve', 'turbomind')
-    triton_models_path = osp.join(dir_path, 'triton_models')
-    dst_path = osp.join(_path, 'triton_models')
-    print(f'copy triton model templates from "{triton_models_path}" to '
-          f'"{dst_path}"')
-    shutil.copytree(triton_models_path, dst_path, symlinks=True)
-    service_docker_up_file = osp.join(dir_path, 'service_docker_up.sh')
-    print(f'copy service_docker_up.sh from "{service_docker_up_file}" to '
-          f'"{_path}"')
-    shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
-    return dst_path
+    weight_path = osp.join(_path, 'triton_models', 'weights')
+    tokenizer_path = osp.join(_path, 'triton_models', 'tokenizer')
+    os.makedirs(weight_path)
+    os.makedirs(tokenizer_path)
+    return weight_path, tokenizer_path
+
+
+def copy_tokenizer(model_path: str, tokenizer_path: str,
+                   tm_tokenizer_path: str, trust_remote_code: bool):
+    """Copy tokenizer."""
+
+    if tokenizer_path is not None:
+        assert osp.exists(tokenizer_path), f'{tokenizer_path} does not exists.'
+
+        shutil.copy(tokenizer_path,
+                    osp.join(tm_tokenizer_path, osp.basename(tokenizer_path)))
+    else:
+        from transformers import AutoTokenizer
+        try:
+            _ = AutoTokenizer.from_pretrained(
+                model_path, trust_remote_code=trust_remote_code)
+        except Exception as e:
+            assert 0, f'{e}'
+
+    # move tokenizer model to the target path
+    candidate = ['tokenizer.model', 'qwen.tiktoken', 'merges.txt']
+    for name in candidate:
+        tmp_path = osp.join(model_path, name)
+        if osp.exists(tmp_path):
+            shutil.copy(tmp_path, osp.join(tm_tokenizer_path, name))
+    # copy py/json files that are related to tokenizer to the target path
+    for _file in os.listdir(model_path):
+        if _file.endswith('.json') or _file.endswith('.py'):
+            json_path = osp.join(model_path, _file)
+            shutil.copy(json_path, osp.join(tm_tokenizer_path, _file))
 
 
 def get_output_model_registered_name_and_config(model_path: str,
@@ -166,6 +170,7 @@ def main(model_name: str,
          tp: int = 1,
          quant_path: str = None,
          group_size: int = 0,
+         trust_remote_code: bool = False,
          revision: str = None,
          download_dir: str = None,
          **kwargs):
@@ -237,17 +242,17 @@ def main(model_name: str,
     cfg.model_name = model_name
     cfg.tensor_para_size = tp
 
-    create_workspace(dst_path)
+    tm_weight_path, tm_tokenizer_path = create_workspace(dst_path)
 
-    weight_path = osp.join(dst_path, 'triton_models', 'weights')
-    os.makedirs(weight_path)
+    copy_tokenizer(model_path, tokenizer_path, tm_tokenizer_path,
+                   trust_remote_code)
 
     input_model = INPUT_MODELS.get(input_model_name)(
         model_path=model_path,
         tokenizer_path=tokenizer_path,
         ckpt_path=quant_path)
     output_model = OUTPUT_MODELS.get(output_model_name)(
-        input_model=input_model, cfg=cfg, to_file=True, out_dir=weight_path)
+        input_model=input_model, cfg=cfg, to_file=True, out_dir=tm_weight_path)
     print(f'turbomind model config: {output_model.cfg}')
 
     output_model.export()

From e1ba6dec0cd9edc271814e931029eb0cfbfafecf Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 17 Jul 2024 12:04:19 +0800
Subject: [PATCH 10/14] remove triton_utils.hpp

---
 src/turbomind/triton_backend/triton_utils.hpp | 57 -------------------
 1 file changed, 57 deletions(-)
 delete mode 100644 src/turbomind/triton_backend/triton_utils.hpp

diff --git a/src/turbomind/triton_backend/triton_utils.hpp b/src/turbomind/triton_backend/triton_utils.hpp
deleted file mode 100644
index a87dd7d6f4..0000000000
--- a/src/turbomind/triton_backend/triton_utils.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/Tensor.h"
-
-namespace ft = turbomind;
-
-template<typename T>
-void move_tensor_H2D(const triton::Tensor&                                          tensor,
-                     T*&                                                            d_ptr,
-                     const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>>* allocator)
-{
-    if (tensor.where == triton::MEMORY_GPU) {
-        return;
-    }
-
-    size_t tensor_size = 1;
-    for (auto t : tensor.shape) {
-        tensor_size *= t;
-    }
-
-    cudaStream_t stream = (*allocator)->returnStream();
-
-    d_ptr = (T*)((*allocator)->reMalloc(d_ptr, sizeof(T) * tensor_size, false));
-    ft::check_cuda_error(cudaMemcpyAsync(d_ptr, (T*)tensor.data, sizeof(T) * tensor_size, cudaMemcpyDefault, stream));
-}
-
-template<typename T>
-ft::Tensor as_GPU_tensor(const triton::Tensor& tensor, T* d_ptr)
-{
-    return ft::Tensor{ft::MEMORY_GPU,
-                      triton::Tensor::convertTritonTypeToFt(tensor.type),
-                      tensor.shape,
-                      tensor.where == triton::MEMORY_CPU ? d_ptr : tensor.data};
-}
-
-inline ft::Tensor as_CPU_tensor(const triton::Tensor& tensor)
-{
-    ft::FT_CHECK(tensor.where == triton::MEMORY_CPU);
-    return ft::Tensor{ft::MEMORY_CPU, triton::Tensor::convertTritonTypeToFt(tensor.type), tensor.shape, tensor.data};
-}

From e6c99acc967f985c6bd4f9de0672ea0f8f058cd3 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 17 Jul 2024 12:28:53 +0800
Subject: [PATCH 11/14] fix

---
 src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
index f9381a03fb..e3ce79826d 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -21,7 +21,6 @@
 #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/turbomind/macro.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/triton_backend/triton_utils.hpp"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/constant.h"
 #include "src/turbomind/utils/cuda_utils.h"

From 2cb9a853704c1a4c88a54d1087b0f6a40474455b Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 17 Jul 2024 18:09:00 +0800
Subject: [PATCH 12/14] fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a9d8be3022..faaa1cba33 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ ______________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
-- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-serie models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5
+- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5
 - \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next
 - \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs
 - \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2

From 9a66b2c7a09e1f0f8a3f99a3fe910feb8bdc2751 Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 17 Jul 2024 18:54:03 +0800
Subject: [PATCH 13/14] update manifest.in

---
 MANIFEST.in | 2 --
 1 file changed, 2 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index ab9b0b57bf..32e432f498 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,5 +4,3 @@ include lmdeploy/lib/*.so*
 include lmdeploy/lib/*.dll
 include lmdeploy/lib/*.pyd
 include lmdeploy/bin/*
-include lmdeploy/serve/turbomind/service_docker_up.sh
-recursive-include lmdeploy/serve/turbomind/triton_models *

From 8d319e2ac077973b9f55ffc8b5416f1b6d6a608f Mon Sep 17 00:00:00 2001
From: lvhan028 <lvhan_028@163.com>
Date: Wed, 17 Jul 2024 19:03:23 +0800
Subject: [PATCH 14/14] fix as suggested

---
 lmdeploy/turbomind/deploy/converter.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
index 2bda0d4aba..6c93d39c42 100644
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -52,7 +52,7 @@ def create_workspace(_path: str):
 
 
 def copy_tokenizer(model_path: str, tokenizer_path: str,
-                   tm_tokenizer_path: str, trust_remote_code: bool):
+                   tm_tokenizer_path: str):
     """Copy tokenizer."""
 
     if tokenizer_path is not None:
@@ -63,8 +63,8 @@ def copy_tokenizer(model_path: str, tokenizer_path: str,
     else:
         from transformers import AutoTokenizer
         try:
-            _ = AutoTokenizer.from_pretrained(
-                model_path, trust_remote_code=trust_remote_code)
+            _ = AutoTokenizer.from_pretrained(model_path,
+                                              trust_remote_code=True)
         except Exception as e:
             assert 0, f'{e}'
 
@@ -170,7 +170,6 @@ def main(model_name: str,
          tp: int = 1,
          quant_path: str = None,
          group_size: int = 0,
-         trust_remote_code: bool = False,
          revision: str = None,
          download_dir: str = None,
          **kwargs):
@@ -244,8 +243,7 @@ def main(model_name: str,
 
     tm_weight_path, tm_tokenizer_path = create_workspace(dst_path)
 
-    copy_tokenizer(model_path, tokenizer_path, tm_tokenizer_path,
-                   trust_remote_code)
+    copy_tokenizer(model_path, tokenizer_path, tm_tokenizer_path)
 
     input_model = INPUT_MODELS.get(input_model_name)(
         model_path=model_path,