InternLM · lvhan028 · Jul 17, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -43,20 +43,21 @@ option(BUILD_PY_FFI "Build python ffi" ON)
 option(BUILD_TEST "Build tests" OFF)
 
 include(FetchContent)
+if (BUILD_TEST)
+  FetchContent_Declare(
+    repo-cutlass
+    GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+    GIT_TAG        6f47420213f757831fae65c686aa471749fa8d60
+    GIT_SHALLOW ON
+  )
 
-FetchContent_Declare(
-  repo-cutlass
-  GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
-  GIT_TAG        6f47420213f757831fae65c686aa471749fa8d60
-  GIT_SHALLOW ON
-)
-
-set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+  set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
-FetchContent_MakeAvailable(repo-cutlass)
+  FetchContent_MakeAvailable(repo-cutlass)
 
-set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
-set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
+  set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
+  set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
+endif()
 
 option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,5 +4,3 @@ include lmdeploy/lib/*.so*
 include lmdeploy/lib/*.dll
 include lmdeploy/lib/*.pyd
 include lmdeploy/bin/*
-include lmdeploy/serve/turbomind/service_docker_up.sh
-recursive-include lmdeploy/serve/turbomind/triton_models *
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ ______________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
-- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-serie models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5
+- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5
 - \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next
 - \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs
 - \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2

diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py
diff --git a/docs/en/benchmark/profile_triton_server.md b/docs/en/benchmark/profile_triton_server.md
diff --git a/docs/en/index.rst b/docs/en/index.rst
@@ -58,7 +58,6 @@ Documentation
    benchmark/profile_generation.md
    benchmark/profile_throughput.md
    benchmark/profile_api_server.md
-   benchmark/profile_triton_server.md
    benchmark/evaluate_with_opencompass.md
 
 .. _supported_models:

diff --git a/docs/en/inference/turbomind.md b/docs/en/inference/turbomind.md
@@ -57,8 +57,6 @@ Our implementation of the LLaMa family models is modified from Gpt-NeoX model in
 
 TurboMind supports a Python API that enables streaming output and tensor parallel mode.
 
-The ability to use [tritonserver](https://github.com/triton-inference-server/server) for serving is also inherited from FasterTransformer. However, to support submitting concurrent requests into our persistent batch model, we no longer use sequence batching or dynamic batching as FasterTransformer does. The bookkeeping of request and sequence states are managed by TurboMind instead.
-
 ## Difference between FasterTransformer and TurboMind
 
 Apart of the features described above, there are still many minor differences that we don't cover in this document. Notably, many capabilities of FT are dropped in TurboMind because of the difference in objectives (e.g. prefix prompt, beam search, context embedding, sparse GEMM, GPT/T5/other model families, etc)