Update TensorRT-LLM (#1639)

* Update TensorRT-LLM --------- Co-authored-by: vonjackustc <[email protected]>
NVIDIA · May 21, 2024 · 5d8ca2f · 5d8ca2f
1 parent b189b61
commit 5d8ca2f
Show file tree

Hide file tree

Showing 165 changed files with 511,601 additions and 517,367 deletions.
diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
@@ -232,7 +232,8 @@ def parse_arguments():
         choices=[
             'fp8', 'fp8_gemm', 'fp8_kv_cache', 'int8_sq_per_tensor',
             'int8_sq_per_token_channel', 'int8_weight_only', 'int4_weight_only',
-            'int4_weight_only_awq', 'int4_weight_only_gptq'
+            'int4_weight_only_awq', 'int4_weight_only_gptq',
+            'int8_sq_per_channel_ootb'
         ],
         help="Optimize the model with specified quantization recipe")
     parser.add_argument(

diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
@@ -220,6 +220,8 @@ def get_quant_config(quantization: str):
     elif quantization == "int8_sq_per_token_channel":
         return QuantConfig(
             quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
+    elif quantization == "int8_sq_per_channel_ootb":
+        return QuantConfig(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL)
     elif quantization == "int8_weight_only":
         return QuantConfig(quant_algo=QuantAlgo.W8A16)
     elif quantization == "int4_weight_only":

diff --git a/cpp/include/tensorrt_llm/runtime/decodingOutput.h b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
@@ -35,14 +35,14 @@ class DecodingOutput
     class BeamHypotheses
     {
     public:
-        // The same as cpp/tensorrt_llm/kernels/beamSearchKernels.h
+        // Keep same as cpp/tensorrt_llm/kernels/beamSearchKernels.h
         TensorPtr outputIdsCBA;       // [BS, BM*2, MSL]
-        TensorPtr sequenceLengthsCBA; // [BS, BM]
+        TensorPtr logProbsCBA;        // [BS, BM*2, MSL]
+        TensorPtr sequenceLengthsCBA; // [BS, BM*2]
         TensorPtr cumLogProbsCBA;     // [BS, BM*2]
         TensorPtr normedScoresCBA;    // [BS, BM*2]
-        TensorPtr logProbsCBA;        // [BS, BM*2, MSL]
-        TensorPtr minNormedScoresCBA; // [BS]
         TensorPtr numBeamsCBA;        // [BS]
+        TensorPtr minNormedScoresCBA; // [BS]
         TensorPtr batchDones;         // [BS]
 
         void empty(BufferManager& manager);

diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
 ed2ee8d73a5d374e800f653169bf293e libtensorrt_llm_batch_manager_static.a
 ed2ee8d73a5d374e800f653169bf293e libtensorrt_llm_batch_manager_static.pre_cxx11.a
-f088526f4bce4b1143c67973b3502734c3491ab9 commit
+05aaf1a0fb2f0115af107b00aa839a6601f6a873 commit
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
diff --git a/...sorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/...sorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
diff --git a/cpp/tensorrt_llm/common/cudaFp8Utils.cu b/cpp/tensorrt_llm/common/cudaFp8Utils.cu
@@ -206,6 +206,9 @@ __device__ __nv_bfloat16 atomicMaxExtd(__nv_bfloat16* address, __nv_bfloat16 val
     }
 
     return __ushort_as_bfloat16(old);
+#else
+    asm volatile("  brkpt;\n");
+    return 0;
 #endif
 }
 

diff --git a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh
@@ -597,6 +597,9 @@ __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val)
 {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
     return __hmax(val.x, val.y);
+#else
+    asm volatile("  brkpt;\n");
+    return 0;
 #endif
 }
 #endif

diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-3c423e837e67ce86756ea468438c41a8 libtensorrt_llm_executor_static.a
-3c423e837e67ce86756ea468438c41a8 libtensorrt_llm_executor_static.pre_cxx11.a
-f088526f4bce4b1143c67973b3502734c3491ab9 commit
+54670adde093baff8b031869bdeeeb1b libtensorrt_llm_executor_static.a
+54670adde093baff8b031869bdeeeb1b libtensorrt_llm_executor_static.pre_cxx11.a
+05aaf1a0fb2f0115af107b00aa839a6601f6a873 commit
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
diff --git a/cpp/tensorrt_llm/executor_worker/executorWorker.cpp b/cpp/tensorrt_llm/executor_worker/executorWorker.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/mpiUtils.h"
 #include "tensorrt_llm/executor/executor.h"
@@ -46,6 +47,11 @@ int main(int argc, char* argv[])
         return -1;
     }
 
+    // TRT-LLM event synchronization sometimes takes extra time to complete
+    // after the kernel has finished. Using a yield in the wait helps improve
+    // performance.
+    TLLM_CUDA_CHECK(::cudaSetDeviceFlags(cudaDeviceScheduleYield));
+
     // Since parentComm is an intercommunicator, input root
     // is the rank of the parent process in his group
     // (always 0 as the parent size is checked before)

diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.h b/cpp/tensorrt_llm/kernels/beamSearchKernels.h
@@ -29,18 +29,18 @@ struct BeamHypotheses
 {
     // clang-format off
 
-    // BS: batch_size, BM: beam_width, MSL: max_seq_length
-    // %%: parameter name when dynamic_decoder.forward() / gather_tree() are called in [generation.py] (python workflow)
+    // MBS: max_batch_size, BS: batch_size, BM: beam_width, MSL: max_seq_length
+    // %%: parameter name in file generation.py (python workflow)
 
     // Candidate beams: a beam which generates end_id or its sequence length reaches MSL
-    // Candidate-Beam-Array (CBA): The arrays (size: BM*2) to place the candidate beams and related information
+    // Candidate-Beam-Array (CBA): The arrays to place the candidate beams and related information
 
     // Scalar values
     bool bReturnNormedScore{false};     // return normed_score / cum_log_probs, useless yet
-    int nBatchSize{0};                  //
+    int nMaxBatchSize{0};               // max batch size by model configuration
+    int nBatchSize{0};                  // batch size by runtime input data
     int nBeamWidth{0};                  //
     int nIte{0};                        // index of local_batch, always be 0 when pp_size==1
-    int nBatchSizeLocal{0};             //
     int nMaxSeqLen{0};                  //
     int nVocabSize{0};                  // vocab_size_padded
 
@@ -54,8 +54,9 @@ struct BeamHypotheses
     int const* endIds{nullptr};         // [BS, BM]         %% self.end_ids
 
     // Pointers for output
-    int* outputIds{nullptr};            // [BS, BM, MSL]    %% self.output_ids
-    float* logProbs{nullptr};           // [MSL, BS, BM]    %% self.log_probs_tiled
+    int* outputIds{nullptr};            // [BS, BM, MSL]    %% self.output_ids                      only used in gather_tree
+    float* logProbs{nullptr};           // [BS, BM, MSL]    %% self.log_probs                       only used in gather_tree
+    float* logProbsTiled{nullptr};      // [MSL, MBS, BM]   %% self.log_probs_tiled
     int* sequenceLengths{nullptr};      // [BS, BM]         %% self.sequence_length_buffer
     float* cumLogProbs{nullptr};        // [BS, BM]         %% self.cum_log_probs
 
@@ -65,8 +66,8 @@ struct BeamHypotheses
     int* sequenceLengthsCBA{nullptr};   // [BS, BM*2]       %% self.beam_hyps_seq_len_cba
     float* cumLogProbsCBA{nullptr};     // [BS, BM*2]       %% self.beam_hyps_cum_log_probs_cba
     float* normedScoresCBA{nullptr};    // [BS, BM*2]       %% self.beam_hyps_normed_scores_cba
-    int* numBeamsCBA{nullptr};          // [BS]             %% self.beam_hyps_num_beams           number of beams in CBA
-    float* minNormedScoresCBA{nullptr}; // [BS]             %% self.beam_hyps_min_normed_scores   worst score in CBA
+    int* numBeamsCBA{nullptr};          // [BS]             %% self.beam_hyps_num_beams             number of beams in CBA
+    float* minNormedScoresCBA{nullptr}; // [BS]             %% self.beam_hyps_min_normed_scores     worst score in CBA
 
     // Pointers related to beam search process, they are initialized in those two functions:
     // [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward
-Original file line number
+Diff line change
@@ Expand Up @@
         }
         return __ushort_as_bfloat16(old);
+    #else
+        asm volatile("  brkpt;\n");
+        return 0;
     #endif
     }
@@ Expand Down @@