Skip to content

Commit

Permalink
Update TensorRT-LLM (#1639)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: vonjackustc <[email protected]>
  • Loading branch information
kaiyux and vonjackustc authored May 21, 2024
1 parent b189b61 commit 5d8ca2f
Show file tree
Hide file tree
Showing 165 changed files with 511,601 additions and 517,367 deletions.
3 changes: 2 additions & 1 deletion benchmarks/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,8 @@ def parse_arguments():
choices=[
'fp8', 'fp8_gemm', 'fp8_kv_cache', 'int8_sq_per_tensor',
'int8_sq_per_token_channel', 'int8_weight_only', 'int4_weight_only',
'int4_weight_only_awq', 'int4_weight_only_gptq'
'int4_weight_only_awq', 'int4_weight_only_gptq',
'int8_sq_per_channel_ootb'
],
help="Optimize the model with specified quantization recipe")
parser.add_argument(
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ def get_quant_config(quantization: str):
elif quantization == "int8_sq_per_token_channel":
return QuantConfig(
quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
elif quantization == "int8_sq_per_channel_ootb":
return QuantConfig(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL)
elif quantization == "int8_weight_only":
return QuantConfig(quant_algo=QuantAlgo.W8A16)
elif quantization == "int4_weight_only":
Expand Down
8 changes: 4 additions & 4 deletions cpp/include/tensorrt_llm/runtime/decodingOutput.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ class DecodingOutput
class BeamHypotheses
{
public:
// The same as cpp/tensorrt_llm/kernels/beamSearchKernels.h
// Keep same as cpp/tensorrt_llm/kernels/beamSearchKernels.h
TensorPtr outputIdsCBA; // [BS, BM*2, MSL]
TensorPtr sequenceLengthsCBA; // [BS, BM]
TensorPtr logProbsCBA; // [BS, BM*2, MSL]
TensorPtr sequenceLengthsCBA; // [BS, BM*2]
TensorPtr cumLogProbsCBA; // [BS, BM*2]
TensorPtr normedScoresCBA; // [BS, BM*2]
TensorPtr logProbsCBA; // [BS, BM*2, MSL]
TensorPtr minNormedScoresCBA; // [BS]
TensorPtr numBeamsCBA; // [BS]
TensorPtr minNormedScoresCBA; // [BS]
TensorPtr batchDones; // [BS]

void empty(BufferManager& manager);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ed2ee8d73a5d374e800f653169bf293e libtensorrt_llm_batch_manager_static.a
ed2ee8d73a5d374e800f653169bf293e libtensorrt_llm_batch_manager_static.pre_cxx11.a
f088526f4bce4b1143c67973b3502734c3491ab9 commit
05aaf1a0fb2f0115af107b00aa839a6601f6a873 commit
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions cpp/tensorrt_llm/common/cudaFp8Utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,9 @@ __device__ __nv_bfloat16 atomicMaxExtd(__nv_bfloat16* address, __nv_bfloat16 val
}

return __ushort_as_bfloat16(old);
#else
asm volatile(" brkpt;\n");
return 0;
#endif
}

Expand Down
3 changes: 3 additions & 0 deletions cpp/tensorrt_llm/common/cudaTypeUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,9 @@ __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val)
{
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
return __hmax(val.x, val.y);
#else
asm volatile(" brkpt;\n");
return 0;
#endif
}
#endif
Expand Down
Git LFS file not shown
Git LFS file not shown
6 changes: 3 additions & 3 deletions cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
3c423e837e67ce86756ea468438c41a8 libtensorrt_llm_executor_static.a
3c423e837e67ce86756ea468438c41a8 libtensorrt_llm_executor_static.pre_cxx11.a
f088526f4bce4b1143c67973b3502734c3491ab9 commit
54670adde093baff8b031869bdeeeb1b libtensorrt_llm_executor_static.a
54670adde093baff8b031869bdeeeb1b libtensorrt_llm_executor_static.pre_cxx11.a
05aaf1a0fb2f0115af107b00aa839a6601f6a873 commit
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
6 changes: 6 additions & 0 deletions cpp/tensorrt_llm/executor_worker/executorWorker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/mpiUtils.h"
#include "tensorrt_llm/executor/executor.h"
Expand Down Expand Up @@ -46,6 +47,11 @@ int main(int argc, char* argv[])
return -1;
}

// TRT-LLM event synchronization sometimes takes extra time to complete
// after the kernel has finished. Using a yield in the wait helps improve
// performance.
TLLM_CUDA_CHECK(::cudaSetDeviceFlags(cudaDeviceScheduleYield));

// Since parentComm is an intercommunicator, input root
// is the rank of the parent process in his group
// (always 0 as the parent size is checked before)
Expand Down
19 changes: 10 additions & 9 deletions cpp/tensorrt_llm/kernels/beamSearchKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,18 @@ struct BeamHypotheses
{
// clang-format off

// BS: batch_size, BM: beam_width, MSL: max_seq_length
// %%: parameter name when dynamic_decoder.forward() / gather_tree() are called in [generation.py] (python workflow)
// MBS: max_batch_size, BS: batch_size, BM: beam_width, MSL: max_seq_length
// %%: parameter name in file generation.py (python workflow)

// Candidate beams: a beam which generates end_id or its sequence length reaches MSL
// Candidate-Beam-Array (CBA): The arrays (size: BM*2) to place the candidate beams and related information
// Candidate-Beam-Array (CBA): The arrays to place the candidate beams and related information

// Scalar values
bool bReturnNormedScore{false}; // return normed_score / cum_log_probs, useless yet
int nBatchSize{0}; //
int nMaxBatchSize{0}; // max batch size by model configuration
int nBatchSize{0}; // batch size by runtime input data
int nBeamWidth{0}; //
int nIte{0}; // index of local_batch, always be 0 when pp_size==1
int nBatchSizeLocal{0}; //
int nMaxSeqLen{0}; //
int nVocabSize{0}; // vocab_size_padded

Expand All @@ -54,8 +54,9 @@ struct BeamHypotheses
int const* endIds{nullptr}; // [BS, BM] %% self.end_ids

// Pointers for output
int* outputIds{nullptr}; // [BS, BM, MSL] %% self.output_ids
float* logProbs{nullptr}; // [MSL, BS, BM] %% self.log_probs_tiled
int* outputIds{nullptr}; // [BS, BM, MSL] %% self.output_ids only used in gather_tree
float* logProbs{nullptr}; // [BS, BM, MSL] %% self.log_probs only used in gather_tree
float* logProbsTiled{nullptr}; // [MSL, MBS, BM] %% self.log_probs_tiled
int* sequenceLengths{nullptr}; // [BS, BM] %% self.sequence_length_buffer
float* cumLogProbs{nullptr}; // [BS, BM] %% self.cum_log_probs

Expand All @@ -65,8 +66,8 @@ struct BeamHypotheses
int* sequenceLengthsCBA{nullptr}; // [BS, BM*2] %% self.beam_hyps_seq_len_cba
float* cumLogProbsCBA{nullptr}; // [BS, BM*2] %% self.beam_hyps_cum_log_probs_cba
float* normedScoresCBA{nullptr}; // [BS, BM*2] %% self.beam_hyps_normed_scores_cba
int* numBeamsCBA{nullptr}; // [BS] %% self.beam_hyps_num_beams number of beams in CBA
float* minNormedScoresCBA{nullptr}; // [BS] %% self.beam_hyps_min_normed_scores worst score in CBA
int* numBeamsCBA{nullptr}; // [BS] %% self.beam_hyps_num_beams number of beams in CBA
float* minNormedScoresCBA{nullptr}; // [BS] %% self.beam_hyps_min_normed_scores worst score in CBA

// Pointers related to beam search process, they are initialized in those two functions:
// [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward
Expand Down
Loading

0 comments on commit 5d8ca2f

Please sign in to comment.