From 8084312b43f15bee17069a6a26aecca5e7058fcc Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:57:55 +0000 Subject: [PATCH] open source cef1070ccdde579844de64f2a2bb8099bc3e5f02 --- .gitignore | 2 +- .gitmodules | 5 +- .pre-commit-config.yaml | 2 +- 3rdparty/pybind11 | 1 + 3rdparty/ucxx | 2 +- README.md | 31 +- benchmarks/README.md | 5 +- benchmarks/cpp/gptManagerBenchmark.cpp | 99 +- benchmarks/cpp/utils/prepare_real_data.py | 2 - benchmarks/python/gpt_benchmark.py | 2 +- cpp/CMakeLists.txt | 8 +- .../batch_manager/capacityScheduler.h | 187 ++++ .../tensorrt_llm/batch_manager/common.h | 118 +++ .../batch_manager/evictionPolicy.h | 74 ++ .../batch_manager/kvCacheConfig.h | 11 +- .../batch_manager/kvCacheManager.h | 282 +++-- .../tensorrt_llm/batch_manager/kvCacheUtils.h | 9 +- .../tensorrt_llm/batch_manager/llmRequest.h | 233 +++-- .../batch_manager/microBatchScheduler.h | 108 ++ .../batch_manager/peftCacheManager.h | 2 + .../batch_manager/trtGptModelOptionalParams.h | 24 +- cpp/include/tensorrt_llm/common/algorithm.h | 32 + cpp/include/tensorrt_llm/common/cudaUtils.h | 9 +- cpp/include/tensorrt_llm/common/mpiUtils.h | 6 +- cpp/include/tensorrt_llm/executor/executor.h | 135 ++- .../tensorrt_llm/executor/serialization.h | 13 +- cpp/include/tensorrt_llm/executor/types.h | 56 + .../tensorrt_llm/runtime/decodingInput.h | 16 + .../tensorrt_llm/runtime/decodingOutput.h | 5 +- cpp/include/tensorrt_llm/runtime/gptDecoder.h | 21 - .../tensorrt_llm/runtime/gptDecoderBatched.h | 27 +- cpp/include/tensorrt_llm/runtime/gptSession.h | 1 - .../tensorrt_llm/runtime/iGptDecoderBatched.h | 16 +- cpp/include/tensorrt_llm/runtime/ipcUtils.h | 2 +- .../tensorrt_llm/runtime/lookaheadBuffers.h | 1 + .../tensorrt_llm/runtime/modelConfig.h | 109 +- .../runtime/speculativeDecodingMode.h | 3 +- cpp/tensorrt_llm/CMakeLists.txt | 2 + .../libtensorrt_llm_batch_manager_static.a | 4 +- ...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +- .../aarch64-linux-gnu/version.txt | 6 +- .../libtensorrt_llm_batch_manager_static.a | 4 +- ...sorrt_llm_batch_manager_static.pre_cxx11.a | 4 +- .../x86_64-linux-gnu/version.txt | 6 +- .../tensorrt_llm_batch_manager_static.lib | 4 +- .../x86_64-windows-msvc/version.txt | 4 +- .../common/customAllReduceUtils.h | 2 +- cpp/tensorrt_llm/common/envUtils.cpp | 38 + cpp/tensorrt_llm/common/envUtils.h | 5 + cpp/tensorrt_llm/common/mpiUtils.cpp | 67 +- cpp/tensorrt_llm/common/reduceKernelUtils.cuh | 6 + .../libtensorrt_llm_executor_static.a | 4 +- ...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +- .../executor/aarch64-linux-gnu/version.txt | 6 +- .../libtensorrt_llm_executor_static.a | 4 +- ...ibtensorrt_llm_executor_static.pre_cxx11.a | 4 +- .../executor/x86_64-linux-gnu/version.txt | 6 +- .../tensorrt_llm_executor_static.lib | 4 +- .../executor/x86_64-windows-msvc/version.txt | 4 +- .../beamSearchKernelsTemplate.h | 4 +- .../fmhaRunner.cpp | 16 +- .../kernels/customAllReduceKernels.cu | 613 +++++++++-- .../kernels/customAllReduceKernels.h | 18 +- .../fused_moe_gemm_launcher_sm80.inl | 1 - .../moe_gemm/moe_gemm_kernels_template.h | 3 +- .../libtensorrt_llm_nvrtc_wrapper.so | 4 +- .../aarch64-linux-gnu/version.txt | 4 +- .../libtensorrt_llm_nvrtc_wrapper.so | 2 +- .../nvrtcWrapper/x86_64-linux-gnu/version.txt | 4 +- .../tensorrt_llm_nvrtc_wrapper.dll | 2 +- .../tensorrt_llm_nvrtc_wrapper.lib | 2 +- .../x86_64-windows-msvc/version.txt | 6 +- cpp/tensorrt_llm/kernels/decodingKernels.cu | 91 ++ cpp/tensorrt_llm/kernels/decodingKernels.h | 20 + cpp/tensorrt_llm/kernels/gptKernels.cu | 2 +- ...orrt_llm_internal_cutlass_kernels_static.a | 2 +- ...nternal_cutlass_kernels_static.pre_cxx11.a | 2 +- .../aarch64-linux-gnu/version.txt | 6 +- ...orrt_llm_internal_cutlass_kernels_static.a | 2 +- ...nternal_cutlass_kernels_static.pre_cxx11.a | 2 +- .../x86_64-linux-gnu/version.txt | 6 +- ...rt_llm_internal_cutlass_kernels_static.lib | 2 +- .../x86_64-windows-msvc/version.txt | 4 +- .../kernels/mixtureOfExperts/moe_kernels.cu | 2 +- .../kernels/samplingAirTopPKernels.cu | 2 +- .../kernels/samplingTopKKernels.cu | 29 +- .../kernels/samplingTopKKernels.h | 10 +- .../kernels/samplingTopPKernels.cu | 74 +- .../kernels/samplingTopPKernels.h | 21 +- .../externalDraftTokensKernels.cu | 369 ++++--- .../externalDraftTokensKernels.h | 123 +-- .../kernels/stopCriteriaKernels.h | 2 +- .../unfusedAttentionKernels_2_template.h | 16 +- cpp/tensorrt_llm/layers/decodingLayer.cpp | 50 + cpp/tensorrt_llm/layers/decodingLayer.h | 2 +- cpp/tensorrt_llm/layers/decodingParams.h | 40 +- .../layers/externalDraftTokensLayer.cpp | 514 +++++++++ .../layers/externalDraftTokensLayer.h | 100 ++ .../layers/lookaheadAlgorithm.cpp | 328 ++++-- cpp/tensorrt_llm/layers/lookaheadAlgorithm.h | 78 +- .../layers/lookaheadDecodingLayer.cpp | 123 +-- .../layers/lookaheadDecodingLayer.h | 6 +- .../layers/lookaheadDecodingUtils.h | 88 +- .../layers/medusaDecodingLayer.cpp | 2 +- cpp/tensorrt_llm/layers/topPSamplingLayer.cpp | 2 +- .../gptAttentionCommon/gptAttentionCommon.cpp | 33 +- .../gptAttentionCommon/gptAttentionCommon.h | 4 +- .../gptAttentionPlugin/gptAttentionPlugin.cpp | 44 +- .../gptAttentionPlugin/gptAttentionPlugin.h | 3 +- .../plugins/loraPlugin/loraPlugin.cpp | 4 +- .../plugins/ncclPlugin/allreducePlugin.cpp | 16 +- cpp/tensorrt_llm/pybind/CMakeLists.txt | 43 +- .../pybind/batch_manager/algorithms.cpp | 55 + .../pybind/batch_manager/algorithms.h | 28 + .../pybind/batch_manager/bindings.cpp | 41 + .../pybind/batch_manager/bindings.h | 28 + .../pybind/batch_manager/gptManager.h | 1 + .../pybind/batch_manager/inferenceRequest.h | 1 + .../pybind/batch_manager/kvCacheManager.cpp | 29 + .../pybind/batch_manager/kvCacheManager.h | 36 + .../pybind/batch_manager/llmRequest.cpp | 145 ++- .../pybind/batch_manager/llmRequest.h | 11 + .../pybind/batch_manager/namedTensor.h | 1 + cpp/tensorrt_llm/pybind/bindings.cpp | 59 +- .../pybind/common/algorithmBindings.h | 39 + .../pybind/common/opaqueBindings.h | 18 + cpp/tensorrt_llm/pybind/executor/bindings.cpp | 78 +- cpp/tensorrt_llm/pybind/executor/bindings.h | 2 + cpp/tensorrt_llm/pybind/executor/executor.cpp | 1 + cpp/tensorrt_llm/pybind/executor/executor.h | 7 + .../pybind/executor/streamCaster.h | 4 +- .../pybind/executor/tensorCaster.h | 4 +- cpp/tensorrt_llm/pybind/utils/bindTypes.h | 69 ++ cpp/tensorrt_llm/pybind/utils/pathCaster.h | 1 + cpp/tensorrt_llm/runtime/gptDecoder.cpp | 250 +---- .../runtime/gptDecoderBatched.cpp | 225 ++-- cpp/tensorrt_llm/runtime/gptJsonConfig.cpp | 47 +- cpp/tensorrt_llm/runtime/gptSession.cpp | 14 +- cpp/tensorrt_llm/runtime/ipcUtils.cpp | 38 +- cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp | 58 +- cpp/tensorrt_llm/runtime/loraUtils.cpp | 4 + cpp/tensorrt_llm/runtime/medusaModule.cpp | 2 +- cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp | 10 +- .../runtime/statefulGptDecoder.cpp | 9 +- cpp/tensorrt_llm/runtime/tllmBuffers.h | 2 +- cpp/tensorrt_llm/runtime/tllmRuntime.cpp | 212 ++-- cpp/tensorrt_llm/runtime/tllmRuntime.h | 23 +- .../runtime/transformerBuffers.cpp | 36 +- cpp/tensorrt_llm/runtime/transformerBuffers.h | 8 +- .../runtime/utils/sessionUtils.cpp | 11 + cpp/tensorrt_llm/runtime/utils/sessionUtils.h | 2 + .../kernels/allReduce/allReduceKernelTest.cu | 181 +++- cpp/tests/kernels/decodingKernelTest.cpp | 468 ++++++++- cpp/tests/kernels/mixtureOfExpertsTest.cu | 411 ++------ .../kernels/sampling/samplingAirTopPTest.cpp | 27 +- cpp/tests/kernels/sampling/samplingTest.cpp | 58 +- cpp/tests/kernels/sampling/samplingTest.h | 6 +- .../kernels/sampling/samplingTopKTest.cpp | 21 +- .../kernels/sampling/samplingTopPTest.cpp | 29 +- cpp/tests/layers/baseSamplingLayerTest.cpp | 41 +- cpp/tests/layers/baseSamplingLayerTest.h | 18 +- cpp/tests/layers/lookaheadAlgorithmTest.cpp | 66 +- .../layers/lookaheadDecodingLayerTest.cpp | 22 +- cpp/tests/layers/lookaheadRandomLlmTest.cpp | 2 +- cpp/tests/layers/randomLlm.cpp | 4 +- cpp/tests/layers/samplingLayerTest.cpp | 2 +- cpp/tests/layers/topKSamplingLayerTest.cpp | 2 +- cpp/tests/layers/topPSamplingLayerTest.cpp | 29 +- .../data/test_model_lora_config.json | 1 - .../scripts/build_chatglm_engines.py | 1 - .../resources/scripts/build_gpt_engines.py | 1 - .../resources/scripts/build_llama_engines.py | 2 +- .../scripts/generate_expected_gpt_output.py | 10 +- .../scripts/generate_expected_llama_output.py | 2 +- cpp/tests/resources/scripts/test_cpp.py | 178 +++- cpp/tests/runtime/gptDecoderBatchedTest.cpp | 16 +- cpp/tests/runtime/gptDecoderTest.cpp | 2 +- cpp/tests/runtime/loraCacheTest.cpp | 12 +- cpp/tests/runtime/loraManagerTest.cpp | 6 +- cpp/tests/runtime/loraUtilsTest.cpp | 7 +- docker/Dockerfile.multi | 6 +- docker/common/install_pytorch.sh | 35 +- docs/requirements.txt | 1 + docs/source/advanced/batch-manager.md | 4 + docs/source/{ => advanced}/executor.md | 57 +- docs/source/advanced/gpt-runtime.md | 22 +- .../kv-cache-reuse.md} | 2 + .../speculative-decoding.md} | 87 +- docs/source/architecture/core-concepts.md | 132 ++- docs/source/architecture/workflow.md | 23 +- docs/source/blogs/quantization-in-TRT-LLM.md | 60 +- docs/source/conf.py | 7 + docs/source/helper.py | 2 +- docs/source/index.rst | 21 +- .../installation/build-from-source-windows.md | 5 +- docs/source/installation/windows.md | 10 +- docs/source/llm-api-examples/index.md | 14 +- docs/source/llm-api/index.md | 101 ++ docs/source/media/image-09-29-2024.png | Bin 0 -> 178826 bytes .../source/performance/perf-benchmarking.md | 273 ++++- docs/source/performance/perf-overview.md | 331 +++--- docs/source/quick-start-guide.md | 25 +- docs/source/reference/support-matrix.md | 3 +- docs/source/reference/troubleshooting.md | 14 - docs/source/release-notes.md | 54 +- examples/apps/fastapi_server.py | 58 +- examples/apps/openai_server.py | 31 +- examples/baichuan/requirements.txt | 4 +- examples/bindings/executor/README.md | 2 +- .../bindings/executor/example_advanced.py | 2 +- examples/bindings/executor/example_debug.py | 39 +- examples/bloom/requirements.txt | 4 +- examples/chatglm/requirements.txt | 4 +- examples/dbrx/convert_checkpoint.py | 7 +- examples/dbrx/requirements.txt | 2 +- examples/deepseek_v1/README.md | 77 ++ .../deci => examples/deepseek_v1}/__init__.py | 0 examples/deepseek_v1/convert_checkpoint.py | 215 ++++ examples/deepseek_v1/requirements.txt | 5 + examples/draft_target_model/README.md | 86 ++ examples/draft_target_model/requirements.txt | 6 + examples/enc_dec/README.md | 2 + examples/falcon/requirements.txt | 4 +- examples/gemma/requirements.txt | 4 +- examples/gpt/README.md | 3 +- examples/gpt/requirements.txt | 4 +- examples/gptj/requirements.txt | 2 +- examples/gptneox/requirements.txt | 2 +- examples/grok/requirements.txt | 2 +- examples/internlm/requirements.txt | 4 +- examples/jais/requirements.txt | 4 +- examples/llama/README.md | 33 +- examples/llama/convert_checkpoint.py | 15 + examples/llama/requirements.txt | 5 +- examples/llm-api/README.md | 330 +----- .../{llm_generate.py => llm_inference.py} | 0 ...nerate_async.py => llm_inference_async.py} | 0 ...ng.py => llm_inference_async_streaming.py} | 0 examples/llm-api/llm_inference_customize.py | 47 + ...ibuted.py => llm_inference_distributed.py} | 6 +- examples/llm-api/llm_logits_processor.py | 51 + examples/llm-api/requirements.txt | 2 - examples/mamba/README.md | 6 - examples/mamba/convert_checkpoint.py | 478 ++------- examples/mamba/requirements.txt | 2 +- examples/medusa/requirements.txt | 4 +- examples/mixtral/README.md | 2 +- examples/mixtral/requirements.txt | 2 +- examples/model_api/README.md | 2 +- examples/model_api/llama.py | 2 - examples/model_api/llama_multi_gpu.py | 1 - examples/mpt/README.md | 56 - examples/mpt/convert_checkpoint.py | 30 +- examples/mpt/requirements.txt | 2 +- examples/multimodal/README.md | 2 +- examples/nemotron/requirements.txt | 2 +- examples/nemotron_nas/README.md | 102 ++ examples/nemotron_nas/calibration_utils.py | 39 + examples/nemotron_nas/convert_checkpoint.py | 162 +++ examples/opt/requirements.txt | 2 +- examples/phi/README.md | 33 +- examples/phi/convert_checkpoint.py | 48 +- examples/phi/requirements.txt | 2 +- examples/quantization/quantize.py | 42 +- examples/quantization/requirements.txt | 2 +- examples/qwen/requirements.txt | 4 +- examples/qwenvl/requirements.txt | 4 +- examples/recurrentgemma/requirements.txt | 2 +- examples/redrafter/requirements.txt | 4 +- examples/run.py | 467 +++++++-- examples/skywork/requirements.txt | 2 +- examples/smaug/requirements.txt | 4 +- examples/summarize.py | 4 +- examples/utils.py | 29 +- examples/whisper/requirements.txt | 2 +- requirements-dev.txt | 2 + requirements-windows.txt | 13 +- requirements.txt | 7 +- scripts/build_wheel.py | 41 +- tensorrt_llm/_utils.py | 28 + .../plugin_nodes/gpt_attention_node.py | 3 + .../bench/{run => benchmark}/__init__.py | 0 .../bench/{run => benchmark}/dataclasses.py | 102 +- tensorrt_llm/bench/benchmark/low_latency.py | 336 ++++++ .../{run/run.py => benchmark/throughput.py} | 83 +- .../bench/{run => benchmark}/utils.py | 68 +- tensorrt_llm/bench/build/benchmark_config.yml | 60 +- tensorrt_llm/bench/build/build.py | 23 +- tensorrt_llm/bench/dataclasses.py | 19 +- tensorrt_llm/bench/utils/data.py | 41 +- tensorrt_llm/bench/utils/tokenize.py | 105 -- tensorrt_llm/builder.py | 40 +- tensorrt_llm/commands/bench.py | 6 +- tensorrt_llm/commands/build.py | 13 +- tensorrt_llm/executor.py | 273 +++-- tensorrt_llm/functional.py | 90 +- tensorrt_llm/hlapi/__init__.py | 7 +- tensorrt_llm/hlapi/llm.py | 2 +- tensorrt_llm/hlapi/llm_utils.py | 148 ++- tensorrt_llm/hlapi/utils.py | 27 +- tensorrt_llm/layers/__init__.py | 3 +- tensorrt_llm/layers/attention.py | 107 +- tensorrt_llm/layers/embedding.py | 9 +- tensorrt_llm/layers/linear.py | 18 +- tensorrt_llm/layers/mlp.py | 5 +- tensorrt_llm/layers/moe.py | 57 +- tensorrt_llm/lora_manager.py | 9 + tensorrt_llm/models/__init__.py | 7 +- tensorrt_llm/models/automodel.py | 18 +- tensorrt_llm/models/chatglm/convert.py | 5 +- tensorrt_llm/models/convert_utils.py | 5 +- tensorrt_llm/models/deepseek_v1/__init__.py | 14 + tensorrt_llm/models/deepseek_v1/convert.py | 361 +++++++ tensorrt_llm/models/deepseek_v1/model.py | 257 +++++ tensorrt_llm/models/enc_dec/model.py | 65 +- tensorrt_llm/models/falcon/model.py | 3 +- tensorrt_llm/models/gemma/model.py | 3 +- tensorrt_llm/models/gemma/smoothquant.py | 4 +- tensorrt_llm/models/generation_mixin.py | 131 ++- tensorrt_llm/models/gpt/config.py | 5 + tensorrt_llm/models/gpt/model.py | 6 + tensorrt_llm/models/grok/convert.py | 6 +- tensorrt_llm/models/grok/model.py | 6 +- tensorrt_llm/models/llama/convert.py | 25 +- tensorrt_llm/models/llama/model.py | 18 +- tensorrt_llm/models/mamba/config.py | 340 ++++++ tensorrt_llm/models/mamba/convert.py | 245 +++++ tensorrt_llm/models/mamba/model.py | 49 +- tensorrt_llm/models/model_weights_loader.py | 79 +- tensorrt_llm/models/modeling_utils.py | 291 +++++- tensorrt_llm/models/nemotron_nas/__init__.py | 14 + .../models/{deci => nemotron_nas}/config.py | 39 +- .../models/{deci => nemotron_nas}/convert.py | 157 +-- .../{deci => nemotron_nas}/layer_config.py | 0 .../models/{deci => nemotron_nas}/model.py | 286 ++++- tensorrt_llm/models/phi3/config.py | 17 +- tensorrt_llm/models/phi3/convert.py | 40 +- tensorrt_llm/models/phi3/model.py | 69 +- tensorrt_llm/models/phi3/split_weights.py | 53 +- tensorrt_llm/models/qwen/model.py | 9 +- tensorrt_llm/models/recurrentgemma/model.py | 19 +- .../models/redrafter/redrafter_helper.py | 136 ++- tensorrt_llm/module.py | 61 ++ tensorrt_llm/parameter.py | 6 +- tensorrt_llm/plugin/plugin.py | 21 +- tensorrt_llm/quantization/layers.py | 60 +- tensorrt_llm/quantization/mode.py | 10 +- tensorrt_llm/quantization/quantize.py | 90 +- .../quantization/quantize_by_modelopt.py | 187 +++- tensorrt_llm/runtime/generation.py | 194 ++-- tensorrt_llm/runtime/kv_cache_manager.py | 21 +- tensorrt_llm/runtime/memory_pools/__init__.py | 0 .../memory_pools/memory_pools_allocator.py | 80 ++ tensorrt_llm/runtime/memory_pools/pool.py | 7 + .../memory_pools/pools_kv_cache_manager.py | 67 ++ tensorrt_llm/runtime/model_runner_cpp.py | 235 +++-- tensorrt_llm/tools/multimodal_builder.py | 31 +- tensorrt_llm/version.py | 2 +- tests/attention/test_gpt_attention.py | 107 +- tests/attention/test_gpt_attention_IFB.py | 80 +- tests/bindings/test_bindings_ut.py | 29 +- tests/bindings/test_executor_bindings.py | 60 +- tests/conftest.py | 44 + tests/functional/test_moe.py | 3 +- tests/hlapi/apps/_test_llm_server.py | 10 +- tests/hlapi/test_llm.py | 183 +++- tests/hlapi/test_llm_models.py | 60 +- tests/hlapi/test_llm_multi_gpu.py | 83 +- tests/hlapi/test_llm_quant.py | 25 +- tests/hlapi/test_llm_utils.py | 2 +- tests/model/test_decilm.py | 602 ----------- tests/model/test_gpt.py | 59 +- tests/model/test_gpt_e2e.py | 1 - tests/model/test_llama.py | 3 +- tests/model/test_mamba.py | 11 +- tests/model/test_mistral.py | 3 +- tests/model/test_nemotron_nas.py | 989 ++++++++++++++++++ tests/model_api/test_model_quantization.py | 7 +- tests/test_graph_rewriter.py | 2 +- tests/test_layer.py | 5 +- tests/test_model_runner_cpp.py | 84 ++ tests/test_module.py | 2 + tests/utils/cpp_paths.py | 5 + tests/utils/util.py | 10 +- 384 files changed, 14603 insertions(+), 5392 deletions(-) create mode 160000 3rdparty/pybind11 create mode 100644 cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h create mode 100644 cpp/include/tensorrt_llm/batch_manager/common.h create mode 100644 cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h create mode 100644 cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h create mode 100644 cpp/include/tensorrt_llm/common/algorithm.h create mode 100644 cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp create mode 100644 cpp/tensorrt_llm/layers/externalDraftTokensLayer.h mode change 100644 => 100755 cpp/tensorrt_llm/pybind/CMakeLists.txt create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/algorithms.h create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/bindings.h create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h create mode 100644 cpp/tensorrt_llm/pybind/common/algorithmBindings.h create mode 100644 cpp/tensorrt_llm/pybind/common/opaqueBindings.h create mode 100644 cpp/tensorrt_llm/pybind/utils/bindTypes.h rename docs/source/{ => advanced}/executor.md (81%) rename docs/source/{kv_cache_reuse.md => advanced/kv-cache-reuse.md} (99%) rename docs/source/{speculative_decoding.md => advanced/speculative-decoding.md} (91%) create mode 100644 docs/source/llm-api/index.md create mode 100644 docs/source/media/image-09-29-2024.png rename benchmarks/Suite.md => docs/source/performance/perf-benchmarking.md (52%) create mode 100644 examples/deepseek_v1/README.md rename {tensorrt_llm/models/deci => examples/deepseek_v1}/__init__.py (100%) create mode 100644 examples/deepseek_v1/convert_checkpoint.py create mode 100644 examples/deepseek_v1/requirements.txt create mode 100644 examples/draft_target_model/README.md create mode 100644 examples/draft_target_model/requirements.txt rename examples/llm-api/{llm_generate.py => llm_inference.py} (100%) rename examples/llm-api/{llm_generate_async.py => llm_inference_async.py} (100%) rename examples/llm-api/{llm_generate_async_streaming.py => llm_inference_async_streaming.py} (100%) create mode 100644 examples/llm-api/llm_inference_customize.py rename examples/llm-api/{llm_generate_distributed.py => llm_inference_distributed.py} (91%) create mode 100644 examples/llm-api/llm_logits_processor.py delete mode 100644 examples/llm-api/requirements.txt create mode 100644 examples/nemotron_nas/README.md create mode 100644 examples/nemotron_nas/calibration_utils.py create mode 100644 examples/nemotron_nas/convert_checkpoint.py rename tensorrt_llm/bench/{run => benchmark}/__init__.py (100%) rename tensorrt_llm/bench/{run => benchmark}/dataclasses.py (62%) create mode 100644 tensorrt_llm/bench/benchmark/low_latency.py rename tensorrt_llm/bench/{run/run.py => benchmark/throughput.py} (89%) rename tensorrt_llm/bench/{run => benchmark}/utils.py (60%) delete mode 100644 tensorrt_llm/bench/utils/tokenize.py create mode 100644 tensorrt_llm/models/deepseek_v1/__init__.py create mode 100644 tensorrt_llm/models/deepseek_v1/convert.py create mode 100644 tensorrt_llm/models/deepseek_v1/model.py create mode 100644 tensorrt_llm/models/mamba/config.py create mode 100644 tensorrt_llm/models/mamba/convert.py create mode 100644 tensorrt_llm/models/nemotron_nas/__init__.py rename tensorrt_llm/models/{deci => nemotron_nas}/config.py (86%) rename tensorrt_llm/models/{deci => nemotron_nas}/convert.py (77%) rename tensorrt_llm/models/{deci => nemotron_nas}/layer_config.py (100%) rename tensorrt_llm/models/{deci => nemotron_nas}/model.py (67%) create mode 100644 tensorrt_llm/runtime/memory_pools/__init__.py create mode 100644 tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py create mode 100644 tensorrt_llm/runtime/memory_pools/pool.py create mode 100644 tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py create mode 100644 tests/conftest.py delete mode 100644 tests/model/test_decilm.py create mode 100644 tests/model/test_nemotron_nas.py create mode 100644 tests/test_model_runner_cpp.py diff --git a/.gitignore b/.gitignore index d9463eeb2..ca13276bd 100644 --- a/.gitignore +++ b/.gitignore @@ -37,7 +37,7 @@ tensorrt_llm/bindings.pyi tensorrt_llm/bindings/*.pyi *docs/cpp_docs* *docs/source/_cpp_gen* -docs/source/llm-api +docs/source/llm-api/*.rst docs/source/llm-api-examples/llm_*.rst *.swp diff --git a/.gitmodules b/.gitmodules index 6fdb69781..f9208d5de 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,4 +13,7 @@ url = https://github.com/NVIDIA/NVTX.git [submodule "3rdparty/ucxx"] path = 3rdparty/ucxx - url = https://github.com/GuanLuo/ucxx.git + url = https://github.com/rapidsai/ucxx.git +[submodule "3rdparty/pybind11"] + path = 3rdparty/pybind11 + url = https://github.com/pybind/pybind11.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e4219d974..2dc60bd2d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,5 +46,5 @@ repos: args: - --skip=".git,3rdparty" - --exclude-file=examples/whisper/tokenizer.py - - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid + - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe exclude: 'tests/llm-test-defs/turtle/test_input_files' diff --git a/3rdparty/pybind11 b/3rdparty/pybind11 new file mode 160000 index 000000000..f99ffd7e0 --- /dev/null +++ b/3rdparty/pybind11 @@ -0,0 +1 @@ +Subproject commit f99ffd7e03001810a3e722bf48ad1a9e08415d7d diff --git a/3rdparty/ucxx b/3rdparty/ucxx index b99181779..5c745102c 160000 --- a/3rdparty/ucxx +++ b/3rdparty/ucxx @@ -1 +1 @@ -Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd +Subproject commit 5c745102c26df11e68f11368bcd9649e81e981da diff --git a/README.md b/README.md index df0cdf10a..64c2c49a8 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ TensorRT-LLM [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/) [![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads) [![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt) -[![version](https://img.shields.io/badge/release-0.13.0-green)](./tensorrt_llm/version.py) +[![version](https://img.shields.io/badge/release-0.14.0-green)](./tensorrt_llm/version.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE) [Architecture](./docs/source/architecture/overview.md)   |   [Results](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/) @@ -17,6 +17,24 @@ TensorRT-LLM
## Latest News +* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12 +[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0) +
+ +
+ +* [2024/09/17] ✨ NVIDIA TensorRT-LLM Meetup +[➡️ link](https://drive.google.com/file/d/1RR8GqC-QbuaKuHj82rZcXb3MS20SWo6F/view?usp=share_link) + +* [2024/09/17] ✨ Accelerating LLM Inference at Databricks with TensorRT-LLM +[➡️ link](https://drive.google.com/file/d/1NeSmrLaWRJAY1rxD9lJmzpB9rzr38j8j/view?usp=sharing) + +* [2024/09/17] ✨ TensorRT-LLM @ Baseten +[➡️ link](https://drive.google.com/file/d/1Y7L2jqW-aRmt31mCdqhwvGMmCSOzBUjG/view?usp=share_link) + +* [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML +[➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml) + * [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/) @@ -43,6 +61,9 @@ TensorRT-LLM * [2024/07/02] Let the @MistralAI MoE tokens fly 📈 🚀 #Mixtral 8x7B with NVIDIA #TensorRT #LLM on #H100. [➡️ Tech blog](https://developer.nvidia.com/blog/achieving-high-mixtral-8x7b-performance-with-nvidia-h100-tensor-core-gpus-and-tensorrt-llm?ncid=so-twit-928467) +
+Previous News + * [2024/06/24] Enhanced with NVIDIA #TensorRT #LLM, @upstage.ai’s solar-10.7B-instruct is ready to power your developer projects through our API catalog 🏎️. ✨[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try ) * [2024/06/18] CYMI: 🤩 Stable Diffusion 3 dropped last week 🎊 🏎️ Speed up your SD3 with #TensorRT INT8 Quantization[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try ) @@ -55,10 +76,6 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights * [2024/06/04] ✨ #TensorRT and GeForce #RTX unlock ComfyUI SD superhero powers 🦸⚡ 🎥 Demo: [➡️ link](https://youtu.be/64QEVfbPHyg) 📗 DIY notebook: [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&name=ComfyUI_TensorRT&instance=L4%40g2-standard-4%3Anvidia-l4%3A1&diskStorage=500&cloudID=GCP&baseImage=docker.io%2Fpytorch%2Fpytorch%3A2.2.0-cuda12.1-cudnn8-runtime&ports=ComfUI%3A8188&file=https%3A%2F%2Fgithub.com%2Fbrevdev%2Fnotebooks%2Fblob%2Fmain%2Ftensorrt-comfyui.ipynb&launchableID=env-2hQX3n7ae5mq3NjNZ32DfAG0tJf) -
-Previous News - - * [2024/05/28] ✨#TensorRT weight stripping for ResNet-50 ✨ ✅+99% compression ✅1 set of weights → ** GPUs\ ✅0 performance loss ✅** models…LLM, CNN, etc 👀 📚 DIY [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&launchableID=env-2h6bym7h5GFNho3vpWQQeUYMwTM&instance=L4%40g6.xlarge&diskStorage=500&cloudID=devplane-brev-1&baseImage=nvcr.io%2Fnvidia%2Ftensorrt%3A24.05-py3&file=https%3A%2F%2Fgithub.com%2FNVIDIA%2FTensorRT%2Fblob%2Frelease%2F10.0%2Fsamples%2Fpython%2Fsample_weight_stripping%2Fnotebooks%2Fweight_stripping.ipynb&name=tensorrt_weight_stripping_resnet50) @@ -68,10 +85,8 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co * [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/) - * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/) - * [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md) * [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md) * [2023/12/04] [Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100](./docs/source/blogs/Falcon180B-H200.md) @@ -88,7 +103,7 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co ## TensorRT-LLM Overview TensorRT-LLM is a library for optimizing Large Language Model (LLM) inference. -It provides state-of-the-art optimziations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs +It provides state-of-the-art optimizations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs TensorRT-LLM provides a Python API to build LLMs into optimized [TensorRT](https://developer.nvidia.com/tensorrt) engines. diff --git a/benchmarks/README.md b/benchmarks/README.md index 00f450319..b368a6621 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -7,5 +7,6 @@ There are currently three workflows to benchmark TensorRT-LLM: - The recommended workflow that uses TensorRT-LLM C++ API and can take advantage of the latest features of TensorRT-LLM. * [Python benchmarks](./python) - The Python benchmarking scripts can only benchmark the Python runtime, which do not support the latest features, such as in-flight batching. -* [The Python benchmarking suite](./Suite.md) - - This benchmarking suite is a current work in progress and is prone to large changes. +* [The Python benchmarking suite](../docs/source/performance/perf-benchmarking.md) + - This benchmarker is native to TensorRT-LLM and is a Python benchmarker for reproducing and testing the performance of TensorRT-LLM. + - _NOTE_: This benchmarking suite is a current work in progress and is prone to large changes. diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp index 45632350c..8e5d94a12 100644 --- a/benchmarks/cpp/gptManagerBenchmark.cpp +++ b/benchmarks/cpp/gptManagerBenchmark.cpp @@ -145,6 +145,7 @@ struct BenchmarkParams { std::optional maxTokensInPagedKvCache{std::nullopt}; std::optional freeGpuMemoryFraction{std::nullopt}; + std::optional crossKvCacheFraction{std::nullopt}; bool enableTrtOverlap{false}; bool enableBlockReuse{false}; bool enableChunkedContext{false}; @@ -159,6 +160,8 @@ struct BenchmarkParams std::optional sinkTokenLength{std::nullopt}; bool multiBlockMode{true}; bool enableContextFMHAFP32Acc{false}; + bool cudaGraphMode{false}; + SizeType32 cudaGraphCacheSize{0}; // lora / peft params std::optional loraDir{std::nullopt}; @@ -470,7 +473,38 @@ class Recorder mRequestBenchInfos[requestId].firstTokenSeen = true; } - mRequestBenchInfos[requestId].outputLength += 1; + mRequestBenchInfos[requestId].decodingIter += 1; + } + + void recordToken(uint64_t requestId, std::list const& responseTensors) + { + int32_t outputLength = 1; + for (auto& tensor : responseTensors) + { + if (tensor.name == inference_request::kSequenceLengthTensorName) + { + // Tensor of shape nBeams, and we only need the first one + outputLength = *(bufferCast(*(tensor.tensor))); + break; + } + } + + mRequestBenchInfos[requestId].outputLength += outputLength; + this->recordToken(requestId); + } + + void recordToken(uint64_t requestId, texec::Response const& response) + { + auto outputTokenIds = response.getResult().outputTokenIds; + + int32_t outputLength = 1; + for (auto const& beam : outputTokenIds) + { + outputLength = std::max(static_cast(beam.size()), outputLength); + } + + mRequestBenchInfos[requestId].outputLength += outputLength; + this->recordToken(requestId); } void recordEnd(uint64_t requestId, std::list const& responseTensors, bool hasError) @@ -500,7 +534,7 @@ class Recorder } else { - this->recordToken(requestId); + this->recordToken(requestId, responseTensors); } } @@ -532,7 +566,7 @@ class Recorder } else { - this->recordToken(requestId); + this->recordToken(requestId, response); } } } @@ -818,11 +852,13 @@ class ExecutorServer texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy); texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache, benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength, - benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks); + benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks, + benchmarkParams.crossKvCacheFraction); texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8, std::nullopt, benchmarkParams.loraHostCacheSize); - texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig( - benchmarkParams.multiBlockMode, benchmarkParams.enableContextFMHAFP32Acc); + texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode, + benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode, + benchmarkParams.cudaGraphCacheSize); texec::ExecutorConfig executorConfig( maxBeamWidth, schedulerConfig, kvCacheConfig, benchmarkParams.enableChunkedContext, true); executorConfig.setGpuWeightsPercent(benchmarkParams.gpuWeightsPercent); @@ -940,7 +976,7 @@ class ExecutorServer { if (!warmup && !response.hasError()) { - mRecorder->recordToken(reqId); + mRecorder->recordToken(reqId, response); } } } @@ -1228,7 +1264,7 @@ class GptServer { if (errMsg.empty()) { - mRecorder->recordToken(requestId); + mRecorder->recordToken(requestId, response_tensors); } } } @@ -1430,6 +1466,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType { optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction; } + if (benchmarkParams.crossKvCacheFraction) + { + optionalParams.kvCacheConfig.crossKvCacheFraction = benchmarkParams.crossKvCacheFraction; + } if (benchmarkParams.maxAttentionWindowVec) { optionalParams.kvCacheConfig.maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec; @@ -1458,8 +1498,8 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType : benchmarkParams.executorLookaheadConfig.has_value() ? texec::DecodingMode::Lookahead() : texec::DecodingMode::Auto(), benchmarkParams.executorLookaheadConfig, benchmarkParams.medusaChoices); - optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig( - benchmarkParams.multiBlockMode, benchmarkParams.enableContextFMHAFP32Acc); + optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode, + benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode, benchmarkParams.cudaGraphCacheSize); auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json"); auto const worldConfig = WorldConfig::mpi(jsonConfig.getGpusPerNode(), jsonConfig.getTensorParallelism(), @@ -1874,6 +1914,8 @@ int main(int argc, char* argv[]) "random_seed", "integer random seed for exponential time delays.", cxxopts::value()->default_value("420")); options.add_options()( "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value()); + options.add_options()( + "cross_kv_cache_fraction", "Cross K-V Cache Fraction (from 0.0 to 1.0).", cxxopts::value()); options.add_options()("request_rate", "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.", cxxopts::value()); @@ -1895,7 +1937,8 @@ int main(int argc, char* argv[]) options.add_options()("return_generation_logits", "Whether to return generation logits.", cxxopts::value()->default_value("false")); - options.add_options()("scheduler_policy", "Choose scheduler policy between max_utilization/guaranteed_no_evict.", + options.add_options()("scheduler_policy", + "Choose scheduler policy between max_utilization/guaranteed_no_evict/static_batch.", cxxopts::value()->default_value("guaranteed_no_evict")); options.add_options()("first_batch_delay", @@ -1946,6 +1989,12 @@ int main(int argc, char* argv[]) cxxopts::value()->default_value("true")); options.add_options()( "encoder_engine_dir", "Directory that store the engines of the encoder models.", cxxopts::value()); + options.add_options()("cuda_graph_mode", "When enabled, inference is executed with cuda graph.", + cxxopts::value()->default_value("false")); + options.add_options()("cuda_graph_cache_size", + "Specify how many cuda graphs are cached in the runtime. Larger cache gives better perf, but consumes more GPU " + "memory.", + cxxopts::value()->default_value("0")); options.add_options()("enable_context_fmha_fp32_acc", "Enable FMHA runner FP32 accumulation", cxxopts::value()->default_value("false")); @@ -2040,6 +2089,20 @@ int main(int argc, char* argv[]) { benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as(); } + // Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models. + if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir")) + { + if (result.count("cross_kv_cache_fraction")) + { + benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as(); + } + else + { + benchmarkParams.crossKvCacheFraction + = 0.5f; // default value if not set. but non enc-dec should not even have this param set + } + } + // Argument: Enable TRT overlap benchmarkParams.enableTrtOverlap = result["enable_trt_overlap"].as(); @@ -2131,6 +2194,12 @@ int main(int argc, char* argv[]) // Argument: enable_context_fmha_fp32_acc benchmarkParams.enableContextFMHAFP32Acc = result["enable_context_fmha_fp32_acc"].as(); + // Argument: cuda_graph_mode + benchmarkParams.cudaGraphMode = result["cuda_graph_mode"].as(); + + // Argument: cuda_graph_mode + benchmarkParams.cudaGraphCacheSize = result["cuda_graph_cache_size"].as(); + std::optional padId; // Argument: Padding token id if (result.count("pad_id")) @@ -2168,6 +2237,10 @@ int main(int argc, char* argv[]) { capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT; } + else if (capacitySchedulerPolicyArg == "static_batch") + { + capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kSTATIC_BATCH; + } else { TLLM_LOG_ERROR("Unexpected scheduler policy: " + capacitySchedulerPolicyArg); @@ -2246,14 +2319,14 @@ int main(int argc, char* argv[]) { texec::ModelType executorModelType; std::optional decoderEngineDir = std::nullopt, encoderEngineDir = std::nullopt; - if (result.count("encoder_engine_dir") && result.count("engine_dir")) + if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir")) { TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api."); TLLM_CHECK_WITH_INFO( modelType == TrtGptModelType::InflightFusedBatching, "encoder-decoder only support inflight batching."); executorModelType = texec::ModelType::kENCODER_DECODER; - decoderEngineDir = result["engine_dir"].as(); encoderEngineDir = result["encoder_engine_dir"].as(); + decoderEngineDir = result["decoder_engine_dir"].as(); } else if (result.count("engine_dir")) { diff --git a/benchmarks/cpp/utils/prepare_real_data.py b/benchmarks/cpp/utils/prepare_real_data.py index 5f14f6747..94383cfa2 100644 --- a/benchmarks/cpp/utils/prepare_real_data.py +++ b/benchmarks/cpp/utils/prepare_real_data.py @@ -231,8 +231,6 @@ def dataset(root_args, **kwargs): }, root_args.output) else: print_dataset( - task_ids, input_ids, output_lens, - tokenizer=None, ) diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py index 04ba2ab0f..ce06c9f9f 100644 --- a/benchmarks/python/gpt_benchmark.py +++ b/benchmarks/python/gpt_benchmark.py @@ -80,7 +80,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents, kv_cache_type = KVCacheType.CONTINUOUS if hasattr(self, 'kv_cache_type'): - kv_cache_type = self.kv_cache_type + kv_cache_type = KVCacheType(self.kv_cache_type) else: if hasattr(self, 'paged_kv_cache'): kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 32e89ae17..125526f7e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -316,6 +316,8 @@ endif() get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH) set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty) +add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11) + include_directories( ${CUDAToolkit_INCLUDE_DIRS} ${CUDNN_ROOT_DIR}/include @@ -323,7 +325,8 @@ include_directories( ${3RDPARTY_DIR}/cutlass/include ${3RDPARTY_DIR}/cutlass/tools/util/include ${3RDPARTY_DIR}/NVTX/include - ${3RDPARTY_DIR}/json/include) + ${3RDPARTY_DIR}/json/include + ${3RDPARTY_DIR}/pybind11/include) # TRT dependencies set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR}) @@ -381,7 +384,7 @@ endif() # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G") set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE} -DENABLE_UCX=${ENABLE_UCX}" + "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}" ) # Fix linking issue with TRT 10, the detailed description about `--mcmodel` can @@ -561,6 +564,7 @@ if(ENABLE_UCX) NO_DEFAULT_PATH) endif() endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_UCX=${ENABLE_UCX}") file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS REGEX "#define NV_TENSORRT_.*") diff --git a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h new file mode 100644 index 000000000..a08544e2a --- /dev/null +++ b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.h" +#include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/common/algorithm.h" +#include "tensorrt_llm/runtime/common.h" +#include + +namespace tensorrt_llm::batch_manager +{ +namespace kv_cache_manager +{ +class KVCacheManager; +} +class BasePeftCacheManager; +} // namespace tensorrt_llm::batch_manager + +namespace tensorrt_llm::batch_manager +{ + +using tensorrt_llm::runtime::SizeType32; + +/// @brief This scheduler takes into account the given request capacity and the KV cache capacity. +/// Depending on the CapacitySchedulerPolicy it will schedule already started and new requests, +/// or even pause previously started requests. +class BaseCapacityScheduler +{ +public: + explicit BaseCapacityScheduler(LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState) + : mNoScheduleUntilState(noScheduleUntilState) + , mNoScheduleAfterState(noScheduleAfterState) + { + } + + [[nodiscard]] LlmRequestState constexpr getNoScheduleUntilState() const noexcept + { + return mNoScheduleUntilState; + } + + [[nodiscard]] LlmRequestState constexpr getNoScheduleAfterState() const noexcept + { + return mNoScheduleAfterState; + } + +private: + /// The state until/after which the scheduler should not schedule requests + LlmRequestState mNoScheduleUntilState; + LlmRequestState mNoScheduleAfterState; +}; + +/// @brief Schedule up to maxNumRequests requests +class MaxRequestsScheduler : public BaseCapacityScheduler +{ +public: + explicit MaxRequestsScheduler(SizeType32 maxNumRequests, + std::shared_ptr kvCacheManager, + std::shared_ptr crossKvCacheManager, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE); + + /// @brief Takes as input a sorted list of requests and outputs a sorted lists of requests + /// to update for this current iteration, and a map of requests to pause + [[nodiscard]] std::tuple operator()(RequestList const& activeRequests) const; + +private: + SizeType32 mMaxNumRequests; + std::shared_ptr mKvCacheManager{nullptr}; + std::shared_ptr mCrossKvCacheManager{nullptr}; +}; + +/// @brief Schedule requests using the MAX_UTILIZATION policy +/// @details Try reserving resources to advance requests by one step, +/// may pause previously started requests. +class MaxUtilizationScheduler : public BaseCapacityScheduler +{ +public: + MaxUtilizationScheduler(SizeType32 maxNumRequests, std::shared_ptr kvCacheManager, + std::shared_ptr crossKvCacheManager, + std::shared_ptr peftCacheManager, bool manyMicroBatches, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE); + + [[nodiscard]] std::tuple operator()(RequestList const& activeRequests) const; + +private: + /// @return {fitsKvCache, fitsPeft} + std::pair trySchedulingRequestMaxUtilization(std::shared_ptr const& req, + RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages, + std::unordered_set& seenTaskIds) const; + + SizeType32 mMaxNumRequests; + std::shared_ptr mKvCacheManager{nullptr}; + std::shared_ptr mCrossKvCacheManager{nullptr}; + std::shared_ptr mPeftCacheManager{nullptr}; + /// @brief Boolean that indicates if multiple micro batches might be in flight + bool mManyMicroBatches; +}; + +/// @brief Schedule requests using the GUARANTEED_NO_EVICT policy +class GuaranteedNoEvictScheduler : public BaseCapacityScheduler +{ +public: + GuaranteedNoEvictScheduler(SizeType32 maxNumRequests, + std::shared_ptr kvCacheManager, + std::shared_ptr crossKvCacheManager, + std::shared_ptr peftCacheManager, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE); + + [[nodiscard]] std::tuple operator()(RequestList const& activeRequests) const; + +protected: + [[nodiscard]] std::tuple forwardImpl( + RequestList const& activeRequests, bool staticBatchScheduling) const; + +private: + SizeType32 mMaxNumRequests; + std::shared_ptr mKvCacheManager{nullptr}; + std::shared_ptr mCrossKvCacheManager{nullptr}; + std::shared_ptr mPeftCacheManager{nullptr}; +}; + +/// @brief Schedule requests using the STATIC_BATCH policy +class StaticBatchScheduler : public GuaranteedNoEvictScheduler +{ +public: + StaticBatchScheduler(SizeType32 maxNumRequests, std::shared_ptr kvCacheManager, + std::shared_ptr crossKvCacheManager, + std::shared_ptr peftCacheManager, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE); + + [[nodiscard]] std::tuple operator()(RequestList const& activeRequests) const; +}; + +class CapacityScheduler : public Algorithm +{ +public: + constexpr static auto name{"CapacityScheduler"}; + + CapacityScheduler() = default; + + CapacityScheduler(SizeType32 maxNumRequests, std::shared_ptr kvCacheManager, + std::shared_ptr crossKvCacheManager, + std::shared_ptr peftCacheManager, + executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE); + + static CapacityScheduler make(SizeType32 maxNumRequests, + std::shared_ptr kvCacheManager, + std::shared_ptr crossKvCacheManager, + std::shared_ptr peftCacheManager, + executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE) + { + return CapacityScheduler{maxNumRequests, std::move(kvCacheManager), std::move(crossKvCacheManager), + std::move(peftCacheManager), capacitySchedulerPolicy, manyMicroBatches, noScheduleUntilState, + noScheduleAfterState}; + } + + [[nodiscard]] std::tuple operator()(RequestList const& activeRequests) const; + +private: + std::variant + mScheduler; +}; + +} // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/common.h b/cpp/include/tensorrt_llm/batch_manager/common.h new file mode 100644 index 000000000..6e4a76bc4 --- /dev/null +++ b/cpp/include/tensorrt_llm/batch_manager/common.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/runtime/common.h" +#include +#include +#include +#include +#include +#include + +namespace tensorrt_llm::executor +{ +class RequestWithId; +} + +namespace tensorrt_llm::batch_manager +{ +class LlmRequest; + +using RequestList = std::list>; +using RequestIdType = std::uint64_t; +using RequestVector = std::vector>; +using ReqIdsSet = std::unordered_set; + +class ScheduledRequests +{ +public: + /// @brief context phase requests (for decoder-only models) or encoder phase requests (for encoder-decoder models + /// and encoder-only models) + RequestVector contextRequests; + + /// @brief generation phase requests (for decoder-only models) or empty for others + RequestVector generationRequests; + + ScheduledRequests() = default; + + explicit ScheduledRequests(RequestVector contextRequests, RequestVector generationRequests) + : contextRequests{std::move(contextRequests)} + , generationRequests{std::move(generationRequests)} + { + } + + [[nodiscard]] bool empty() const + { + return contextRequests.empty() && generationRequests.empty(); + } + + [[nodiscard]] std::size_t size() const + { + return contextRequests.size() + generationRequests.size(); + } +}; + +class BatchState +{ +public: + BatchState() = default; + + BatchState(runtime::SizeType32 numCtxRequests, runtime::SizeType32 numGenRequests, runtime::SizeType32 numTokens, + runtime::SizeType32 maxKvCacheLength) + : mNumCtxRequests{numCtxRequests} + , mNumGenRequests{numGenRequests} + , mNumTokens{numTokens} + , mMaxKvCacheLength{maxKvCacheLength} + { + } + + bool isAnyContext() const + { + return mNumCtxRequests > 0; + } + + bool operator==(BatchState const& other) const + { + return mNumCtxRequests == other.mNumCtxRequests && mNumGenRequests == other.mNumGenRequests + && mNumTokens == other.mNumTokens && mMaxKvCacheLength == other.mMaxKvCacheLength; + } + + size_t hash() const + { + size_t h1 = std::hash{}(mNumCtxRequests); + size_t h2 = std::hash{}(mNumGenRequests); + size_t h3 = std::hash{}(mNumTokens); + size_t h4 = std::hash{}(mMaxKvCacheLength); + return h1 ^ h2 ^ h3 ^ h4; + } + + runtime::SizeType32 mNumCtxRequests; + runtime::SizeType32 mNumGenRequests; + runtime::SizeType32 mNumTokens; + runtime::SizeType32 mMaxKvCacheLength; +}; + +struct BatchStateHash +{ + size_t operator()(BatchState const& bs) const + { + return bs.hash(); + } +}; + +} // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h new file mode 100644 index 000000000..a7326eee7 --- /dev/null +++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/batch_manager/kvCacheManager.h" + +#include + +using namespace tensorrt_llm::batch_manager::kv_cache_manager; + +namespace tensorrt_llm::batch_manager::eviction_policy +{ + +class BaseEvictionPolicy +{ +public: + virtual ~BaseEvictionPolicy() = default; + + virtual void initialize( + std::vector& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks) + = 0; + + // Get a free block from the primary memory pool + virtual BlockPtr getFreePrimaryBlock() = 0; + // Get a free block from the secondary memory pool + virtual BlockPtr getFreeSecondaryBlock() = 0; + // Release a block. Prioritize the block for eviction if toFront=true + virtual void releaseBlock(BlockPtr block, bool toFront = false) = 0; + // Get the amount of free blocks in the primary memory pool + virtual SizeType32 getNumFreePrimaryBlocks() = 0; + // Get the amount of free blocks in the secondary memory pool + virtual SizeType32 getNumFreeSecondaryBlocks() = 0; + // Claim a free block. Called when the cache manager allocates or reuses a new block + virtual void claimBlock(KVCacheBlock block) = 0; +}; + +class LRUEvictionPolicy : public BaseEvictionPolicy +{ +public: + void initialize( + std::vector& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks) override; + BlockPtr getFreePrimaryBlock() override; + BlockPtr getFreeSecondaryBlock() override; + void releaseBlock(BlockPtr block, bool toFront = false) override; + SizeType32 getNumFreePrimaryBlocks() override; + SizeType32 getNumFreeSecondaryBlocks() override; + + void claimBlock(KVCacheBlock block); + +private: + FreeBlocksQueue mFreePrimaryBlocks; + FreeBlocksQueue mFreeSecondaryBlocks; + + std::vector> mFreeBlockIterators; + + SizeType32 mFreePrimaryBlocksSize; + SizeType32 mFreeSecondaryBlocksSize; +}; + +} // namespace tensorrt_llm::batch_manager::eviction_policy diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h index 0aa80adfe..b7295650a 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h @@ -41,7 +41,8 @@ class KvCacheConfig std::optional> maxAttentionWindowVec = std::nullopt, std::optional sinkTokenLength = std::nullopt, std::optional freeGpuMemoryFraction = std::nullopt, bool enableBlockReuse = false, bool useUvm = false, - std::optional hostCacheSize = std::nullopt, bool onboardBlocks = true) + std::optional hostCacheSize = std::nullopt, bool onboardBlocks = true, + std::optional crossKvCacheFraction = std::nullopt) : maxTokens{maxTokens} , maxAttentionWindowVec{maxAttentionWindowVec} , sinkTokenLength{sinkTokenLength} @@ -50,6 +51,7 @@ class KvCacheConfig , useUvm(useUvm) , hostCacheSize(hostCacheSize) , onboardBlocks(onboardBlocks) + , crossKvCacheFraction{crossKvCacheFraction} { } @@ -57,7 +59,7 @@ class KvCacheConfig : KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindowVec(), kvCacheConfig.getSinkTokenLength(), kvCacheConfig.getFreeGpuMemoryFraction(), kvCacheConfig.getEnableBlockReuse(), false, kvCacheConfig.getHostCacheSize(), - kvCacheConfig.getOnboardBlocks()) + kvCacheConfig.getOnboardBlocks(), kvCacheConfig.getCrossKvCacheFraction()) { } @@ -66,7 +68,8 @@ class KvCacheConfig return maxTokens == other.maxTokens && maxAttentionWindowVec == other.maxAttentionWindowVec && sinkTokenLength == other.sinkTokenLength && freeGpuMemoryFraction == other.freeGpuMemoryFraction && enableBlockReuse == other.enableBlockReuse && useUvm == other.useUvm - && hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks; + && hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks + && crossKvCacheFraction == other.crossKvCacheFraction; } friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self); @@ -80,5 +83,7 @@ class KvCacheConfig bool useUvm; std::optional hostCacheSize; bool onboardBlocks; + // Cross will use crossKvCacheFraction of KV Cache and self attention will use the rest. + std::optional crossKvCacheFraction; }; } // namespace tensorrt_llm::batch_manager::kv_cache_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 38b49bd23..cc7aa9374 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -22,6 +22,7 @@ #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/cudaStream.h" +#include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/modelConfig.h" #include "tensorrt_llm/runtime/worldConfig.h" @@ -29,13 +30,18 @@ #include #include -#include +#include #include #include #include #include #include +namespace tensorrt_llm::batch_manager::eviction_policy +{ +class BaseEvictionPolicy; +} + namespace tensorrt_llm::batch_manager::kv_cache_manager { @@ -124,6 +130,8 @@ class KVCacheBlock [[nodiscard]] IdType getBlockId() const; + [[nodiscard]] NextBlockMap getNextBlocks() const; + [[nodiscard]] kernels::KVCacheIndex::UnderlyingType getMemoryPoolBlockIndex() const; [[nodiscard]] bool isPrimary() const; @@ -144,22 +152,12 @@ class KVCacheBlock [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const; - void setFreeBlockIterator(FreeBlocksQueue::iterator freeBlockIterator); - - void resetFreeBlockIterator(); - - [[nodiscard]] std::optional const& getFreeBlockIterator() const; - void setPrevBlock(BlockPtr prevBlock); void addNextBlock(BlockKey const& blockKey, BlockPtr block); void removeNextBlock(BlockKey const& blockKey); - static std::shared_ptr findBestGPUBlockToFree(std::shared_ptr searchStart); - - static std::shared_ptr findLeafBlock(std::shared_ptr searchStart); - [[nodiscard]] BlockPtr findMatchingBlock(BlockKey const& blockKey) const; //! \brief Free block from previous block if present. @@ -203,14 +201,21 @@ class GenerationRequest { public: using SizeType32 = tensorrt_llm::runtime::SizeType32; - using SharedPtr = std::shared_ptr; - explicit GenerationRequest(SizeType32 seqSlotIdx, SizeType32 numTokens, SizeType32 beamWidth) - : mSeqSlotIdx(seqSlotIdx) + explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth, + SizeType32 maxBlocks, SizeType32 numPools = 1) + : mRequestId(requestId) , mNumTokens(numTokens) , mBeamWidth(beamWidth) , mCacheBlockIds(beamWidth) + , mCacheBlockIndices{ + runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}), + runtime::TRTDataType::value)} { + auto cacheBlockIdsRange = runtime::BufferRange(*mCacheBlockIndices); + std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(), + tensorrt_llm::kernels::KVCacheIndex{ + std::numeric_limits::max()}); } void addNewTokens(SizeType32 n) @@ -225,9 +230,9 @@ class GenerationRequest mNumTokens -= n; } - [[nodiscard]] SizeType32 getSequenceSlotIdx() const + [[nodiscard]] LlmRequest::RequestIdType getRequestId() const { - return mSeqSlotIdx; + return mRequestId; } [[nodiscard]] SizeType32 getNumTokens() const @@ -245,6 +250,16 @@ class GenerationRequest return mCacheBlockIds; } + [[nodiscard]] runtime::ITensor& getCacheBlockIndices() + { + return *mCacheBlockIndices; + } + + [[nodiscard]] runtime::ITensor const& getCacheBlockIndices() const + { + return *mCacheBlockIndices; + } + void addCacheBlock(SizeType32 beamIdx, KVCacheBlock::IdType blockId) { mCacheBlockIds.at(beamIdx).push_back(blockId); @@ -272,37 +287,64 @@ class GenerationRequest } private: - // Slot id of the sequence - SizeType32 mSeqSlotIdx; + // Request id of the sequence + LlmRequest::RequestIdType mRequestId; // Current number of generated tokens SizeType32 mNumTokens; // Number of beams SizeType32 mBeamWidth; - // List of blocks allocated for each beam of the sequence + // List of block ids allocated for each beam of the sequence std::vector> mCacheBlockIds; + // Tensor of block indices allocated for each beam of the sequence + runtime::ITensor::SharedPtr mCacheBlockIndices; }; -// BlockManager manages overall metadata of KVCacheBlocks in a layer of the -// network. Layers are expected to be symmetric, so the metadata can be -// reused for all layers of the network. -// The array of cache blocks for a layer is called a pool. -// Each pool has shape [max_blocks, 2, num_heads, tokens_per_block, head_size]. -// Size per block and number of blocks per pool are pre-determined and set in -// constructor. These should not be changed after. -// Block shape is [2, num_heads, tokens_per_block, head_size]. +// attach metadata to a pool pointer +class KVCacheBlockPool +{ +public: + SizeType32 numKvHeads; + SizeType32 numLayers; + SizeType32 blockSize; + + // Memory pools. Primary is fast memory, secondary is slower memory used for offloading. + runtime::ITensor::SharedPtr primaryPtr; + runtime::ITensor::SharedPtr secondaryPtr; + + KVCacheBlockPool(SizeType32 numKvHeads, SizeType32 numLayers, SizeType32 blockSize, + runtime::ITensor::SharedPtr primaryPtr = nullptr, runtime::ITensor::SharedPtr secondaryPtr = nullptr) + : numKvHeads(numKvHeads) + , numLayers(numLayers) + , blockSize(blockSize) + , primaryPtr(std::move(primaryPtr)) + , secondaryPtr(std::move(secondaryPtr)) + { + } +}; + +// The BlockManager manages the metadata of KVCacheBlocks. +// It manages multiple arrays of cache blocks called pools. +// Layers with the same number of kv heads are grouped under the same pool. +// Each pool has shape [max_blocks, num_layers, 2, num_kv_heads, tokens_pre_block, head_size], where num_layers refers +// to the number of layers with the same num_kv_heads that share that pool. +// The metadata of KVCacheBlocks is shared between layers, so each block spans all of the managed pool - an allocated +// block matches some chunk of memory in each pool. The shape of the chunk in every pool is [2, num_kv_heads, +// tokens_per_block, head_size]. The size per block and number of blocks are pre-determined and set in the constructor. // BlockManager maintains a list of free blocks at any time. // Alloc pops off the block at the front, and Free pushes it back to the vector. -// BlockManager maintains a vector of lists of seqSlotIdx to allocated blocks +// BlockManager maintains a vector of lists of request ids to allocated blocks // per sequence. This can be used to Free all blocks belonging to a sequence. class BlockManager { public: using SizeType32 = tensorrt_llm::runtime::SizeType32; using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType; + using BaseEvictionPolicy = tensorrt_llm::batch_manager::eviction_policy::BaseEvictionPolicy; - explicit BlockManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, + explicit BlockManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, - std::shared_ptr stream, bool onboardBlocks, CacheType cacheType = CacheType::kSELF); + SizeType32 maxNumSequences, std::shared_ptr stream, bool onboardBlocks, + CacheType cacheType = CacheType::kSELF); ~BlockManager(); @@ -317,10 +359,6 @@ class BlockManager //! \brief Assign blocks for new sequence. Does not try to reuse blocks. void addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx); - //! \brief Release block, which puts it back onto free blocks queue. - //! \details Block appended by default, will be put at front if toFront is true. - void releaseBlock(std::shared_ptr block, bool toFront = false); - //! \brief Allocate new block for each beam of the sequence. //! \details Might free cached blocks if no free blocks are available. void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams = false); @@ -336,10 +374,7 @@ class BlockManager //! \brief Release last block in the sequence void releaseLastBlock(GenerationRequest& sequence); - [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept - { - return mFreePrimaryBlocks.size(); - } + [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept; [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const { @@ -381,21 +416,26 @@ class BlockManager return mTokensPerBlock; } - //! \brief Get size of one K/V cache block in one layer. - //! @details Volume of [numKvHeads, tokensPerBlock, sizePerHead] - [[nodiscard]] SizeType32 getBlockSize() const + //! \brief Get size of one K/V cache block in one layer for the specified pool. + //! @details Volume of [numKvHeads, tokensPerBlock, sizePerHead] in the specified pool. + [[nodiscard]] SizeType32 getBlockSize(SizeType32 poolIdx) const { - return mBlockSize; + return mPools.at(poolIdx).blockSize; } - [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool() const noexcept + [[nodiscard]] SizeType32 getNumPools() const noexcept { - return mPrimaryPool; + return mPools.size(); } - [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool() const noexcept + [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 poolIdx) const { - return mSecondaryPool; + return mPools.at(poolIdx).primaryPtr; + } + + [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool(SizeType32 poolIdx) const + { + return mPools.at(poolIdx).secondaryPtr; } [[nodiscard]] SizeType32 getNumLayers() const @@ -403,10 +443,32 @@ class BlockManager return mNumLayers; } + [[nodiscard]] SizeType32 getNumPrimaryBlocks() const + { + return mNumPrimaryBlocks; + } + + [[nodiscard]] SizeType32 getNumSecondaryBlocks() const + { + return mNumSecondaryBlocks; + } + + [[nodiscard]] CacheType getCacheType() const + { + return mCacheType; + } + + [[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const + { + return mLayerToPool.at(layerIdx); + } + //! \brief Get index in pool to K or V block. //! \param blockId the blockId as returned by getBlockId() //! \param fieldIdx either 0 (K) or 1 (V), - [[nodiscard]] kernels::KVCacheIndex getKOrVBlockIndex(KVCacheBlock::IdType blockId, SizeType32 fieldIdx) const; + //! \param poolIdx the index of the pool for which the index is calculated (each pool has different strides) + [[nodiscard]] kernels::KVCacheIndex getKOrVBlockIndex( + KVCacheBlock::IdType blockId, SizeType32 fieldIdx, SizeType32 poolIdx) const; //! \brief Bring offloaded block from secondary to primary memory. //! \details Does nothing of block is already in primary memory. @@ -417,6 +479,11 @@ class BlockManager BlockKey findNewContextBlock( VecUniqueTokens const& uniqueTokens, std::shared_ptr const& llmRequest) const; + [[nodiscard]] runtime::BufferManager const& getBufferManager() const + { + return mBufferManager; + } + private: //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq. void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx); @@ -436,22 +503,15 @@ class BlockManager SizeType32 loadOrAllocateBlocks( std::list const& blockKeys, SizeType32 numContextBlocks, GenerationRequest& sequence); - //! \brief Find best primary block to free. - //! \details The best primary block to free is the primary block that appears first in the queue and have no primary - //! block descendants - [[nodiscard]] std::shared_ptr findBestGPUBlockToFree(); - //! \brief Find block least likely to be reused, free it if necessary and return. [[nodiscard]] BlockPtr getFreeBlock(); - //! \brief Claim block if it is in free blocks list. - void claimBlock(KVCacheBlock& block); - //! \brief Free block from previous block and claim it from free blocks list. void claimLeafBlock(KVCacheBlock& block); //! \brief Compute pointer to raw KV block (K & V, all layers). - [[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(std::shared_ptr block) const; + [[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer( + std::shared_ptr block, SizeType32 poolIdx) const; //! \brief Copy content of src block to dst. void copyBlock(BlockPtr src, BlockPtr dst); @@ -460,23 +520,24 @@ class BlockManager // Number of blocks in pools SizeType32 mNumPrimaryBlocks; SizeType32 mNumSecondaryBlocks; - // List of free blocks. Blocks are either backed by fast primary memory or slow secondary memory, - // we maintain separate queues for these. - FreeBlocksQueue mFreePrimaryBlocks; - FreeBlocksQueue mFreeSecondaryBlocks; + // List of allocated blocks for each sequences - std::vector> mAllocatedBlocksPerSeq; - // Memory pools. Primary is fast memory, secondary is slower memory used for offloading. - runtime::ITensor::SharedPtr mPrimaryPool; - runtime::ITensor::SharedPtr mSecondaryPool; + std::unordered_map> mAllocatedBlocksPerSeq; + + // Pool per unique numKvHeads in the model + std::vector mPools; + // Matching of model layers to their pools + std::vector mLayerToPool; + // Whether offloaded blocks should be onboarded before reuse. bool mOnboardBlocks; // Buffer manager runtime::BufferManager mBufferManager; + + // Size of a single KV heads + SizeType32 mSizePerHead; // Number of layers SizeType32 mNumLayers; - // Volume of [numKvHeads, tokensPerBlock, sizePerHead] - SizeType32 mBlockSize; // Used to keep track of number of free blocks during scheduling SizeType32 mSchedulingNumFreeBlocks; // Number of tokens per one block @@ -489,6 +550,8 @@ class BlockManager std::size_t mAllocTotalBlocks, mAllocNewBlocks, mReusedBlocks; // KV cache type (self or cross) CacheType mCacheType; + // Eviction Policy + std::shared_ptr mEvictionPolicy; private: friend class KVCacheManager; @@ -497,17 +560,24 @@ class BlockManager class KVCacheManager { public: + friend class KVCacheManagerBindings; + using SizeType32 = tensorrt_llm::runtime::SizeType32; - using SequencesPtr = GenerationRequest::SharedPtr; using CudaStreamPtr = std::shared_ptr; using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType; - KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock, + KVCacheManager(std::vector const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock, CudaStreamPtr stream, bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF); + KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock, + SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, + SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock, + CudaStreamPtr stream, bool enableBlockReuse = true, bool onboardBlocks = true, + CacheType cacheType = CacheType::kSELF); + void allocatePools(nvinfer1::DataType dtype, bool useUvm = false); void startScheduling(); @@ -583,10 +653,10 @@ class KVCacheManager /// @return The number of blocks [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const; - void addContextTokens(SizeType32 seqSlotIdx, SizeType32 numTokens); + void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens); - /// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed. - void addToken(SizeType32 seqSlotIdx); + /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed. + void addToken(LlmRequest::RequestIdType requestId); /// @brief Add new request to the KV cache manager. /// @param inputLength Input length for which KV cache need to be allocated. @@ -594,34 +664,40 @@ class KVCacheManager /// @param llmRequest Optional request to use for KV cache lookup. /// @details If llmRequest is supplied and KV cache reuse is enabled, try to recover KV cache blocks for /// inputLength - 1 tokens and populate prepopulatedPromptLen. - void addSequence(SizeType32 seqSlotIdx, SizeType32 inputLength, SizeType32 beamWidth, + void addSequence(LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth, std::shared_ptr const& llmRequest = nullptr); - void removeSequence(SizeType32 seqSlotIdx, std::shared_ptr const& llmRequest = nullptr); + void removeSequence(LlmRequest::RequestIdType requestId, std::shared_ptr const& llmRequest = nullptr); - void schedulingRemoveSequence(SizeType32 seqSlotIdx); + void schedulingRemoveSequence(LlmRequest::RequestIdType requestId); - [[nodiscard]] runtime::ITensor::UniquePtr getBlockPoolPointers() const; + [[nodiscard]] runtime::ITensor::SharedPtr getBlockPoolPointers() const + { + return mBlockPoolPointers; + } + + [[nodiscard]] runtime::ITensor::SharedPtr getLayerToPoolMapping() const + { + return mLayerToPoolMapping; + } void getBlockOffsetsOfBatch( runtime::ITensor& output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize, SizeType32 beamWidth) const; //! @return maxBlockCount of all beams SizeType32 copyBlockOffsets( - runtime::ITensor& output, SizeType32 outputSlotOffset, SizeType32 seqSlotIdx, SizeType32 beamWidth) const; - - // Volume of [2, numKvHeads, tokensPerBlock, sizePerHead] - [[nodiscard]] static SizeType32 constexpr calculatePageSize(tensorrt_llm::runtime::ModelConfig const& modelConfig) - { - return 2 * modelConfig.getNbKvHeads() * modelConfig.getTokensPerBlock() * modelConfig.getSizePerHead(); - } + runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const; - // numLayers * 2 * numKvHeads * sizePerHead - [[nodiscard]] static SizeType32 constexpr calculateCacheSizePerToken( + // Sum of numLayers * 2 * numKvHeads * sizePerHead for each pool + [[nodiscard]] static SizeType32 calculateCacheSizePerToken( tensorrt_llm::runtime::ModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig) { - return modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism()) * 2 * modelConfig.getNbKvHeads() - * modelConfig.getSizePerHead(); + // NOTE: We expect the initialization of modelConfig to have already taken the tp size into account and do not + // address it here + // consider only local layers for the calculation + return modelConfig.getSumLocalKvHeads( + worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank()) + * 2 * modelConfig.getSizePerHead(); } [[nodiscard]] static std::tuple const calculateMaxNumBlocks(KvCacheConfig const& config, @@ -633,14 +709,14 @@ class KVCacheManager return mEnableBlockReuse; } - void removeToken(SizeType32 seqSlotIdx); - void rewindKVCache(SizeType32 seqSlotIdx, SizeType32 rewindLengths); + void removeToken(LlmRequest::RequestIdType requestId); + void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths); - [[nodiscard]] GenerationRequest const& getSequence(SizeType32 seqSlotIdx) const; + [[nodiscard]] GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const; [[nodiscard]] bool isCrossKv() const { - return mCacheType == CacheType::kCROSS; + return mBlockManager.getCacheType() == CacheType::kCROSS; } //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector. @@ -650,7 +726,7 @@ class KVCacheManager //! \brief Store full context blocks contributed by llmRequest. //! \details These blocks become reusable from next step. - void storeContextBlocks(SizeType32 seqSlotIdx, std::shared_ptr const& llmRequest); + void storeContextBlocks(std::shared_ptr const& llmRequest); [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock); @@ -658,14 +734,13 @@ class KVCacheManager SizeType32 tokensPerBlock, SizeType32 maxBeamWidth, SizeType32 sinkTokenLen, bool useOneMoreBlock); private: - void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 seqSlotIdx, - SizeType32 beamIdx, SizeType32 blockIdx, KVCacheBlock::IdType blockId) const; + void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx, + SizeType32 blockIdx, KVCacheBlock::IdType blockId) const; - void resetBlockOffsets(SizeType32 seqSlotIdx, SizeType32 beamWidth); - void cacheBlockOffsets(GenerationRequest const& seq, SizeType32 seqSlotIdx); - void cacheNewBlockOffsets(GenerationRequest const& seq, SizeType32 seqSlotIdx); - void updateNewBlockPointer(GenerationRequest const& seq, SizeType32 seqSlotIdx, SizeType32 blockIdx); - void updateToken(SizeType32 seqSlotIdx, bool addToken); + void cacheBlockOffsets(GenerationRequest& seq); + void cacheNewBlockOffsets(GenerationRequest& seq); + void updateNewBlockPointer(GenerationRequest& seq, SizeType32 blockIdx); + void updateToken(GenerationRequest& sequence, bool addToken); private: // Maximum number of sequences @@ -685,14 +760,13 @@ class KVCacheManager SizeType32 mSinkBlockTokenLength; // Block manager BlockManager mBlockManager; - // List of all sequences - std::vector mSequences; - // buffer for block indices for all managed sequences - runtime::ITensor::SharedPtr mSequenceBlockIndices; + // Map of all sequences + std::unordered_map mSequences; // Whether to cache KV pages for reuse bool mEnableBlockReuse; - // KV cache type (self or cross) - CacheType mCacheType; + // buffers for static tensors, will be created after allocating pools + runtime::ITensor::SharedPtr mBlockPoolPointers; + runtime::ITensor::SharedPtr mLayerToPoolMapping; }; } // namespace tensorrt_llm::batch_manager::kv_cache_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h index 81b91e24a..69ca1963b 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h @@ -65,6 +65,11 @@ class BlockIterator return ret; } + operator runtime::ITensor::SharedPtr() + { + return mCurrent; + } + [[nodiscard]] bool operator==(BlockIterator const& other) const { return mIdx == other.mIdx && mPool.get() == other.mPool.get(); @@ -91,9 +96,9 @@ class BlockIterator }; [[nodiscard]] BlockIterator getBlockBeginIt( - KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam); + KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam, SizeType32 poolIdx); [[nodiscard]] BlockIterator getBlockEndIt( - KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam); + KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam, SizeType32 poolIdx); } // namespace tensorrt_llm::batch_manager::kv_cache_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 0124592e8..475970b7b 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -26,6 +26,7 @@ #include "tensorrt_llm/runtime/samplingConfig.h" #include +#include #include #include #include @@ -39,24 +40,22 @@ namespace tensorrt_llm::batch_manager * @brief The state of the request. * * Enum order must follow chronological order for state dependency check, @see hasReachedState(). - * - * @todo(rkobus): refactor */ -enum LlmRequestState_t +enum class LlmRequestState : int32_t { - REQUEST_STATE_UNKNOWN = 0, ///< Unknown state - REQUEST_STATE_ENCODER_INIT = 1, ///< Encoder phase starts (for encoder-decoder models) - REQUEST_STATE_CONTEXT_INIT = 2, ///< Context phase starts - REQUEST_STATE_GENERATION_IN_PROGRESS = 3, ///< Generation phase is in progress - REQUEST_STATE_GENERATION_TO_COMPLETE = 4, ///< Generation phase is to be completed - REQUEST_STATE_GENERATION_COMPLETE = 5, ///< Generation phase completed - REQUEST_STATE_DISAGG_GENERATION_INIT = 6, ///< For disaggregated serving only: - /// new Generation request arrived at generation model - REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS = 7, ///< For disaggregated serving only: - /// Waiting context-only request transmitting the kv cache - REQUEST_STATE_DISAGG_CONTEXT_COMPLETE = 8, ///< Context-only request finished kv cache transmission. - REQUEST_STATE_DISAGG_GENERATION_TRANS_IN_PROGRESS - = 9, ///< For disaggregated serving only: transmitting the kv cache + kUNKNOWN = 0, ///< Unknown state + kENCODER_INIT = 1, ///< Encoder phase starts (for encoder-decoder models) + kCONTEXT_INIT = 2, ///< Context phase starts + kGENERATION_IN_PROGRESS = 3, ///< Generation phase is in progress + kGENERATION_TO_COMPLETE = 4, ///< Generation phase is to be completed + kGENERATION_COMPLETE = 5, ///< Generation phase completed + kDISAGG_GENERATION_INIT = 6, ///< For disaggregated serving only: + /// new Generation request arrived at generation model + kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 7, ///< For disaggregated serving only: + /// Waiting context-only request transmitting the kv cache + kDISAGG_CONTEXT_COMPLETE = 8, ///< Context-only request finished kv cache transmission. + kDISAGG_GENERATION_TRANS_IN_PROGRESS = 9, ///< For disaggregated serving only: transmitting the kv cache + kWAITING_TO_SEND_LOGITS = 10, ///< Generation phase completed, logits not sent yet }; enum LlmRequestType @@ -114,7 +113,7 @@ class GenericLlmRequest , mPromptLen(inputTokens->size()) , mMaxNewTokens(maxNewTokens) , mSamplingConfig(samplingConfig) - , mState(REQUEST_STATE_CONTEXT_INIT) + , mState(LlmRequestState::kCONTEXT_INIT) , mEndId(endId) , mPadId(padId) , mLogitsPostProcessor(logitsPostProcessor) @@ -134,8 +133,7 @@ class GenericLlmRequest , mLoraWeights(std::move(loraWeights)) , mLoraConfig(std::move(loraConfig)) , mLookaheadConfig(std::move(lookaheadConfig)) - , mContextChunkSize(std::nullopt) - , mContextCurrentPosition(0) + , mContextChunkSize{mPromptLen} , mLogProbs(samplingConfig.beamWidth) , mCumLogProbs(samplingConfig.beamWidth) , mDraftTokens(draftTokens.value_or(std::make_shared())) @@ -159,7 +157,7 @@ class GenericLlmRequest { if (mEncoderTokens.has_value() || encoderInputFeatures.has_value()) { - mState = REQUEST_STATE_ENCODER_INIT; + mState = LlmRequestState::kENCODER_INIT; } initialize(*inputTokens, returnLogProbs); @@ -170,7 +168,7 @@ class GenericLlmRequest , mPromptLen(req.getInputTokenIds().size()) , mMaxNewTokens(req.getMaxTokens()) , mSamplingConfig(req.getSamplingConfig(), req.getExternalDraftTokensConfig()) - , mState(REQUEST_STATE_CONTEXT_INIT) + , mState(LlmRequestState::kCONTEXT_INIT) , mEndId(req.getEndId()) , mPadId(req.getPadId()) , mClientId(req.getClientId()) @@ -188,8 +186,7 @@ class GenericLlmRequest , mLoraWeights(std::nullopt) , mLoraConfig(std::nullopt) , mLookaheadConfig(std::nullopt) - , mContextChunkSize(std::nullopt) - , mContextCurrentPosition(0) + , mContextChunkSize{mPromptLen} , mLogProbs(mSamplingConfig.beamWidth) , mCumLogProbs(mSamplingConfig.beamWidth) , mDraftTokens(std::make_shared()) @@ -212,7 +209,7 @@ class GenericLlmRequest { if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY) { - mState = REQUEST_STATE_DISAGG_GENERATION_INIT; + mState = LlmRequestState::kDISAGG_GENERATION_INIT; } if (mIsStreaming && mSamplingConfig.beamWidth > 1 && !mReturnAllGeneratedTokens) { @@ -236,7 +233,7 @@ class GenericLlmRequest if (req.getEncoderInputTokenIds().has_value() || req.getEncoderInputFeatures().has_value()) { - mState = REQUEST_STATE_ENCODER_INIT; + mState = LlmRequestState::kENCODER_INIT; if (req.getEncoderInputTokenIds().has_value()) { mEncoderTokens = std::make_shared(req.getEncoderInputTokenIds().value()); @@ -394,6 +391,15 @@ class GenericLlmRequest mMaxNewTokens = maxNewTokens; } + if (mNumReturnSequences > 1 && mSamplingConfig.beamWidth > 1) + { + TLLM_THROW( + "Using mNumReturnSequences (%d) > 1 with beam search is currently disabled, since TensorRT-LLM returns " + "a total of mNumReturnSequences x beamWidth beams, rather than limiting the number of returned beams " + "to mNumReturnSequences. This restriction will be removed once the issue is resolved.", + mNumReturnSequences); + } + TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config"); // validate extra ids when enabling kv cache reuse with prompt table @@ -402,7 +408,8 @@ class GenericLlmRequest TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.has_value() && mInputTokenExtraIds.value(), "Input token extra ids must be provided when enabling kv cache reuse with prompt table"); TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.value()->size() == static_cast(mOrigPromptLen), - "inputTokenExtraIds vector size must be the same as input token vector size."); + "inputTokenExtraIds vector size (%lu) must be the same as input token vector size (%lu).", + mInputTokenExtraIds.value()->size(), static_cast(mOrigPromptLen)); } } @@ -413,7 +420,7 @@ class GenericLlmRequest /// @brief Get the params of the context /// @return The params of the context - std::optional const& getContextPhaseParams() const noexcept + [[nodiscard]] std::optional const& getContextPhaseParams() const noexcept { return mContextPhaseParams; } @@ -425,10 +432,10 @@ class GenericLlmRequest /// @brief Get the state params of the context /// @return The state params of the context - executor::ContextPhaseState const& getContextPhaseState() const + [[nodiscard]] executor::DataTransceiverState const& getDataTransceiverState() const { TLLM_CHECK(mContextPhaseParams.has_value()); - return *static_cast(mContextPhaseParams.value().getState()); + return *static_cast(mContextPhaseParams.value().getState()); } /// @brief Get total number of tokens for this req (prompt + generated) @@ -661,6 +668,11 @@ class GenericLlmRequest return mSequenceIndex > 0; } + [[nodiscard]] RequestIdType getParentRequestId() const + { + return mParentRequestId; + } + /// @brief Return a vector of the last-generated tokens of shape [num_beams] [[nodiscard]] VecTokens const& getLastTokens() { @@ -715,10 +727,10 @@ class GenericLlmRequest } // for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase - mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? REQUEST_STATE_ENCODER_INIT - : REQUEST_STATE_CONTEXT_INIT; + mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT + : LlmRequestState::kCONTEXT_INIT; mContextCurrentPosition = 0; - mContextChunkSize = std::nullopt; + mContextChunkSize = mPromptLen; mSeqSlot.reset(); } @@ -860,9 +872,9 @@ class GenericLlmRequest return mOrigPromptLen; } - void setPrepopulatedPromptLen(SizeType32 prepopulatedPromptLen) + [[nodiscard]] SizeType32 getPromptLen() const { - mPrepopulatedPromptLen = prepopulatedPromptLen; + return mPromptLen; } [[nodiscard]] SizeType32 getPrepopulatedPromptLen() const @@ -870,6 +882,37 @@ class GenericLlmRequest return mPrepopulatedPromptLen; } + void setPrepopulatedPromptLen(SizeType32 prepopulatedPromptLen, SizeType32 kvTokensPerBlock) + { + auto const promptLen = getPromptLen(); + TLLM_CHECK(prepopulatedPromptLen < promptLen); + mPrepopulatedPromptLen = prepopulatedPromptLen; + + if (prepopulatedPromptLen > 0) + { + // Currently, the runtime process is to apply for cache first and then determine prepopulation. + // Use the prepopulated length to advance the context position and decrease chunk size if necessary. + auto chunkSize = getContextChunkSize(); + if (prepopulatedPromptLen + chunkSize < promptLen) + { + // make sure to end at block boundary after current chunk + auto const flooredEndPosition + = (prepopulatedPromptLen + chunkSize) / kvTokensPerBlock * kvTokensPerBlock; + chunkSize = flooredEndPosition - prepopulatedPromptLen; + TLLM_CHECK(chunkSize <= getContextChunkSize()); + } + setContextCurrentPosition(prepopulatedPromptLen); + setContextChunkSize(chunkSize); + + if (!isLastContextChunk()) + { + TLLM_CHECK_WITH_INFO((getContextCurrentPosition() + getContextChunkSize()) % kvTokensPerBlock == 0, + "To prevent cache fragmentation, the context position after current chunk should be divisible " + "by the number of tokens per block, except for the last chunk."); + } + } + } + void setDraftTokens(std::shared_ptr const& draftTokens) { mDraftTokens = draftTokens; @@ -1100,44 +1143,49 @@ class GenericLlmRequest mGenerationLogitsFragments.clear(); } - [[nodiscard]] bool hasReachedState(LlmRequestState_t state) const noexcept + [[nodiscard]] bool hasReachedState(LlmRequestState state) const noexcept { return mState >= state; } [[nodiscard]] bool isEncoderInitState() const noexcept { - return mState == REQUEST_STATE_ENCODER_INIT; + return mState == LlmRequestState::kENCODER_INIT; } [[nodiscard]] bool isContextInitState() const noexcept { - return mState == REQUEST_STATE_CONTEXT_INIT; + return mState == LlmRequestState::kCONTEXT_INIT; } [[nodiscard]] bool isGenerationInProgressState() const noexcept { - return mState == REQUEST_STATE_GENERATION_IN_PROGRESS || mState == REQUEST_STATE_GENERATION_TO_COMPLETE; + return mState == LlmRequestState::kGENERATION_IN_PROGRESS || mState == LlmRequestState::kGENERATION_TO_COMPLETE; } [[nodiscard]] bool isGenerationCompleteState() const noexcept { - return mState == REQUEST_STATE_GENERATION_COMPLETE; + return mState == LlmRequestState::kGENERATION_COMPLETE; } [[nodiscard]] bool isDisaggGenerationInitState() const noexcept { - return mState == REQUEST_STATE_DISAGG_GENERATION_INIT; + return mState == LlmRequestState::kDISAGG_GENERATION_INIT; } [[nodiscard]] bool isDisaggContextTransmissionState() const noexcept { - return mState == REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS; + return mState == LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS; } [[nodiscard]] bool isDisaggContextCompleteState() const noexcept { - return mState == REQUEST_STATE_DISAGG_CONTEXT_COMPLETE; + return mState == LlmRequestState::kDISAGG_CONTEXT_COMPLETE; + } + + [[nodiscard]] bool isCompleteWaitingToSendLogits() const noexcept + { + return mState == LlmRequestState::kWAITING_TO_SEND_LOGITS; } /// To determine whether the context is unchunked. When a context is chunked into only a part, it @@ -1152,6 +1200,11 @@ class GenericLlmRequest return mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_CONTEXT_ONLY; } + [[nodiscard]] bool isGenerationOnlyRequest() const noexcept + { + return mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY; + } + void setContextCurrentPosition(SizeType32 contextCurrentPosition) { mContextCurrentPosition = contextCurrentPosition; @@ -1170,12 +1223,11 @@ class GenericLlmRequest return mPromptLen - getContextCurrentPosition(); } - /// To retrieve the context chunk size, throw an exception when the context is not chunked. [[nodiscard]] SizeType32 getContextChunkSize() const { - TLLM_CHECK_WITH_INFO( - isContextInitState() && mContextChunkSize, "The current request is not in context chunking state."); - return mContextChunkSize.value(); + TLLM_CHECK_WITH_INFO(isContextInitState() || isDisaggGenerationInitState(), + "getContextChunkSize is only possible during the context phase."); + return mContextChunkSize; } /// To set the context chunk size, throw an exception when the chunk size is negative. If the chunk @@ -1183,45 +1235,34 @@ class GenericLlmRequest /// remaining length. void setContextChunkSize(SizeType32 size) { - TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase."); + TLLM_CHECK_WITH_INFO(isContextInitState(), "setContextChunkSize is only possible during the context phase."); TLLM_CHECK_WITH_INFO(size >= 0, "The chunk size of context (%d) can't be negative.", size); mContextChunkSize = std::min(size, getContextRemainingLength()); } /// Determines whether the current position is only one chunk away from the end of the context. - /// It will return true when the context is not chunked. [[nodiscard]] bool isLastContextChunk() const noexcept { - return isFullContextRequest() - || (isContextInitState() && getContextCurrentPosition() + getContextChunkSize() == mPromptLen); + return isDisaggGenerationInitState() || getContextCurrentPosition() + getContextChunkSize() == mPromptLen; } - /// Returns whether the position is at the beginning of the context. It will return true when the - /// context is not chunked. + /// Returns whether the position is at the beginning of the context. [[nodiscard]] bool isFirstContextChunk() const noexcept { - return isFullContextRequest() || getContextCurrentPosition() == 0; - } - - [[nodiscard]] executor::PriorityType priority() const noexcept - { - return mPriority; + return getContextCurrentPosition() == 0; } /// Move the cursor forward one chunk. When not chunked, move forward to the end of the context. void moveToNextContextChunk() { TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase."); - if (mContextChunkSize) - { - mContextCurrentPosition += getContextChunkSize(); - setContextChunkSize(0); - } - else - { - TLLM_CHECK_WITH_INFO(mContextCurrentPosition == 0, "Full context out of bounds."); - mContextCurrentPosition = mPromptLen; - } + mContextCurrentPosition += getContextChunkSize(); + setContextChunkSize(0); + } + + [[nodiscard]] executor::PriorityType priority() const noexcept + { + return mPriority; } /// Increment the counter of decoding iterations. @@ -1241,20 +1282,24 @@ class GenericLlmRequest return static_cast(getMaxNumGeneratedTokens()) / mDecodingIter; } + [[nodiscard]] bool isFinished() const noexcept + { + return isGenerationCompleteState() || isDisaggContextTransmissionState() || isCompleteWaitingToSendLogits(); + } + /// @brief Create a Response from the current state of the request /// @return An optional Response - std::optional createResponse() + std::optional createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0) { TLLM_CHECK(!isDisaggContextCompleteState()); - if (isGenerationCompleteState() || (mIsStreaming && isGenerationInProgressState()) - || isDisaggContextTransmissionState()) + if (isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)) { TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId); executor::Result result; result.sequenceIndex = mSequenceIndex; - result.isSequenceFinal = isGenerationCompleteState() || isDisaggContextTransmissionState(); + result.isSequenceFinal = isFinished(); mSequenceFinalVec->at(mSequenceIndex) = result.isSequenceFinal; result.isFinal = std::all_of(mSequenceFinalVec->begin(), mSequenceFinalVec->end(), @@ -1273,7 +1318,7 @@ class GenericLlmRequest } // TODO: fill the rank ids result.contextPhaseParams = executor::ContextPhaseParams{ - std::move(firstGenTokens), mContextPhaseParams.value().releaseState()}; + std::move(firstGenTokens), mRequestId, mContextPhaseParams.value().releaseState()}; } auto const calculateNbTokensOut = [this](SizeType32 maxNbTokens) @@ -1292,8 +1337,7 @@ class GenericLlmRequest auto const startTokenPos = maxNbTokens - maxNbTokensOut; - auto const shouldSendResponse = isGenerationCompleteState() - || (mIsStreaming && maxNbTokens > getMaxSentTokenLen()) || isDisaggContextTransmissionState(); + auto const shouldSendResponse = isFinished() || (mIsStreaming && maxNbTokens > getMaxSentTokenLen()); if (!shouldSendResponse) { @@ -1333,6 +1377,11 @@ class GenericLlmRequest = runtime::ITensor::slice(getGenerationLogitsHost(), startGenTokenPos, maxNbTokensOut); result.generationLogits = executor::detail::ofITensor(generationLogitsHostCurrentStep); } + else if (useFastLogits) + { + result.specDecFastLogitsInfo + = executor::SpeculativeDecodingFastLogitsInfo{mRequestId, mpiWorldRank}; + } else { result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost()); @@ -1351,7 +1400,7 @@ class GenericLlmRequest setMaxSentTokenLen(maxNbTokens); auto requestId = isChild() ? mParentRequestId : mRequestId; - auto response = executor::Response(requestId, std::move(result)); + auto response = executor::Response(requestId, std::move(result), mClientId); return response; } @@ -1372,12 +1421,29 @@ class GenericLlmRequest mDecodingIter = iter; } + void setKvCacheTransferStart(std::chrono::time_point const& time) + { + mKvCacheTransferStart = time; + } + + void setKvCacheTransferEnd(std::chrono::time_point const& time) + { + mKvCacheTransferEnd = time; + } + + [[nodiscard]] double getKvCacheTransferTimeMS() const + { + // get max with 0 in case this function is called while end time is not recorded + return std::max( + 0.0, std::chrono::duration(mKvCacheTransferEnd - mKvCacheTransferStart).count()); + } + RequestIdType mRequestId; SizeType32 mPromptLen; SizeType32 mMaxNewTokens; // Tokens [beam_size, mPromptLen + getMaxNumGeneratedTokens()] runtime::SamplingConfig mSamplingConfig; - LlmRequestState_t mState; + LlmRequestState mState; std::optional mEndId; std::optional mPadId; std::optional mSeqSlot; @@ -1425,8 +1491,8 @@ class GenericLlmRequest // To enable chunked context, the FHMA paged kv-cache also needs to be enabled. Except for the last one, // the size of the context chunk needs to be an integer multiple of the kv-cache block size. The meaning // of null value is that the context is not chunked. - std::optional mContextChunkSize; - SizeType32 mContextCurrentPosition; + SizeType32 mContextChunkSize{0}; + SizeType32 mContextCurrentPosition{0}; std::vector mLogProbs; // [beamSize, seqLen] VecLogProbs mCumLogProbs; // [beamSize] @@ -1476,6 +1542,9 @@ class GenericLlmRequest RequestIdType mParentRequestId; std::shared_ptr> mSequenceFinalVec; // Indicators whether each sibling completes generation. + std::chrono::time_point mKvCacheTransferStart; + std::chrono::time_point mKvCacheTransferEnd; + private: void initialize(VecTokens const& inputTokens, bool outputLogProbs) { @@ -1490,8 +1559,8 @@ class GenericLlmRequest { if (mInputTokenExtraIds.value()->size() != inputTokens.size()) { - std::string errStr = "inputTokenExtraIds vector size must be the same as input token vector size."; - TLLM_THROW(errStr); + TLLM_THROW("inputTokenExtraIds vector size (%lu) must be the same as input token vector size (%lu).", + mInputTokenExtraIds.value()->size(), inputTokens.size()); } VecTokenExtraIds tokenExtraIds = *mInputTokenExtraIds.value(); for (std::size_t i = 0; i < inputTokens.size(); ++i) @@ -1575,6 +1644,8 @@ class GenericLlmRequest class LlmRequest : public GenericLlmRequest { + friend class LlmRequestBindings; + public: using Base = GenericLlmRequest; using TensorPtr = Base::TensorPtr; diff --git a/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h b/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h new file mode 100644 index 000000000..2e932ba23 --- /dev/null +++ b/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.h" +#include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/common/algorithm.h" +#include "tensorrt_llm/runtime/common.h" + +namespace tensorrt_llm::batch_manager +{ + +namespace batch_scheduler +{ + +struct ContextChunkingConfig +{ + ContextChunkingConfig() = default; + + executor::ContextChunkingPolicy chunkingPolicy; + /// The minimum size, also known as the chunk unit size. It generally + /// needs to be equal to the size of the kv cache block or its integer + /// multiples (except for the last context chunk) to avoid fragmentation. + /// When set to null, it indicates that the context chunk is disabled. + tensorrt_llm::runtime::SizeType32 chunkUnitSize; +}; + +} // namespace batch_scheduler + +/// @brief This scheduler takes into account the desired batch size and limits of the TRT engine to schedule requests. +class MicroBatchScheduler : Algorithm +{ +public: + constexpr static auto name{"MicroBatchScheduler"}; + + using SizeType32 = tensorrt_llm::runtime::SizeType32; + using ContextChunkingPolicy = tensorrt_llm::executor::ContextChunkingPolicy; + + MicroBatchScheduler() = default; + + explicit MicroBatchScheduler(SizeType32 maxBatchSize, std::optional maxNumTokens = std::nullopt, + std::optional ctxChunkConfig = std::nullopt, + std::optional maxContextLength = std::nullopt, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE); + + static MicroBatchScheduler make(SizeType32 maxBatchSize, std::optional maxNumTokens = std::nullopt, + std::optional ctxChunkConfig = std::nullopt, + std::optional maxContextLength = std::nullopt, + LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT, + LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE) + { + return MicroBatchScheduler{ + maxBatchSize, maxNumTokens, ctxChunkConfig, maxContextLength, noScheduleUntilState, noScheduleAfterState}; + } + + std::tuple operator()( + RequestVector const& activeRequests, ReqIdsSet const& inflightReqIds); + + static void setCtxRequestsChunkSize(RequestVector const& contextsToBeChunked, ContextChunkingPolicy ctxChunkPolicy, + std::optional ctxTokensCapacity, SizeType32 chunkUnitSize, + std::optional const& maxContextLength); + +private: + template + static void setCtxRequestsChunkSize(RequestVector const& contextsToBeChunked, + std::optional ctxTokensCapacity, SizeType32 chunkUnitSize, + std::optional const& maxContextLength); + + /// After the chunk sizes have been determined, this function will discard + /// any draft tokens that don't fit. + static void fitDraftTokens(RequestVector const& contextsToBeChunked, std::optional ctxTokensCapacity, + SizeType32 chunkUnitSize, std::optional const& maxContextLength); + + /// The maximum number of requests returned by scheduleRequests + SizeType32 mMaxBatchSize; + + /// The maximum number of tokens to include in a batch + std::optional mMaxNumTokens; + + /// The maximum length of the context. If the context exceeds this length, + /// it must be chunked, otherwise it cannot be processed. Therefore, it + /// needs to be set together with the chunk unit size to make sense. + /// When set to null, it indicates that context length is unlimited. + std::optional mMaxContextLength; + + std::optional mCtxChunkConfig; + + /// The state until/after which the scheduler should not schedule requests + LlmRequestState mNoScheduleUntilState; + LlmRequestState mNoScheduleAfterState; +}; + +} // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h index 65808134b..f86e76b4b 100644 --- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h @@ -51,6 +51,8 @@ class PeftTaskNotCachedException : public runtime::LoraExpectedException class BasePeftCacheManager { public: + friend class BasePeftCacheManagerBindings; + using LlmRequestPtr = std::shared_ptr; using RequestVector = std::vector; using PeftTable = std::map>>; diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h index fc61fd581..4a430d8c1 100644 --- a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h +++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h @@ -46,7 +46,9 @@ class TrtGptModelOptionalParams executor::SchedulerConfig const& schedulerConfig = executor::SchedulerConfig{}, executor::ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = executor::ExtendedRuntimePerfKnobConfig{}, - std::optional debugConfig = std::nullopt, uint64_t maxSeqIdleMicroseconds = 180000000) + std::optional debugConfig = std::nullopt, uint64_t maxSeqIdleMicroseconds = 180000000, + std::optional specDecConfig = std::nullopt, + bool isLeaderInOrchMode = false) : kvCacheConfig{kvCacheConfig} , enableTrtOverlap{enableTrtOverlap} , deviceIds(deviceIds) @@ -62,10 +64,12 @@ class TrtGptModelOptionalParams , extendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig) , debugConfig{std::move(debugConfig)} , maxSeqIdleMicroseconds{maxSeqIdleMicroseconds} + , speculativeDecodingConfig{std::move(specDecConfig)} + , isLeaderInOrchMode{isLeaderInOrchMode} { } - explicit TrtGptModelOptionalParams(executor::ExecutorConfig const& executorConfig) + explicit TrtGptModelOptionalParams(executor::ExecutorConfig const& executorConfig, bool isLeaderInOrchMode) : TrtGptModelOptionalParams(KvCacheConfig(executorConfig.getKvCacheConfig()), false, executorConfig.getParallelConfig().value_or(executor::ParallelConfig()).getDeviceIds(), executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(), @@ -74,16 +78,7 @@ class TrtGptModelOptionalParams executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(), executorConfig.getMaxNumTokens(), executorConfig.getSchedulerConfig(), executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig.getDebugConfig(), - executorConfig.getMaxSeqIdleMicroseconds()) - { - } - - // Copy constructor - TrtGptModelOptionalParams(TrtGptModelOptionalParams const& other) - : TrtGptModelOptionalParams(other.kvCacheConfig, other.enableTrtOverlap, other.deviceIds, - other.normalizeLogProbs, other.enableChunkedContext, other.peftCacheManagerConfig, other.decodingConfig, - other.gpuWeightsPercent, other.maxBeamWidth, other.maxBatchSize, other.maxNumTokens, other.schedulerConfig, - other.extendedRuntimePerfKnobConfig, other.debugConfig, other.maxSeqIdleMicroseconds) + executorConfig.getMaxSeqIdleMicroseconds(), executorConfig.getSpecDecConfig(), isLeaderInOrchMode) { } @@ -103,6 +98,8 @@ class TrtGptModelOptionalParams && extendedRuntimePerfKnobConfig == other.extendedRuntimePerfKnobConfig // && debugConfig == other.debugConfig // && maxSeqIdleMicroseconds == other.maxSeqIdleMicroseconds // + && speculativeDecodingConfig == other.speculativeDecodingConfig // + && isLeaderInOrchMode == other.isLeaderInOrchMode // ; } @@ -126,6 +123,9 @@ class TrtGptModelOptionalParams std::optional debugConfig; // Sequence is considered idle if not updated for this amount of time. uint64_t maxSeqIdleMicroseconds; + std::optional speculativeDecodingConfig; + // This rank is the leader worker in orchestrator mode + bool isLeaderInOrchMode; }; } // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/common/algorithm.h b/cpp/include/tensorrt_llm/common/algorithm.h new file mode 100644 index 000000000..9363504f7 --- /dev/null +++ b/cpp/include/tensorrt_llm/common/algorithm.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace tensorrt_llm +{ + +// Base class for algorithms +struct Algorithm +{ + Algorithm() = default; + Algorithm(Algorithm&&) = default; + Algorithm& operator=(Algorithm&&) = default; + Algorithm(Algorithm const&) = delete; + Algorithm& operator=(Algorithm const&) = delete; +}; + +} // namespace tensorrt_llm diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h index 71657c0bb..023f97d87 100644 --- a/cpp/include/tensorrt_llm/common/cudaUtils.h +++ b/cpp/include/tensorrt_llm/common/cudaUtils.h @@ -161,7 +161,7 @@ inline std::optional isCudaLaunchBlocking() return result; } -inline void syncAndCheck(char const* const file, int const line) +inline bool doCheckError() { auto const cudaLaunchBlocking = isCudaLaunchBlocking(); #ifndef NDEBUG @@ -170,7 +170,12 @@ inline void syncAndCheck(char const* const file, int const line) bool const checkError = cudaLaunchBlocking.value_or(false); #endif - if (checkError) + return checkError; +} + +inline void syncAndCheck(char const* const file, int const line) +{ + if (doCheckError()) { check(cudaGetLastError(), "cudaGetLastError", file, line); check(cudaDeviceSynchronize(), "cudaDeviceSynchronize", file, line); diff --git a/cpp/include/tensorrt_llm/common/mpiUtils.h b/cpp/include/tensorrt_llm/common/mpiUtils.h index edf3da004..d5801f36c 100644 --- a/cpp/include/tensorrt_llm/common/mpiUtils.h +++ b/cpp/include/tensorrt_llm/common/mpiUtils.h @@ -99,7 +99,6 @@ struct MpiTypeConverter }; template <> - struct MpiTypeConverter { @@ -380,9 +379,14 @@ class MpiComm void allreduce(void const* sendbuf, void* recvbuf, int count, MpiType dtype, MpiOp op) const; void allgather(void const* sendbuf, void* recvbuf, int count, MpiType dtype) const; + + void allgatherv(void const* sendbuf, int sendcount, MpiType sendtype, void* recvbuf, + std::vector const& recvcounts, std::vector const& displs, MpiType recvtype) const; + void barrier() const; void mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const; + bool improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const; //! \brief Returns if a message with the specified source and tag is available bool iprobe(int source, int tag, MPI_Status* status) const; diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h index a96c24d43..e6e5e1e0e 100644 --- a/cpp/include/tensorrt_llm/executor/executor.h +++ b/cpp/include/tensorrt_llm/executor/executor.h @@ -43,7 +43,7 @@ char const* version() noexcept; class Model; class Serialization; -class ContextPhaseState; +class DataTransceiverState; /// @brief Sampling configuration class SamplingConfig @@ -186,11 +186,13 @@ class ExternalDraftTokensConfig { public: explicit ExternalDraftTokensConfig(VecTokens tokens, std::optional logits = std::nullopt, - std::optional const& acceptanceThreshold = std::nullopt); + std::optional const& acceptanceThreshold = std::nullopt, + std::optional const& fastLogits = std::nullopt); [[nodiscard]] VecTokens getTokens() const; [[nodiscard]] std::optional getLogits() const; [[nodiscard]] std::optional getAcceptanceThreshold() const; + [[nodiscard]] std::optional getFastLogits() const; private: friend class Serialization; @@ -200,6 +202,8 @@ class ExternalDraftTokensConfig std::optional mLogits; /// @brief The acceptance threshold. Must be > 0.f and <= 1.f std::optional mAcceptanceThreshold; + /// @brief Use direct transfer for draft logits + std::optional mFastLogits; }; /// @brief Configuration for prompt tuning @@ -283,8 +287,10 @@ struct LookaheadDecodingConfig class ContextPhaseParams { public: - explicit ContextPhaseParams(VecTokens firstGenTokens); - ContextPhaseParams(VecTokens firstGenTokens, void* state); + using RequestIdType = std::uint64_t; + + explicit ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId); + ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId, void* state); ContextPhaseParams(ContextPhaseParams const&); ContextPhaseParams(ContextPhaseParams&&); @@ -295,6 +301,8 @@ class ContextPhaseParams [[nodiscard]] VecTokens const& getFirstGenTokens() const& noexcept; [[nodiscard]] VecTokens popFirstGenTokens() && noexcept; + [[nodiscard]] RequestIdType getReqId() const noexcept; + [[nodiscard]] void const* getState() const noexcept; [[nodiscard]] void* getState() noexcept; [[nodiscard]] void* releaseState() noexcept; @@ -304,6 +312,9 @@ class ContextPhaseParams static void deleter(void const* data); using StatePtr = std::unique_ptr; + /// @brief This request corresponds to the request ID in the context phase. + RequestIdType mReqId{0}; + /// @brief The first tokens generated by context executor VecTokens mFirstGenTokens; @@ -311,6 +322,18 @@ class ContextPhaseParams StatePtr mState{nullptr, deleter}; }; +/// @brief Configuration for speculative decoding (both draft and target models) +class SpeculativeDecodingConfig +{ +public: + explicit SpeculativeDecodingConfig(bool fastLogits); + + bool operator==(SpeculativeDecodingConfig const& other) const; + + /// @brief Send logits tensor directly from draft to target model. + bool fastLogits; +}; + /// @brief A class that holds information about the request class Request { @@ -430,6 +453,16 @@ class Request std::unique_ptr mImpl; }; +/// @brief Struct that holds the logits information when using direct transfer +struct SpeculativeDecodingFastLogitsInfo +{ + /// @brief Draft request id + uint64_t draftRequestId; + + /// @brief MPI world rank of the draft model leader + int32_t draftParticipantId; +}; + /// @brief Struct that holds the generation result struct Result { @@ -448,11 +481,14 @@ struct Result /// @brief The context logits. Size [promptLen, vocabSizePadded] std::optional contextLogits; - /// @brief The context logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming) + /// @brief The generation logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming) /// or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens) /// or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens) std::optional generationLogits; + /// @brief Logits information for direct transfer when using fast logits + std::optional specDecFastLogitsInfo; + /// @brief The encoder output. Size [encoderLen, hiddenSize] std::optional encoderOutput; @@ -477,8 +513,8 @@ struct Result class Response { public: - Response(IdType requestId, std::string errorMsg); - Response(IdType requestId, Result Result); + Response(IdType requestId, std::string errorMsg, std::optional clientId = std::nullopt); + Response(IdType requestId, Result Result, std::optional clientId = std::nullopt); ~Response(); Response(Response const& other); @@ -489,6 +525,9 @@ class Response /// @brief Get the id of the request for which this response was generated [[nodiscard]] IdType getRequestId() const; + /// @brief Get the client id of the request for which this response was generated + [[nodiscard]] std::optional getClientId() const; + /// @brief Indicates if this response has an error or not [[nodiscard]] bool hasError() const; @@ -538,13 +577,15 @@ class KvCacheConfig std::optional> const& maxAttentionWindowVec = std::nullopt, std::optional const& sinkTokenLength = std::nullopt, std::optional const& freeGpuMemoryFraction = std::nullopt, - std::optional const& hostCacheSize = std::nullopt, bool onboardBlocks = true); + std::optional const& hostCacheSize = std::nullopt, bool onboardBlocks = true, + std::optional const& crossKvCacheFraction = std::nullopt); [[nodiscard]] bool getEnableBlockReuse() const; [[nodiscard]] std::optional getMaxTokens() const; [[nodiscard]] std::optional> getMaxAttentionWindowVec() const; [[nodiscard]] std::optional getSinkTokenLength() const; [[nodiscard]] std::optional getFreeGpuMemoryFraction() const; + [[nodiscard]] std::optional getCrossKvCacheFraction() const; [[nodiscard]] std::optional getHostCacheSize() const; [[nodiscard]] bool getOnboardBlocks() const; @@ -553,6 +594,7 @@ class KvCacheConfig void setMaxAttentionWindowVec(std::vector maxAttentionWindowVec); void setSinkTokenLength(SizeType32 sinkTokenLength); void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction); + void setCrossKvCacheFraction(FloatType crossKvCacheFraction); void setHostCacheSize(size_t hostCacheSize); void setOnboardBlocks(bool onboardBlocks); @@ -581,6 +623,12 @@ class KvCacheConfig /// allocated. std::optional mFreeGpuMemoryFraction; + /// @brief The fraction of the KV Cache memory should be reserved for cross attention + /// If set to p, self attention will use 1-p of KV Cache memory and cross attention + /// will use p of KV Cache memory. Default is 50%. + /// Should only be set when using encoder-decoder model. + std::optional mCrossKvCacheFraction; + /// @brief Size of secondary memory pool in bytes. Default is 0. /// Having a secondary memory pool increases KV cache block reuse potential. std::optional mHostCacheSize; @@ -593,18 +641,24 @@ class KvCacheConfig class ExtendedRuntimePerfKnobConfig { public: - explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false); + explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false, + bool cudaGraphMode = false, SizeType32 cudaGraphCacheSize = 0); bool operator==(ExtendedRuntimePerfKnobConfig const& other) const { - return mMultiBlockMode == other.mMultiBlockMode && mEnableContextFMHAFP32Acc == other.mEnableContextFMHAFP32Acc; + return mMultiBlockMode == other.mMultiBlockMode && mEnableContextFMHAFP32Acc == other.mEnableContextFMHAFP32Acc + && mCudaGraphMode == other.mCudaGraphMode && mCudaGraphCacheSize == other.mCudaGraphCacheSize; } [[nodiscard]] bool getMultiBlockMode() const; [[nodiscard]] bool getEnableContextFMHAFP32Acc() const; + [[nodiscard]] bool getCudaGraphMode() const; + [[nodiscard]] SizeType32 getCudaGraphCacheSize() const; void setMultiBlockMode(bool multiBlockMode); void setEnableContextFMHAFP32Acc(bool enableContextFMHAFP32Acc); + void setCudaGraphMode(bool cudaGraphMode); + void setCudaGraphCacheSize(SizeType32 cacheSize); private: friend class Serialization; @@ -614,6 +668,13 @@ class ExtendedRuntimePerfKnobConfig /// @brief If enable FMHA runner FP32 accumulation. bool mEnableContextFMHAFP32Acc; + + /// @brief Control if enable cuda graph. + bool mCudaGraphMode; + + /// @brief Number of cuda graphs to be cached in the runtime. + /// The larger the cache, the better the perf, but more GPU memory is consumed. + SizeType32 mCudaGraphCacheSize; }; /// @brief Configuration class for debugging output @@ -622,27 +683,33 @@ class DebugConfig using StringVec = std::vector; public: - explicit DebugConfig(bool dumpInputTensors = false, bool dumpOuputTensors = false, StringVec debugTensorNames = {}); + explicit DebugConfig(bool debugInputTensors = false, bool debugOutputTensors = false, + StringVec debugTensorNames = {}, SizeType32 debugTensorsMaxIterations = 0); bool operator==(DebugConfig const& other) const; - [[nodiscard]] bool getDumpInputTensors() const; - [[nodiscard]] bool getDumpOutputTensors() const; + [[nodiscard]] bool getDebugInputTensors() const; + [[nodiscard]] bool getDebugOutputTensors() const; [[nodiscard]] StringVec const& getDebugTensorNames() const; + [[nodiscard]] SizeType32 getDebugTensorsMaxIterations() const; - void setDumpInputTensors(bool dumpInputTensors); - void setDumpOuputTensors(bool dumpOuputTensors); + void setDebugInputTensors(bool debugInputTensors); + void setDebugOutputTensors(bool debugOutputTensors); void setDebugTensorNames(StringVec const& debugTensorNames); + void setDebugTensorsMaxIterations(SizeType32 debugTensorsMaxIterations); private: friend class Serialization; - /// @brief If true, dump all input tensors. - bool mDumpInputTensors; - /// @brief If true, dump all output tensors. - bool mDumpOuputTensors; - /// @brief If not empty, only dump tensors in this list. + /// @brief If true, debug all input tensors. + bool mDebugInputTensors; + /// @brief If true, debug all output tensors. + bool mDebugOutputTensors; + /// @brief If not empty, only debug tensors in this list. StringVec mDebugTensorNames; + /// @brief If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations, + /// else dump them to files. + SizeType32 mDebugTensorsMaxIterations; }; SizeType32 const kDefaultIterStatsMaxIterations = 1000; @@ -847,7 +914,8 @@ class ExecutorConfig std::optional maxQueueSize = std::nullopt, ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(), std::optional debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0, - uint64_t maxSeqIdleMicroseconds = 180000000); + uint64_t maxSeqIdleMicroseconds = 180000000, + std::optional specDecConfig = std::nullopt); [[nodiscard]] SizeType32 getMaxBeamWidth() const; [[nodiscard]] SchedulerConfig getSchedulerConfig() const; @@ -869,6 +937,7 @@ class ExecutorConfig [[nodiscard]] std::optional getDebugConfig() const; [[nodiscard]] SizeType32 getRecvPollPeriodMs() const; [[nodiscard]] uint64_t getMaxSeqIdleMicroseconds() const; + [[nodiscard]] std::optional getSpecDecConfig() const; void setMaxBeamWidth(SizeType32 maxBeamWidth); void setMaxBatchSize(SizeType32 maxBatchSize); @@ -890,6 +959,7 @@ class ExecutorConfig void setDebugConfig(DebugConfig const& debugConfig); void setRecvPollPeriodMs(SizeType32 const& recvPollPeriodMs); void setMaxSeqIdleMicroseconds(uint64_t maxNumTokens); + void setSpecDecConfig(SpeculativeDecodingConfig const& specDecConfig); private: friend class Serialization; @@ -952,6 +1022,9 @@ class ExecutorConfig /// @brief The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default /// is 3 minutes. uint64_t mMaxSeqIdleMicroseconds; + + /// @brief The speculative decoding configuration + std::optional mSpeculativeDecodingConfig; }; /// @brief The executor is responsible for receiving new requests and sending responses, and running the inference @@ -1032,23 +1105,31 @@ class Executor /// @param id The request id for which to cancel the response void cancelRequest(IdType requestId); - /// @brief Signals the server to shutdown - /// This call is blocking. Only returns when all requests have terminated or timeout has been reached + /// @brief Signals the server to shutdown. + /// @details This call is blocking. Only returns when all requests have terminated or timeout has been reached void shutdown(); - /// @brief Returns the per-iterations statistics computed since last call to getLatestIterationStats - /// Contains at most iterStatsMaxIterations iterations + /// @brief Returns the per-iterations statistics computed since last call to getLatestIterationStats. + /// Contains at most iterStatsMaxIterations iterations. /// @return Iteration stats std::deque getLatestIterationStats(); - /// @brief Returns the request stats of each iteration computed since last call to getLatestRequestStats - /// Contains at most requestStatsMaxIterations iterations + /// @brief Returns the request stats of each iteration computed since last call to getLatestRequestStats. + /// Contains at most requestStatsMaxIterations iterations. /// @return Request stats grouped by iterations std::deque getLatestRequestStats(); + /// @brief Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors. + /// Contains at most debugTensorsMaxIterations iterations. + /// @return Request debug tensors grouped by iterations + std::deque getLatestDebugTensors(); + /// @brief Indicates if the current process is allowed to enqueueRequests [[nodiscard]] bool canEnqueueRequests() const; + /// @brief Indicates if the current process participates in this executor instance + [[nodiscard]] bool isParticipant() const; + private: class Impl; std::unique_ptr mImpl; diff --git a/cpp/include/tensorrt_llm/executor/serialization.h b/cpp/include/tensorrt_llm/executor/serialization.h index 11d22c3f0..28aba9dc1 100644 --- a/cpp/include/tensorrt_llm/executor/serialization.h +++ b/cpp/include/tensorrt_llm/executor/serialization.h @@ -75,10 +75,10 @@ class Serialization static void serialize(kv_cache::CacheState const& state, std::ostream& os); [[nodiscard]] static size_t serializedSize(kv_cache::CacheState const& state); - // ContextPhaseState - [[nodiscard]] static ContextPhaseState deserializeContextPhaseState(std::istream& is); - static void serialize(ContextPhaseState const& contextPhaseState, std::ostream& os); - [[nodiscard]] static size_t serializedSize(ContextPhaseState const& contextPhaseState); + // DataTransceiverState + [[nodiscard]] static DataTransceiverState deserializeDataTransceiverState(std::istream& is); + static void serialize(DataTransceiverState const& dataTransceiverState, std::ostream& os); + [[nodiscard]] static size_t serializedSize(DataTransceiverState const& dataTransceiverState); // ContextPhaseParams [[nodiscard]] static ContextPhaseParams deserializeContextPhaseParams(std::istream& is); @@ -95,6 +95,11 @@ class Serialization static void serialize(Tensor const& tensor, std::ostream& os); [[nodiscard]] static size_t serializedSize(Tensor const& tensor); + // SpeculativeDecodingFastLogitsInfo + [[nodiscard]] static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo(std::istream& is); + static void serialize(SpeculativeDecodingFastLogitsInfo const& info, std::ostream& os); + [[nodiscard]] static size_t serializedSize(SpeculativeDecodingFastLogitsInfo const& info); + // Result [[nodiscard]] static Result deserializeResult(std::istream& is); static void serialize(Result const& result, std::ostream& os); diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h index e07c539a9..5a8525caf 100644 --- a/cpp/include/tensorrt_llm/executor/types.h +++ b/cpp/include/tensorrt_llm/executor/types.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -197,6 +198,10 @@ enum class CapacitySchedulerPolicy /// @brief GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run /// to completion without eviction. kGUARANTEED_NO_EVICT = 1, + + /// @brief kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed. + /// Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction. + kSTATIC_BATCH = 2 }; std::ostream& operator<<(std::ostream& os, CapacitySchedulerPolicy policy); @@ -332,6 +337,13 @@ enum class RequestStage kGENERATION_COMPLETE, }; +/// @brief Struct that holds the request stats in the case of disaggregated serving +struct DisServingRequestStats +{ + /// @brief The total time spent on transferring KV cache from context phase to generation phase (ms) + double kvCacheTransferMS; +}; + /// @brief Struct that holds the stats of a single request struct RequestStats { @@ -350,6 +362,8 @@ struct RequestStats /// @brief Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks /// exhaustion for example) bool paused; + /// @brief Stats specific to disaggregated serving + std::optional disServingStats; }; /// @brief Struct that holds the stats of all requests in an iteration @@ -361,6 +375,15 @@ struct RequestStatsPerIteration std::vector requestStats; }; +/// @brief Struct that holds the debug tensors in an iteration +struct DebugTensorsPerIteration +{ + /// @brief The iteration id for these tensors + IterationType iter; + /// @brief The debug tensors for this iteration + std::map debugTensors; +}; + /// @brief The reason why the model stopped generating tokens for a request. enum class FinishReason { @@ -423,6 +446,11 @@ class DecodingMode return DecodingMode{kExplicitDraftTokens | kStandardStopCriteria | kUseExplicitEosStop}; } + static auto constexpr ExternalDraftTokens() + { + return DecodingMode{kExternalDraftTokens | kUsePenalties | kUseBanTokens | kStandardStopCriteria}; + } + auto constexpr useTemperature(bool useTemp) { mState = setBitTo(kUseTemperature, useTemp); @@ -540,6 +568,11 @@ class DecodingMode return anyBitSet(kExplicitDraftTokens); } + [[nodiscard]] bool constexpr isExternalDraftTokens() const + { + return anyBitSet(kExternalDraftTokens); + } + [[nodiscard]] bool constexpr isUseTemperature() const { return anyBitSet(kUseTemperature); @@ -653,6 +686,7 @@ class DecodingMode static UnderlyingType constexpr kMedusa{1u << (kNumFlags + 4)}; static UnderlyingType constexpr kLookahead{1u << (kNumFlags + 5)}; static UnderlyingType constexpr kExplicitDraftTokens{1u << (kNumFlags + 6)}; + static UnderlyingType constexpr kExternalDraftTokens{1u << (kNumFlags + 7)}; static UnderlyingType constexpr kTopKTopP{kTopK | kTopP}; [[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const @@ -683,6 +717,7 @@ static_assert(!DecodingMode::Auto().isBeamSearch()); static_assert(!DecodingMode::Auto().isMedusa()); static_assert(!DecodingMode::Auto().isLookahead()); static_assert(!DecodingMode::Auto().isExplicitDraftTokens()); +static_assert(!DecodingMode::Auto().isExternalDraftTokens()); static_assert(DecodingMode::TopK().isTopK()); static_assert(DecodingMode::TopK().isTopKorTopP()); @@ -703,6 +738,7 @@ static_assert(!DecodingMode::TopK().isBeamSearch()); static_assert(!DecodingMode::TopK().isMedusa()); static_assert(!DecodingMode::TopK().isLookahead()); static_assert(!DecodingMode::TopK().isExplicitDraftTokens()); +static_assert(!DecodingMode::TopK().isExternalDraftTokens()); static_assert(DecodingMode::TopP().isTopP()); static_assert(DecodingMode::TopP().isTopKorTopP()); @@ -716,6 +752,7 @@ static_assert(!DecodingMode::TopP().isBeamSearch()); static_assert(!DecodingMode::TopP().isMedusa()); static_assert(!DecodingMode::TopP().isLookahead()); static_assert(!DecodingMode::TopP().isExplicitDraftTokens()); +static_assert(!DecodingMode::TopP().isExternalDraftTokens()); static_assert(DecodingMode::TopKTopP().isTopK()); static_assert(DecodingMode::TopKTopP().isTopP()); @@ -729,6 +766,7 @@ static_assert(!DecodingMode::TopKTopP().isBeamSearch()); static_assert(!DecodingMode::TopKTopP().isMedusa()); static_assert(!DecodingMode::TopKTopP().isLookahead()); static_assert(!DecodingMode::TopKTopP().isExplicitDraftTokens()); +static_assert(!DecodingMode::TopKTopP().isExternalDraftTokens()); static_assert(DecodingMode::BeamSearch().isBeamSearch()); static_assert(DecodingMode::BeamSearch().isUseStopCriteria()); @@ -737,6 +775,7 @@ static_assert(!DecodingMode::BeamSearch().isTopKorTopP()); static_assert(!DecodingMode::BeamSearch().isMedusa()); static_assert(!DecodingMode::BeamSearch().isLookahead()); static_assert(!DecodingMode::BeamSearch().isExplicitDraftTokens()); +static_assert(!DecodingMode::BeamSearch().isExternalDraftTokens()); static_assert(!DecodingMode::Medusa().isAuto()); static_assert(!DecodingMode::Medusa().isTopK()); @@ -752,6 +791,7 @@ static_assert(DecodingMode::Medusa().isUseStopCriteria()); static_assert(DecodingMode::Medusa().isUsePenalty()); static_assert(DecodingMode::Medusa().isUseMinLength()); static_assert(DecodingMode::Medusa().isMedusa()); +static_assert(!DecodingMode::Medusa().isExternalDraftTokens()); static_assert(!DecodingMode::Lookahead().isAuto()); static_assert(!DecodingMode::Lookahead().isTopK()); @@ -765,6 +805,7 @@ static_assert(DecodingMode::Lookahead().isUseStopCriteria()); static_assert(DecodingMode::Lookahead().isUseStopWords()); static_assert(DecodingMode::Lookahead().isUseExplicitEosStop()); static_assert(DecodingMode::Lookahead().isLookahead()); +static_assert(!DecodingMode::Lookahead().isExternalDraftTokens()); static_assert(!DecodingMode::ExplicitDraftTokens().isAuto()); static_assert(!DecodingMode::ExplicitDraftTokens().isTopK()); @@ -778,4 +819,19 @@ static_assert(!DecodingMode::ExplicitDraftTokens().isUsePenalty()); static_assert(DecodingMode::ExplicitDraftTokens().isUseStopCriteria()); static_assert(!DecodingMode::ExplicitDraftTokens().isUseBanWords()); static_assert(DecodingMode::ExplicitDraftTokens().isExplicitDraftTokens()); +static_assert(!DecodingMode::ExplicitDraftTokens().isExternalDraftTokens()); + +static_assert(!DecodingMode::ExternalDraftTokens().isTopK()); +static_assert(!DecodingMode::ExternalDraftTokens().isTopP()); +static_assert(!DecodingMode::ExternalDraftTokens().isTopKorTopP()); +static_assert(!DecodingMode::ExternalDraftTokens().isTopKandTopP()); +static_assert(DecodingMode::ExternalDraftTokens().isUseBanWords()); +static_assert(DecodingMode::ExternalDraftTokens().isUseOccurrencePenalty()); +static_assert(DecodingMode::ExternalDraftTokens().isUseStopCriteria()); +static_assert(!DecodingMode::ExternalDraftTokens().isAuto()); +static_assert(!DecodingMode::ExternalDraftTokens().isBeamSearch()); +static_assert(!DecodingMode::ExternalDraftTokens().isMedusa()); +static_assert(!DecodingMode::ExternalDraftTokens().isLookahead()); +static_assert(!DecodingMode::ExternalDraftTokens().isExplicitDraftTokens()); +static_assert(DecodingMode::ExternalDraftTokens().isExternalDraftTokens()); } // namespace tensorrt_llm::executor diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h index 68ebf0547..630617b11 100644 --- a/cpp/include/tensorrt_llm/runtime/decodingInput.h +++ b/cpp/include/tensorrt_llm/runtime/decodingInput.h @@ -108,6 +108,20 @@ class DecodingInput TensorConstPtr medusaTargetTokensPerStep; //!< [batchSize], on gpu }; + class ExternalDraftTokensInputs + { + public: + TensorPtr draftLogits; + TensorPtr draftProbs; + TensorPtr targetProbs; + TensorPtr numDraftTokens; + TensorPtr draftTokenIds; + TensorPtr useDraftLogits; + SizeType32 step; + float constantThreshold; + bool useRandomAcceptanceThreshold; + }; + class ExplicitDraftTokensInputs { public: @@ -138,6 +152,8 @@ class DecodingInput std::optional explicitDraftTokensInputs; std::optional lookaheadInputs; + + std::optional externalDraftTokensInputs; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/decodingOutput.h b/cpp/include/tensorrt_llm/runtime/decodingOutput.h index 146db40a4..50a76588a 100644 --- a/cpp/include/tensorrt_llm/runtime/decodingOutput.h +++ b/cpp/include/tensorrt_llm/runtime/decodingOutput.h @@ -95,10 +95,13 @@ class DecodingOutput // mandatory parameters for beam search TensorPtr logProbs; // [BS, BM, MSL], must be float* TensorPtr cumLogProbs; // [BS, BM], optional for sampling - TensorPtr parentIds; // [BS, BM, MSL] + TensorPtr parentIds; // [BS, BM, MSL] index of the beam where the previous token is TensorPtr lengths; // [BS, BM], total sequence lengths including padding TensorPtr cacheIndirection; // [BS, BM, MSL], k/v indirection for next generation step + TensorPtr logProbsTiled; // [MSL, BS, BM] Buffer used to store the transpose of the logProbs. + // Needed because the kernels have been written to use that shape. + BeamHypotheses beamHypotheses; // Speculative decoding diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h index 7ed345a8b..f12362ece 100644 --- a/cpp/include/tensorrt_llm/runtime/gptDecoder.h +++ b/cpp/include/tensorrt_llm/runtime/gptDecoder.h @@ -62,23 +62,8 @@ class IGptDecoder virtual void forwardSync(DecodingOutput& output, DecodingInput const& input) = 0; - virtual void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, - BufferManager const& manager, - std::optional> samplingConfig = std::nullopt) - = 0; - virtual SamplingConfig const& getSamplingConfig() = 0; - static void acceptDraftTokensByIds(ITensor const& targetTokenIds, ITensor const& draftTokenIds, - ITensor const& contextLengths, ITensor const& numDraftTokens, ITensor& sequenceLengths, - ITensor const& finishedVec, ITensor& finishedFinal, ITensor& finishedSum, ITensor const& batchSlots, - BufferManager::CudaStreamPtr const& stream); - - static void acceptDraftTokensByLogits(ITensor& draftLogits, ITensor const& targetLogits, ITensor& draftProbs, - ITensor& targetProbs, ITensor const& numDraftTokens, ITensor& finished, ITensor const& batchSlots, - SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, - curandState_t* curandState, BufferManager::CudaStreamPtr const& stream); - static std::unique_ptr create(executor::DecodingMode const& mode, nvinfer1::DataType dtype, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength, BufferManager::CudaStreamPtr const& stream, @@ -105,10 +90,6 @@ class GptDecoder : public virtual IGptDecoder void forwardSync(DecodingOutput& output, DecodingInput const& input) override; - void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, - BufferManager const& manager, - std::optional> samplingConfig = std::nullopt) override; - SamplingConfig const& getSamplingConfig() override { return mSamplingConfig; @@ -119,8 +100,6 @@ class GptDecoder : public virtual IGptDecoder std::shared_ptr> mDynamicDecodeLayer; std::shared_ptr mDecodingLayerWorkspace; - TensorPtr mLogProbsTiled; // Buffer used to store the transpose of the logProbs. Needed because the kernels have - // been written to use that shape. SamplingConfig mSamplingConfig; size_t mMaxBatchSize; diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h index 358826f50..50bd89924 100644 --- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h +++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h @@ -62,12 +62,12 @@ class GptDecoderBatched : public IGptDecoderBatched void newRequests(std::vector const& seqSlots, std::vector const& requests, std::vector const& samplingConfigs) override; - TokenPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) override; + DecoderFinishedEventPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) override; - void forwardSync(decoder_batch::Token const& token) override; + void forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent) override; - void forwardSync( - decoder_batch::Token const& token, decoder_batch::Output& output, decoder_batch::Input const& input) override; + void forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent, decoder_batch::Output& output, + decoder_batch::Input const& input) override; void forwardAsync(decoder::Output& output, decoder::Input const& input) override; @@ -245,7 +245,7 @@ class GptDecoderBatched : public IGptDecoderBatched void newRequest(SizeType32 batchSlot, decoder_batch::Request const& request, SamplingConfig const& samplingConfig); //! @brief Allocate buffers for speculative decoding. - void allocateSpeculativeDecodingBuffers(); + void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype); //! @brief Setup buffers for speculative decoding. void setupSpeculativeDecoding(ModelConfig const& modelConfig); @@ -271,7 +271,7 @@ class GptDecoderBatched : public IGptDecoderBatched void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const& request); //! @brief Updates finished state on host for all active requests - void updateFinished(decoder_batch::Token const& token); + void updateFinished(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent); //! @brief Sets inputs for explicit draft tokens. void setExplicitDraftTokensInputs(decoder_batch::Input const& input); @@ -289,7 +289,7 @@ class GptDecoderBatched : public IGptDecoderBatched CudaStreamPtr mRuntimeStream; CudaStreamPtr mDecoderStream; BufferManager mBufferManager; - TokenPtr mForwardToken; + DecoderFinishedEventPtr mDecoderFinishEvent; CudaEvent mForwardEvent; using GptDecoderPtr = std::unique_ptr; @@ -300,10 +300,6 @@ class GptDecoderBatched : public IGptDecoderBatched DecodingInputPtr mJointDecodingInput; DecodingOutputPtr mJointDecodingOutput; - std::vector mAcceptByLogits; - TensorPtr mNumDraftTokens; - TensorPtr mCurandStates; - std::vector mNbSteps; std::vector mFinished; TensorPtr mFinishedSum; @@ -313,18 +309,9 @@ class GptDecoderBatched : public IGptDecoderBatched TensorPtr mFinishedSteps; // [maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState // for each generated token of maxTokensPerStep, on gpu - TensorPtr mDraftProbs; // [batchSize, maxTokensPerEngineStep, beamWidth, vocabPadded], temporary data for - // speculative decoding accept by logits kernel, on gpu - TensorPtr mTargetProbs; // [batchSize, maxTokensPerEngineStep, beamWidth, vocabPadded], temporary data for - // speculative decoding accept by logits kernel, on gpu - TensorPtr mDraftTokenIds; // [batchSize, maxTokensPerEngineStep], draft token indices, on gpu - TensorPtr mDraftLogits; // [batchSize, maxTokensPerEngineStep, vocabSizePadded], draft token logits, on gpu TensorPtr mBatchSlotsSetup; // [maxBatchSize], int32_t, address map, pinned TensorPtr mBatchSlotsDecoder; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned - TensorPtr mBatchSlotsAcceptTokens; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned - TensorPtr mBatchSlotsAcceptLogits; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned - TensorPtr mTargetLogitsPtrs; // [maxBatchSize], float*, pointers to target logits, pinned SizeType32 mMaxSequenceLength{}; SizeType32 mMaxAttentionWindow{}; SizeType32 mSinkTokenLength{}; diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h index 46cd19902..a4b8e4cc3 100644 --- a/cpp/include/tensorrt_llm/runtime/gptSession.h +++ b/cpp/include/tensorrt_llm/runtime/gptSession.h @@ -115,7 +115,6 @@ class [[deprecated("Use the executor API instead.")]] GptSession std::optional genMicroBatchSize = std::nullopt; std::optional decodingMode = std::nullopt; bool normalizeLogProbs = true; - std::optional enginePath; }; //! @brief Optional profiler class to profile the generation phase of an inference request diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h index 11464f80e..048fa05a7 100644 --- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h +++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h @@ -75,11 +75,11 @@ class Input using Output = decoder::Output; -// TODO: is this a bad name to mix up with token concept in LLM? Would 'Event' be better? And should move to common.h -class Token +// used just as a container for easy returning / passing to function +class DecoderFinishedEvent { public: - explicit Token(CudaEvent&& event, std::vector const& active) + explicit DecoderFinishedEvent(CudaEvent&& event, std::vector const& active) : event(std::move(event)) , active(active) { @@ -96,7 +96,7 @@ class IGptDecoderBatched : public virtual IStatefulGptDecoder public: using CudaStreamPtr = std::shared_ptr; using TensorPtr = std::shared_ptr; - using TokenPtr = std::unique_ptr; + using DecoderFinishedEventPtr = std::unique_ptr; //! @brief Setup buffers for ExplicitDraftTokens decoding. virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0; @@ -105,15 +105,15 @@ class IGptDecoderBatched : public virtual IStatefulGptDecoder virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0; //! @brief Run one step for all requests without blocking the host process and return the token for synchronization. - virtual TokenPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) = 0; + virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) = 0; //! @brief Call decoder forwardSync and wait for the call to `forwardAsync` associated with a token to complete. - virtual void forwardSync( - decoder_batch::Token const& token, decoder_batch::Output& output, decoder_batch::Input const& input) + virtual void forwardSync(decoder_batch::DecoderFinishedEvent const& token, decoder_batch::Output& output, + decoder_batch::Input const& input) = 0; //! @brief Wait for the call to `forwardAsync` associated with a token to complete. - virtual void forwardSync(decoder_batch::Token const& token) = 0; + virtual void forwardSync(decoder_batch::DecoderFinishedEvent const& token) = 0; //! @brief Run one step for all requests and wait for completion on the host. virtual void forward(decoder_batch::Output& output, decoder_batch::Input const& input) diff --git a/cpp/include/tensorrt_llm/runtime/ipcUtils.h b/cpp/include/tensorrt_llm/runtime/ipcUtils.h index 0dd45531e..76afedca8 100644 --- a/cpp/include/tensorrt_llm/runtime/ipcUtils.h +++ b/cpp/include/tensorrt_llm/runtime/ipcUtils.h @@ -32,7 +32,7 @@ class IpcMemory using BufferPtr = IBuffer::SharedPtr; // MAX_ALL_REDUCE_BLOCKS for block_barrier, 1 for multi_gpu_barrier - size_t static constexpr FLAGS_SIZE = (kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t); + size_t static constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t); IpcMemory( std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig, bool openIpc = true); diff --git a/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h b/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h index 56504bd94..3c6fe731a 100644 --- a/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h +++ b/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h @@ -62,6 +62,7 @@ class LookaheadRuntimeBuffers TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& worldConfig) const; public: + TensorPtr cumSumLength; // [1] the cumulative sum of generation length, on pinned TensorPtr packedMasksDevice; // [forwardBatchSize, tokensPerStep, numPackedMasks], on gpu TensorPtr generationLengthsDevice; // [forwardBatchSize], on gpu TensorPtr positionOffsetsDevice; // [forwardBatchSize, tokensPerStep], on gpu diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h index fc3ac2928..b1b495e75 100644 --- a/cpp/include/tensorrt_llm/runtime/modelConfig.h +++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h @@ -60,6 +60,9 @@ class ModelConfig { kATTENTION, kRECURRENT, + // NOTE: Linear and noop are attention alternatives introduced in Nemotron-NAS. They do not use the KV cache. + kLINEAR, + kNOOP, }; enum class KVCacheType : std::int32_t @@ -97,13 +100,13 @@ class ModelConfig kEnabled, }; - explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads, - SizeType32 hiddenSize, nvinfer1::DataType dtype) + explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbLayers, SizeType32 nbAttentionLayers, + SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype) : mVocabSize(vocabSize) + , mNbLayers(nbLayers) , mNbAttentionLayers(nbAttentionLayers) , mNbRnnLayers(nbRnnLayers) , mNbHeads(nbHeads) - , mNbKvHeads(nbHeads) , mHiddenSize(hiddenSize) , mSizePerHead(mHiddenSize / mNbHeads) , mDataType(dtype) @@ -134,6 +137,10 @@ class ModelConfig , mUseShapeInference(true) , mManageWeightsType(ManageWeightsType::kDisabled) { + TLLM_CHECK_WITH_INFO(mNbLayers >= mNbAttentionLayers + mNbRnnLayers, + "Number of layers (%d) expected to be >= number of attention (%d) + number of rnn layers (%d)", mNbLayers, + mNbAttentionLayers, mNbRnnLayers); + setNbKvHeads(mNbHeads); } [[nodiscard]] static std::vector getOptProfilesSplitPoints() noexcept @@ -151,14 +158,55 @@ class ModelConfig return (mVocabSize + worldSize - 1) / worldSize * worldSize; } - [[nodiscard]] SizeType32 constexpr getNbAttentionLayers(SizeType32 pipelineParallelism = 1) const + [[nodiscard]] SizeType32 countLocalLayers( + LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const { - return mNbAttentionLayers / pipelineParallelism; + TLLM_CHECK_WITH_INFO(pipelineParallelism > 0, "Invalid pipelineParallelism: %d", pipelineParallelism); + auto const numLocalLayers = mNbLayers / pipelineParallelism; // WARNING: assume no remainder + auto const firstLocalLayerIt = mLayerTypes.cbegin() + (numLocalLayers * pipelineParallelismRank); + return std::count(firstLocalLayerIt, firstLocalLayerIt + numLocalLayers, layerType); } - [[nodiscard]] SizeType32 constexpr getNbRnnLayers(SizeType32 pipelineParallelism = 1) const + [[nodiscard]] SizeType32 countLowerRankLayers( + LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const { - return mNbRnnLayers / pipelineParallelism; + auto const numLocalLayers = mNbLayers / pipelineParallelism; // WARNING: assume no remainder + auto const firstLocalLayer = numLocalLayers * pipelineParallelismRank; + // count number of previous non-local attention layers + return std::count(mLayerTypes.cbegin(), mLayerTypes.cbegin() + firstLocalLayer, layerType); + } + + [[nodiscard]] SizeType32 getNbLayers(SizeType32 pipelineParallelism = 1) const + { + return mNbLayers / pipelineParallelism; // WARNING: assume no remainder + } + + [[nodiscard]] SizeType32 getNbAttentionLayers( + SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const + { + // TODO(oargov): get rid of this invalid state + if (mLayerTypes.empty()) + { + // this assumption might be wrong in a few cases, for example: + // layer types: [attention, recurrent, recurrent], pp=2 ==> first rank has 1 attention layer, not 0 + TLLM_LOG_DEBUG("Assuming uniform distribution of attention layers between ranks"); + return mNbAttentionLayers / pipelineParallelism; + } + return countLocalLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank); + } + + [[nodiscard]] SizeType32 getNbRnnLayers( + SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const + { + // TODO(oargov): get rid of this invalid state + if (mLayerTypes.empty()) + { + // this assumption might be wrong in a few cases, for example: + // layer types: [attention, attention, recurrent], pp=2 ==> second rank has 1 rnn layer, not 0 + TLLM_LOG_DEBUG("Assuming uniform distribution of recurrent layers between ranks"); + return mNbRnnLayers / pipelineParallelism; + } + return countLocalLayers(LayerType::kRECURRENT, pipelineParallelism, pipelineParallelismRank); } [[nodiscard]] SizeType32 constexpr getNbHeads() const noexcept @@ -166,14 +214,16 @@ class ModelConfig return mNbHeads; } - [[nodiscard]] SizeType32 constexpr getNbKvHeads() const noexcept + [[nodiscard]] SizeType32 getNbKvHeads(SizeType32 layerIdx) const { - return mNbKvHeads; + TLLM_CHECK_WITH_INFO(layerIdx < mNbAttentionLayers, "Layer index %d is out of bounds", layerIdx); + return mNumKvHeadsPerAttentionLayer[layerIdx]; } - void constexpr setNbKvHeads(SizeType32 nbKvHeads) noexcept + // set the number of kv heads for all layers + void setNbKvHeads(SizeType32 nbKvHeads) { - mNbKvHeads = nbKvHeads; + mNumKvHeadsPerAttentionLayer = std::vector(mNbAttentionLayers, nbKvHeads); } [[nodiscard]] SizeType32 constexpr getHiddenSize() const noexcept @@ -645,12 +695,46 @@ class ModelConfig mModelName = modelName; } + [[nodiscard]] std::vector const& getNumKvHeadsPerLayer() const + { + return mNumKvHeadsPerAttentionLayer; + } + + [[nodiscard]] std::pair::const_iterator, std::vector::const_iterator> + getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const + { + TLLM_CHECK_WITH_INFO(pipelineParallelism > 0, "Invalid pipelineParallelism: %d", pipelineParallelism); + // count number of previous non-local attention layers + auto const numPrevAttnLayers + = countLowerRankLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank); + auto const firstLocalAttentionLayerIt = mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers; + auto const numLocalAttentionLayers + = countLocalLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank); + return std::make_pair(firstLocalAttentionLayerIt, firstLocalAttentionLayerIt + numLocalAttentionLayers); + } + + void setNumKvHeadsPerLayer(std::vector const& headsPerLayer) + { + auto const numElems = static_cast(headsPerLayer.size()); + TLLM_CHECK_WITH_INFO(numElems == mNbAttentionLayers, + "Length of head_per_layer (%d) must match number of attention layers (%d)", numElems, mNbAttentionLayers); + mNumKvHeadsPerAttentionLayer = headsPerLayer; + } + + [[nodiscard]] SizeType32 getSumLocalKvHeads( + SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const + { + auto [cbegin, cend] = getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank); + auto const sumLocalHeads = std::reduce(cbegin, cend); + return sumLocalHeads; + } + private: SizeType32 mVocabSize; + SizeType32 mNbLayers; SizeType32 mNbAttentionLayers; SizeType32 mNbRnnLayers; SizeType32 mNbHeads; - SizeType32 mNbKvHeads; SizeType32 mHiddenSize; SizeType32 mSizePerHead; nvinfer1::DataType mDataType; @@ -703,6 +787,7 @@ class ModelConfig bool mUseShapeInference; ManageWeightsType mManageWeightsType; std::string mModelName; + std::vector mNumKvHeadsPerAttentionLayer; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h index 8226c411c..e739e8188 100644 --- a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h +++ b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h @@ -97,8 +97,7 @@ class SpeculativeDecodingMode [[nodiscard]] bool constexpr variableDraftLength() const { - // Add Lookahead, when lookahead supports it. - return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens); + return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding); } [[nodiscard]] bool constexpr hasDraftLogits() const diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index 10debf560..2ff2de09e 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -348,9 +348,11 @@ endif() if(NOT WIN32) # Unix-like compilers set(UNDEFINED_FLAG "-Wl,--no-undefined") set(AS_NEEDED_FLAG "-Wl,--as-needed") + set(NO_AS_NEEDED_FLAG "-Wl,--no-as-needed") else() # Windows set(UNDEFINED_FLAG "") set(AS_NEEDED_FLAG "") + set(NO_AS_NEEDED_FLAG "") endif() set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a index a7b70a468..70202b6ff 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2b202970ce1e9ee672df2692cc5bd1676be62b4185878ad8aa1afb0fe342f41 -size 4474050 +oid sha256:96164a1788ee2edfdb9f18906e4c2727d5593274f00f40b065ccefa7b2a71063 +size 5206662 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index 0a4c8f235..a4e75f719 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52ddca820b9c23d3ce209e3cb321e205fc62563461324e47892b0312d38719ef -size 4573988 +oid sha256:ecf5976593289620ab34b311a7c725ab946510edf610f58e0e0aff86610469a3 +size 5316564 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt index 14737cbef..feddb36b7 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -625e1171d9e44b814e58c12c4ad7eead libtensorrt_llm_batch_manager_static.a -bf803b865d786a024c5e9e2fd9e40791 libtensorrt_llm_batch_manager_static.pre_cxx11.a -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +99a273dde85e731ce80079d4769ea45f libtensorrt_llm_batch_manager_static.a +52afd574ed63cd6157fd32c8f95770be libtensorrt_llm_batch_manager_static.pre_cxx11.a +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a index 2db07ac01..1f78589c5 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8f5c3fd7a4a0fe2a95d9632ffc1b5fcdf8351cf65483b046115309746ec001a -size 4337326 +oid sha256:e7de37c449d41e183580feebd9bc581e11ad9c19ddde00a2b7f4e3dac6bd5bb6 +size 5048202 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index ab690974e..f81826dc0 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28f3746c1b3c59f3bfe5afd8f960babddb0a7e31130ce84a5d0e0c20382e86b1 -size 4301934 +oid sha256:a4cdfaa251cf3d3e9219d161ef112af4c691a9e1a2e7791cd806696f9750f725 +size 5009770 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt index 61e5782a8..830a4f130 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -beb06bae50f372d9413d705ae5cc6986 libtensorrt_llm_batch_manager_static.a -ea9db9b655537ba55110102dcbe62733 libtensorrt_llm_batch_manager_static.pre_cxx11.a -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +ac3aad947cbaa559cf0c96eca8b91a42 libtensorrt_llm_batch_manager_static.a +03c5875d49ef25f416378a2997cf67ae libtensorrt_llm_batch_manager_static.pre_cxx11.a +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib index 5fd10d767..603fb407a 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib +++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d970a69f8de5e8f852a572715d95c2144936464422367e6b35b6e7f99f9097d6 -size 26765420 +oid sha256:0b4eaa12656c06eab96a526aaeebd8292ae91450641e0719827f429dca1af813 +size 32677392 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt index 45c04b499..27ad16618 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt @@ -1,2 +1,2 @@ -845200593f2128c3c25e02e59ee2d115 tensorrt_llm_batch_manager_static.lib -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +38905d5e30a4169dfa2a04efdb349d11 tensorrt_llm_batch_manager_static.lib +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h index 9f2d93316..d7bf43b40 100644 --- a/cpp/tensorrt_llm/common/customAllReduceUtils.h +++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h @@ -21,7 +21,7 @@ namespace tensorrt_llm::utils::customAllReduceUtils { -constexpr size_t NUM_POINTERS_PER_RANK = 4; +constexpr size_t NUM_POINTERS_PER_RANK = 7; // WARNING: MUST BE KEPT IN SYNC with tensorrt_llm/plugin/plugin.py inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp index 5a0ef3b1d..cfff14f64 100644 --- a/cpp/tensorrt_llm/common/envUtils.cpp +++ b/cpp/tensorrt_llm/common/envUtils.cpp @@ -157,4 +157,42 @@ bool getEnvEnablePDL() return enablePDL; } +bool getEnvUseUCXKvCache() +{ + static bool init = false; + static bool useUCXKVCache = false; + if (!init) + { + init = true; + { + char const* use_ucx_kv_cache = std::getenv("TRTLLM_USE_UCX_KVCACHE"); + if (use_ucx_kv_cache) + { + if (use_ucx_kv_cache[0] == '1' && use_ucx_kv_cache[1] == '\0') + { + useUCXKVCache = true; + } + } + } + } + return useUCXKVCache; +} + +std::string getEnvUCXInterface() +{ + static bool init = false; + static std::string ucxInterface; + if (!init) + { + init = true; + { + char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE"); + if (ucx_interface) + { + ucxInterface = ucx_interface; + } + } + } + return ucxInterface; +} } // namespace tensorrt_llm::common diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h index 7aff8d40d..f8b71b00c 100644 --- a/cpp/tensorrt_llm/common/envUtils.h +++ b/cpp/tensorrt_llm/common/envUtils.h @@ -18,6 +18,7 @@ #pragma once #include #include +#include namespace tensorrt_llm::common { @@ -40,4 +41,8 @@ int getEnvMmhaKernelBlockSize(); // Whether PDL is enabled. bool getEnvEnablePDL(); +bool getEnvUseUCXKvCache(); + +std::string getEnvUCXInterface(); + } // namespace tensorrt_llm::common diff --git a/cpp/tensorrt_llm/common/mpiUtils.cpp b/cpp/tensorrt_llm/common/mpiUtils.cpp index c47bdf2ec..be1de0a9e 100644 --- a/cpp/tensorrt_llm/common/mpiUtils.cpp +++ b/cpp/tensorrt_llm/common/mpiUtils.cpp @@ -16,6 +16,7 @@ #include #include +#include #include "tensorrt_llm/common/mpiUtils.h" @@ -127,7 +128,6 @@ std::vector getWorldRanks(MpiComm const& comm) MPICHECK(MPI_Group_translate_ranks(group, groupSize, ranks.data(), worldGroup, worldRanks.data())); MPICHECK(MPI_Group_free(&group)); MPICHECK(MPI_Group_free(&worldGroup)); - std::sort(worldRanks.begin(), worldRanks.end()); #else std::vector worldRanks{0}; #endif @@ -314,6 +314,18 @@ void MpiComm::allgather(void const* sendbuf, void* recvbuf, int count, MpiType d #endif // ENABLE_MULTI_DEVICE } +void MpiComm::allgatherv(void const* sendbuf, int sendcount, MpiType sendtype, void* recvbuf, + std::vector const& recvcounts, std::vector const& displs, MpiType recvtype) const +{ +#if ENABLE_MULTI_DEVICE + MPICHECK(MPI_Allgatherv(sendbuf, sendcount, getMpiDtype(sendtype), recvbuf, recvcounts.data(), displs.data(), + getMpiDtype(recvtype), mComm)); + +#else + TLLM_THROW("Multi device support is disabled."); +#endif // ENABLE_MULTI_DEVICE +} + void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const { #if ENABLE_MULTI_DEVICE @@ -323,6 +335,18 @@ void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) #endif // ENABLE_MULTI_DEVICE } +bool MpiComm::improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const +{ +#if ENABLE_MULTI_DEVICE + int flag{0}; + MPICHECK(MPI_Improbe(source, tag, mComm, &flag, msg, status)); + return flag != 0; +#else + TLLM_THROW("Multi device support is disabled."); + return false; +#endif +} + bool MpiComm::iprobe(int source, int tag, MPI_Status* status) const { #if ENABLE_MULTI_DEVICE @@ -391,31 +415,30 @@ MpiComm& MpiComm::mutableLocalSession() void MpiComm::refreshLocalSession() { #if ENABLE_MULTI_DEVICE - static std::vector initSessionRanks; static std::mutex mutex; std::unique_lock lock(mutex); - if (initSessionRanks.empty()) - { - auto initSessionRanks = getWorldRanks(MpiComm::session()); - auto localSessionRanks = getWorldRanks(MpiComm::localSession()); - std::vector intersectionRanks; - std::set_intersection(initSessionRanks.begin(), initSessionRanks.end(), localSessionRanks.begin(), - localSessionRanks.end(), std::back_inserter(intersectionRanks)); - - MPI_Group worldGroup; - MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup)); - MPI_Group localGroup; - MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup)); - MPI_Comm localComm; - MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm)); - MpiComm::mutableLocalSession().mFreeComm = true; - MpiComm::mutableLocalSession() = MpiComm{localComm, false}; - } - else + auto initSessionRanks = getWorldRanks(MpiComm::session()); + auto localSessionRanks = getWorldRanks(MpiComm::localSession()); + + // Add to intersectionRanks in order of initSessionRanks + std::vector intersectionRanks; + std::unordered_set localSessionRanksSet(localSessionRanks.begin(), localSessionRanks.end()); + for (auto rank : initSessionRanks) { - TLLM_CHECK_WITH_INFO(getWorldRanks(MpiComm::session()) == initSessionRanks, - "Executors in the same process must use the same participant IDs."); + if (localSessionRanksSet.find(rank) != localSessionRanksSet.end()) + { + intersectionRanks.push_back(rank); + } } + + MPI_Group worldGroup; + MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup)); + MPI_Group localGroup; + MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup)); + MPI_Comm localComm; + MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm)); + MpiComm::mutableLocalSession().mFreeComm = true; + MpiComm::mutableLocalSession() = MpiComm{localComm, false}; TLLM_LOG_INFO("Refreshed the MPI local session"); #endif // ENABLE_MULTI_DEVICE } diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh index 979d8dd6f..c5a4fe0e2 100644 --- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh +++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh @@ -38,6 +38,12 @@ namespace common template struct BytesToType; +template <> +struct BytesToType<1> +{ + using type = uint8_t; +}; + template <> struct BytesToType<2> { diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a index e5223bbfb..a27ea7589 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d21eecce6992b0099a580328a4eeeaf9bf4c52076e642a609088dca34c74803 -size 1751564 +oid sha256:3097d831283ec377c42227aed2b62d8fc0a3cd1bd766c730ace5372db3a8778b +size 2214754 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index 247c944cc..2341a015b 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cf43ade3c597e3258e880fc19d2d98e4974a773d932fc6e99b0bc56af67f750 -size 1781954 +oid sha256:cc302b9720b93c583ef0de78225d33bdc070000354e074eb362c6712cc228ed1 +size 2242542 diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt index b71d7b8bc..47ec985c3 100644 --- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -7b9dc1aa716176dab6a2efb46c9daf06 libtensorrt_llm_executor_static.a -48d4741a6ef91bc3ed2209fc39d25edc libtensorrt_llm_executor_static.pre_cxx11.a -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +ff0d39f3c7a1d2df88b9b281f6c6883c libtensorrt_llm_executor_static.a +bf82afc7a6e6afc288e7a2c7de1c8944 libtensorrt_llm_executor_static.pre_cxx11.a +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a index c5ca6fe2e..182cb7ab1 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e02165792048793f72a67666dbdad274ab52eaec1bad847deb96cb1c6f9900fa -size 1814682 +oid sha256:2aa2554bf7a93a45ae311d0e9ffb2b43b26bd8697437101ced24564c8884995c +size 3297132 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a index 241934be8..07bee4a94 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b148526b181a9eb7c2c0eff11395e64c60526397f2d887b8ace87f3f130c750 -size 1724916 +oid sha256:fa0a8208fbbaa3eaa22ffe98cdba16dffdc112dd7c7273dfc0f4b5805092b65b +size 3212266 diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt index 7cf6fe271..16707d37a 100644 --- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -9a82b1a3cc646a499f48d0ca0b154e06 libtensorrt_llm_executor_static.a -ded4c067f44d95ac7ddb683b3b82cbf6 libtensorrt_llm_executor_static.pre_cxx11.a -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +9a3657a830e424f26494c81903245bf8 libtensorrt_llm_executor_static.a +af374d630c9fd7da70d7971cd95cccfb libtensorrt_llm_executor_static.pre_cxx11.a +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib index 29549ac60..645586a70 100644 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib +++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6244f920c8327495d842d1835b7593e5a8a2fd9a119ec383d95c1794eacf527c -size 19231936 +oid sha256:077d9336da40781c22c8c0da23fb844e91473a0bee9671bf7140c6e9434b3de3 +size 21155258 diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt index 204f4631f..3757330bd 100644 --- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt @@ -1,2 +1,2 @@ -502454ea4f6888677f8d91c419def633 tensorrt_llm_executor_static.lib -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +04280fbceefc758ac20758609526b354 tensorrt_llm_executor_static.lib +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h index 8b44a419a..895a91483 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h @@ -630,7 +630,7 @@ void topKSoftMaxKernelLauncher(T const* logits, T const* bias, void* workspace, // ┃ pTemp ┃ BS * PAD_K * VP * (2 * (PAD_K * 2) + 2) | | float | // ┗━━━━━━━━━━┛ -------------------------------------------------------------------------------- - // Stage1: gridDim(BS,BM,nVPart), blockDim(nBlockSize,1,1) + // beamStage1Kernel: gridDim(BS,BM,nVPart), blockDim(nBlockSize,1,1) // Each ThreadBlock takes `nVocabChunk` contiguous elements in logits to do TopK and reduce_md, // then writes output into pTemp. // At end of this kernel, each ThreadBlock holds the indices and values of the top 2*BM elements, @@ -647,7 +647,7 @@ void topKSoftMaxKernelLauncher(T const* logits, T const* bias, void* workspace, // ┃ md ┃ 2 | 2 | float | // ┗━━━━━━━━━━┛ ----------------------------------------- - // Stage2: gridDim(BS,BM,1), blockDim(32/64/128,1,1) + // beamStage2Kernel: gridDim(BS,BM,1), blockDim(32/64/128,1,1) // Each TheadBlock takes `nVPart` contiguous Tiles in pTemp to do reduce_topk and reduce_md, // writes output topk_id into in pTempId, writes topk_value + cumLogProbs into pTempVal. diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp index 5857e927d..0c228692c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp @@ -165,7 +165,7 @@ void FusedMHARunnerV2::setupKernelParams(MHARunnerParams runnerParams) // Use exp2f optimization for warp-specialized ws kernels on Hopper. if (mLaunchParams.useBase2ExpTrick) { - // The kernel adopts the log2f optimziation. + // The kernel adopts the log2f optimization. constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E set_alpha(mKernelParams.scale_bmm1, scale_bmm1 * float(kLog2e), DATA_TYPE_FP32); } @@ -364,8 +364,8 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams) void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams) { // split D into multiple groups in order to match the TMA swizzle mode (128B) - const uint32_t d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType); - const uint32_t d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1; + uint32_t const d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType); + uint32_t const d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1; // separate q, k, v and o tma descriptors Multiple_tma_descriptor<4> qkv_tma_descriptor; @@ -421,8 +421,8 @@ void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams) uint32_t fp32_to_tf32 = 0; // gmma descriptor mode - const uint32_t d_bytes_per_group = d_in_bytes / d_groups; - const cudaTmaDescSwizzle swizzle_mode = (d_bytes_per_group > 64 + uint32_t const d_bytes_per_group = d_in_bytes / d_groups; + cudaTmaDescSwizzle const swizzle_mode = (d_bytes_per_group > 64 ? cudaTmaDescSwizzle::SWIZZLE_128B : (d_bytes_per_group > 32 ? cudaTmaDescSwizzle::SWIZZLE_64B : cudaTmaDescSwizzle::SWIZZLE_32B)); @@ -474,8 +474,8 @@ void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams) void FusedMHARunnerV2::setSeparateQKvTmaDescriptors(MHARunnerParams runnerParams) { // split D into multiple groups in order to match the TMA swizzle mode (128B) - const uint32_t d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType); - const uint32_t d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1; + uint32_t const d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType); + uint32_t const d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1; uint32_t q_step = 0, kv_step = 0; xmmaKernel->getStepSize(q_step, kv_step, mKernelParams, mLaunchParams); @@ -518,7 +518,7 @@ void FusedMHARunnerV2::setSeparateQKvTmaDescriptors(MHARunnerParams runnerParams = (get_size_in_bytes(mFixedParams.dataType) == 1) ? cudaTmaDescFormat::U8 : cudaTmaDescFormat::F16_RN; // gmma descriptor mode - const uint32_t d_bytes_per_group = d_in_bytes / d_groups; + uint32_t const d_bytes_per_group = d_in_bytes / d_groups; cudaTmaDescSwizzle const swizzle_mode = (d_bytes_per_group > 64 ? cudaTmaDescSwizzle::SWIZZLE_128B : (d_bytes_per_group > 32 ? cudaTmaDescSwizzle::SWIZZLE_64B : cudaTmaDescSwizzle::SWIZZLE_32B)); diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu index 0f2a514bf..d84188139 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu @@ -17,8 +17,11 @@ #include "customAllReduceKernels.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/customAllReduceUtils.h" #include "tensorrt_llm/common/dataType.h" #include "tensorrt_llm/common/envUtils.h" +#include #include #include @@ -174,12 +177,6 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag namespace reduce_fusion { -namespace details -{ -static constexpr int kBytesPerAccess = 16; -static constexpr int kWarpSize = 32; -static constexpr int kMaxCtaSize = 1024; -}; // namespace details inline __device__ float warp_reduce_sum(float val) { @@ -318,7 +315,7 @@ __global__ void rms_norm_kernel(AllReduceParams params) } template -void rms_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream) +void rms_norm_kernel_launcher(AllReduceParams& params, cudaStream_t stream) { static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T); TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0); @@ -387,6 +384,395 @@ void rms_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream) } } +template +struct NegZero128b +{ + static constexpr int v = static_cast(0x80008000); + static constexpr int4 value = {v, v, v, v}; +}; + +template <> +struct NegZero128b +{ + static constexpr int v = static_cast(0x80000000); + static constexpr int4 value = {v, v, v, v}; +}; + +template +__device__ static constexpr int4 NegZero128b_v = NegZero128b::value; + +template +__device__ __forceinline__ bool is_neg_zero(T& v); + +template <> +__device__ __forceinline__ bool is_neg_zero(float& v) +{ + uint32_t bits = *reinterpret_cast(&v); + return bits == 0x80000000; +} + +template <> +__device__ __forceinline__ bool is_neg_zero(half& v) +{ + uint16_t bits = *reinterpret_cast(&v); + return bits == 0x8000; +} + +template <> +__device__ __forceinline__ bool is_neg_zero<__nv_bfloat16>(__nv_bfloat16& v) +{ + uint16_t bits = *reinterpret_cast(&v); + return bits == 0x8000; +} + +template +__device__ __forceinline__ VecType remove_neg_zero(VecType const& vec) +{ + static constexpr int kIter = sizeof(VecType) / sizeof(ValType); + using ReadOnlyValType = std::add_const_t; + VecType ret; +#pragma unroll + for (int i = 0; i < kIter; ++i) + { + auto val = reinterpret_cast(&vec)[i]; + reinterpret_cast(&ret)[i] = is_neg_zero(val) ? static_cast(0.f) : val; + } + return ret; +} + +template +__device__ __forceinline__ bool has_neg_zero(VecType const& vec) +{ + static constexpr int kIter = sizeof(VecType) / sizeof(ValType); + using ReadOnlyValType = std::add_const_t; +#pragma unroll + for (int i = 0; i < kIter; ++i) + { + auto val = reinterpret_cast(&vec)[i]; + if (is_neg_zero(val)) + { + return true; + } + } + return false; +} + +template +__device__ __forceinline__ bool all_neg_zero(VecType const& vec) +{ + static constexpr int kIter = sizeof(VecType) / sizeof(ValType); + using ReadOnlyValType = std::add_const_t; +#pragma unroll + for (int i = 0; i < kIter; ++i) + { + auto val = reinterpret_cast(&vec)[i]; + if (!is_neg_zero(val)) + { + return false; + } + } + return true; +} + +__device__ __forceinline__ void st_global_release(int4 const& val, int4* addr) +{ + asm volatile("st.release.global.sys.v4.b32 [%4], {%0, %1, %2, %3};" ::"r"(val.x), "r"(val.y), "r"(val.z), + "r"(val.w), "l"(addr)); +} + +__device__ __forceinline__ int4 ld_global_acquire(int4* addr) +{ + int4 val; + asm volatile("ld.acquire.global.sys.v4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(addr)); + return val; +} + +__device__ __forceinline__ void st_global_volatile(int4 const& val, int4* addr) +{ + asm volatile("st.volatile.global.v4.b32 [%4], {%0, %1, %2, %3};" ::"r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w), + "l"(addr)); +} + +__device__ __forceinline__ int4 ld_global_volatile(int4* addr) +{ + int4 val; + asm volatile("ld.volatile.global.v4.b32 {%0, %1, %2, %3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(addr)); + return val; +} + +template +__device__ __forceinline__ void set_neg_zero(int4* addr) +{ + st_global_volatile(NegZero128b_v, addr); +} + +template +struct Reducer; + +template +struct Reducer +{ + static __device__ __forceinline__ int4 allreduce(AllReduceParams& params, int global_offset) + { + using PackedStruct = typename PackedOn16Bytes::Type; + int ping = params.barrier_flag % 3; + int pong = (params.barrier_flag + 2) % 3; + T const* local_input_buffer = reinterpret_cast(params.local_input_buffer_ptr); + T* local_shared_buffer = reinterpret_cast( + params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + ping * MAX_RANKS_PER_NODE]); + T* local_clean_buffer = reinterpret_cast( + params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + pong * MAX_RANKS_PER_NODE]); + local_input_buffer += global_offset; + local_shared_buffer += global_offset; + local_clean_buffer += global_offset; + T* buffers[RanksPerNode]; +#pragma unroll + for (int ii = 0; ii < RanksPerNode; ++ii) + { + int rank = (params.local_rank + ii) % RanksPerNode; + buffers[ii] = reinterpret_cast( + params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + ping * MAX_RANKS_PER_NODE]) + + global_offset + params.local_rank * params.elts_total; + } + PackedStruct sum_vec, val; + val.packed = remove_neg_zero(*reinterpret_cast(local_input_buffer)); +#pragma unroll + for (int ii = 1; ii < RanksPerNode; ++ii) + { + st_global_volatile(val.packed, reinterpret_cast(buffers[ii])); + } + sum_vec.packed = val.packed; +#pragma unroll + for (int ii = 1; ii < RanksPerNode; ++ii) + { + int rank = (params.local_rank + ii) % RanksPerNode; + set_neg_zero(reinterpret_cast(local_clean_buffer + rank * params.elts_total)); + } + PackedStruct vals[RanksPerNode - 1]; + bool done = false; + while (!done) + { + done = true; +#pragma unroll + for (int ii = 1; ii < RanksPerNode; ++ii) + { + int rank = (params.local_rank + ii) % RanksPerNode; + vals[ii - 1].packed + = ld_global_volatile(reinterpret_cast(local_shared_buffer + rank * params.elts_total)); + } +#pragma unroll + for (int ii = 0; ii < RanksPerNode - 1; ii++) + { + done &= !has_neg_zero(vals[ii].packed); + } + } + +#pragma unroll + for (int ii = 1; ii < RanksPerNode; ++ii) + { + sum_vec.packed = add128b(sum_vec, vals[ii - 1]); + } + return sum_vec.packed; + } +}; + +template +struct Reducer +{ + static __device__ __forceinline__ int4 allreduce(AllReduceParams& params, int global_offset) + { + using PackedStruct = typename PackedOn16Bytes::Type; + int ping = params.barrier_flag % 3; + int pong = (params.barrier_flag + 2) % 3; + T const* local_input_buffer = reinterpret_cast(params.local_input_buffer_ptr); + T* local_shared_buffer = reinterpret_cast( + params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + ping * MAX_RANKS_PER_NODE]); + T* local_clean_buffer = reinterpret_cast( + params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + pong * MAX_RANKS_PER_NODE]); + local_input_buffer += global_offset; + local_shared_buffer += global_offset; + local_clean_buffer += global_offset; + T* buffers[RanksPerNode]; +#pragma unroll + for (int ii = 0; ii < RanksPerNode; ++ii) + { + int rank = (params.local_rank + ii) % RanksPerNode; + buffers[ii] = reinterpret_cast( + params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + ping * MAX_RANKS_PER_NODE]) + + global_offset; + } + PackedStruct sum_vec, val; + val.packed = remove_neg_zero(*reinterpret_cast(local_input_buffer)); + st_global_volatile(val.packed, reinterpret_cast(local_shared_buffer)); + sum_vec.packed = val.packed; +#pragma unroll + for (int ii = 1; ii < RanksPerNode; ++ii) + { + do + { + val.packed = ld_global_volatile(reinterpret_cast(buffers[ii])); + } while (has_neg_zero(val.packed)); + sum_vec.packed = add128b(sum_vec, val); + } + set_neg_zero(reinterpret_cast(local_clean_buffer)); + return sum_vec.packed; + } +}; + +template +static __global__ void lamport_style_one_shot_all_reduce_norm_kernel(AllReduceParams params) +{ +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + namespace cg = cooperative_groups; + static_assert(RanksPerNode <= 8); + static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T); + using PackedStruct = typename PackedOn16Bytes::Type; + + cg::cluster_group cluster = cg::this_cluster(); + + __shared__ float cluster_acc; + + int bid = blockIdx.x, tid = threadIdx.x; + int cluster_id = bid / ClusterSize, cluster_block_rank = bid % ClusterSize; + + int token_id = cluster_id; + int cluster_offset = token_id * params.fusion_params.hidden_size; + int block_offset = cluster_block_rank * params.fusion_params.hidden_size / ClusterSize; + int thread_offset = tid * kPackedSize; + + int inner_token_offset = block_offset + thread_offset; + int global_offset = cluster_offset + inner_token_offset; + + T const* bias_buffer = reinterpret_cast(params.fusion_params.bias_buffer); + T const* residual_buffer = reinterpret_cast(params.fusion_params.residual_buffer); + T const* weight_buffer = reinterpret_cast(params.fusion_params.weight_buffer); + T* local_final_output_buffer = reinterpret_cast(params.local_output_buffer_ptr); + T* intermediate_buffer = reinterpret_cast(params.fusion_params.intermediate_buffer); + + local_final_output_buffer += global_offset; + intermediate_buffer += global_offset; + residual_buffer += global_offset; + bias_buffer += inner_token_offset; + weight_buffer += inner_token_offset; + + PackedStruct weight_vec, bias_vec, residual_vec; + residual_vec.packed = *reinterpret_cast(residual_buffer); + if constexpr (Bias) + { + bias_vec.packed = *reinterpret_cast(bias_buffer); + } + if constexpr (Affine) + { + weight_vec.packed = *reinterpret_cast(weight_buffer); + } + + cudaGridDependencySynchronize(); + + float acc = 0.f; + PackedStruct sum_vec; + sum_vec.packed = Reducer::allreduce(params, global_offset); + + if constexpr (Bias) + { + sum_vec.packed = add128b(sum_vec, bias_vec); + } + sum_vec.packed = add128b(sum_vec, residual_vec); + *reinterpret_cast(intermediate_buffer) = sum_vec.packed; + acc = accumulate(acc, sum_vec); + acc = block_reduce_sum(acc); + if (ClusterSize > 1) + { + if (threadIdx.x == 0) + { + cluster_acc = acc; + } + cluster.sync(); + acc = 0.f; +#pragma unroll + for (int ii = 0; ii < ClusterSize; ++ii) + { + acc += *cluster.map_shared_rank(&cluster_acc, ii); + } + } + + float denom = __fsqrt_rn(__fdividef(acc, params.fusion_params.hidden_size) + params.fusion_params.eps); + sum_vec.packed = rms_norm(denom, sum_vec, weight_vec); + *reinterpret_cast(local_final_output_buffer) = sum_vec.packed; + + cudaTriggerProgrammaticLaunchCompletion(); +#endif +} + +int heuristic_min_warp_number(int tp_size, int hidden_size) +{ + if (hidden_size >= 4096) + { + return 4; + } + if (tp_size == 2) + { + return 32; + } + else + { + return 16; + } +} + +template +void lamport_style_one_shot_all_reduce_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream) +{ + static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T); + TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0); + int threads_per_token = params.fusion_params.hidden_size / kPackedSize; + int warps_per_token = (threads_per_token + details::kWarpSize - 1) / details::kWarpSize; + int token_num = params.elts_total / params.fusion_params.hidden_size; + int warp_min_number = heuristic_min_warp_number(RanksPerNode, params.fusion_params.hidden_size); + int cluster_size = std::min(((warps_per_token + warp_min_number - 1) / warp_min_number), details::kClusterMaxSize); + int cta_size = warps_per_token / cluster_size * details::kWarpSize; + TLLM_CHECK(cta_size <= details::kMaxCtaSize); + int cta_num = token_num * cluster_size; + cudaLaunchConfig_t kernel_config = {0}; + kernel_config.gridDim = cta_num; + kernel_config.blockDim = cta_size; + kernel_config.dynamicSmemBytes = 0; + kernel_config.stream = stream; + + cudaLaunchAttribute attribute[2]; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = cluster_size; + attribute[0].val.clusterDim.y = 1; + attribute[0].val.clusterDim.z = 1; + kernel_config.attrs = attribute; + kernel_config.numAttrs = 1; + if (tensorrt_llm::common::getEnvEnablePDL()) + { + attribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attribute[1].val.programmaticStreamSerializationAllowed = 1; + kernel_config.numAttrs++; + } +#define LAUNCH_LAMPORT_KERNEL(CLUSTER_SIZE) \ + if (cluster_size == CLUSTER_SIZE) \ + { \ + TLLM_CUDA_CHECK(cudaLaunchKernelEx(&kernel_config, \ + lamport_style_one_shot_all_reduce_norm_kernel, params)); \ + return; \ + } + LAUNCH_LAMPORT_KERNEL(1); + LAUNCH_LAMPORT_KERNEL(2); + LAUNCH_LAMPORT_KERNEL(3); + LAUNCH_LAMPORT_KERNEL(4); + LAUNCH_LAMPORT_KERNEL(5); + LAUNCH_LAMPORT_KERNEL(6); + LAUNCH_LAMPORT_KERNEL(7); + LAUNCH_LAMPORT_KERNEL(8); +#undef LAUNCH_LAMPORT_KERNEL +} + template static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kernel(AllReduceParams params) { @@ -495,80 +881,145 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne #endif } -template -void one_shot_all_reduce_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream) +template +bool is_lamport_supported(int token_num) { - static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T); - TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0); - int need_threads = params.fusion_params.hidden_size / kPackedSize; - int cta_size; - if (need_threads <= details::kMaxCtaSize) + static char* disableLamportReduceNormFusionChar = std::getenv("DISABLE_LAMPORT_REDUCE_NORM_FUSION"); + bool disableLamportReduceNormFusion = (disableLamportReduceNormFusionChar != nullptr); + if (disableLamportReduceNormFusion) + return false; + static int sm = tensorrt_llm::common::getSMVersion(); + if (sm < 90) { - cta_size = (need_threads + details::kWarpSize - 1) / details::kWarpSize * details::kWarpSize; + return false; } - else + if (!std::is_same_v && !std::is_same_v) { - cta_size = details::kMaxCtaSize; + return false; } - int norm_num = params.elts_total / params.fusion_params.hidden_size; - int cta_num = std::min(norm_num, static_cast(MAX_ALL_REDUCE_BLOCKS)); - int smem_size = 0; - - if (cta_size * kPackedSize < params.fusion_params.hidden_size) + if (token_num > details::kLamportTokenNumThreshold) { - smem_size = params.fusion_params.hidden_size * sizeof(T); - if (tensorrt_llm::common::getEnvEnablePDL()) - { - TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel"); - - cudaLaunchConfig_t kernelConfig = {0}; - kernelConfig.gridDim = cta_num; - kernelConfig.blockDim = cta_size; - kernelConfig.dynamicSmemBytes = smem_size; - kernelConfig.stream = stream; + return false; + } + return true; +} - cudaLaunchAttribute attribute[1]; - attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attribute[0].val.programmaticStreamSerializationAllowed = 1; - kernelConfig.attrs = attribute; - kernelConfig.numAttrs = 1; +bool is_lamport_supported(nvinfer1::DataType dataType, int token_num) +{ + switch (dataType) + { + case nvinfer1::DataType::kFLOAT: return is_lamport_supported(token_num); + case nvinfer1::DataType::kHALF: return is_lamport_supported(token_num); +#ifdef ENABLE_BF16 + case nvinfer1::DataType::kBF16: return is_lamport_supported<__nv_bfloat16>(token_num); +#endif + default: return false; + } +} - TLLM_CUDA_CHECK(cudaLaunchKernelEx( - &kernelConfig, one_shot_all_reduce_norm_kernel, params)); - } - else - { - one_shot_all_reduce_norm_kernel - <<>>(params); - } +template +void one_shot_all_reduce_norm_kernel_launcher(AllReduceParams& params, cudaStream_t stream) +{ + int token_num = params.elts_total / params.fusion_params.hidden_size; + if (is_lamport_supported(token_num)) + { + lamport_style_one_shot_all_reduce_norm_kernel_launcher(params, stream); } else { - if (tensorrt_llm::common::getEnvEnablePDL()) + static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T); + TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0); + int need_threads = params.fusion_params.hidden_size / kPackedSize; + int cta_size; + if (need_threads <= details::kMaxCtaSize) { - cudaLaunchConfig_t kernelConfig = {0}; - kernelConfig.gridDim = cta_num; - kernelConfig.blockDim = cta_size; - kernelConfig.dynamicSmemBytes = smem_size; - kernelConfig.stream = stream; - - cudaLaunchAttribute attribute[1]; - attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attribute[0].val.programmaticStreamSerializationAllowed = 1; - kernelConfig.attrs = attribute; - kernelConfig.numAttrs = 1; + cta_size = (need_threads + details::kWarpSize - 1) / details::kWarpSize * details::kWarpSize; + } + else + { + cta_size = details::kMaxCtaSize; + } + int norm_num = params.elts_total / params.fusion_params.hidden_size; + int cta_num = std::min(norm_num, static_cast(MAX_ALL_REDUCE_BLOCKS)); + int smem_size = 0; - TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel"); - TLLM_CUDA_CHECK(cudaLaunchKernelEx( - &kernelConfig, one_shot_all_reduce_norm_kernel, params)); + if (cta_size * kPackedSize < params.fusion_params.hidden_size) + { + smem_size = params.fusion_params.hidden_size * sizeof(T); + if (tensorrt_llm::common::getEnvEnablePDL()) + { + TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel"); + + cudaLaunchConfig_t kernelConfig = {0}; + kernelConfig.gridDim = cta_num; + kernelConfig.blockDim = cta_size; + kernelConfig.dynamicSmemBytes = smem_size; + kernelConfig.stream = stream; + + cudaLaunchAttribute attribute[1]; + attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attribute[0].val.programmaticStreamSerializationAllowed = 1; + kernelConfig.attrs = attribute; + kernelConfig.numAttrs = 1; + + TLLM_CUDA_CHECK(cudaLaunchKernelEx( + &kernelConfig, one_shot_all_reduce_norm_kernel, params)); + } + else + { + one_shot_all_reduce_norm_kernel + <<>>(params); + } } else { - one_shot_all_reduce_norm_kernel - <<>>(params); + if (tensorrt_llm::common::getEnvEnablePDL()) + { + cudaLaunchConfig_t kernelConfig = {0}; + kernelConfig.gridDim = cta_num; + kernelConfig.blockDim = cta_size; + kernelConfig.dynamicSmemBytes = smem_size; + kernelConfig.stream = stream; + + cudaLaunchAttribute attribute[1]; + attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attribute[0].val.programmaticStreamSerializationAllowed = 1; + kernelConfig.attrs = attribute; + kernelConfig.numAttrs = 1; + + TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel"); + TLLM_CUDA_CHECK(cudaLaunchKernelEx( + &kernelConfig, one_shot_all_reduce_norm_kernel, params)); + } + else + { + one_shot_all_reduce_norm_kernel + <<>>(params); + } } } } + +template +__global__ void lamport_initialize_kernel(T* buffer, size_t size) +{ + static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T); + using PackedStruct = typename PackedOn16Bytes::Type; + for (size_t offset = (blockIdx.x * blockDim.x + threadIdx.x) * kPackedSize; offset < size; + offset += gridDim.x * blockDim.x * kPackedSize) + { + set_neg_zero(reinterpret_cast(&buffer[offset])); + } +} + +template +void lamport_initialize_kernel_launcher(void* buffer, size_t size, cudaStream_t stream) +{ + static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T); + int block_size = 1024; + int grid_size = (size + 1024 * kPackedSize - 1) / (1024 * kPackedSize); + lamport_initialize_kernel<<>>(reinterpret_cast(buffer), size); +} }; // namespace reduce_fusion template @@ -1117,13 +1568,24 @@ void AllReduceDispatchType(AllReduceParams& params, AllReduceStrategyType strat, } } -AllReduceParams AllReduceParams::deserialize(int64_t* buffer, size_t tpSize, size_t tpRank) +AllReduceParams AllReduceParams::deserialize( + int64_t* buffer, size_t tpSize, size_t tpRank, nvinfer1::DataType dataType, int token_num, AllReduceFusionOp op) { void* const* buffer_ptrs = reinterpret_cast(buffer); - auto const flag_ptr = &buffer[4 * tpSize]; + int flag_offset; + if (op == AllReduceFusionOp::RESIDUAL_RMS_NORM && reduce_fusion::is_lamport_supported(dataType, token_num)) + { + flag_offset = 0; + } + else + { + flag_offset = 1; + } + auto const flag_ptr + = &buffer[tensorrt_llm::utils::customAllReduceUtils::NUM_POINTERS_PER_RANK * tpSize + flag_offset]; // cannot use 0 since 0 represents released state for barrier *flag_ptr += 1; - TLLM_LOG_TRACE("AllReduceParams's flag value is %d", *flag_ptr); + TLLM_LOG_TRACE("AllReduceParams's flag value is %d, flag offset %d", *flag_ptr, flag_offset); uint32_t flag_value = *flag_ptr; AllReduceParams params; // Even plugins use ping buffers, odd plugins use pong. @@ -1208,4 +1670,25 @@ void residualRmsNorm(kernels::AllReduceParams& params, nvinfer1::DataType dataTy sync_check_cuda_error(); } +void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream) +{ + sync_check_cuda_error(); + switch (dataType) + { + case nvinfer1::DataType::kFLOAT: + reduce_fusion::lamport_initialize_kernel_launcher(buffer, size, stream); + break; + case nvinfer1::DataType::kHALF: + reduce_fusion::lamport_initialize_kernel_launcher(buffer, size, stream); + break; +#ifdef ENABLE_BF16 + case nvinfer1::DataType::kBF16: + reduce_fusion::lamport_initialize_kernel_launcher<__nv_bfloat16>(buffer, size, stream); + break; +#endif + default: TLLM_THROW("Unsupported dataType for customAllReduce"); + } + sync_check_cuda_error(); +} + } // namespace tensorrt_llm::kernels diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h index ebe6b8795..6a67ba13e 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h @@ -31,6 +31,15 @@ constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24; constexpr size_t MAX_RANKS_PER_NODE = 8; constexpr size_t DEFAULT_BLOCK_SIZE = 512; +namespace reduce_fusion::details +{ +static constexpr int kBytesPerAccess = 16; +static constexpr int kWarpSize = 32; +static constexpr int kMaxCtaSize = 1024; +static constexpr int kClusterMaxSize = 8; +static constexpr int kLamportTokenNumThreshold = 16; +}; // namespace reduce_fusion::details + // Warning: python definition is in tensorrt_llm/functional.py // they must be kept in sync enum class AllReduceStrategyType : int8_t @@ -73,6 +82,7 @@ struct AllReduceFusionParams float eps; // new residual void* intermediate_buffer; + void* lamport_peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE * 3]; }; struct AllReduceParams @@ -81,7 +91,8 @@ struct AllReduceParams size_t elts_per_rank; size_t elts_per_block; size_t rank_offset; - size_t ranks_per_node, local_rank; + size_t ranks_per_node; + size_t local_rank; uint32_t barrier_flag; uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE]; uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE]; @@ -91,7 +102,8 @@ struct AllReduceParams AllReduceFusionParams fusion_params; - static AllReduceParams deserialize(int64_t* buffer, size_t tpSize, size_t tpRank); + static AllReduceParams deserialize(int64_t* buffer, size_t tpSize, size_t tpRank, nvinfer1::DataType dataType, + int token_num, AllReduceFusionOp op); }; bool configurationSupported(AllReduceStrategyType algo, size_t msg_size, size_t n_ranks, nvinfer1::DataType type); @@ -101,4 +113,6 @@ void customAllReduce(kernels::AllReduceParams& params, nvinfer1::DataType dataTy void residualRmsNorm(kernels::AllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream); +void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream); + } // namespace tensorrt_llm::kernels diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl index 1a0f6bc65..126e761ec 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl @@ -74,7 +74,6 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe int occupancy = std::min(2, fused_moe::fused_gemm_maximum_active_blocks()); int const threadblock_count = multi_processor_count * occupancy; TLLM_CHECK_WITH_INFO(occupancy > 0, "GPU lacks the shared memory resources to run fused_moe kernel"); - GemmType gemm; using Arguments = typename GemmType::Arguments; Arguments args{{const_cast(A), const_cast(B), const_cast(biases), reinterpret_cast(C), total_tokens_including_expert, static_cast(gemm_n), diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h index 3a1b83c8c..5670b61ba 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h @@ -559,8 +559,9 @@ template ::supportsFusedGatedActivation( bool is_gated_activation, int gemm_n, int gemm_k) const { + constexpr bool ENABLE_FUSED_GATED_ACTIVATION = false; // TODO There is a bug that causes non-determinism return is_gated_activation && std::is_same_v && !std::is_same_v && !use_fp8 - && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0); + && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION; } template diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so index 9149cf698..cdc18a4c7 100755 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38b241619f08ee636eee1d00a91d2fa2fc8a70f4afe1e12d01b180e6adeef7aa -size 81578928 +oid sha256:e73dd3a8859cd67c62ab89a98381028bd20ac9e756f0346bbbaab0fb6c566eb7 +size 81578760 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt index d8f31a5f9..11bd98565 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -4a08f099886e0595057a20115658be51 libtensorrt_llm_nvrtc_wrapper.so -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so index 8977a08e5..d957c0b85 100755 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28e39c448442c950d41012ad964057d84c8afc51aa116bbee17ccacd76b43e9f +oid sha256:c11e0550552f4cc3568ac11de47079d5c6bd88aeb34ebbd52b39f4f732afbd7d size 84839528 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt index 14159937e..b1b08b1f3 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -4a873e8722270fed4a2e6a60c59aec27 libtensorrt_llm_nvrtc_wrapper.so -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll index 346481e68..a68f72234 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:534c4295300d7928a7949884c2791784a33fa46d1d93be567d81736f6d5dfb03 +oid sha256:1de32a25a27c7f5205f2e95452a48e65cae3311f2e5c087881e7fd2278c3bd77 size 1128448 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib index cfe4399d6..eb4782449 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e207a8f57b944529163c7ed2ab30639a5f2779c5118602c6ebd50a623d16f845 +oid sha256:1a6c03470aaa69378d4989971ab9dd00ee427f7e14a85ba5e114ea0594c4de5e size 3488 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt index bc3bc70c5..2e3885d57 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt @@ -1,3 +1,3 @@ -b7e624ba775e9f5090ef4b67bcdbd7a2 tensorrt_llm_nvrtc_wrapper.lib -ae3e4d6dd528f376dc29840ca316ab08 tensorrt_llm_nvrtc_wrapper.dll -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +c5f36e093e875c8ea84523fb1566d986 tensorrt_llm_nvrtc_wrapper.lib +e6af3699a00052b3f151052e7fb6c9a4 tensorrt_llm_nvrtc_wrapper.dll +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu index d3e2cbce6..1de9b04be 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu @@ -711,4 +711,95 @@ void invokeTransposeLogProbs(float* outputLogProbs, float* outputLogProbsTiled, } } // namespace kernels + +namespace runtime::kernels +{ +// Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree +void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, BufferManager const& manager, + SamplingConfig const& samplingConfig) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto& finalOutputIds = *decodingOutput.gatheredIds; + auto const& finalOutputIdsShape = finalOutputIds.getShape(); + auto const& decodingOutputIdsShape = decodingOutput.ids->getShape(); + auto const batchSize = finalOutputIdsShape.d[0]; + auto const beamWidth = finalOutputIdsShape.d[1]; + auto const maxSeqLength = finalOutputIdsShape.d[2]; + + TLLM_CHECK_WITH_INFO(beamWidth > 1, "gatherTree is only needed for beam search."); + + TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[0] == batchSize, + common::fmtstr("Decoder batch size (" FMT_DIM ") does not match final batch size (" FMT_DIM ")", + decodingOutputIdsShape.d[0], batchSize)); + TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[1] == beamWidth, + common::fmtstr("Decoder beam width (" FMT_DIM ") does not match final beam width (" FMT_DIM ")", + decodingOutputIdsShape.d[1], beamWidth)); + TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[2] <= maxSeqLength, + common::fmtstr("Decoder seq length size (" FMT_DIM ") is too large for final seq length (" FMT_DIM ")", + decodingOutputIdsShape.d[2], maxSeqLength)); + + auto const& stream = manager.getStream().get(); + + // prefill finalOutputIds with the EOS tokens from decodingInput.endIds + tensorrt_llm::kernels::invokeInitializeOutput(bufferCast(finalOutputIds), + bufferCast(*decodingInput.endIds), batchSize * beamWidth, maxSeqLength, stream); + sync_check_cuda_error(); + + std::vector lengthPenaltyVec; + auto lengthPenaltyPtr = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize}), TRTDataType::value)); + if (!samplingConfig.lengthPenalty.has_value() || samplingConfig.lengthPenalty.value().size() == 0) + { + lengthPenaltyVec = std::vector(batchSize, 1.0f); + } + else if (long int const size = samplingConfig.lengthPenalty.value().size(); size == 1) + { + lengthPenaltyVec = std::vector(batchSize, samplingConfig.lengthPenalty.value()[0]); + } + else + { + TLLM_CHECK_WITH_INFO(size == batchSize, + common::fmtstr("Size of lengthPenalty in SamplingConfig (" FMT_DIM ") is different from batchSize (" FMT_DIM + ")", + size, batchSize)); + lengthPenaltyVec = samplingConfig.lengthPenalty.value(); + } + + lengthPenaltyPtr = manager.copyFrom(lengthPenaltyVec, ITensor::makeShape({batchSize}), runtime::MemoryType::kGPU); + + tensorrt_llm::kernels::BeamHypotheses bh; + bh.nMaxBatchSize = batchSize; + bh.nBatchSize = batchSize; + bh.nBeamWidth = beamWidth; + bh.nMaxSeqLen = maxSeqLength; + bh.lengthPenalties = bufferCast(*lengthPenaltyPtr); + bh.inputLengths = bufferCast(*decodingInput.lengths); + bh.outputIds = bufferCast(finalOutputIds); + bh.logProbs = bufferCastOrNull(decodingOutput.logProbs); + bh.logProbsTiled = bufferCast(*decodingOutput.logProbsTiled); + bh.sequenceLengths = bufferCast(*decodingOutput.lengths); + bh.cumLogProbs = bufferCast(*decodingOutput.cumLogProbs); + bh.outputIdsCBA = bufferCast(*decodingOutput.beamHypotheses.outputIdsCBA); + bh.logProbsCBA = bufferCast(*decodingOutput.beamHypotheses.logProbsCBA); + bh.sequenceLengthsCBA = bufferCast(*decodingOutput.beamHypotheses.sequenceLengthsCBA); + bh.cumLogProbsCBA = bufferCast(*decodingOutput.beamHypotheses.cumLogProbsCBA); + bh.normedScoresCBA = bufferCast(*decodingOutput.beamHypotheses.normedScoresCBA); + bh.numBeamsCBA = bufferCast(*decodingOutput.beamHypotheses.numBeamsCBA); + bh.minNormedScoresCBA = bufferCast(*decodingOutput.beamHypotheses.minNormedScoresCBA); + bh.batchDones = bufferCast(*decodingOutput.beamHypotheses.batchDones); + bh.finished = bufferCast(*decodingOutput.finishReasons); + bh.outputIdsUnfinish = bufferCast(*decodingOutput.ids); + bh.parentIdsUnfinish = bufferCast(*decodingOutput.parentIds); + + // This is where transpose is done + tensorrt_llm::kernels::invokeInsertUnfinishedPath(bh, stream); + sync_check_cuda_error(); + + tensorrt_llm::kernels::invokeFinalize(bh, stream); + sync_check_cuda_error(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +} // namespace runtime::kernels + } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.h b/cpp/tensorrt_llm/kernels/decodingKernels.h index 6fdcc9056..73e97930d 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.h +++ b/cpp/tensorrt_llm/kernels/decodingKernels.h @@ -20,7 +20,9 @@ #include "tensorrt_llm/kernels/beamSearchKernels.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/decodingInput.h" #include "tensorrt_llm/runtime/decodingOutput.h" +#include "tensorrt_llm/runtime/samplingConfig.h" #include #include #include @@ -116,4 +118,22 @@ void invokeTransposeLogProbs(float* output_log_probs, float* output_log_probs_ti cudaStream_t stream); } // namespace kernels + +namespace runtime::kernels +{ +//! \brief Inserts the running beams into the finished beams stored in the CBA buffers. (beams where the most likely +//! continuation is the end token get stored separately, and another candidate next token is stored). Then sorts the +//! beams according to their cumulative log probs. Note: the kernels in gatherTree modify the buffers inplace. When +//! streaming, we use tmp buffers since beam search kernels expect ungathered data. +//! +//! \param decodingOutput contains a slice of the output buffers to gather. Also contains the +//! DecodingOutput::BeamHypotheses object with the finished beams. +//! \param decodingInput used for endIds and input lengths. +//! \param manager the usual buffer manager. +//! \param samplingConfig the usual buffer samplingConfig. + +void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, BufferManager const& manager, + SamplingConfig const& samplingConfig); +} // namespace runtime::kernels + } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/gptKernels.cu b/cpp/tensorrt_llm/kernels/gptKernels.cu index 30ce90e0a..ae4c9d895 100644 --- a/cpp/tensorrt_llm/kernels/gptKernels.cu +++ b/cpp/tensorrt_llm/kernels/gptKernels.cu @@ -228,7 +228,7 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void computeSeqAndPaddingOffsets } } - // Perpare values for fmha. + // Prepare values for fmha. if (threadIdx.x == 0 && blockIdx.x == 0) { // Reset fmha tile counter to 0 before launching fmha kernels. diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a index da5714a59..75170e0be 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a66f773268e75b4cdda1de30e52a178d0d76ff4e0cce460272fae6da81a53715 +oid sha256:b441dc3c4773e25088812fd91cfb4974f0966291b4801c1f2f364a31669711cd size 25364090 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a index 2a59e4e6c..7f19a0ec7 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a04d66310a3c702bd2def83e26739164a72a22d9bffaef8749870e3c7c4b7be6 +oid sha256:8615ba1c4f8cf243e638765094ed399e694b87475bb3eb2e30b07fd304ede5f4 size 25768990 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index b81f3cf8f..bc334fbaa 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -952c054f0ae95852960c6dcd19a6a727 libtensorrt_llm_internal_cutlass_kernels_static.a -629c1305cc5fdf8fdd61bb5d983b0ce1 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +4a9af041741dacb8179f6ea7f429b1e9 libtensorrt_llm_internal_cutlass_kernels_static.a +f2a0bbcb400bd9ce12958ff6b418ca0f libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a index 0a07e42d6..a470bc574 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7b984e24b4e8f2204cd8f9b0b96a9c7fe12a890cee9a5c9a43163fed9a397f2 +oid sha256:1ca9cb8ec2f0f7ddb3604d624526ff01ddb3371379f28e4ad94c278fe14b7ebe size 44173728 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a index f1e9971d3..ec0ba15f1 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89799c5ec2c00651f46aa2673be8d19523e5d35bd810dba48a67650419a98ec8 +oid sha256:39cb4a742fcdc2005d847359870fab18a2b7bd1a22cd05c12a4959eb531a55d8 size 43561142 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index 4e98dd55b..cd208d12f 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -274d4e4f87e1bce533ebc6e8298ca28c libtensorrt_llm_internal_cutlass_kernels_static.a -866bd2e84276b3ccef0459f53879d74a libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +f18fb02389d90717633943adf485d89c libtensorrt_llm_internal_cutlass_kernels_static.a +ab42a2079f7314a5a6f46ad7ffa454cc libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib index f3d5dc691..772d87c3c 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6683b5ae21f2577f61b35d3d7e7d02623d8436279efdd12aa2251f5526d096e9 +oid sha256:597418556c9efeba5dff0b4f86cc4f41454d6cb6457c0cbc57232d47c9dc9ec4 size 88140804 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt index 5e799c7cb..f26dce6b4 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt @@ -1,2 +1,2 @@ -f207648d1d9f1c1a4c6c6753a1032c9c tensorrt_llm_internal_cutlass_kernels_static.lib -867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit \ No newline at end of file +5d0b1c092159da211bac1c7335897487 tensorrt_llm_internal_cutlass_kernels_static.lib +cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit \ No newline at end of file diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu index bfde5bae3..05cccf03d 100644 --- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu @@ -1458,7 +1458,7 @@ std::vector CutlassMoeFCRunner cons kernel = airTopPSampling; } - kernel<<>>(counters, histograms, countHistograms, params.outputIds, + kernel<<>>(counters, histograms, countHistograms, params.outputIdsPtrs, params.sequenceLength, params.finishedInput, params.finishedOutput, params.cumLogProbs, params.outputLogProbs, params.endIds, params.maxBatchSize, params.skipDecode, pass, buf1, idxBuf1, buf2, idxBuf2, params.batchSlots); diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu index 04edc841a..c766e6da2 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu @@ -77,7 +77,7 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr if (finished != nullptr && finishState.isFinished()) { - if (tid < k) + if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input { auto const index = tmpTopKBufIndex + tid; if (blockLane == 0 && tid == 0) @@ -134,7 +134,7 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* float const* topPs, curandState_t* curandState, TokenIdType const* endIds, SizeType32 vocabSize, bool const* skipDecode, SizeType32 const* batchSlots, SizeType32 maxBatchSize, bool normalizeLogProbs, bool logitHasProbs, SizeType32 const* tokensPerStep, SizeType32 maxTokensPerStep, SizeType32 maxSeqLen, - bool returnAllTopK) + bool returnAllSelectedTokens) { bool const IS_FP16 = std::is_same::value; T const MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX; @@ -215,13 +215,16 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* if (tid == 0) { - auto randNum = static_cast(curand_uniform(curandState + batchSlot) * probThreshold * sSum); + // if we want to return all top k indices, we should not do random sampling for probThreshold + auto randNum = returnAllSelectedTokens + ? static_cast(probThreshold * sSum) + : static_cast(curand_uniform(curandState + batchSlot) * probThreshold * sSum); auto* outputIdsRequestPtr = idsPtrs == nullptr ? ids + batchSlot * maxSeqLen : idsPtrs[batchSlot]; for (SizeType32 ki = 0; ki < k; ki++) { auto expLogit = sVal2[ki]; randNum = randNum - expLogit; - if (randNum <= 0.0f || ki == k - 1 || returnAllTopK) + if (randNum <= 0.0f || ki == k - 1 || returnAllSelectedTokens) { auto idx = sId[ki]; // If sId is -1 here we force output token to the last from vocabulary to get vivid indicator of smth @@ -230,10 +233,10 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* ? topKTmpIdBuf[(batchIdx * maxTokensPerStep + tokenIdx) * stride + idx] % vocabSize : vocabSize - 1; auto const curSeqLen = sequenceLengths == nullptr ? 0 : sequenceLengths[batchSlot]; - auto const outIdx = returnAllTopK ? tokenIdx * maxTopK + ki : curSeqLen + tokenIdx; + auto const outIdx = returnAllSelectedTokens ? tokenIdx * maxTopK + ki : curSeqLen + tokenIdx; outputIdsRequestPtr[outIdx] = outputId; - // cum log prob is not supported with returnAllTopK - if (!returnAllTopK) + // cum log prob is not supported with returnAllSelectedTokens + if (!returnAllSelectedTokens) { if (cumLogProbs != nullptr || outputLogProbs != nullptr) { @@ -255,9 +258,17 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* } break; } + if (returnAllSelectedTokens && randNum <= 0.0f) + { + if (ki < k - 1) + { // not the last k, write a -1 to to log top p tokens boundary for external draft token masking + outputIdsRequestPtr[outIdx + 1] = -1; + } + break; + } } } - if (maxTokensPerStep == 1 && !returnAllTopK && sequenceLengths != nullptr && finishedOutput != nullptr + if (maxTokensPerStep == 1 && !returnAllSelectedTokens && sequenceLengths != nullptr && finishedOutput != nullptr && endIds != nullptr) { auto const seqLen = sequenceLengths[batchSlot]; @@ -297,7 +308,7 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T* params.maxTopK, params.topKs, params.maxTopP, params.topPs, params.curandState, params.endIds, \ params.vocabSizePadded, params.skipDecode, params.batchSlots, params.maxBatchSize, \ params.normalizeLogProbs, params.logitsHasProbs, params.tokensPerStep, params.maxTokensPerStep, \ - params.maxSeqLen, params.returnAllTopK); \ + params.maxSeqLen, params.returnAllSelectedTokens); \ } \ } while (0) diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h index 0330cad31..dbf8cda0b 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h @@ -106,8 +106,8 @@ struct TopKSamplingKernelParams bool normalizeLogProbs{false}; //! flag to highlight that logProbs contains probabilities bool logitsHasProbs{false}; - //! flag to return all selectedTopK results - bool returnAllTopK{false}; + //! flag to return all selected TopK results + bool returnAllSelectedTokens{false}; void checkParams() const { @@ -133,11 +133,11 @@ struct TopKSamplingKernelParams TLLM_CHECK(workspace); TLLM_CHECK(curandState); - TLLM_CHECK(maxTokensPerStep != 1 || returnAllTopK || sequenceLengths); - TLLM_CHECK(maxTokensPerStep != 1 || returnAllTopK || endIds); + TLLM_CHECK(maxTokensPerStep != 1 || returnAllSelectedTokens || sequenceLengths); + TLLM_CHECK(maxTokensPerStep != 1 || returnAllSelectedTokens || endIds); if (cumLogProbs != nullptr || outputLogProbs != nullptr) { - TLLM_CHECK(maxTokensPerStep == 1 && !returnAllTopK); + TLLM_CHECK(maxTokensPerStep == 1 && !returnAllSelectedTokens); } TLLM_CHECK(((finishedOutput == nullptr) ^ (endIds == nullptr)) == 0); diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu index 13da77bdf..472115b64 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu @@ -196,11 +196,11 @@ __device__ void epilogue(SizeType32 batchId, SizeType32 currentStep, SizeType32 } template -__global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenIdType** ids, SizeType32* sequenceLength, - FinishedState const* finishedInput, FinishedState* finishedOutput, float* cumLogProbs, float* outputLogProbs, - SizeType32 const* beginOffsetBuf, SizeType32 const* offsetBuf, SizeType32 vocabSize, curandState_t* curandState, - float const* topPs, TokenIdType const* endIds, SizeType32 maxBatchSize, bool const* skipDecode, - SizeType32 const* batchSlots) +__global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenIdType* ids, TokenIdType** idsPtrs, + SizeType32* sequenceLength, FinishedState const* finishedInput, FinishedState* finishedOutput, float* cumLogProbs, + float* outputLogProbs, SizeType32 const* beginOffsetBuf, SizeType32 const* offsetBuf, SizeType32 vocabSize, + curandState_t* curandState, float const* topPs, TokenIdType const* endIds, SizeType32 maxBatchSize, + bool const* skipDecode, SizeType32 const* batchSlots, bool returnAllSelectedTokens, SizeType32 maxSeqLen) { /** * Each block processes one request row sorted in descending order by probabilities. @@ -235,14 +235,16 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId } auto const probThreshold = topPs[batchSlot]; - auto const currentStep = sequenceLength[batchSlot]; + auto const currentStep = sequenceLength == nullptr ? 0 : sequenceLength[batchSlot]; + auto* outputIdsRequestPtr = idsPtrs == nullptr ? ids + batchSlot * maxSeqLen : idsPtrs[batchSlot]; // With P in (0.0; 1.0] we draw a random number P' in range (0.0; P] // We will sum all probs moving from the largest probability to the smallest and // will choose the token which probability makes cumulative probability sum to exceed P' if (threadIdx.x == 0) { - randNumS = curand_uniform(curandState + blockIdx.x) * probThreshold; + // if we want to return all top p indices, we should not do random sampling for probThreshold + randNumS = returnAllSelectedTokens ? probThreshold : curand_uniform(curandState + blockIdx.x) * probThreshold; } // if beginOffsetBuf and offsetBuf of sorting have same value, @@ -253,8 +255,15 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId if (tid == 0) { auto offset = batchId * vocabSize; - epilogue(batchSlot, currentStep, offset, ids, sortedIdVals, sortedProbs, cumLogProbs, outputLogProbs, - endIds, sequenceLength, finishedOutput, maxBatchSize); + if (returnAllSelectedTokens) + { + outputIdsRequestPtr[currentStep] = sortedIdVals[offset]; + } + else + { + epilogue(batchSlot, currentStep, offset, idsPtrs, sortedIdVals, sortedProbs, cumLogProbs, + outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize); + } } return; } @@ -267,7 +276,7 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId __syncthreads(); auto offset = batchId * vocabSize; - ids[batchSlot][currentStep] = sortedIdVals[offset]; + outputIdsRequestPtr[currentStep] = sortedIdVals[offset]; auto end = ((vocabSize + blockSize - 1) / blockSize) * blockSize; SizeType32 selectedTokenId = 0; // Cumulative sum @@ -285,11 +294,31 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId } } - // select first thread exceeded the prob threshold or the last thread in case of P=1.0f - if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1)) + if (returnAllSelectedTokens) + { + __shared__ SizeType32 sharedSelectedTokenId; + if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1)) + { + sharedSelectedTokenId = selectedTokenId; + } + __syncthreads(); + for (int vi = tid; vi <= sharedSelectedTokenId; vi += blockSize) + { + outputIdsRequestPtr[vi] = sortedIdVals[offset + vi]; + } + if (tid == 0 && sharedSelectedTokenId != end - 1) + { + outputIdsRequestPtr[sharedSelectedTokenId + 1] = -1; // a boundary to record the end of all selected top Ps. + } + } + else { - epilogue(batchSlot, currentStep, offset + selectedTokenId, ids, sortedIdVals, sortedProbs, cumLogProbs, - outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize); + // select first thread exceeded the prob threshold or the last thread in case of P=1.0f + if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1)) + { + epilogue(batchSlot, currentStep, offset + selectedTokenId, idsPtrs, sortedIdVals, sortedProbs, cumLogProbs, + outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize); + } } } @@ -371,9 +400,10 @@ void invokeBatchTopPSampling(TopPSamplingKernelParams const& params, cudaStre dim3 grid(params.batchSize); // Sample with Top P given sorted tokens topPSsampling<<>>(sortedProbs, sortedIdVals, - params.outputIds, params.sequenceLength, params.finishedInput, params.finishedOutput, params.cumLogProbs, - params.outputLogProbs, beginOffsetBuf, offsetBuf + 1, params.vocabSizePadded, params.curandState, params.topPs, - params.endIds, params.maxBatchSize, params.skipDecode, params.batchSlots); + params.outputIds, params.outputIdsPtrs, params.sequenceLength, params.finishedInput, params.finishedOutput, + params.cumLogProbs, params.outputLogProbs, beginOffsetBuf, offsetBuf + 1, params.vocabSizePadded, + params.curandState, params.topPs, params.endIds, params.maxBatchSize, params.skipDecode, params.batchSlots, + params.returnAllSelectedTokens, params.maxSeqLen); sync_check_cuda_error(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -385,9 +415,13 @@ template void invokeBatchTopPSampling(TopPSamplingKernelParams const& para __global__ void computeToppDecay(float* runtimeTopP, float const* runtimeInitialTopP, TokenIdType const** outputIds, float const* topPDecay, float const* topPMin, TokenIdType const* topPResetIds, SizeType32 const* sequenceLengths, - SizeType32 const* batchSlots) + SizeType32 const* batchSlots, SizeType32 localBatchSize) { auto const idx = static_cast(blockDim.x * blockIdx.x + threadIdx.x); + if (idx >= localBatchSize) + { + return; + } auto const batchSlot = batchSlots[idx]; auto const currentStep{sequenceLengths[batchSlot]}; if (outputIds[batchSlot][currentStep] == topPResetIds[batchSlot]) @@ -406,8 +440,8 @@ void invokeComputeToppDecay(float* runtimeTopP, float const* runtimeInitialTopP, { dim3 block(std::min(localBatchSize, 512)); dim3 grid((localBatchSize + block.x - 1) / block.x); - computeToppDecay<<>>( - runtimeTopP, runtimeInitialTopP, outputIds, topPDecay, topPMin, topPResetIds, sequenceLengths, batchSlots); + computeToppDecay<<>>(runtimeTopP, runtimeInitialTopP, outputIds, topPDecay, topPMin, + topPResetIds, sequenceLengths, batchSlots, localBatchSize); } __global__ void setTopPRuntimeArgs(SizeType32 batchSize, SizeType32 topK, SizeType32* topKs, SizeType32 topKsSize, diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h index 1cda8bc56..639d7d4d6 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h @@ -28,8 +28,13 @@ struct TopPSamplingKernelParams //! input buffer [batchSize, vocabSizePadded], required. Probabilities of each token in the vocab. T const* probs{nullptr}; - //! output buffer [maxBatchSize][maxSeqLen], required. Contains pointers to rows with output tokens per request. - runtime::TokenIdType** outputIds{nullptr}; + //! output buffer [maxBatchSize][maxSeqLen]. Contains pointers to rows with output tokens per request. + //! If nullptr, outputIds must be provided. + runtime::TokenIdType** outputIdsPtrs{nullptr}; + + //! output buffer [maxBatchSize, maxSeqLen], optional. Tensor to store output tokens. + //! Not used if outputIdsPtrs != nullptr + runtime::TokenIdType* outputIds{nullptr}; //! pointer to the workspace. Has to be pre-allocated by caller. //! Function does not take ownership of the buffer. @@ -73,6 +78,9 @@ struct TopPSamplingKernelParams runtime::SizeType32 batchSize{-1}; runtime::SizeType32 maxBatchSize{-1}; runtime::SizeType32 vocabSizePadded{-1}; + runtime::SizeType32 maxSeqLen{-1}; + + bool returnAllSelectedTokens{false}; void checkParams() const { @@ -81,12 +89,17 @@ struct TopPSamplingKernelParams TLLM_CHECK(maxBatchSize >= batchSize); TLLM_CHECK(vocabSizePadded > 0); TLLM_CHECK(probs); - TLLM_CHECK(outputIds); + TLLM_CHECK(outputIds || outputIdsPtrs); TLLM_CHECK(workspace); - TLLM_CHECK(sequenceLength); + TLLM_CHECK((sequenceLength != nullptr) || returnAllSelectedTokens); TLLM_CHECK(curandState); TLLM_CHECK(topPs); + if (outputIds) + { + TLLM_CHECK(maxSeqLen > 0); + } + TLLM_CHECK(((finishedOutput == nullptr) ^ (endIds == nullptr)) == 0); } }; diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu index 6036695cd..427f1bb6b 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu @@ -35,230 +35,291 @@ namespace tensorrt_llm::kernels::speculative_decoding { namespace { -__global__ void acceptDraftTokensByIds(TokenIdType const* draftIds, TokenIdType const* targetIds, - SizeType32 const* contextLengths, SizeType32 const* numsDraftTokens, SizeType32* sequenceLengths, - FinishedState const* finished, FinishedState* finishedFinal, SizeType32* finishedSum, SizeType32 const* batchSlots, - SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 maxSeqLen, SizeType32 maxDraftTokens) -{ - for (auto batchIdx = static_cast(threadIdx.x); batchIdx < batchSize; batchIdx += blockDim.x) - { - auto const batchSlot = batchSlots[batchIdx]; - auto const numDraftTokens = numsDraftTokens[batchSlot]; - - auto const contextLength = contextLengths[batchSlot]; - auto& sequenceLength = sequenceLengths[batchSlot]; - SizeType32 finishedDraftIdx = 0; - for (auto ti = contextLength; ti < min(sequenceLength, contextLength + numDraftTokens); - ++ti, ++finishedDraftIdx) - { - auto const draftIdx = ti - contextLength; - auto const targetTokenIdx = batchSlot * maxSeqLen + ti; - auto const draftTokenIdx = batchSlot * maxDraftTokens + draftIdx; - // Check if draft tokens are the same as target tokens - bool const accepted = draftIds[draftTokenIdx] == targetIds[targetTokenIdx]; - if (!accepted) - { - // Set sequence length to the numAcceptedTokens + 1 - sequenceLength = min(ti + 1, maxSeqLen); - // FIXME(nkorobov): do we need to set endIds here? - break; - } - } - FinishedState finishState = finished[finishedDraftIdx * maxBatchSize + batchSlot]; - finishedFinal[batchSlot] = finishState; - - if (finishedSum) - { - finishedSum[batchSlot] = static_cast(finishState.isFinished()); - } - } -} -} // namespace - -void invokeAcceptDraftTokensByIds(TokenIdType const* draftIds, TokenIdType const* targetIds, - SizeType32 const* contextLengths, SizeType32 const* numsDraftTokens, SizeType32* sequenceLengths, - FinishedState const* finished, FinishedState* finishedFinal, SizeType32* finishedSum, SizeType32 const* batchSlots, - SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth, SizeType32 maxSeqLen, - SizeType32 maxDraftTokens, cudaStream_t stream) -{ - TLLM_CHECK(beamWidth == 1); - dim3 block(min(1024, batchSize)); - dim3 grid(1); - acceptDraftTokensByIds<<>>(draftIds, targetIds, contextLengths, numsDraftTokens, - sequenceLengths, finished, finishedFinal, finishedSum, batchSlots, batchSize, maxBatchSize, maxSeqLen, - maxDraftTokens); -} -namespace -{ template -__global__ void acceptDraftTokensByLogitsKernel(T const* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens, - FinishedState* finished, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 batchSize, - SizeType32 maxBatchSize, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSize, - bool randomThreshold, float constantThreshold) +__global__ void maskTargetLogitsKernel(T* targetLogits, SizeType32 const* batchSlots, SizeType32 beamWidth, + SizeType32 vocabSize, FinishedState const* finishedInput, SizeType32 maxBatchSize, bool const* batchUseDraftLogits, + SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds, SizeType32* runtimeTopKDevicePtr, bool* maskBuffer) { + /** + * @brief Masking the selected token to -inf as was done in Huggingface TopK/TopP Logits Warper + * https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/generation/logits_process.py#L533 + */ + auto const bid = blockIdx.x; - auto const draftTokenIdx = blockIdx.y; auto const batchIdx = bid / beamWidth; - auto const beamIdx = bid % beamWidth; + auto const tid = static_cast(threadIdx.x); auto const batchSlot = batchSlots[batchIdx]; - auto const batchSlotBeamWidth = batchSlot * beamWidth + beamIdx; - auto const numDraftTokens = numsDraftTokens[batchSlotBeamWidth]; + constexpr bool IS_HALF = std::is_same::value; + T const MAX_T_VAL = (IS_HALF) ? HALF_FLT_MAX : FLT_MAX; + + auto targetLogitsBatch = targetLogits + batchIdx * vocabSize; + auto& finishedState = finishedInput[batchSlot]; - if (draftTokenIdx >= numDraftTokens) + auto* outputIdsAfterSamplingPtr = outputIdsAfterSampling + batchSlot * vocabSize; + auto const useDraftLogits = batchUseDraftLogits[batchSlot]; + + if (finishedState.isSkipDecoding() || finishedState.isFinished()) { return; } - auto const logitsOffset = (batchSlot * maxDraftTokens + draftTokenIdx) * beamWidth * vocabSize; - auto const draftProbsBatch = draftProbs + logitsOffset; - auto const targetProbsBatch = targetProbs + logitsOffset; - auto const vocabSizePadded = static_cast((vocabSize + blockDim.x - 1) / blockDim.x) * blockDim.x; + __shared__ SizeType32 tokensToMask; - struct Candidate candidate; - __shared__ float threshold; - if (threadIdx.x == 0) + if (tid == 0) { - threshold = randomThreshold ? curand_uniform(curandState + batchSlot) : constantThreshold; + tokensToMask = runtimeTopKDevicePtr[batchSlot]; } __syncthreads(); - for (auto vIdx = static_cast(threadIdx.x); vIdx < vocabSizePadded; - vIdx += static_cast(blockDim.x)) + for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast(blockDim.x)) { - bool const pred = vIdx < vocabSize; - auto const targetProb = pred ? static_cast(targetProbsBatch[vIdx]) : 1.f; - auto const draftProb = pred ? static_cast(draftProbsBatch[vIdx]) : 0.f; - - if (draftProb > candidate.maxProb) - { - candidate.maxProb = draftProb; - candidate.rateQP = pred ? targetProb / draftProb : 0.f; + if (outputIdsAfterSamplingPtr[vIdx] == -1) + { // we need to find the -1 boundary from returnAllTopP outputIds if topK == 0 or number of topP indices < topK + tokensToMask = vIdx; } + maskBuffer[vIdx] = false; } + __syncthreads(); - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage reduce_buffer; - Candidate candidate_global = BlockReduce(reduce_buffer).Reduce(candidate, reduce_op); + if (!useDraftLogits && tid == 0) + { + targetOutputIds[batchSlot] = outputIdsAfterSamplingPtr[tokensToMask - 1]; + } + + for (SizeType32 vIdx = tid; vIdx < tokensToMask; vIdx += static_cast(blockDim.x)) + { + auto tokenToMask = outputIdsAfterSamplingPtr[vIdx]; + maskBuffer[tokenToMask] = true; + } + __syncthreads(); - if (threadIdx.x == 0) + for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast(blockDim.x)) { - finished[draftTokenIdx * maxBatchSize * beamWidth + batchSlotBeamWidth] - = candidate_global.rateQP < threshold ? FinishedState::skipDecoding() : FinishedState::empty(); + if (!maskBuffer[vIdx]) + { + targetLogitsBatch[vIdx] = -MAX_T_VAL; + } } } template -__global__ void correctAcceptedStatesAndLogits(T const* draftProbs, T* targetProbs, T** targetLogits, - SizeType32 const* numsDraftTokens, FinishedState* finished, SizeType32 const* batchSlots, SizeType32 batchSize, - SizeType32 maxBatchSize, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSize) +__global__ void acceptDraftTokensKernel(T const* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens, + bool const* batchUseDraftLogits, TokenIdType const* draftIds, FinishedState const* finishedInput, + FinishedState* finishedOutput, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 maxDraftTokens, + SizeType32 beamWidth, SizeType32 vocabSize, bool randomThreshold, float constantThreshold, SizeType32 step, + bool* batchIsAccepted, SizeType32* targetOutputIds) { auto const bid = blockIdx.x; + auto const draftTokenIdx = step; auto const batchIdx = bid / beamWidth; auto const beamIdx = bid % beamWidth; auto const batchSlot = batchSlots[batchIdx]; auto const batchSlotBeamWidth = batchSlot * beamWidth + beamIdx; + auto const tid = static_cast(threadIdx.x); + auto const numDraftTokens = numsDraftTokens[batchSlotBeamWidth]; + auto const useDraftLogits = batchUseDraftLogits[batchSlotBeamWidth]; - __shared__ SizeType32 numAcceptedTokens; - if (threadIdx.x == 0) + if (draftTokenIdx > numDraftTokens || finishedInput[batchSlot].isSkipDecoding() + || finishedInput[batchSlot].isFinished()) { - numAcceptedTokens = numDraftTokens; - bool cummulativeSkipDecoding = false; - for (SizeType32 ti = 0; ti < numDraftTokens + 1; ++ti) + if (tid == 0) { - auto& finishedState = finished[ti * maxBatchSize * beamWidth + batchSlotBeamWidth]; - bool localSkipDecoding = finishedState.isSkipDecoding(); - if (cummulativeSkipDecoding == false && localSkipDecoding == true) + batchIsAccepted[batchSlot] = true; + + // either finished or skip decode in previous step, this step don't need decoding + finishedOutput[batchSlot].setSkipDecoding(); + + // if previous step is finished, write the state to next step too + if (finishedInput[batchSlot].isFinished()) { - numAcceptedTokens = ti; + finishedOutput[batchSlot] = finishedInput[batchSlot]; } + } + return; + } + + auto const logitsOffset = (batchSlot * maxDraftTokens + draftTokenIdx) * beamWidth * vocabSize; + auto const draftProbsBatch = draftProbs + logitsOffset; + auto const targetProbsBatch = targetProbs + (batchIdx * beamWidth * vocabSize); - finishedState = cummulativeSkipDecoding ? FinishedState::skipDecoding() : FinishedState::empty(); - cummulativeSkipDecoding |= localSkipDecoding; + __shared__ bool isAccepted; + __shared__ T sSumVal; + if (tid == 0) + { + if (draftTokenIdx < numDraftTokens) + { + auto const draftOutputTokenId = draftIds[batchSlot * maxDraftTokens + draftTokenIdx]; + if (useDraftLogits) + { + float threshold = randomThreshold ? curand_uniform(curandState + batchSlot) : constantThreshold; + auto const targetProb = static_cast(targetProbsBatch[draftOutputTokenId]); + auto const draftProb = static_cast(draftProbsBatch[draftOutputTokenId]); + auto rateQP = targetProb / draftProb; + if (rateQP < threshold) + { + isAccepted = false; + finishedOutput[batchSlot].setSkipDecoding(); + } + else + { + isAccepted = true; + } + } + else + { + // Check if draft tokens are the same as target tokens + isAccepted = targetOutputIds[batchSlot] == draftOutputTokenId; + if (!isAccepted) + { + finishedOutput[batchSlot].setSkipDecoding(); + } + } } + else + { + isAccepted = false; + finishedOutput[batchSlot].setSkipDecoding(); + } + batchIsAccepted[batchSlot] = isAccepted; } + __syncthreads(); - if (numAcceptedTokens < numDraftTokens) + if (!isAccepted) { - auto const logitsIdx = (batchSlot * maxDraftTokens + numAcceptedTokens) * beamWidth * vocabSize; - auto const draftProbBatch = draftProbs + logitsIdx; - auto targetProbBatch = targetProbs + logitsIdx; - auto targetLogitsBatch = targetLogits[bid] + numAcceptedTokens * beamWidth * vocabSize; - - float sumProbs = 0.f; - for (SizeType32 vIdx = static_cast(threadIdx.x); vIdx < vocabSize; - vIdx += static_cast(blockDim.x)) + T const zeroVal = static_cast(0.0f); + T sumVal = zeroVal; + for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast(blockDim.x)) { - auto const correctedProb = max(static_cast(targetProbBatch[vIdx] - draftProbBatch[vIdx]), 0.f); - sumProbs += correctedProb; - targetProbBatch[vIdx] = correctedProb; + targetProbsBatch[vIdx] + -= (draftTokenIdx < numDraftTokens && useDraftLogits) ? draftProbsBatch[vIdx] : zeroVal; + targetProbsBatch[vIdx] = targetProbsBatch[vIdx] >= zeroVal ? targetProbsBatch[vIdx] : zeroVal; + sumVal += targetProbsBatch[vIdx]; } - - __shared__ float sumProbsShared; - sumProbs = blockReduceSum((float) sumProbs); - if (threadIdx.x == 0) + sumVal = blockReduceSum(sumVal); + if (tid == 0) { - sumProbsShared = max(sumProbs, 1e-6f); + sSumVal = sumVal; } __syncthreads(); - for (SizeType32 vIdx = static_cast(threadIdx.x); vIdx < vocabSize; - vIdx += static_cast(blockDim.x)) + for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast(blockDim.x)) { - auto const correctedNormProb = static_cast(targetProbBatch[vIdx]) / sumProbsShared; - targetLogitsBatch[vIdx] = __logf(correctedNormProb / (1.f - correctedNormProb)); + targetProbsBatch[vIdx] /= sSumVal; } } } + +__global__ void forwardAcceptedTokensKernel(SizeType32 batchSize, SizeType32 const* batchSlots, bool* batchIsAccepted, + SizeType32* sequenceLengths, TokenIdType const* draftIds, TokenIdType** idsPtrs, SizeType32 step, + SizeType32 maxDraftTokens, TokenIdType const* endIds, FinishedState* finishedOutput) +{ + auto index = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + for (SizeType32 bi = index; bi < batchSize; bi += static_cast(gridDim.x * blockDim.x)) + { + auto const batchSlot = batchSlots[bi]; + if (batchIsAccepted[batchSlot] && !finishedOutput[batchSlot].isSkipDecoding() + && !finishedOutput[batchSlot].isFinished()) + { + auto const curSeqLen = sequenceLengths[batchSlot]; + auto const draftTokenIdx = step; + auto const draftOutputTokenId = draftIds[batchSlot * maxDraftTokens + draftTokenIdx]; + auto* outputIdsRequestPtr = idsPtrs[batchSlot]; + auto const outIdx = curSeqLen; + outputIdsRequestPtr[outIdx] = draftOutputTokenId; + if (outputIdsRequestPtr[outIdx] == endIds[batchSlot]) + { + finishedOutput[batchSlot].setFinishedEOS(); + // Do not increase seq len when EOS is generated. Seq len should always contain only tokens to be + // outputted + } + else + { + // We don't need to set output finished state as it is assumed to be in non finished state + sequenceLengths[batchSlot] += 1; + } + } + } +} // namespace + } // namespace template -void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs, T* targetProbs, - SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState, - SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth, - SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold, - float constantThreshold, cudaStream_t stream) +void invokeMaskTargetLogits(SizeType32 batchSize, T* targetLogits, SizeType32 const* batchSlots, SizeType32 beamWidth, + SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize, + bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds, + SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream) { + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(beamWidth == 1); - { - invokeAddBiasSoftMax(draftLogits, static_cast(nullptr), draftProbs, static_cast(nullptr), nullptr, - finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded, - /* skip softmax */ false, - /* batchSlotLogits */ true, stream); - invokeAddBiasSoftMax(static_cast(nullptr), targetLogits, targetProbs, static_cast(nullptr), nullptr, - finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded, - /* skip softmax */ false, - /* batchSlotLogits */ true, stream); - } { dim3 block(1024); - dim3 grid(batchSize * beamWidth, maxDraftTokens); - acceptDraftTokensByLogitsKernel<<>>(draftProbs, targetProbs, numsDraftTokens, finished, - curandState, batchSlots, batchSize, maxBatchSize, maxDraftTokens, beamWidth, vocabSizePadded, - randomThreshold, constantThreshold); + dim3 grid(batchSize * beamWidth); + maskTargetLogitsKernel<<>>(targetLogits, batchSlots, beamWidth, vocabSizePadded, + finishedInput, maxBatchSize, batchUseDraftLogits, outputIdsAfterSampling, targetOutputIds, + runtimeTopKDevicePtr, maskBuffer); } + sync_check_cuda_error(); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void invokeAcceptDraftTokens(SizeType32 batchSize, T* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens, + bool const* batchUseDraftLogits, TokenIdType const* draftIds, FinishedState const* finishedInput, + FinishedState* finishedOutput, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 maxDraftTokens, + SizeType32 beamWidth, SizeType32 vocabSizePadded, bool randomThreshold, float constantThreshold, SizeType32 step, + bool* batchIsAccepted, SizeType32* targetOutputIds, cudaStream_t stream) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + TLLM_CHECK(beamWidth == 1); { dim3 block(1024); dim3 grid(batchSize * beamWidth); - correctAcceptedStatesAndLogits<<>>(draftProbs, targetProbs, targetLogits, - numsDraftTokens, finished, batchSlots, batchSize, maxBatchSize, maxDraftTokens, beamWidth, vocabSizePadded); + acceptDraftTokensKernel<<>>(draftProbs, targetProbs, numsDraftTokens, + batchUseDraftLogits, draftIds, finishedInput, finishedOutput, curandState, batchSlots, maxDraftTokens, + beamWidth, vocabSizePadded, randomThreshold, constantThreshold, step, batchIsAccepted, targetOutputIds); } + sync_check_cuda_error(); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -template void acceptDraftTokensByLogits(float* draftLogits, float** targetLogits, float* draftProbs, float* targetProbs, - SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState, - SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth, - SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold, - float constantThreshold, cudaStream_t stream); -template void acceptDraftTokensByLogits(half* draftLogits, half** targetLogits, half* draftProbs, half* targetProbs, - SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState, - SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth, - SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold, - float constantThreshold, cudaStream_t stream); +template void invokeMaskTargetLogits(SizeType32 batchSize, float* targetLogits, SizeType32 const* batchSlots, + SizeType32 beamWidth, SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize, + bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds, + SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream); +template void invokeMaskTargetLogits(SizeType32 batchSize, half* targetLogits, SizeType32 const* batchSlots, + SizeType32 beamWidth, SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize, + bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds, + SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream); + +template void invokeAcceptDraftTokens(SizeType32 batchSize, float* draftProbs, float* targetProbs, + SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, TokenIdType const* draftIds, + FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState, + SizeType32 const* batchSlots, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSizePadded, + bool randomThreshold, float constantThreshold, SizeType32 step, bool* batchIsAccepted, SizeType32* targetOutputIds, + cudaStream_t stream); +template void invokeAcceptDraftTokens(SizeType32 batchSize, half* draftProbs, half* targetProbs, + SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, TokenIdType const* draftIds, + FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState, + SizeType32 const* batchSlots, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSizePadded, + bool randomThreshold, float constantThreshold, SizeType32 step, bool* batchIsAccepted, SizeType32* targetOutputIds, + cudaStream_t stream); +void invokeForwardAcceptedTokens(SizeType32 batchSize, SizeType32 const* batchSlots, bool* batchIsAccepted, + SizeType32* outputSequenceLengths, TokenIdType const* draftIds, TokenIdType** idsPtrs, SizeType32 step, + SizeType32 maxDraftTokens, TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + dim3 block(std::min(static_cast(batchSize), 256u)); + dim3 grid(divUp(static_cast(batchSize), block.x)); + forwardAcceptedTokensKernel<<>>(batchSize, batchSlots, batchIsAccepted, + outputSequenceLengths, draftIds, idsPtrs, step, maxDraftTokens, endIds, finishedOutput); + sync_check_cuda_error(); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} } // namespace tensorrt_llm::kernels::speculative_decoding diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h index 4921e1390..69ee81e40 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h @@ -26,84 +26,77 @@ namespace tensorrt_llm::kernels::speculative_decoding { -//! \brief Accepts or rejects draft tokens based on the equality of draft and target tokens -//! for speculative decoding. Target token is accepted if targetToken == draftToken. -//! If number of accepted tokens N < maxDraftTokens, then function accepts N + 1 tokens of target model. -//! sequenceLengths, finishedSum and finishedFinal are modified accordingly. -//! -//! \param draftIds input buffer [batchSize, maxDraftTokens]. -//! Indices of the draft tokens. -//! \param targetIds input buffer [batchSize, maxSeqLen]. Indices of the tokens decoded by the target model -//! \param contextLengths input buffer [batchSize]. Context lengths of the requests without draft tokens -//! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request -//! \param sequenceLengths input/output buffer [batchSize] sequence lengths of the requests in batch -//! Modified in-place according to the accepted/rejected tokens -//! \param finished input buffer [maxDraftTokens + 1, batchSize] finished states at each decoding iteration -//! \param finishedFinal output buffer [batchSize] finished states after accepting/rejecting tokens -//! \param finishedSum output buffer [1] total number of requests in batch that finished the execution -//! \param batchSlots input buffer [batchSize], address map from local index -//! to global index [0, batchSize] -> [0, maxBatchSize] -//! \param batchSize current batch size -//! \param maxBatchSize maximum batch size -//! \param beamWidth beam width -//! \param maxSeqLen maximum sequence length -//! \param maxDraftTokens maximum number of draft tokens -//! \param stream stream -void invokeAcceptDraftTokensByIds(runtime::TokenIdType const* draftIds, runtime::TokenIdType const* targetIds, - runtime::SizeType32 const* contextLengths, runtime::SizeType32 const* numsDraftTokens, - runtime::SizeType32* sequenceLengths, FinishedState const* finished, FinishedState* finishedFinal, - runtime::SizeType32* finishedSum, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, - runtime::SizeType32 maxBatchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxSeqLen, - runtime::SizeType32 maxDraftTokens, cudaStream_t stream); - -//! \brief Performs probabilistic acceptance of draft tokens based on their probability distributions. -//! Corrects targetLogits for the next to the last accepted token +//! \brief Accepts or rejects draft tokens based on their probability distributions or the equality of draft and target +//! tokens. Corrects targetLogits for the last accepted token //! according to https://openreview.net/pdf?id=C9NEblP8vS //! -//! \param draftLogits input/output buffer [draftTokens, batchSize, beamWidth, vocabSize]. -//! Initially contains token logits of the draft model. -//! \param targetLogits input/output buffer [batchSize][draftTokens+1, beamWidth, vocabSize]. -//! Vector of pointers to the logits. -//! Initially contains token logits of the target model. -//! It is modified in-place for next to the last accepted token such as -//! P'(x) = norm(max(0, P_{n+1}(x) - Q_{n+1}(x))), where N < maxDraftTokens is number of accepted tokens. +//! \param batchSize current batch size //! \param draftProbs output buffer [maxDraftTokens, batchSize, beamWidth, vocabSize]. //! Workspace buffer for token probabilities of the draft model. //! \param targetProbs output buffer [maxDraftTokens+1, batchSize, beamWidth, vocabSize]. //! Workspace buffer for token probabilities of the target model. //! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request -//! \param finished output buffer [draftTokens, batchSize, beamWidth]. -//! At each step sets to NOT_FINISHED if token is accepted or SKIP_DECODING if token is not accepted -//! \param curandState input buffer [batchSize]. Curand states properly -//! initialized using invokeCurandInitialize per request. -//! \param batchSlots input buffer [batchSize], address map from local index -//! to global index [0, batchSize] -> [0, maxBatchSize] -//! \param batchSize current batch size -//! \param maxBatchSize maximum batch size -//! \param beamWidth beam width -//! \param vocabSize unpadded vocab size -//! \param vocabSizePadded padded vocab size +//! \param batchUseDraftLogits input buffer [batchSize]. Acceptance logic using draft logits or not, per request +//! \param draftIds input buffer [batchSize, draftTokens]. Pointer to draft token ids. +//! \param finishedInput input buffer [batchSize, beamWidth]. +//! \param finishedOutput output buffer [batchSize, beamWidth]. At each step sets SKIP_DECODING if token is not +//! accepted. +//! \param curandState input buffer [batchSize]. Curand states properly initialized using invokeCurandInitialize +//! per request. +//! \param batchSlots input buffer [batchSize], address map from local index to global index [0, batchSize] -> +//! [0, maxBatchSize]. //! \param maxDraftTokens maximum number of draft tokens +//! \param beamWidth beam width (only beamWidth == 1 supported) +//! \param vocabSizePadded padded vocab size //! \param randomThreshold True if use uniformly sampled threshold for token acceptance //! \param constantThreshold threshold used to accept tokens if randomThreshold is false +//! \param step The current step of decoding (draft token id index) +//! \param batchIsAccepted output buffer [batchSize]. Stores acceptance result for multinomial sampling later or +//! forwarding next step. +//! \param targetOutputIds input/output buffer [batchSize]. Stores target sampling output ids for acceptById +//! logics. //! \param stream stream template -void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs, T* targetProbs, - runtime::SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState, - runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 maxBatchSize, - runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSize, runtime::SizeType32 vocabSizePadded, - runtime::SizeType32 maxDraftTokens, bool randomThreshold, float constantThreshold, cudaStream_t stream); +void invokeAcceptDraftTokens(runtime::SizeType32 batchSize, T* draftProbs, T* targetProbs, + runtime::SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, runtime::TokenIdType const* draftIds, + FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState, + runtime::SizeType32 const* batchSlots, runtime::SizeType32 maxDraftTokens, runtime::SizeType32 beamWidth, + runtime::SizeType32 vocabSizePadded, bool randomThreshold, float constantThreshold, runtime::SizeType32 step, + bool* batchIsAccepted, runtime::SizeType32* targetOutputIds, cudaStream_t stream); -struct Candidate // Hold probability maximum and rate of target / dfraft, used in `acceptDraftTokensByLogits` -{ - float maxProb{0.f}; - float rateQP{0.f}; -}; +//! \brief Mask the target logits with -inf for unselected topK/topP token ids. +//! according to +//! https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/generation/utils.py#L4064 +//! +//! \param batchSize current batch size +//! \param targetLogits input/output buffer [batchSize][draftTokens+1, beamWidth, vocabSize]. +//! Vector of pointers to the logits. (beamWidth == 1) +//! Initially contains token logits of the target model. +//! \param batchSlots input buffer [batchSize], address map from local index to global index [0, batchSize] -> +//! [0, maxBatchSize]. +//! \param beamWidth beam width (only beamWidth == 1 supported) +//! \param vocabSizePadded padded vocab size +//! \param finishedInput input buffer [batchSize, beamWidth]. +//! \param maxBatchSize maximum batch size +//! \param batchUseDraftLogits input buffer [batchSize]. Acceptance logic using draft logits or not, per request +//! \param outputIdsAfterSampling input buffer [batchSize, vocabSize]. Stores all selected IDs from sampling for +//! masking. +//! \param targetOutputIds input/output buffer [batchSize]. Stores target sampling output ids for acceptById +//! logics. +//! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request +//! \param runtimeTopKDevicePtr input buffer [batchSize] the topks in sampling step, for porting topK ids out. +//! \param maskBuffer input buffer [batchSize, vocabSize] for masking calculation (index value to position). +//! \param stream stream +template +void invokeMaskTargetLogits(runtime::SizeType32 batchSize, T* targetLogits, runtime::SizeType32 const* batchSlots, + runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSizePadded, FinishedState const* finishedInput, + runtime::SizeType32 maxBatchSize, bool const* batchUseDraftLogits, runtime::SizeType32* outputIdsAfterSampling, + runtime::SizeType32* targetOutputIds, runtime::SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, + cudaStream_t stream); -__device__ __forceinline__ Candidate reduce_op(Candidate const& a, Candidate const& b) -{ - // Max-reduce operator of Candidate - return (a.maxProb > b.maxProb) ? a : b; -} +void invokeForwardAcceptedTokens(runtime::SizeType32 batchSize, runtime::SizeType32 const* batchSlots, + bool* batchIsAccepted, runtime::SizeType32* outputSequenceLengths, runtime::TokenIdType const* draftIds, + runtime::TokenIdType** idsPtrs, runtime::SizeType32 step, runtime::SizeType32 maxDraftTokens, + runtime::TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream); } // namespace tensorrt_llm::kernels::speculative_decoding diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h index 0ba522222..f60ac784e 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h @@ -73,7 +73,7 @@ void invokeLengthCriterion(FinishedState* finished, runtime::SizeType32* finishe runtime::SizeType32* numNewTokens, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, cudaStream_t stream); -//! \brief Sets finished states based on the endIds and ajusts sequence length to length before the first EOS token. +//! \brief Sets finished states based on the endIds and adjusts sequence length to length before the first EOS token. //! Does not support beamWidth > 1 for now. //! //! \param outputIds input buffer [maxBatchSize][beamWidth, maxSeqLen]. diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h index c53510d3e..f6ecd5b72 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h @@ -507,15 +507,12 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams(global_token_idx) * params.q_hidden_size + hidden_idx; - QuantizedEltType* quantized_q_ptr = STORE_QKV - ? reinterpret_cast(params.QuantizedQKV) + src_q_idx - : reinterpret_cast(params.Q) + dst_q_idx; VecType* q_ptr = STORE_QKV ? reinterpret_ptr(params.QKV, src_q_idx) : reinterpret_ptr(params.Q, dst_q_idx); // Cast float scale to dst data type. using TScale = typename mmha::kv_cache_scale_type_t::Type; - TScale scaleOrigQuant; + [[maybe_unused]] TScale scaleOrigQuant; if constexpr (FP8_OUTPUT || ENABLE_8BITS_CACHE) { mmha::convert_from_float( @@ -525,6 +522,9 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams(params.QuantizedQKV) + src_q_idx + : reinterpret_cast(params.Q) + dst_q_idx; mmha::store_8bits_vec(quantized_q_ptr, q, 0, scaleOrigQuant); } else @@ -813,15 +813,12 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams(global_token_idx) * params.q_hidden_size + hidden_idx; - QuantizedEltType* quantized_q_ptr = STORE_QKV - ? reinterpret_cast(params.QuantizedQKV) + src_q_idx - : reinterpret_cast(params.Q) + dst_q_idx; VecT* q_ptr = STORE_QKV ? reinterpret_ptr(params.QKV, src_q_idx) : reinterpret_ptr(params.Q, dst_q_idx); // Cast float scale to dst data type. using TScale = typename mmha::kv_cache_scale_type_t::Type; - TScale scaleOrigQuant; + [[maybe_unused]] TScale scaleOrigQuant; if constexpr (FP8_OUTPUT || ENABLE_8BITS_CACHE) { mmha::convert_from_float(&scaleOrigQuant, params.kvScaleOrigQuant ? params.kvScaleOrigQuant[0] : 1.0f); @@ -830,6 +827,9 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams(params.QuantizedQKV) + src_q_idx + : reinterpret_cast(params.Q) + dst_q_idx; mmha::store_8bits_vec(quantized_q_ptr, q, 0, scaleOrigQuant); } else diff --git a/cpp/tensorrt_llm/layers/decodingLayer.cpp b/cpp/tensorrt_llm/layers/decodingLayer.cpp index 1d91a626b..7e5c75964 100644 --- a/cpp/tensorrt_llm/layers/decodingLayer.cpp +++ b/cpp/tensorrt_llm/layers/decodingLayer.cpp @@ -19,6 +19,7 @@ #include "tensorrt_llm/layers/beamSearchLayer.h" #include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/layers/explicitDraftTokensLayer.h" +#include "tensorrt_llm/layers/externalDraftTokensLayer.h" #include "tensorrt_llm/layers/layerUtils.h" #include "tensorrt_llm/layers/lookaheadDecodingLayer.h" #include "tensorrt_llm/layers/medusaDecodingLayer.h" @@ -96,6 +97,10 @@ DecodingLayer::DecodingLayer(executor::DecodingMode const& mode, DecoderDomai { mDecodingLayer = std::make_unique>(decoderDomain, mBufferManager); } + else if (mDecodingMode.isExternalDraftTokens()) + { + mDecodingLayer = std::make_unique>(mDecodingMode, decoderDomain, mBufferManager); + } else { TLLM_CHECK_WITH_INFO(false, @@ -144,6 +149,12 @@ void DecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, TensorC beamWidth == 1, "Decoding mode is ExplicitDraftTokens, but beamWidth != 1 (%d != 1)", beamWidth); mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams, workspace); } + else if (mDecodingMode.isExternalDraftTokens()) + { + TLLM_CHECK_WITH_INFO( + beamWidth == 1, "Decoding mode is external draft tokens, but beamWidth != 1 (%d != 1)", beamWidth); + mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams, workspace); + } else { TLLM_CHECK_WITH_INFO(false, @@ -249,6 +260,45 @@ std::tuple, std::shared_ptr(baseInputs); + auto const ite = externalDraftTokenParams->ite; + auto const step = externalDraftTokenParams->step; + auto const localBatchSize = static_cast(externalDraftTokenParams->localBatchSize); + + TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() == 1, + "Decoding mode is TopK and/or TopP, but beamWidth != 1 (%d != 1)", localDecoderDomain.getBeamWidth()); + + // In sampling, we have supported batch sampling. So, we always compute all + // sentences once. + TensorConstPtr logitsSlice = ITensor::slice(*externalDraftTokenParams->logits, 0, localBatchSize); + TensorConstPtr endIdSlice = ITensor::slice(endIds, 0, localBatchSize); + auto decodeInputs = std::make_shared( + endIdSlice, externalDraftTokenParams->batchSlots, step, ite, localBatchSize); + + decodeInputs->finished = externalDraftTokenParams->finished; + + decodeInputs->logits = logitsSlice; + + if (externalDraftTokenParams->inputLengths) + { + auto& inputLengths = externalDraftTokenParams->inputLengths.value(); + decodeInputs->inputLengths = ITensor::slice(inputLengths, 0, localBatchSize); + } + decodeInputs->draftLogits = externalDraftTokenParams->draftLogits; + decodeInputs->draftProbs = externalDraftTokenParams->draftProbs; + decodeInputs->targetProbs = externalDraftTokenParams->targetProbs; + decodeInputs->numDraftTokens = externalDraftTokenParams->numDraftTokens; + decodeInputs->draftTokenIds = externalDraftTokenParams->draftTokenIds; + decodeInputs->constantThreshold = externalDraftTokenParams->constantThreshold; + decodeInputs->useRandomAcceptanceThreshold = externalDraftTokenParams->useRandomAcceptanceThreshold; + decodeInputs->step = externalDraftTokenParams->step; + decodeInputs->useDraftLogits = externalDraftTokenParams->useDraftLogits; + + preparedInputs = decodeInputs; + preparedOutputs = baseOutputs; + } else { TLLM_CHECK_WITH_INFO(false, diff --git a/cpp/tensorrt_llm/layers/decodingLayer.h b/cpp/tensorrt_llm/layers/decodingLayer.h index 78cd6b1b5..60780851f 100644 --- a/cpp/tensorrt_llm/layers/decodingLayer.h +++ b/cpp/tensorrt_llm/layers/decodingLayer.h @@ -45,7 +45,7 @@ class DecodingLayer : public BaseLayer std::shared_ptr const& inputs, std::shared_ptr const& workspace) override; - //! \brief Calls forwardSync of configired decoding layer. + //! \brief Calls forwardSync of configured decoding layer. void forwardSync(std::shared_ptr const& outputs, std::shared_ptr const& inputs, std::shared_ptr const& workspace) override; diff --git a/cpp/tensorrt_llm/layers/decodingParams.h b/cpp/tensorrt_llm/layers/decodingParams.h index 0179add1d..40dbbba1f 100644 --- a/cpp/tensorrt_llm/layers/decodingParams.h +++ b/cpp/tensorrt_llm/layers/decodingParams.h @@ -210,8 +210,13 @@ struct LookaheadSetupParams : public DecodingSetupParams TensorPtr positionOffsets; //! see LookaheadDecodingOutputs::attentionPackedMasks TensorPtr attentionPackedMasks; - //! see LookaheadDecodingOutputs::actualGenerationLengths - TensorPtr actualGenerationLengths; +}; + +class ExternalDraftTokensSetupParams : public DecodingSetupParams +{ +public: + std::optional> runtimeTopK; // [1] or [setupBatchSize] on cpu + std::optional> runtimeTopP; // [1] or [setupBatchSize] on cpu }; class BaseDecodingInputs @@ -333,6 +338,33 @@ class SamplingInputs : public DecodingInputs bool probsComputed{}; }; +class ExternalDraftTokensInputs : public DecodingInputs +{ +public: + explicit ExternalDraftTokensInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 step, + runtime::SizeType32 ite, runtime::SizeType32 localBatchSize) + : DecodingInputs{std::move(endIds), std::move(batchSlots), step, ite, localBatchSize} + { + } + + TensorPtr draftLogits; + TensorPtr draftProbs; + TensorPtr targetProbs; + TensorPtr numDraftTokens; + TensorPtr draftTokenIds; + TensorPtr useDraftLogits; + runtime::SizeType32 step; + float constantThreshold; + bool useRandomAcceptanceThreshold; + + //! optional parameters + //! [localBatchSize] + curandState_t* curandStates{}; + + //! Flag to mark that logits tensor contains probabilities + bool probsComputed{}; +}; + // Medusa inputs class MedusaDecodingInputs : public DecodingInputs { @@ -479,7 +511,7 @@ class BeamSearchOutputs : public BaseDecodingOutputs //! {c'} is always accepted and {x', z'} is supposed to be accepted. //! The accepted tokens [c', x', z'] is saved in `outputIds` in-place, starting from `sequenceLength`. //! The `acceptedLength` is 3, and the accepted draft tokens length is 2. -//! `sequenceLength` is also increaded by `acceptedLength` in-place. +//! `sequenceLength` is also increased by `acceptedLength` in-place. //! The pathsOffset is {0, 1, 3} for {c', x', z'}. //! [] for accepted, <> for draft, {} for input/output. //! @@ -551,8 +583,6 @@ class LookaheadDecodingOutputs : public SpeculativeDecodingOutputs TensorPtr positionOffsets; //! [maxBatchSize, maxDecodingTokens] TensorPtr positionIds; - //! The actual decoding tokens length, for debug and for future. - TensorPtr actualGenerationLengths; }; class ExplicitDraftTokensOutputs : public SpeculativeDecodingOutputs diff --git a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp new file mode 100644 index 000000000..097fe116e --- /dev/null +++ b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "externalDraftTokensLayer.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/kernels/decodingCommon.h" +#include "tensorrt_llm/kernels/samplingTopKKernels.h" +#include "tensorrt_llm/kernels/samplingTopPKernels.h" +#include "tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h" +#include "tensorrt_llm/layers/defaultDecodingParams.h" +#include "tensorrt_llm/layers/layerUtils.h" +#include "tensorrt_llm/runtime/runtimeKernels.h" + +#include + +namespace tksd = tensorrt_llm::kernels::speculative_decoding; + +using namespace tensorrt_llm::common; +using namespace tensorrt_llm::kernels; +using namespace tensorrt_llm::runtime; + +namespace tensorrt_llm::layers +{ + +template +ExternalDraftTokensLayer::ExternalDraftTokensLayer(executor::DecodingMode const& mode, + DecoderDomain const& decoderDomain, std::shared_ptr bufferManager) + : BaseLayer(decoderDomain, bufferManager) + , mDecodingMode(mode) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + TLLM_CHECK_WITH_INFO(!mDecodingMode.isBeamSearch(), "ExternalDraftTokensLayer does not support Beam search mode"); + + allocateBuffer(decoderDomain.getBatchSize()); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExternalDraftTokensLayer::allocateBuffer(SizeType32 batchSize) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + // top k workspace size + auto workspaceSize = getTopKWorkspaceSize(batchSize, 1, TOP_K_MAX, mDecoderDomain.getVocabSizePadded()); + mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize); + // top p workspace size + workspaceSize = getTopPWorkspaceSize(batchSize, mDecoderDomain.getVocabSizePadded()); + mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize); + // multinomial (top p == 1) workspace size + workspaceSize = getTopPWorkspaceSize(batchSize, mDecoderDomain.getVocabSizePadded()); + mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize); + + // batchsize here is maxBatchSize + auto const batchSizeShape = ITensor::makeShape({batchSize}); + + mCurandStatesDevice + = mBufferManager->gpu(ITensor::makeShape({batchSize, sizeof(curandState_t)}), TRTDataType::value); + mBatchIsAccepted = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + mRuntimeMultinomialDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + + // host buffers. + mSkipTopKDecodeDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + mSkipTopKDecodeHost = mBufferManager->pinnedPool(batchSizeShape, TRTDataType::value); + mSkipTopPDecodeDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + mSkipTopPDecodeHost = mBufferManager->pinnedPool(batchSizeShape, TRTDataType::value); + auto skipTopPDecodeHostRange = BufferRange(*mSkipTopPDecodeHost); + std::fill(skipTopPDecodeHostRange.begin(), skipTopPDecodeHostRange.end(), true); + + mOutputIdsAfterSampling = mBufferManager->gpu( + ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType::value); + mTargetOutputIds = mBufferManager->gpu(ITensor::makeShape({batchSize}), TRTDataType::value); + + mRuntimeTopKDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + + mRuntimeTopPForTopKDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + + mRuntimeTopPDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + mInitialTopPDevice = mBufferManager->gpu(batchSizeShape, TRTDataType::value); + + mMaskBuffer = mBufferManager->gpu( + ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType::value); + + mSetupWorkspaceSize = std::max({mBatchIsAccepted->getSizeInBytes(), mRuntimeMultinomialDevice->getSizeInBytes(), + mSkipTopKDecodeDevice->getSizeInBytes(), mSkipTopPDecodeDevice->getSizeInBytes(), + mOutputIdsAfterSampling->getSizeInBytes(), mTargetOutputIds->getSizeInBytes(), + mRuntimeTopKDevice->getSizeInBytes(), mRuntimeTopPForTopKDevice->getSizeInBytes(), + mRuntimeTopPDevice->getSizeInBytes(), mInitialTopPDevice->getSizeInBytes(), mMaskBuffer->getSizeInBytes()}); + + mTargetLogits = mBufferManager->gpu( + ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType::value); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExternalDraftTokensLayer::setup(SizeType32 batchSize, SizeType32 beamWidth, TensorConstPtr batchSlots, + std::shared_ptr const& baseSetupParams, + std::shared_ptr const& workspace) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto setupParams = std::dynamic_pointer_cast(baseSetupParams); + + workspace->initializeDeviceCurandStates( + setupParams->randomSeed, batchSize, workspace->getDeviceBatchSlots(), mCurandStatesDevice); + + auto const* batchSlotsDevicePtr = workspace->getDeviceBatchSlotsPtr(); + auto& runtimeMultinomialDeviceTensor = const_cast(*mRuntimeMultinomialDevice); + tensorrt_llm::runtime::kernels::invokeFill(runtimeMultinomialDeviceTensor, 1.0f, mBufferManager->getStream()); + + auto* runtimeTopKDevicePtr = bufferCastOrNull(mRuntimeTopKDevice); + + // Prepare runtime top K + auto constexpr defaultTopK = 1u; + auto runtimeTopK = setupParams->runtimeTopK.value_or(std::vector(batchSize, defaultTopK)); + auto const runtimeTopKSize = runtimeTopK.size(); + for (auto& topK : runtimeTopK) + { + if (topK < 0 || topK > TOP_K_MAX) + { + TLLM_LOG_WARNING( + "TopK (%d) is larger than max supported number (%d). Clip to max supported number.", topK, TOP_K_MAX); + topK = std::clamp(topK, 0, static_cast(TOP_K_MAX)); + } + } + + if (runtimeTopKSize > 1) + { + TLLM_CHECK_WITH_INFO(runtimeTopK.size() == batchSize, + fmtstr("runtimeTopK.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopK.size(), batchSize)); + DecodingLayerWorkspace::copyToWorkspace( + *this->mBufferManager, runtimeTopK, workspace->getWorkspaceDeviceBuffer()); + auto* setupWorkspaceDevicePtr = workspace->getWorkspaceDevicePtrAs(); + // fill top ks into runtimeTopKDevice + invokeScatterDecodingParams( + setupWorkspaceDevicePtr, runtimeTopKDevicePtr, batchSlotsDevicePtr, batchSize, getStream()); + } + + // FIXME(nkorobov): monotonically growing + auto const curMaxTopK = *std::max_element(std::begin(runtimeTopK), std::end(runtimeTopK)); + mRuntimeMaxTopK = std::max(mRuntimeMaxTopK, curMaxTopK); + + auto runtimeTopP = setupParams->runtimeTopP.value_or(std::vector{}); + auto const runtimeTopPSize = runtimeTopP.size(); + auto* runtimeTopPForTopKDevicePtr = bufferCastOrNull(mRuntimeTopPForTopKDevice); + auto* runtimeTopPDevicePtr = bufferCastOrNull(mRuntimeTopPDevice); + auto* skipTopPDecodeHostPtr = bufferCastOrNull(mSkipTopPDecodeHost); + + // if no top P, fill topP skip decode to true + if (runtimeTopPSize == 0) + { + auto const* batchSlotsPtr = bufferCast(*batchSlots); + for (SizeType32 bi = 0; bi < batchSize; ++bi) + { + auto const bid = batchSlotsPtr[bi]; + skipTopPDecodeHostPtr[bid] = true; + } + auto skipTopPDecodeHostSlice = IBuffer::slice(mSkipTopPDecodeHost, 0, mDecoderDomain.getBatchSize()); + mBufferManager->copy(*skipTopPDecodeHostSlice, *mSkipTopPDecodeDevice); + } + else + { + for (auto& topP : runtimeTopP) + { + if (topP < 0.f || topP > 1.0f) + { + TLLM_LOG_WARNING("TopP (%f) is out of range ([0.0, 1.0f]). Clip to closest number.", topP); + topP = std::clamp(topP, 0.f, 1.f); + } + } + if (runtimeTopPSize > 1) + { + TLLM_CHECK_WITH_INFO(runtimeTopP.size() == batchSize, + fmtstr("runtimeTopP.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopP.size(), batchSize)); + DecodingLayerWorkspace::copyToWorkspace( + *this->mBufferManager, runtimeTopP, workspace->getWorkspaceDeviceBuffer()); + auto* setupWorkspaceDevicePtr = workspace->getWorkspaceDevicePtrAs(); + // fill runtime top p device for top k kernel + invokeScatterDecodingParams( + setupWorkspaceDevicePtr, runtimeTopPForTopKDevicePtr, batchSlotsDevicePtr, batchSize, getStream()); + // fill runtime top p device for top p kernel + invokeScatterDecodingParams( + setupWorkspaceDevicePtr, runtimeTopPDevicePtr, batchSlotsDevicePtr, batchSize, getStream()); + } + } + // if no topP, default topP is 0.0f, but in invokeSetupTopKRuntimeArgs, it gets set to 1.0f if k > 0 + auto const topP = (runtimeTopPSize == 0) ? DefaultDecodingParams::getTopP() : runtimeTopP.front(); + + auto* skipTopKDecodeDevicePtr = bufferCastOrNull(mSkipTopKDecodeDevice); + { + dim3 block(std::min(static_cast(batchSize), 256u)); + dim3 grid(divUp(static_cast(batchSize), block.x)); + // support topK up to TOP_K_MAX. + invokeSetupTopKRuntimeArgs(batchSize, curMaxTopK, runtimeTopKDevicePtr, runtimeTopKSize, topP, + runtimeTopPForTopKDevicePtr, runtimeTopPSize, skipTopKDecodeDevicePtr, batchSlotsDevicePtr, getStream()); + } + auto const skipTopKHostDecodeDeviceSlice = ITensor::slice(mSkipTopKDecodeDevice, 0, mDecoderDomain.getBatchSize()); + auto skipTopKDecodeHostSlice = ITensor::slice(mSkipTopKDecodeHost, 0, mDecoderDomain.getBatchSize()); + mBufferManager->copy(*skipTopKHostDecodeDeviceSlice, *skipTopKDecodeHostSlice); + + auto* skipTopPDecodeDevicePtr = bufferCast(*mSkipTopPDecodeDevice); + { + auto* initialTopPDevicePtr = bufferCast(*mInitialTopPDevice); + invokeSetTopPRuntimeArgs(batchSize, curMaxTopK, runtimeTopKDevicePtr, runtimeTopKSize, topP, + runtimeTopPDevicePtr, runtimeTopPSize, skipTopPDecodeDevicePtr, batchSlotsDevicePtr, initialTopPDevicePtr, + getStream()); + } + auto const skipTopPHostDecodeDeviceSlice = ITensor::slice(mSkipTopPDecodeDevice, 0, mDecoderDomain.getBatchSize()); + auto skipTopPDecodeHostSlice = ITensor::slice(mSkipTopPDecodeHost, 0, mDecoderDomain.getBatchSize()); + mBufferManager->copy(*skipTopPHostDecodeDeviceSlice, *skipTopPDecodeHostSlice); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExternalDraftTokensLayer::forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto inputs = std::dynamic_pointer_cast(baseInputs); + + auto const batchSize = inputs->logits.value()->getDimension<0>(); + + auto const* endIds = bufferCast(*inputs->endIds); + + FinishedState const* finishedInput = (inputs->finished) + ? reinterpret_cast(bufferCast(*inputs->finished.value())) + : nullptr; + + inputs->curandStates = reinterpret_cast(bufferCast(*mCurandStatesDevice)); + inputs->probsComputed = true; + + auto runtimeLogitsPtr = bufferCast(*workspace->getDeviceRuntimeLogits()); + auto logitsPtrsPtr = static_cast(nullptr); + auto biasPtr = static_cast(nullptr); + auto const* batchSlotsPtr = workspace->getDeviceBatchSlotsPtr(); + mBufferManager->copy(runtimeLogitsPtr, *mTargetLogits); + invokeAddBiasSoftMax(runtimeLogitsPtr, logitsPtrsPtr, runtimeLogitsPtr, biasPtr, endIds, finishedInput, + batchSlotsPtr, batchSize, mDecoderDomain.getBatchSize(), /* bw */ 1, mDecoderDomain.getVocabSize(), + mDecoderDomain.getVocabSizePadded(), /*skipSoftMax*/ false, /* batchSlotLogits */ false, getStream()); + + auto const targetTokenIdsShape = (*outputs->outputIds).getShape(); + + // Fill the buffer for selected ids from sampling with zero. -1 will be set as a boundary if topP kernel is required + auto& outputIdsAfterSamplingTensor = const_cast(*mOutputIdsAfterSampling); + tensorrt_llm::runtime::kernels::invokeFill(outputIdsAfterSamplingTensor, 0, mBufferManager->getStream()); + + // The logits from target engine should go through samplings first. + // gptDecoderBatched.cpp is calling dynamic decoder step by step, in this step, dynamic Decoder already forwarded + // PenaltyLayer, BanWordsLayer. For (TopK > 0) && (TopK == 0 && TopP == 0), we invoke TopK sampling kernel. The same + // logic is implemented in SamplingLayer.cpp + getAllTopKs(outputs, baseInputs, workspace); + + // Only for (TopK == 0 && TopP > 0), we invoke TopP sampling + getAllTopPs(outputs, baseInputs, workspace); + + // After all selected tokens are filled in mOutputIdsAfterSampling by topK, topP kernels, token acceptance logics + // starts. First we mask the logits of unselected token id to -inf as HF's TopK, TopP implementation. We compute the + // logit probs of draft and target and go through acceptance logics. + acceptDraftTokens(outputs, baseInputs, workspace); + + // If the token of the sequence is not accepted, a multinomial sampling is required for the bonus token. + // Multinomial sampling is achieved through TopP kernel with TopP = 1 and already weighted-sum target logits. + // The acceptance result of each batch is used as skipDecode in topP kernel. If is accepted, no sampling is needed + // (early exit). Forwarding for the next step is also set in this kernel. + multinomialSampling(outputs, baseInputs, workspace); + + // For the sequence with accepted tokens, we simply forward a step. + forwardAcceptedTokens(outputs, baseInputs, workspace); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +size_t ExternalDraftTokensLayer::getWorkspaceSize() const noexcept +{ + return std::max(mWorkspaceSize, mSetupWorkspaceSize); +} + +template +void ExternalDraftTokensLayer::acceptDraftTokens(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(baseInputs); + + auto const draftLogitsShape = (*inputs->draftLogits).getShape(); + auto const maxBatchSize = mDecoderDomain.getBatchSize(); + auto const maxTokensPerStep = draftLogitsShape.d[1]; // 1 + auto const batchSize = inputs->logits.value()->getDimension<0>(); + auto constexpr beamWidth = 1; + + FinishedState const* finishedInput = (inputs->finished) + ? reinterpret_cast(bufferCastOrNull(inputs->finished)) + : nullptr; + + FinishedState* finishedOutput = (outputs->finished) + ? reinterpret_cast(bufferCastOrNull(outputs->finished)) + : nullptr; + + tksd::invokeMaskTargetLogits(batchSize, bufferCast(*mTargetLogits), workspace->getDeviceBatchSlotsPtr(), + beamWidth, mDecoderDomain.getVocabSizePadded(), finishedInput, maxBatchSize, + bufferCast(*inputs->useDraftLogits), bufferCast(*mOutputIdsAfterSampling), + bufferCast(*mTargetOutputIds), bufferCastOrNull(mRuntimeTopKDevice), + bufferCast(*mMaskBuffer), getStream()); + + if (inputs->step == 0) + { + invokeAddBiasSoftMax(bufferCast(*inputs->draftLogits), static_cast(nullptr), + bufferCast(*inputs->draftProbs), static_cast(nullptr), nullptr, finishedInput, + workspace->getDeviceBatchSlotsPtr(), batchSize, maxBatchSize, beamWidth * maxTokensPerStep, + mDecoderDomain.getVocabSize(), mDecoderDomain.getVocabSizePadded(), + /* skip softmax */ false, + /* batchSlotLogits */ true, getStream()); + } + + invokeAddBiasSoftMax(bufferCast(*mTargetLogits), static_cast(nullptr), bufferCast(*inputs->targetProbs), + static_cast(nullptr), nullptr, finishedInput, workspace->getDeviceBatchSlotsPtr(), batchSize, maxBatchSize, + beamWidth /* 1 */, mDecoderDomain.getVocabSize(), mDecoderDomain.getVocabSizePadded(), + /* skip softmax */ false, + /* batchSlotLogits */ false, getStream()); + + sync_check_cuda_error(); + + tksd::invokeAcceptDraftTokens(batchSize, bufferCast(*inputs->draftProbs), bufferCast(*inputs->targetProbs), + bufferCast(*inputs->numDraftTokens), bufferCast(*inputs->useDraftLogits), + bufferCast(*inputs->draftTokenIds), finishedInput, finishedOutput, inputs->curandStates, + workspace->getDeviceBatchSlotsPtr(), maxTokensPerStep, beamWidth, mDecoderDomain.getVocabSizePadded(), + inputs->useRandomAcceptanceThreshold, inputs->constantThreshold, inputs->step, + bufferCast(*mBatchIsAccepted), bufferCast(*mTargetOutputIds), getStream()); + + sync_check_cuda_error(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExternalDraftTokensLayer::multinomialSampling(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(baseInputs); + + auto const batchSize = inputs->logits.value()->getDimension<0>(); + auto probs = bufferCastOrNull(inputs->targetProbs); + auto* sequenceLength = bufferCastOrNull(outputs->sequenceLength); + auto const* endIds = bufferCastOrNull(inputs->endIds); + + FinishedState* finishedOutput = (outputs->finished) + ? reinterpret_cast(bufferCastOrNull(outputs->finished)) + : nullptr; + TopPSamplingKernelParams params{}; + + params.probs = probs; + params.outputIdsPtrs = bufferCastOrNull(outputs->outputIdsPtr); + params.workspace = workspace->getRawWorkspaceDevicePtr(); + params.topPs = bufferCastOrNull(mRuntimeMultinomialDevice); + params.sequenceLength = sequenceLength; + params.endIds = endIds; + params.batchSlots = workspace->getDeviceBatchSlotsPtr(); + params.finishedInput = nullptr; + params.finishedOutput = finishedOutput; + params.skipDecode = bufferCastOrNull(mBatchIsAccepted); + params.cumLogProbs = nullptr; + params.outputLogProbs = nullptr; + params.curandState = inputs->curandStates; + params.batchSize = batchSize; + params.maxBatchSize = mDecoderDomain.getBatchSize(); + params.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); + + invokeBatchTopPSampling(params, getStream()); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExternalDraftTokensLayer::getAllTopKs(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(baseInputs); + + auto logits = bufferCastOrNull(inputs->logits); + + auto const batchSize = inputs->logits.value()->getDimension<0>(); + + auto const* batchSlotsHost = bufferCast(*inputs->batchSlots); + auto* skipDecodeHostPtr = bufferCastOrNull(mSkipTopKDecodeHost); + auto const skip = allOfBatchSlots(batchSlotsHost, skipDecodeHostPtr, batchSize, true); + if (skip) + { + return; + } + + FinishedState const* finishedInput = (inputs->finished) + ? reinterpret_cast(bufferCastOrNull(inputs->finished)) + : nullptr; + + TopKSamplingKernelParams params{}; + params.logProbs = logits; + params.outputIds = bufferCastOrNull(mOutputIdsAfterSampling); + params.workspace = workspace->getRawWorkspaceDevicePtr(); + params.maxTopP = 1.0f; + params.topPs = bufferCastOrNull(mRuntimeTopPForTopKDevice); + params.maxTopK = mRuntimeMaxTopK; + params.topKs = bufferCastOrNull(mRuntimeTopKDevice); + params.batchSlots = workspace->getDeviceBatchSlotsPtr(); + params.finishedInput = finishedInput; + params.skipDecode = bufferCastOrNull(mSkipTopKDecodeDevice); + params.curandState = inputs->curandStates; + params.batchSize = batchSize; + params.maxBatchSize = mDecoderDomain.getBatchSize(); + params.maxTokensPerStep = 1; + params.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); + params.returnAllSelectedTokens = true; + params.maxSeqLen = mDecoderDomain.getVocabSizePadded(); // workaround for returning all topKs with outputIds + params.logitsHasProbs = inputs->probsComputed; + + invokeBatchTopKSampling(params, getStream()); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExternalDraftTokensLayer::getAllTopPs(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(baseInputs); + + auto logits = bufferCastOrNull(inputs->logits); + + auto const batchSize = inputs->logits.value()->getDimension<0>(); + + auto const* batchSlotsHost = bufferCast(*inputs->batchSlots); + auto* skipDecodeHostPtr = bufferCastOrNull(mSkipTopPDecodeHost); + auto const skip = allOfBatchSlots(batchSlotsHost, skipDecodeHostPtr, batchSize, true); + if (skip) + { + return; + } + + FinishedState const* finishedInput = (inputs->finished) + ? reinterpret_cast(bufferCastOrNull(inputs->finished)) + : nullptr; + + TopPSamplingKernelParams params{}; + params.probs = logits; + params.outputIds = bufferCastOrNull(mOutputIdsAfterSampling); + params.workspace = workspace->getRawWorkspaceDevicePtr(); + params.topPs = bufferCastOrNull(mRuntimeTopPDevice); + params.batchSlots = workspace->getDeviceBatchSlotsPtr(); + params.finishedInput = finishedInput; + params.skipDecode = bufferCastOrNull(mSkipTopPDecodeDevice); + params.curandState = inputs->curandStates; + params.batchSize = batchSize; + params.maxBatchSize = mDecoderDomain.getBatchSize(); + params.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); + params.returnAllSelectedTokens = true; + params.maxSeqLen = mDecoderDomain.getVocabSizePadded(); + + invokeBatchTopPSampling(params, getStream()); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template +void ExternalDraftTokensLayer::forwardAcceptedTokens(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + auto inputs = std::dynamic_pointer_cast(baseInputs); + auto const batchSize = inputs->logits.value()->getDimension<0>(); + + auto const draftLogitsShape = (*inputs->draftLogits).getShape(); + auto const maxTokensPerStep = draftLogitsShape.d[1]; // 1 + + FinishedState* finishedOutput = (outputs->finished) + ? reinterpret_cast(bufferCastOrNull(outputs->finished)) + : nullptr; + + tksd::invokeForwardAcceptedTokens(batchSize, workspace->getDeviceBatchSlotsPtr(), + bufferCast(*mBatchIsAccepted), bufferCastOrNull(outputs->sequenceLength), + bufferCast(*inputs->draftTokenIds), bufferCastOrNull(outputs->outputIdsPtr), + inputs->step, maxTokensPerStep, bufferCastOrNull(inputs->endIds), finishedOutput, getStream()); + + sync_check_cuda_error(); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +template class ExternalDraftTokensLayer; +template class ExternalDraftTokensLayer; + +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h new file mode 100644 index 000000000..4122c7c35 --- /dev/null +++ b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/executor/types.h" +#include "tensorrt_llm/layers/baseLayer.h" +#include "tensorrt_llm/layers/decodingParams.h" +#include "tensorrt_llm/runtime/common.h" + +#include + +namespace tensorrt_llm::layers +{ + +//! \brief Top class for sampling layers. +//! It sets up and executes TopKSamplingLayer and TopPSamplingLayer samplings +template +class ExternalDraftTokensLayer : public BaseLayer +{ +public: + using Base = BaseLayer; + + ExternalDraftTokensLayer(executor::DecodingMode const& mode, DecoderDomain const& decoderDomain, + std::shared_ptr bufferManager); + + void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, TensorConstPtr batchSlots, + std::shared_ptr const& setupParams, + std::shared_ptr const& workspace) override; + + void forwardAsync(std::shared_ptr const& outputs, + std::shared_ptr const& inputs, + std::shared_ptr const& workspace) override; + + //! @returns workspace needed for this layer in bytes + [[nodiscard]] size_t getWorkspaceSize() const noexcept override; + +protected: + runtime::SizeType32 mRuntimeMaxTopK{0}; + +private: + using Base::mDecoderDomain; + + executor::DecodingMode mDecodingMode; + + size_t mWorkspaceSize{0}; + size_t mSetupWorkspaceSize{0}; + + TensorPtr mCurandStatesDevice; + TensorPtr mSkipTopKDecodeDevice; + TensorPtr mSkipTopKDecodeHost; + TensorPtr mSkipTopPDecodeDevice; + TensorPtr mSkipTopPDecodeHost; + + TensorPtr mBatchIsAccepted; + TensorPtr mRuntimeMultinomialDevice; + + TensorPtr mOutputIdsAfterSampling; + TensorPtr mTargetOutputIds; + TensorPtr mRuntimeTopKDevice; + TensorPtr mRuntimeTopPForTopKDevice; + TensorPtr mRuntimeTopPDevice; + TensorPtr mInitialTopPDevice; + TensorPtr mMaskBuffer; + + TensorPtr mTargetLogits; + +private: + void allocateBuffer(runtime::SizeType32 batchSize); + void acceptDraftTokens(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace); + void multinomialSampling(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace); + void getAllTopKs(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace); + void getAllTopPs(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace); + void forwardAcceptedTokens(std::shared_ptr const& outputs, + std::shared_ptr const& baseInputs, + std::shared_ptr const& workspace); +}; + +} // namespace tensorrt_llm::layers diff --git a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp index 5b3062be0..db78160b9 100644 --- a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp +++ b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp @@ -18,8 +18,12 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/executor/executor.h" +#include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/lookaheadModule.h" +#include #include namespace tensorrt_llm::layers @@ -27,6 +31,36 @@ namespace tensorrt_llm::layers using namespace tensorrt_llm::runtime; +LookaheadAlgorithm::LookaheadAlgorithm( + runtime::SizeType32 maxW, runtime::SizeType32 maxN, runtime::SizeType32 maxG, runtime::SizeType32 id) + : mMaxW(maxW) + , mMaxN(maxN) + , mMaxG(maxG) + , mFilling(0) + , mPoolManager(maxG) + , mId(id) + , mGoldenTokensMax( + runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxN * 2 - 1}), nvinfer1::DataType::kINT32)) + , mPrefillsMax(runtime::BufferManager::cpu( + runtime::ITensor::makeShape({(maxN <= 1 ? 0 : maxN - 2)}), nvinfer1::DataType::kINT32)) + , mKeyTokensMax(runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW}), nvinfer1::DataType::kINT32)) + , mPastTokensMax( + runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW * (maxN - 1)}), nvinfer1::DataType::kINT32)) + , mGuessTokensMax( + runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxG * (maxN - 1)}), nvinfer1::DataType::kINT32)) +{ + runtime::SizeType32 maxGeneratedLen, maxDraftLen; + std::tie(maxGeneratedLen, std::ignore, maxDraftLen, std::ignore) + = executor::LookaheadDecodingConfig(maxW, maxN, maxG).calculateSpeculativeResource(); + mAttentionMask = runtime::BufferManager::cpu( + runtime::ITensor::makeShape({maxDraftLen, maxDraftLen}), nvinfer1::DataType::kBOOL); + mDraftTokensMax + = runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxDraftLen}), nvinfer1::DataType::kINT32); + mSampledTokensMax + = runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxGeneratedLen}), nvinfer1::DataType::kINT32); + mEncodeMapMax = runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxDraftLen}), nvinfer1::DataType::kINT32); +} + void LookaheadAlgorithm::setup(TensorConstPtr const& prompt, SizeType32 w, SizeType32 n, SizeType32 g) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -36,7 +70,7 @@ void LookaheadAlgorithm::setup(TensorConstPtr const& prompt, SizeType32 w, SizeT mW = w; mN = n; mG = g; - std::tie(std::ignore, std::ignore, mRuntimeMaxDraftLen, std::ignore) + std::tie(std::ignore, std::ignore, mRuntimeMaxDraftLen, mRuntimeMaxDraftPathLen) = executor::LookaheadDecodingConfig(mW, mN, mG).calculateSpeculativeResource(); mPoolManager.setup(mG); @@ -81,8 +115,8 @@ void LookaheadAlgorithm::accept(TensorConstPtr const& generatedTokens) } //! lookahead has two phase, prefill the past tokens matrix and maintain past tokens matrix. -runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, TensorPtr const& positionIds, - TensorPtr const& samplingMask, runtime::SizeType32 offset) +runtime::SizeType32 LookaheadAlgorithm::lookahead( + TensorPtr const& draftTokens, TensorPtr const& positionIds, runtime::SizeType32 startPosId) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -90,7 +124,6 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, SizeType32 len = prefill + mFilling * mW; TLLM_CHECK(len <= ITensor::volume(draftTokens->getShape())); TLLM_CHECK(len <= ITensor::volume(positionIds->getShape())); - TLLM_CHECK(len <= ITensor::volume(samplingMask->getShape())); BufferRange prefillRange(*mPrefills); BufferRange pastRange(*mPastTokens); BufferRange draftRange(*draftTokens); @@ -112,11 +145,6 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, } BufferRange positionIdsRange(*positionIds); - BufferRange samplingMaskRange(*samplingMask); - for (auto& v : samplingMaskRange) - { - v = 0; - } SizeType32 idx = 0, wj = 0; auto fillPosition = [&positionIdsRange, &idx](SizeType32 start, SizeType32 len) { @@ -127,20 +155,18 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, }; if (prefill >= 0) { - fillPosition(offset, prefill); + fillPosition(startPosId, prefill); for (wj = 0; wj < mW; wj++) { - fillPosition(offset + prefill + wj, mFilling); - samplingMaskRange[prefill + wj * mFilling + mFilling - 1] = true; + fillPosition(startPosId + prefill + wj, mFilling); } } else { - fillPosition(offset, mFilling - 1); + fillPosition(startPosId, mFilling - 1); for (wj = 1; wj < mW; wj++) { - fillPosition(offset - 1 + wj, mFilling); - samplingMaskRange[wj * mFilling + mFilling - 1 - 1] = true; + fillPosition(startPosId - 1 + wj, mFilling); } } PRINT_VALUES(positionIds); @@ -150,7 +176,7 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, } runtime::SizeType32 LookaheadAlgorithm::guess(TensorPtr const& guessTokens, TensorPtr const& guessIds, - TensorPtr const& samplingMask, runtime::SizeType32 offset, runtime::TokenIdType lastToken) + runtime::SizeType32 startPosId, runtime::TokenIdType lastToken) { auto guesses = mPoolManager.guess(lastToken, mW); @@ -158,67 +184,227 @@ runtime::SizeType32 LookaheadAlgorithm::guess(TensorPtr const& guessTokens, Tens std::for_each(guesses.begin(), guesses.end(), [&len](auto& a) { len += ITensor::volume(a->getShape()); }); TLLM_CHECK(len <= ITensor::volume(guessTokens->getShape())); TLLM_CHECK(len <= ITensor::volume(guessIds->getShape())); - TLLM_CHECK(len <= ITensor::volume(samplingMask->getShape())); BufferRange guessTokensRange(*guessTokens); BufferRange guessIdsRange(*guessIds); - BufferRange samplingMaskRange(*samplingMask); SizeType32 cur = 0; for (auto guess : guesses) { BufferRange guessRange(*guess); std::copy(guessRange.begin(), guessRange.end(), guessTokensRange.begin() + cur); - SizeType32 tmp = offset; + SizeType32 tmp = startPosId; std::for_each( guessIdsRange.begin() + cur, guessIdsRange.begin() + cur + mN - 1, [&tmp](auto& v) { v = tmp++; }); cur += ITensor::volume(guess->getShape()); } - std::for_each(samplingMaskRange.begin(), samplingMaskRange.begin() + len, [](auto& a) { a = true; }); - return len; } +void LookaheadAlgorithm::posIdsToMask(TensorPtr const& mask, TensorConstPtr const& posIds) +{ + auto len = ITensor::volume(posIds->getShape()); + TLLM_CHECK(mask->getDimension<0>() >= len); + TLLM_CHECK(mask->getDimension<1>() >= len); + auto posIdsRange = BufferRange(*posIds); + auto maskLocation = BufferLocation(*mask); + + for (auto& item : maskLocation) + { + item = false; + } + + if (len > 0) + { + std::vector> stack; + for (auto i = 0; i < len; i++) + { + auto cur = posIdsRange[i]; + while (stack.size() > 0 && cur <= stack.back().second) + { + stack.pop_back(); + } + TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true); + stack.push_back(std::make_pair(i, cur)); + for (auto prev : stack) + { + maskLocation.at(i, prev.first) = true; + } + } + } +} + +struct TreeValue; +using TreeMap = std::unordered_map; + +struct TreeValue +{ + TreeValue() + : nexts(std::make_shared()) + { + } + + using Nexts = std::shared_ptr; + Nexts nexts{nullptr}; + std::list sources; +}; + +using TreeNode = TreeMap::value_type; + +template +void treeDFS(TreeNode& node, BF const& visitBefore, AF const& visitAfter) +{ + visitBefore(node); + for (auto& next : *(node.second.nexts)) + { + treeDFS(next, visitBefore, visitAfter); + } + visitAfter(node); +} + +SizeType32 LookaheadAlgorithm::treeEncode( + TensorPtr const& tokens, TensorPtr const& posIds, TensorPtr const& mask, TensorPtr const& encodeMap) +{ + TLLM_CHECK(ITensor::volume(tokens->getShape()) == ITensor::volume(posIds->getShape())); + auto len = ITensor::volume(tokens->getShape()); + + BufferRange tokensRange(*tokens); + BufferRange posIdsRange(*posIds); + BufferLocation maskLocation(*mask); + BufferRange mapRange(*encodeMap); + + auto branches = std::make_shared(); + + for (auto i = 0; i < len; i++) + { + auto nexts = branches; + for (auto j = 0; j <= i; j++) + { + if (maskLocation.at(i, j)) + { + auto pos = posIdsRange[j]; + auto tok = tokensRange[j]; + auto found = nexts->find(tok); + if (found != nexts->end()) + { + found->second.sources.push_back(j); + nexts = found->second.nexts; + } + else + { + auto [inserted, ok] = nexts->insert({tok, TreeValue()}); + inserted->second.sources.push_back(j); + nexts = inserted->second.nexts; + } + } + } + } + + for (auto& item : maskLocation) + { + item = 0; + } + std::vector> stack; + SizeType32 offset = 0; + SizeType32 posId = posIdsRange.size() ? posIdsRange[0] : 0; + + auto visitBefore + = [&stack, &maskLocation, &tokensRange, &posIdsRange, &posId, &offset, &mapRange](TreeNode const& node) + { + stack.push_back(std::make_pair(offset, node.first)); + for (auto const& source : node.second.sources) + { + mapRange[source] = offset; + } + for (auto const& prev : stack) + { + maskLocation.at(offset, prev.first) = true; + } + tokensRange[offset] = node.first; + posIdsRange[offset] = posId; + offset++; + posId++; + }; + auto visitAfter = [&stack, &posId](TreeNode const& node) + { + stack.pop_back(); + posId--; + }; + + for (auto& next : *branches) + { + treeDFS(next, visitBefore, visitAfter); + } + + for (SizeType32 i = offset; i < len; i++) + { + tokensRange[i] = 0; + posIdsRange[i] = 0; + } + for (SizeType32 i = 0; i < len; i++) + { + for (SizeType32 j = i < offset ? offset : 0; j < len; j++) + { + maskLocation.at(i, j) = false; + } + } + + return offset; +} + void LookaheadAlgorithm::prepare(TensorPtr const& draftTokens, TensorPtr const& positionIds, - TensorPtr const& samplingMask, TensorPtr const& length, TensorConstPtr const& offsetPtr, - TensorConstPtr const& lastTokenPtr) + TensorPtr const& draftLengthPtr, TensorPtr const& attentionMask, SizeType32 attentionMaskOffset, + TensorConstPtr const& lastPositionIdPtr, TensorConstPtr const& lastTokenPtr) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); if (mRuntimeMaxDraftLen == 0) { - (BufferRange(*length))[0] = 0; + mDraftTokens = ITensor::slice(mDraftTokensMax, 0, 0); + mEncodeMap = ITensor::slice(mEncodeMapMax, 0, 0); + (BufferRange(*draftLengthPtr))[0] = 0; return; } auto lastToken = BufferRange(*lastTokenPtr)[0]; - auto offset = BufferRange(*offsetPtr)[0]; + auto offset = BufferRange(*lastPositionIdPtr)[0]; SizeType32 inputLen = ITensor::volume(draftTokens->getShape()); TLLM_CHECK(inputLen >= mRuntimeMaxDraftLen); BufferRange draftRange(*draftTokens); BufferRange positionRange(*positionIds); - BufferRange samplingRange(*samplingMask); SizeType32 filledLen = 0; filledLen += lookahead(ITensor::slice(draftTokens, filledLen, mRuntimeMaxDraftLen - filledLen), - ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), - ITensor::slice(samplingMask, filledLen, mRuntimeMaxDraftLen - filledLen), offset); + ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), offset); auto guessStart = filledLen; filledLen += guess(ITensor::slice(draftTokens, filledLen, mRuntimeMaxDraftLen - filledLen), - ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), - ITensor::slice(samplingMask, filledLen, mRuntimeMaxDraftLen - filledLen), offset, lastToken); + ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), offset, lastToken); auto guessEnd = filledLen; + std::copy(draftRange.begin() + guessStart, draftRange.begin() + guessEnd, + BufferRange(*mGuessTokensMax).begin()); mGuessTokens = ITensor::slice(mGuessTokensMax, 0, guessEnd - guessStart); - std::copy(draftRange.begin() + guessStart, draftRange.begin() + guessEnd, - BufferRange(*mGuessTokens).begin()); + posIdsToMask(mAttentionMask, ITensor::slice(positionIds, 0, filledLen)); - (BufferRange(*length))[0] = filledLen; + auto draftLen = treeEncode(ITensor::slice(draftTokens, 0, filledLen), ITensor::slice(positionIds, 0, filledLen), + mAttentionMask, mEncodeMapMax); + + for (SizeType32 i = 0; i < draftLen; i++) + { + BufferRange srcRange(*ITensor::at(mAttentionMask, {i})); + BufferRange dstRange(*ITensor::slice(attentionMask, {i + attentionMaskOffset, attentionMaskOffset})); + std::copy(srcRange.begin(), srcRange.end(), dstRange.begin()); + } + + std::copy(draftRange.begin(), draftRange.begin() + draftLen, BufferRange(*mDraftTokensMax).begin()); + mDraftTokens = ITensor::slice(mDraftTokensMax, 0, draftLen); + (BufferRange(*draftLengthPtr))[0] = draftLen; + mEncodeMap = ITensor::slice(mEncodeMapMax, 0, filledLen); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -229,29 +415,31 @@ void LookaheadAlgorithm::verify(TensorPtr const& accepted, TensorPtr const& acce { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - TLLM_CHECK(ITensor::volume(goldenTokens->getShape()) == ITensor::volume(mGuessTokens->getShape())); + TLLM_CHECK(ITensor::volume(goldenTokens->getShape()) == ITensor::volume(mDraftTokens->getShape())); BufferRange goldRange(*goldenTokens); - BufferRange guessTokensRange(*mGuessTokens); - auto guessSize = ITensor::volume(mGuessTokens->getShape()); + BufferRange draftRange(*mDraftTokens); + BufferLocation maskLocation(*mAttentionMask); + auto draftSize = ITensor::volume(mDraftTokens->getShape()); + auto end = *BufferRange(*endToken).begin(); - SizeType32 guesses = (mN - 1 > 0) ? (guessSize / (mN - 1)) : 0; - SizeType32 hit = 0, maxHit = 0, hitIdx = 0; - for (SizeType32 i = 0; i < guesses; i++) + SizeType32 maxHit = 0, hitIdx = 0; + for (SizeType32 i = 0; i < draftSize; i++) { SizeType32 hit = 0; - for (SizeType32 j = 0; j < mN - 1; j++) + TokenIdType cur = newLastToken; + for (SizeType32 j = 0; j < draftSize; j++) { - auto idx = i * (mN - 1) + j; - bool ok - = (j == 0) ? (newLastToken == guessTokensRange[idx]) : (goldRange[idx - 1] == guessTokensRange[idx]); - bool finish = guessTokensRange[idx] == *BufferRange(*endToken).begin(); - if (ok && !finish) - { - hit++; - } - else + if (maskLocation.at(i, j)) { - break; + if (draftRange[j] == cur && draftRange[j] != end) + { + hit++; + cur = goldRange[j]; + } + else + { + break; + } } } if (hit > maxHit) @@ -261,17 +449,19 @@ void LookaheadAlgorithm::verify(TensorPtr const& accepted, TensorPtr const& acce } } - BufferRange acceptedRange(*accepted); - acceptedRange[0] = newLastToken; - std::copy(goldRange.begin() + hitIdx * (mN - 1), goldRange.begin() + hitIdx * (mN - 1) + maxHit, - acceptedRange.begin() + 1); + maxHit = maxHit > mRuntimeMaxDraftPathLen ? mRuntimeMaxDraftPathLen : maxHit; + SizeType32 acceptedIdx = 0; + BufferRange acceptedRange(*accepted); BufferRange acceptedOffsetsRange(*acceptedOffsets); - auto lookSize = 1 + mN - 2 - mFilling + mFilling * mW; - // acceptedOffsetsRange[0] = 0; - for (SizeType32 i = 0; i < maxHit; i++) + acceptedRange[acceptedIdx] = newLastToken; + for (SizeType32 j = 0; j < draftSize; j++) { - acceptedOffsetsRange[i] = lookSize + hitIdx * (mN - 1) + i - 1; + if (maskLocation.at(hitIdx, j) && acceptedIdx < maxHit) + { + acceptedOffsetsRange[acceptedIdx++] = j; + acceptedRange[acceptedIdx] = goldRange[j]; + } } *BufferRange(*acceptedLength).begin() = maxHit + 1; @@ -325,7 +515,19 @@ void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(ITensor::volume(acceptedTokens->getShape()) >= mN); - BufferRange sampledRange(*sampledTokens); + BufferRange zippedTokensRange(*sampledTokens); + BufferRange sampledRange(*mSampledTokensMax); + + BufferRange mapRange(*mEncodeMap); + BufferRange unzipRange(*mSampledTokensMax); + mSampledTokens = ITensor::slice(mSampledTokensMax, 0, mEncodeMap->getShape().d[0] + 1); + + unzipRange[0] = zippedTokensRange[0]; + for (SizeType32 i = 0; i < mapRange.size(); i++) + { + unzipRange[i + 1] = zippedTokensRange[mapRange[i] + 1]; + } + BufferRange keyRange(*mKeyTokens); BufferRange pastRange(*mPastTokens); @@ -359,13 +561,15 @@ void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const } auto guessSize = ITensor::volume(mGuessTokens->getShape()); - auto outputSize = ITensor::volume(sampledTokens->getShape()); + auto outputSize = ITensor::volume(mSampledTokens->getShape()); auto lookSize = 1 + (mN > 1 ? mN - 2 : 0) - mFilling + mFilling * mW; TLLM_CHECK(guessSize + lookSize == outputSize); - TensorConstPtr goldenTokens = ITensor::slice(sampledTokens, lookSize, guessSize); + TensorConstPtr goldenTokens = ITensor::slice(mSampledTokens, lookSize, guessSize); + + auto& acptLen = *BufferRange(*acceptedLength).begin(); - verify(acceptedTokens, acceptedOffsets, acceptedLength, newLastToken, goldenTokens, endToken); + verify(acceptedTokens, acceptedOffsets, acceptedLength, newLastToken, ITensor::slice(sampledTokens, 1), endToken); accept(ITensor::slice(acceptedTokens, 0, *BufferRange(*acceptedLength).begin())); diff --git a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h index 99df44128..485734c5a 100644 --- a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h +++ b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h @@ -21,6 +21,7 @@ #include "tensorrt_llm/layers/decodingParams.h" #include "tensorrt_llm/runtime/common.h" #include +#include namespace tensorrt_llm::layers { @@ -35,24 +36,7 @@ class LookaheadAlgorithm //! @brief Currently the resource management is to be aligned with batch manager. //! @param w, n, g is the Jacobi window, n-gram level and guess set size respectively. LookaheadAlgorithm( - runtime::SizeType32 maxW, runtime::SizeType32 maxN, runtime::SizeType32 maxG, runtime::SizeType32 id = 0) - : mMaxW(maxW) - , mMaxN(maxN) - , mMaxG(maxG) - , mFilling(0) - , mPoolManager(maxG) - , mId(id) - , mGoldenTokensMax( - runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxN * 2 - 1}), nvinfer1::DataType::kINT32)) - , mPrefillsMax(runtime::BufferManager::cpu( - runtime::ITensor::makeShape({(maxN <= 1 ? 0 : maxN - 2)}), nvinfer1::DataType::kINT32)) - , mKeyTokensMax(runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW}), nvinfer1::DataType::kINT32)) - , mPastTokensMax( - runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW * (maxN - 1)}), nvinfer1::DataType::kINT32)) - , mGuessTokensMax( - runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxG * (maxN - 1)}), nvinfer1::DataType::kINT32)) - { - } + runtime::SizeType32 maxW, runtime::SizeType32 maxN, runtime::SizeType32 maxG, runtime::SizeType32 id = 0); //! @brief setup per request, fill internal states from @param prompt. void setup(TensorConstPtr const& prompt, runtime::SizeType32 w, runtime::SizeType32 n, runtime::SizeType32 g); @@ -62,43 +46,55 @@ class LookaheadAlgorithm void accept(TensorConstPtr const& generatedTokens); //! @brief combine lookahead and guess to prepare the tensors. - //! input @param offsetPtr is position id of the last golden token, in a TensorPtr. + //! input @param lastPositionIdPtr is position id of the last golden token, in a TensorPtr. //! input @param lastTokenPtr the last golden token for searching in the pool, in a TensorPtr. - //! output @param draftTokens, positionIds, samplingMask; including the golden token, the lookahead - //! and the verification branch information. @param length holds the draft tokens length. - void prepare(TensorPtr const& draftTokens, TensorPtr const& positionIds, TensorPtr const& samplingMask, - TensorPtr const& length, TensorConstPtr const& offsetPtr, TensorConstPtr const& lastTokenPtr); + //! output @param draftTokens, positionIds includes the lookahead and the verification branch information. + //! output @param draftLengthPtr holds the draft tokens length. + //! output @param attentionMask holds the draft tokens dependency mask, and attentionMaskOffset is the index offset + //! in attentionMask. + void prepare(TensorPtr const& draftTokens, TensorPtr const& positionIds, TensorPtr const& draftLengthPtr, + TensorPtr const& attentionMask, runtime::SizeType32 attentionMaskOffset, + TensorConstPtr const& lastPositionIdPtr, TensorConstPtr const& lastTokenPtr); //! @brief update the internal states and generate accepted tokens from @param outputTokens. - //! input @param sampledTokens is the all the tokens from the language model. The position at samplingMask=1 is - //! valid. input @param endToken is the end token for `verify` early quit. - //! output @param acceptedTokens, acceptedOffsets ind @param acceptedLength. + //! input @param sampledTokens is the all the tokens from the language model. + //! input @param endToken is the end token for `verify` early quit. + //! output @param acceptedTokens, acceptedOffsets in @param acceptedLength. void update(TensorPtr const& acceptedTokens, TensorPtr const& acceptedOffsets, TensorPtr const& acceptedLength, TensorConstPtr const& sampledTokens, TensorConstPtr const& endToken); + //! generate attention @param mask from @param posIds. + static void posIdsToMask(TensorPtr const& mask, TensorConstPtr const& posIds); + + //! inplace encode the @param tokens and @param posIds according to attention @param masks, and record the offsets + //! in @param encodeMap. + static runtime::SizeType32 treeEncode( + TensorPtr const& tokens, TensorPtr const& posIds, TensorPtr const& masks, TensorPtr const& encodeMap); + private: //! @brief generate lookahead branch information. - //! input @param offset the position id of the last golden token. - //! output @param draftTokens, positionIds, samplingMask of the lookahead branch. + //! input @param startPosId is the first position id of the draftTokens. + //! output @param draftTokens, positionIds of the lookahead branch. //! @return the actual filled lookahead length. - runtime::SizeType32 lookahead(TensorPtr const& draftTokens, TensorPtr const& positionIds, - TensorPtr const& samplingMask, runtime::SizeType32 offset); + runtime::SizeType32 lookahead( + TensorPtr const& draftTokens, TensorPtr const& positionIds, runtime::SizeType32 startPosId); //! @brief generate verification branch information. Also save the guessed tokens for future verification. - //! input @param offset the position id of the last golden token. + //! input @param startPosId the first position id. //! input @param lastToken the last golden token for searching in the pool. - //! output @param guessTokens, guessIds, samplingMask of the verification branch. + //! output @param guessTokens, guessIds of the verification branch. //! @return the actual filled guess length. - runtime::SizeType32 guess(TensorPtr const& guessTokens, TensorPtr const& guessIds, TensorPtr const& samplingMask, - runtime::SizeType32 offset, runtime::TokenIdType lastToken); + runtime::SizeType32 guess(TensorPtr const& guessTokens, TensorPtr const& guessIds, runtime::SizeType32 startPosId, + runtime::TokenIdType lastToken); //! @brief verify the guessed tokens results and generate the longest accepted tokens. //! input @param newLastToken is the new-generated last golden token. - //! input @param goldenTokens is the guessed token results from the language model. + //! input @param sampledTokens is the generated token results from the language model. //! input @param endToken is the end token for early quit detection. - //! output @param accepted, acceptedOffsets in @param acceptedLength, . + //! output @param accepted in @param acceptedLength, including the first golden one. + //! output @param acceptedOffsets is the offsets of draft tokens, excluding the first golden one. void verify(TensorPtr const& accepted, TensorPtr const& acceptedOffsets, TensorPtr const& acceptedLength, - runtime::TokenIdType newLastToken, TensorConstPtr const& goldenTokens, TensorConstPtr const& endToken); + runtime::TokenIdType newLastToken, TensorConstPtr const& sampledTokens, TensorConstPtr const& endToken); private: LookaheadPoolManager mPoolManager; @@ -117,6 +113,13 @@ class LookaheadAlgorithm //! the same guess tokens from `guess` and used in `verify` TensorPtr mGuessTokensMax; // shape [mMaxG*(mMaxN-1)] TensorPtr mGuessTokens; // shape [mG*(mN-1)] + TensorPtr mDraftTokensMax; + TensorPtr mDraftTokens; + TensorPtr mAttentionMask; + TensorPtr mEncodeMapMax; + TensorPtr mEncodeMap; + TensorPtr mSampledTokensMax; + TensorPtr mSampledTokens; //! look ahead algorithm parameters, Window size, Level and Guess set size. //! max for reserving resources and current for current request. @@ -127,6 +130,7 @@ class LookaheadAlgorithm runtime::SizeType32 mN{0}; runtime::SizeType32 mG{0}; runtime::SizeType32 mRuntimeMaxDraftLen{0}; + runtime::SizeType32 mRuntimeMaxDraftPathLen{0}; //! in prefilling mode when mFilling < mN-1. runtime::SizeType32 mFilling; diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp index 8214abfb4..414572322 100644 --- a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp @@ -24,6 +24,7 @@ #include "tensorrt_llm/layers/lookaheadAlgorithm.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" #include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/lookaheadModule.h" @@ -75,19 +76,21 @@ LookaheadDecodingLayer::CpuAlgorithmResources::CpuAlgorithmResources(DecoderD ITensor::makeShape({maxTokensPerStep, maxBatchSize, beamWidth}), nvinfer1::DataType::kINT32); mPathsOffsets = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32); + mPathsOffsetsBatch + = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32); mNumNewTokens = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mNumNewTokensCumSum = BufferManager::cpu(ITensor::makeShape({maxBatchSize + 1}), nvinfer1::DataType::kINT32); mNextDraftTokens = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); mNextDraftPosIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); mGenerationLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); - mGenerationLengthsMax = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mPositionOffsets = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep}), nvinfer1::DataType::kINT32); mPositionIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep}), nvinfer1::DataType::kINT32); + mAttentionMask + = BufferManager::cpu(ITensor::makeShape({maxTokensPerStep, maxTokensPerStep}), nvinfer1::DataType::kBOOL); mPackedMask = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep, static_cast(divUp(maxTokensPerStep, 32))}), nvinfer1::DataType::kINT32); - mSamplingMask = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kBOOL); mNextDraftLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); mSequenceLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32); } @@ -113,7 +116,6 @@ LookaheadDecodingLayer::LookaheadDecodingLayer( mWorkspaceSize = getTopKWorkspaceSize(maxBatchSize, maxTokensPerStep, maxTopK, vocabSizePadded); mTargetTokensDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kINT32); - mSamplingMaskDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kBOOL); mCurandStatesDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize, sizeof(curandState_t)}), nvinfer1::DataType::kINT8); @@ -168,6 +170,7 @@ void LookaheadDecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth { SizeType32 gbi = batchSlotsRange[bi]; (BufferRange(*mCpuAlgo->mGenerationLengths))[gbi] = 1; + (BufferRange(*mCpuAlgo->mNextDraftLengths))[gbi] = 0; BufferLocation(*mCpuAlgo->mPositionOffsets).at(gbi, 0) = 0; BufferRange packedMaskRange(*ITensor::at(mCpuAlgo->mPackedMask, {gbi})); for (auto& mask : packedMaskRange) @@ -184,11 +187,6 @@ void LookaheadDecodingLayer::setup(SizeType32 batchSize, SizeType32 beamWidth PRINT_SHAPE(setupParams->attentionPackedMasks); mBufferManager->copy( *ITensor::at(mCpuAlgo->mGenerationLengths, {gbi}), *ITensor::at(setupParams->generationLengths, {gbi})); - if (setupParams->actualGenerationLengths) - { - mBufferManager->copy(*ITensor::at(mCpuAlgo->mGenerationLengths, {gbi}), - *ITensor::at(setupParams->actualGenerationLengths, {gbi})); - } mBufferManager->copy( *ITensor::at(mCpuAlgo->mPositionOffsets, {gbi}), *ITensor::at(setupParams->positionOffsets, {gbi})); mBufferManager->copy( @@ -224,7 +222,7 @@ void LookaheadDecodingLayer::forwardAsync(std::shared_ptr::getWorkspaceSize() const noexcept return std::max(mWorkspaceSize, mSetupWorkspaceSize); } -template -void LookaheadDecodingLayer::posIdsToMask(TensorPtr mask, TensorConstPtr posIds) +inline void initAttentionMask(TensorPtr const& mask, std::shared_ptr& bufferManager) { - auto len = ITensor::volume(posIds->getShape()); - TLLM_CHECK(mask->getDimension<0>() > len); - TLLM_CHECK(mask->getDimension<1>() * 32 > len); - auto posIdsRange = BufferRange(*posIds); - auto maskLocation = BufferLocation(*mask); - - for (auto i = 0; i < maskLocation.size(); i++) + bufferManager->setZero(*mask); + BufferLocation maskLocation(*mask); + auto maskShape = mask->getShape(); + for (SizeType32 i = 0; i < maskShape.d[0]; i++) { - maskLocation[i] = 0; + maskLocation.at(i, 0) = true; } - maskLocation.at(0, 0) = 1; +} - auto setBit = [](SizeType32& x, SizeType32 idx) { x |= (1 << idx); }; - if (len > 0) +inline void convertBoolToInt32(TensorPtr const& dst, TensorConstPtr const& src) +{ + auto dstShape = dst->getShape(); + auto srcShape = src->getShape(); + TLLM_CHECK(dstShape.d[0] == srcShape.d[0]); + TLLM_CHECK(dstShape.d[1] * 32 >= srcShape.d[1]); + BufferLocation dstLocation(*dst); + BufferLocation srcLocation(*src); + + auto setBit = [](SizeType32& x, SizeType32 idx, bool value) { x |= (value << idx); }; + for (auto i = 0; i < srcShape.d[0]; i++) { - std::vector> stack; - stack.emplace_back(0, posIdsRange[0] - 1); - for (auto i = 1; i < len + 1; i++) + for (auto j = 0; j < srcShape.d[1]; j++) { - auto cur = posIdsRange[i - 1]; - while (stack.size() > 0 && cur <= stack.back().second) - { - stack.pop_back(); - } - TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true); - stack.emplace_back(i, cur); - for (auto prev : stack) - { - setBit(maskLocation.at(i, prev.first / 32), prev.first % 32); - } + setBit(dstLocation.at(i, j / 32), j % 32, srcLocation.at(i, j)); } } } @@ -307,12 +298,16 @@ void LookaheadDecodingLayer::forwardSyncCPU( mCpuAlgo->mBatchSlots->reshape(inputs->batchSlots->getShape()); mBufferManager->copy(*inputs->batchSlots, *mCpuAlgo->mBatchSlots); mBufferManager->copy(*inputs->curTokensPerStep.value(), *mCpuAlgo->mTokensPerStep); - mBufferManager->copy(*inputs->curTokensPerStep.value(), *mCpuAlgo->mTokensPerStep); mBufferManager->copy(*inputs->endIds, *mCpuAlgo->mEndIds); mBufferManager->copy(*outputs->sequenceLength.value(), *mCpuAlgo->mSequenceLengths); mBufferManager->copy(*mTargetTokensDevice, *mCpuAlgo->mTargetTokens); + if (outputs->prevDraftLengths) + { + mBufferManager->copy(*mCpuAlgo->mNextDraftLengths, *outputs->prevDraftLengths); + } + mBufferManager->getStream().synchronize(); auto const batchSize = inputs->localBatchSize; @@ -325,15 +320,16 @@ void LookaheadDecodingLayer::forwardSyncCPU( BufferRange numNewTokensCumSumRange(*mCpuAlgo->mNumNewTokensCumSum); BufferRange batchSlotsRange(*mCpuAlgo->mBatchSlots); BufferRange generationLengthsRange(*mCpuAlgo->mGenerationLengths); - BufferRange generationLengthsMaxRange(*mCpuAlgo->mGenerationLengthsMax); BufferRange nextDraftLengthsRange(*mCpuAlgo->mNextDraftLengths); BufferRange sequenceLengthsRange(*mCpuAlgo->mSequenceLengths); BufferLocation pathsOffsetLocation(*mCpuAlgo->mPathsOffsets); + BufferLocation pathsOffsetBatchLocation(*mCpuAlgo->mPathsOffsetsBatch); BufferLocation outputIdsLocation(*mCpuAlgo->mOutputIds); mBufferManager->setZero(*mCpuAlgo->mPathsOffsets); mBufferManager->setZero(*mCpuAlgo->mNumNewTokens); mBufferManager->setZero(*mCpuAlgo->mNumNewTokensCumSum); + mBufferManager->setZero(*mCpuAlgo->mPackedMask); for (SizeType32 bi = 0; bi < batchSize; bi++) { @@ -342,7 +338,6 @@ void LookaheadDecodingLayer::forwardSyncCPU( SizeType32 const tokensPerStep = generationLengthsRange[gbi]; TensorPtr sampledTokens = ITensor::slice(mCpuAlgo->mTargetTokens, {gbi, 0}, tokensPerStep); - PRINT_VALUES(sampledTokens); if (tokensPerStep == 1) { @@ -369,14 +364,18 @@ void LookaheadDecodingLayer::forwardSyncCPU( sequenceLengthsRange[gbi] += numNewTokensRange[gbi]; + initAttentionMask(mCpuAlgo->mAttentionMask, mBufferManager); + theAlgo.prepare( // ITensor::at(mCpuAlgo->mNextDraftTokens, {gbi}), // ITensor::at(mCpuAlgo->mNextDraftPosIds, {gbi}), // - ITensor::at(mCpuAlgo->mSamplingMask, {gbi}), // ITensor::at(mCpuAlgo->mNextDraftLengths, {gbi}), // + mCpuAlgo->mAttentionMask, 1, // ITensor::at(mCpuAlgo->mSequenceLengths, {gbi}), // ITensor::at(mCpuAlgo->mOutputIds, {gbi, numNewTokensRange[gbi] - 1})); + convertBoolToInt32(ITensor::at(mCpuAlgo->mPackedMask, {gbi}), mCpuAlgo->mAttentionMask); + BufferLocation posIdsLocation(*ITensor::at(mCpuAlgo->mPositionIds, {gbi})); for (auto& posid : posIdsLocation) { @@ -385,39 +384,35 @@ void LookaheadDecodingLayer::forwardSyncCPU( mBufferManager->copy(*ITensor::slice(mCpuAlgo->mNextDraftPosIds, {gbi, 0}, nextDraftLengthsRange[gbi]), *ITensor::slice(mCpuAlgo->mPositionIds, {gbi, 1}, nextDraftLengthsRange[gbi])); - posIdsToMask( // - ITensor::at(mCpuAlgo->mPackedMask, {gbi}), // - ITensor::slice(mCpuAlgo->mNextDraftPosIds, {gbi, 0}, nextDraftLengthsRange[gbi])); - BufferRange offsetRange(*ITensor::at(mCpuAlgo->mPositionOffsets, {gbi})); - TLLM_CHECK_WITH_INFO( - posIdsLocation.size() == offsetRange.size(), "%ld, %ld", posIdsLocation.size(), offsetRange.size()); for (auto i = 0; i < posIdsLocation.size(); i++) { offsetRange[i] = posIdsLocation[i] - posIdsLocation[0]; } + TensorPtr accepted = ITensor::slice(mCpuAlgo->mOutputIds, {gbi, 0}, numNewTokensRange[gbi]); TensorPtr draft = ITensor::slice(mCpuAlgo->mNextDraftTokens, {gbi, 0}, nextDraftLengthsRange[gbi]); - TLLM_LOG_DEBUG("CPU ALGO [ %d ] forward, %s", gbi, D(sampledTokens).values().c_str()); TLLM_LOG_DEBUG("[%d][%d] CPU ALGO [ %d ] forward, %s, %s", mGlobalSteps, batchSize, gbi, D(accepted).values().c_str(), D(draft).values().c_str()); } - numNewTokensCumSumRange[0] = 0; SizeType32 pi = 0; - for (SizeType32 bi = 0; bi < numNewTokensRange.size(); bi++) + numNewTokensCumSumRange[0] = 0; + for (SizeType32 bi = 0; bi < batchSize; bi++) { - SizeType32 acceptedDraftLen = numNewTokensRange[bi] <= 1 ? 0 : (numNewTokensRange[bi] - 1); + SizeType32 gbi = batchSlotsRange[bi]; + SizeType32 acceptedDraftLen = numNewTokensRange[gbi] <= 1 ? 0 : (numNewTokensRange[gbi] - 1); numNewTokensCumSumRange[bi + 1] = numNewTokensCumSumRange[bi] + acceptedDraftLen; for (SizeType32 tj = 0; tj < acceptedDraftLen; tj++) { - pathsOffsetLocation[pi++] = pathsOffsetLocation.at(bi, tj); + pathsOffsetBatchLocation[pi++] = pathsOffsetLocation.at(gbi, tj); } } - for (; pi < pathsOffsetLocation.size(); pi++) + + for (; pi < pathsOffsetBatchLocation.size(); pi++) { - pathsOffsetLocation[pi++] = 0; + pathsOffsetBatchLocation[pi++] = 0; } TLLM_CHECK(outputs->numNewTokens); @@ -425,34 +420,28 @@ void LookaheadDecodingLayer::forwardSyncCPU( mBufferManager->copy(*mCpuAlgo->mSequenceLengths, *outputs->sequenceLength.value()); mBufferManager->copy(*mCpuAlgo->mNewTokens, *outputs->newTokens); - mBufferManager->copy(*mCpuAlgo->mPathsOffsets, *outputs->pathsOffsets); mBufferManager->copy(*mCpuAlgo->mNumNewTokens, *outputs->numNewTokens.value()); + mBufferManager->copy(*mCpuAlgo->mPathsOffsetsBatch, *outputs->pathsOffsets); mBufferManager->copy(*mCpuAlgo->mNumNewTokensCumSum, *outputs->numNewTokensCumSum); // mBufferManager->copy(*mCpuAlgo->mNextDraftTokens, *outputs->nextDraftTokens); - mBufferManager->copy(*mCpuAlgo->mPackedMask, *outputs->packedMasks); + for (SizeType32 bi = 0; bi < batchSize; bi++) + { + SizeType32 gbi = batchSlotsRange[bi]; + // nextDraftLengthsRange[gbi] = mDecoderDomain.getMaxDecodingTokens() - 1; + generationLengthsRange[gbi] = nextDraftLengthsRange[gbi] + 1; + } if (outputs->nextDraftLengths) { mBufferManager->copy(*mCpuAlgo->mNextDraftLengths, *outputs->nextDraftLengths); } - for (SizeType32 bi = 0; bi < batchSize; bi++) - { - SizeType32 gbi = batchSlotsRange[bi]; - generationLengthsRange[gbi] = nextDraftLengthsRange[gbi] + 1; - generationLengthsMaxRange[gbi] = mDecoderDomain.getMaxDecodingTokens(); - } mBufferManager->copy(*mCpuAlgo->mPackedMask, *outputs->packedMasks); - mBufferManager->copy(*mCpuAlgo->mGenerationLengthsMax, *outputs->generationLengths); + mBufferManager->copy(*mCpuAlgo->mGenerationLengths, *outputs->generationLengths); mBufferManager->copy(*mCpuAlgo->mPositionOffsets, *outputs->positionOffsets); mBufferManager->copy(*mCpuAlgo->mPositionIds, *outputs->positionIds); - if (outputs->actualGenerationLengths) - { - mBufferManager->copy(*mCpuAlgo->mGenerationLengths, *outputs->actualGenerationLengths); - } - mBufferManager->getStream().synchronize(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h index 536d21727..e20b59b22 100644 --- a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h @@ -48,7 +48,6 @@ class LookaheadDecodingLayer : public BaseLayer private: void forwardSyncCPU(std::shared_ptr const& outputs, std::shared_ptr const& inputs); - void posIdsToMask(TensorPtr mask, TensorConstPtr posIds); private: using Base::mDecoderDomain; @@ -57,7 +56,6 @@ class LookaheadDecodingLayer : public BaseLayer size_t mSetupWorkspaceSize{}; TensorPtr mCurandStatesDevice; TensorPtr mTargetTokensDevice; - TensorPtr mSamplingMaskDevice; struct CpuAlgorithmResources { @@ -72,17 +70,17 @@ class LookaheadDecodingLayer : public BaseLayer TensorPtr mOutputIds; TensorPtr mPathsOffsets; + TensorPtr mPathsOffsetsBatch; TensorPtr mNumNewTokens; TensorPtr mNumNewTokensCumSum; TensorPtr mNewTokens; TensorPtr mNextDraftTokens; TensorPtr mNextDraftPosIds; - TensorPtr mSamplingMask; TensorPtr mNextDraftLengths; TensorPtr mSequenceLengths; TensorPtr mGenerationLengths; - TensorPtr mGenerationLengthsMax; + TensorPtr mAttentionMask; TensorPtr mPackedMask; TensorPtr mPositionOffsets; TensorPtr mPositionIds; diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h index d109132e1..739cf6500 100644 --- a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h +++ b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h @@ -121,14 +121,20 @@ class BufferLocation : public runtime::BufferRange class DebugTensor { public: - DebugTensor(runtime::ITensor const& tensor, char const* name) + DebugTensor(runtime::ITensor const& tensor, char const* name, + std::shared_ptr bufferManager = nullptr, + std::shared_ptr stream = nullptr) : mTensor(tensor) , mName(name) + , mBufferManager(bufferManager) + , mStream(stream) { } - DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name) - : DebugTensor(*tensor, name) + DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name, + std::shared_ptr bufferManager = nullptr, + std::shared_ptr stream = nullptr) + : DebugTensor(*tensor, name, bufferManager, stream) { } @@ -187,9 +193,11 @@ class DebugTensor runtime::BufferManager::ITensorPtr hostPtr{nullptr}; if (mTensor.getMemoryType() == runtime::MemoryType::kGPU) { - runtime::BufferManager manager{std::make_shared()}; - hostPtr = manager.copyFrom(mTensor, runtime::MemoryType::kCPU); - manager.getStream().synchronize(); + auto theManager = mBufferManager + ? mBufferManager + : std::make_shared(mStream ? mStream : std::make_shared()); + hostPtr = theManager->copyFrom(mTensor, runtime::MemoryType::kCPU); + theManager->getStream().synchronize(); } return hostPtr; } @@ -343,12 +351,80 @@ class DebugTensor TLLM_LOG_DEBUG(shape()); } + template + void randomize(runtime::SizeType32 vtype) + { + runtime::BufferRange tensorRange(const_cast(mTensor)); + for (auto& item : tensorRange) + { + item = vtype == 0 ? 0 : vtype == 1 ? 1 : rand(); + } + } + + void randomize(void) + { + if (mTensor.getMemoryType() == runtime::MemoryType::kGPU) + { + runtime::ITensor& nonConstTensor = const_cast(mTensor); + runtime::BufferManager manager{std::make_shared()}; + runtime::ITensor::SharedConstPtr cpuBuffer = manager.cpu(mTensor.getShape(), mTensor.getDataType()); + DebugTensor(cpuBuffer, "cpuBuffer").randomize(); + manager.copy(*cpuBuffer, nonConstTensor); + manager.getStream().synchronize(); + } + else + { + switch (mTensor.getDataType()) + { + case nvinfer1::DataType::kBOOL: return randomize(3); + case nvinfer1::DataType::kFLOAT: return randomize(3); + case nvinfer1::DataType::kINT8: return randomize(3); + case nvinfer1::DataType::kINT32: return randomize(3); + case nvinfer1::DataType::kINT64: return randomize(3); + case nvinfer1::DataType::kUINT8: return randomize(3); + default: return; + } + } + } + + void setZeros(void) + { + switch (mTensor.getDataType()) + { + case nvinfer1::DataType::kBOOL: return randomize(0); + case nvinfer1::DataType::kFLOAT: return randomize(0); + case nvinfer1::DataType::kINT8: return randomize(0); + case nvinfer1::DataType::kINT32: return randomize(0); + case nvinfer1::DataType::kINT64: return randomize(0); + case nvinfer1::DataType::kUINT8: return randomize(0); + default: return; + } + } + + void setOnes(void) + { + switch (mTensor.getDataType()) + { + case nvinfer1::DataType::kBOOL: return randomize(1); + case nvinfer1::DataType::kFLOAT: return randomize(1); + case nvinfer1::DataType::kINT8: return randomize(1); + case nvinfer1::DataType::kINT32: return randomize(1); + case nvinfer1::DataType::kINT64: return randomize(1); + case nvinfer1::DataType::kUINT8: return randomize(1); + default: return; + } + } + private: runtime::ITensor const& mTensor; std::string mName; + std::shared_ptr mBufferManager; + std::shared_ptr mStream; }; #define D(x) tensorrt_llm::layers::DebugTensor(x, #x) +#define Db(x, bufferManager) tensorrt_llm::layers::DebugTensor(x, #x, bufferManager, nullptr) +#define Ds(x, stream) tensorrt_llm::layers::DebugTensor(x, #x, nullptr, stream) #define PRINT_TOKENS(x) D(x).print_tokens() #define PRINT_VALUES(x) D(x).print_values() #define PRINT_SHAPE(x) D(x).print_shape() diff --git a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp index ac8f78ec1..9f5fc6d38 100644 --- a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp +++ b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp @@ -390,7 +390,7 @@ void MedusaDecodingLayer::sampleNewDraftTokens(SpeculativeDecodingOutputs con params.maxBatchSize = maxBatchSizeHeadNums; params.maxTokensPerStep = 1; params.vocabSizePadded = mDecoderDomain.getVocabSizePadded(); - params.returnAllTopK = true; + params.returnAllSelectedTokens = true; invokeBatchTopKSampling(params, getStream()); diff --git a/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp b/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp index fc0774450..f583b0e3e 100644 --- a/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp +++ b/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp @@ -267,7 +267,7 @@ void TopPSamplingLayer::forwardAsync(std::shared_ptr con TopPSamplingKernelParams params{}; params.probs = probs; - params.outputIds = bufferCastOrNull(outputs->outputIdsPtr); + params.outputIdsPtrs = bufferCastOrNull(outputs->outputIdsPtr); params.workspace = workspace->getRawWorkspaceDevicePtr(); params.topPs = bufferCastOrNull(mRuntimeTopPDevice); params.sequenceLength = sequenceLength; diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp index 291900103..71ad6591f 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp +++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp @@ -164,7 +164,7 @@ bool GPTAttentionPluginCommon::convertMMHAParamsToXQAParams(tensorrt_llm::kernel memset(&xqaParams, 0, sizeof(XQAParams)); xqaParams.data_type = ConvertMMHAToXQAParamsHelper::data_type; - xqaParams.layer_idx = mLayerIdx; + xqaParams.layer_idx = mLayerIdxInCachePool; xqaParams.num_q_heads = mNumHeads; xqaParams.num_kv_heads = mNumKVHeads; xqaParams.head_size = mHeadSize; @@ -376,13 +376,13 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params&, \ - const FusedQKVMaskedAttentionDispatchParams&, cudaStream_t stream); \ + FusedQKVMaskedAttentionDispatchParams const&, cudaStream_t stream); \ template void fusedQKV_masked_attention_dispatch(Multihead_attention_params&, \ - const FusedQKVMaskedAttentionDispatchParams&, cudaStream_t stream); \ + FusedQKVMaskedAttentionDispatchParams const&, cudaStream_t stream); \ template void fusedQKV_masked_attention_dispatch(Multihead_attention_params&, \ - const FusedQKVMaskedAttentionDispatchParams&, cudaStream_t stream); \ + FusedQKVMaskedAttentionDispatchParams const&, cudaStream_t stream); \ template void fusedQKV_masked_attention_dispatch(Multihead_attention_params&, \ - const FusedQKVMaskedAttentionDispatchParams&, cudaStream_t stream); + FusedQKVMaskedAttentionDispatchParams const&, cudaStream_t stream); INSTANTIATE_MMHA_DISPATCH(float, float) INSTANTIATE_MMHA_DISPATCH(uint16_t, half) #ifdef ENABLE_BF16 @@ -391,8 +391,8 @@ INSTANTIATE_MMHA_DISPATCH(__nv_bfloat16, __nv_bfloat16) #undef INSTANTIATE_MMHA_DISPATCH GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads, int vision_start, int vision_length, - int num_kv_heads, int head_size, int unidirectional, float q_scaling, float qk_tanh_scale, - tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, + int num_kv_heads, int layer_idx_in_cache_pool, int head_size, int unidirectional, float q_scaling, + float qk_tanh_scale, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type, float rotary_embedding_scale, float rotary_embedding_short_m_scale, float rotary_embedding_long_m_scale, @@ -411,6 +411,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads, , mVisionStart(vision_start) , mVisionLength(vision_length) , mNumKVHeads(num_kv_heads) + , mLayerIdxInCachePool(layer_idx_in_cache_pool) , mHeadSize(head_size) , mUnidirectional(unidirectional) , mQScaling(q_scaling) @@ -525,6 +526,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng read(d, mVisionStart); read(d, mVisionLength); read(d, mNumKVHeads); + read(d, mLayerIdxInCachePool); read(d, mHeadSize); read(d, mUnidirectional); read(d, mQScaling); @@ -721,7 +723,7 @@ int GPTAttentionPluginCommon::enqueueContext(EnqueueContextParams #include #include #include @@ -41,8 +43,8 @@ static char const* GPT_ATTENTION_PLUGIN_VERSION{"1"}; static char const* GPT_ATTENTION_PLUGIN_NAME{"GPTAttention"}; GPTAttentionPlugin::GPTAttentionPlugin(int layer_idx, int num_heads, int vision_start, int vision_length, - int num_kv_heads, int head_size, int unidirectional, float q_scaling, float qk_tanh_scale, - tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, + int num_kv_heads, int layer_idx_in_cache_pool, int head_size, int unidirectional, float q_scaling, + float qk_tanh_scale, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, int rotary_embedding_dim, // for RoPE. 0 for non-RoPE float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type, float rotary_embedding_scale, float rotary_embedding_short_m_scale, @@ -57,9 +59,9 @@ GPTAttentionPlugin::GPTAttentionPlugin(int layer_idx, int num_heads, int vision_ bool pos_shift_enabled, bool dense_context_fmha, bool use_paged_context_fmha, bool use_fp8_context_fmha, bool use_cache, bool is_spec_decoding_enabled, bool spec_decoding_is_generation_length_variable, int spec_decoding_max_generation_length) - : GPTAttentionPluginCommon(layer_idx, num_heads, vision_start, vision_length, num_kv_heads, head_size, - unidirectional, q_scaling, qk_tanh_scale, position_embedding_type, rotary_embedding_dim, rotary_embedding_base, - rotary_embedding_scale_type, rotary_embedding_scale, rotary_embedding_short_m_scale, + : GPTAttentionPluginCommon(layer_idx, num_heads, vision_start, vision_length, num_kv_heads, layer_idx_in_cache_pool, + head_size, unidirectional, q_scaling, qk_tanh_scale, position_embedding_type, rotary_embedding_dim, + rotary_embedding_base, rotary_embedding_scale_type, rotary_embedding_scale, rotary_embedding_short_m_scale, rotary_embedding_long_m_scale, rotary_embedding_max_positions, rotary_embedding_original_max_positions, tp_size, tp_rank, unfuse_qkv_gemm, context_fmha_type, enable_xqa, kv_cache_quant_mode, remove_input_padding, mask_type, block_sparse_params, paged_kv_cache, tokens_per_block, type, max_context_length, qkv_bias_enabled, @@ -94,6 +96,7 @@ bool GPTAttentionPlugin::isEntryUsed(IdxEntry const& entry) const case IdxEntry::KV_CACHE_BLOCK_OFFSETS: return useKVCache() && mPagedKVCache; case IdxEntry::HOST_KV_CACHE_BLOCK_OFFSETS: return useKVCache() && mPagedKVCache; case IdxEntry::HOST_KV_CACHE_POOL_POINTERS: return useKVCache() && mPagedKVCache; + case IdxEntry::HOST_KV_CACHE_POOL_MAPPING: return useKVCache() && mPagedKVCache; case IdxEntry::PAST_KEY_VALUE: return useKVCache() && !mPagedKVCache; case IdxEntry::KV_CACHE_QUANTIZATION_SCALE: return useKVCache() && mKVCacheQuantMode.hasKvCacheQuant(); case IdxEntry::KV_CACHE_DEQUANTIZATION_SCALE: return useKVCache() && mKVCacheQuantMode.hasKvCacheQuant(); @@ -244,6 +247,11 @@ bool GPTAttentionPlugin::supportsFormatCombination( // kv cache pool pointers return inOut[pos].type == nvinfer1::DataType::kINT64 && inOut[pos].format == TensorFormat::kLINEAR; } + else if (useKVCache() && mPagedKVCache && (pos == getIdx(IdxEntry::HOST_KV_CACHE_POOL_MAPPING))) + { + // kv cache pool mapping + return inOut[pos].type == nvinfer1::DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR; + } else if (useKVCache() && mKVCacheQuantMode.hasInt8KvCache() && (!mPagedKVCache && (pos == getIdx(IdxEntry::PAST_KEY_VALUE) || pos == nbInputs + 1))) { @@ -625,27 +633,36 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32 auto const& kvCacheBlockOffsets = inputDesc[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)]; auto const& kvCacheBlockOffsetsShape = inputDesc[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)].dims; max_blocks_per_sequence = kvCacheBlockOffsetsShape.d[kvCacheBlockOffsetsShape.nbDims - 1]; - auto const seqStride = getStride(kvCacheBlockOffsetsShape, 0); + + std::int32_t const* host_pool_mapping + = static_cast(inputs[getIdx(IdxEntry::HOST_KV_CACHE_POOL_MAPPING)]); + + const int32_t layerToPool = host_pool_mapping[mLayerIdx]; + auto const seqStride = getStride(kvCacheBlockOffsetsShape, 1); + auto const poolStride = getStride(kvCacheBlockOffsetsShape, 0); auto const seqOffset = seqIdxBeg * seqStride; + auto const poolOffset = layerToPool * poolStride; block_offsets = reinterpret_cast(inputs[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)]) - + seqOffset; + + poolOffset + seqOffset; host_block_offsets = reinterpret_cast(inputs[getIdx(IdxEntry::HOST_KV_CACHE_BLOCK_OFFSETS)]) - + seqOffset; + + poolOffset + seqOffset; auto const* const typed_host_pool_pointers = static_cast(inputs[getIdx(IdxEntry::HOST_KV_CACHE_POOL_POINTERS)]); auto const cacheElemSize = (mKVCacheQuantMode.hasKvCacheQuant() ? 1 : sizeof(T)); + auto const blockSize = mTokensPerBlock * mNumKVHeads * mHeadSize; auto const bytesPerBlock = blockSize * cacheElemSize; - auto const layerOffset = mLayerIdx * 2 * bytesPerBlock; + auto const layerOffset = mLayerIdxInCachePool * 2 * bytesPerBlock; - host_primary_pool_pointer = reinterpret_cast(typed_host_pool_pointers[0] + layerOffset); - host_secondary_pool_pointer = reinterpret_cast(typed_host_pool_pointers[1] + layerOffset); + host_primary_pool_pointer = reinterpret_cast(typed_host_pool_pointers[layerToPool * 2] + layerOffset); + host_secondary_pool_pointer + = reinterpret_cast(typed_host_pool_pointers[layerToPool * 2 + 1] + layerOffset); } AttentionOutT* context_buf_ = static_cast(outputs[0]) @@ -963,8 +980,9 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(char const* name, PluginField auto* obj = new GPTAttentionPlugin(p.getScalar("layer_idx").value(), p.getScalar("num_heads").value(), p.getScalar("vision_start").value(), p.getScalar("vision_length").value(), p.getScalar("num_kv_heads").value(), - p.getScalar("head_size").value(), p.getScalar("unidirectional").value(), - p.getScalar("q_scaling").value(), p.getScalar("qk_tanh_scale").value(), + p.getScalar("layer_idx_in_cache_pool").value(), p.getScalar("head_size").value(), + p.getScalar("unidirectional").value(), p.getScalar("q_scaling").value(), + p.getScalar("qk_tanh_scale").value(), static_cast(p.getScalar("position_embedding_type").value()), p.getScalar("rotary_embedding_dim").value(), p.getScalar("rotary_embedding_base").value(), static_cast(p.getScalar("rotary_embedding_scale_type").value()), diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h index aeeae99ce..7982d3c07 100644 --- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h +++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h @@ -85,7 +85,7 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon { public: GPTAttentionPlugin(int layer_idx, int num_heads, int vision_start, int vision_length, int num_kv_heads, - int head_size, int unidirectional, float q_scaling, float qk_tanh_scale, + int layer_idx_in_cache_pool, int head_size, int unidirectional, float q_scaling, float qk_tanh_scale, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type, int rotary_embedding_dim, // for RoPE. 0 for non-RoPE float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type, @@ -182,6 +182,7 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon KV_CACHE_BLOCK_OFFSETS, HOST_KV_CACHE_BLOCK_OFFSETS, HOST_KV_CACHE_POOL_POINTERS, + HOST_KV_CACHE_POOL_MAPPING, PAST_KEY_VALUE, KV_CACHE_QUANTIZATION_SCALE, KV_CACHE_DEQUANTIZATION_SCALE, diff --git a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp index ffc4b7a8d..6f6512f13 100644 --- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp @@ -259,7 +259,7 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P int idx = 0; for (int reqId = 0; reqId < numReqs; reqId++) { - const RequestType reqType = static_cast(reqTypes[reqId]); + RequestType const reqType = static_cast(reqTypes[reqId]); if (reqType == RequestType::kGENERATION) { mExpandLoraWeightPtrs.push_back(reinterpret_cast(loraWeightModulePtrs[reqId * 2])); @@ -284,7 +284,7 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P fmtstr("LoraParams and input dims don't match, lora tokens %d input tokens %d", idx, numTokens)); } - // only used for unifed gemm + // only used for unified gemm auto bestTactic = mPluginProfiler->getBestConfig(numTokens, mGemmId); mLoraImpl->setBestTactic(bestTactic); mLoraImpl->run(numTokens, numReqs, input, mExpandLoraRanks.data(), mExpandLoraWeightPtrs.data(), mWeightIndex, diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp index 784055cc5..72304b5ff 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp @@ -143,7 +143,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation( { if (!isAuto) { - TLLM_LOG_WARNING("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL"); + TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL"); } return AllReduceStrategyType::NCCL; } @@ -305,14 +305,17 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe ++tpRank; } + int token_num = size / inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1]; + auto params = tensorrt_llm::kernels::AllReduceParams::deserialize( - reinterpret_cast(const_cast(inputs[1])), tpSize, tpRank); + reinterpret_cast(const_cast(inputs[1])), tpSize, tpRank, mType, token_num, mOp); params.local_output_buffer_ptr = outputs[0]; params.local_input_buffer_ptr = inputs[0]; params.elts_total = size; if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM) { + int fusion_ptr_idx = 2; params.fusion_params.bias_buffer = mBias ? inputs[fusion_ptr_idx++] : nullptr; params.fusion_params.residual_buffer = inputs[fusion_ptr_idx++]; @@ -320,6 +323,15 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe params.fusion_params.hidden_size = inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1]; params.fusion_params.eps = mEps; params.fusion_params.intermediate_buffer = outputs[1]; + for (int i = 0; i < tpSize; ++i) + { + params.fusion_params.lamport_peer_comm_buffer_ptrs[i] + = reinterpret_cast(const_cast(inputs[1]))[tpSize * 4 + i]; + params.fusion_params.lamport_peer_comm_buffer_ptrs[i + tensorrt_llm::kernels::MAX_RANKS_PER_NODE] + = reinterpret_cast(const_cast(inputs[1]))[tpSize * 5 + i]; + params.fusion_params.lamport_peer_comm_buffer_ptrs[i + tensorrt_llm::kernels::MAX_RANKS_PER_NODE * 2] + = reinterpret_cast(const_cast(inputs[1]))[tpSize * 6 + i]; + } } tensorrt_llm::kernels::customAllReduce(params, mType, runtimeStrategy, mConfig, mOp, stream); } diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt old mode 100644 new mode 100755 index 65f54c0c3..daae58398 --- a/cpp/tensorrt_llm/pybind/CMakeLists.txt +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -3,35 +3,19 @@ set(TRTLLM_PYBIND_MODULE ${TRTLLM_PYBIND_MODULE} PARENT_SCOPE) -if(NOT BUILD_PYT) - message( - FATAL_ERROR - "Python bindings for C++ runtime require PyTorch. Please enable BUILD_PYT" - ) -endif() - -execute_process( - COMMAND ${Python3_EXECUTABLE} "-c" - "import pybind11 as pb11; print(pb11.get_cmake_dir(),end='');" - RESULT_VARIABLE PYBIND_CMAKE_DIR_RET - OUTPUT_VARIABLE PYBIND_CMAKE_DIR) - -if(PYBIND_CMAKE_DIR_RET MATCHES 0) - list(APPEND CMAKE_PREFIX_PATH "${PYBIND_CMAKE_DIR}") -else() - message(ERROR "pybind11 CMake directory not found.") -endif() - -find_package(pybind11 REQUIRED) - set(SRCS - bindings.cpp + batch_manager/algorithms.cpp + batch_manager/bindings.cpp batch_manager/gptManager.cpp - batch_manager/llmRequest.cpp batch_manager/inferenceRequest.cpp + batch_manager/kvCacheManager.cpp + batch_manager/llmRequest.cpp batch_manager/namedTensor.cpp executor/bindings.cpp - executor/executor.cpp) + executor/executor.cpp + bindings.cpp) + +include_directories(${PROJECT_SOURCE_DIR}/include) pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS}) @@ -42,12 +26,11 @@ target_link_directories(${TRTLLM_PYBIND_MODULE} PUBLIC "${TORCH_INSTALL_PREFIX}/lib") target_link_libraries( ${TRTLLM_PYBIND_MODULE} - PUBLIC ${SHARED_TARGET} ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python - ${UNDEFINED_FLAG}) -target_compile_definitions(${TRTLLM_PYBIND_MODULE} - PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE}) -target_compile_definitions(${TRTLLM_PYBIND_MODULE} - PUBLIC PYBIND11_DETAILED_ERROR_MESSAGES=1) + PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG} + ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python) +target_compile_definitions( + ${TRTLLM_PYBIND_MODULE} PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE} + PYBIND11_DETAILED_ERROR_MESSAGES=1) if(NOT WIN32) set_target_properties( diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp new file mode 100644 index 000000000..15fc1ee4f --- /dev/null +++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "algorithms.h" +#include "tensorrt_llm/batch_manager/capacityScheduler.h" +#include "tensorrt_llm/batch_manager/kvCacheManager.h" +#include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/batch_manager/microBatchScheduler.h" +#include "tensorrt_llm/batch_manager/peftCacheManager.h" +#include "tensorrt_llm/pybind/common/algorithmBindings.h" + +namespace py = pybind11; + +using namespace tensorrt_llm::batch_manager; +using namespace PybindUtils; + +void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::module_& m) +{ + // Algorithms with custom bindings + py::class_(m, CapacityScheduler::name) + .def_static("make", &CapacityScheduler::make, py::arg("max_num_requests"), py::arg("kv_cache_manager"), + py::arg("cross_kv_cache_manager"), py::arg("peft_cache_manager"), py::arg("capacity_scheduler_policy"), + py::arg("many_micro_batches") = false, + py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"), + py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE, + "LlmRequestState.GENERATION_COMPLETE")) + .def(py::init()) + .def("__call__", &CapacityScheduler::operator()) + .def("name", [](CapacityScheduler const&) { return CapacityScheduler::name; }); + + py::class_(m, MicroBatchScheduler::name) + .def_static("make", &MicroBatchScheduler::make, py::arg("max_batch_size"), + py::arg_v("max_num_tokens", std::nullopt, "None"), py::arg_v("ctx_chunk_config", std::nullopt, "None"), + py::arg_v("max_context_length", std::nullopt, "None"), + py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"), + py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE, + "LlmRequestState.GENERATION_COMPLETE")) + .def(py::init()) + .def("__call__", &MicroBatchScheduler::operator()) + .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; }); +} diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h new file mode 100644 index 000000000..895a4d13e --- /dev/null +++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/pybind/common/opaqueBindings.h" +#include + +namespace tensorrt_llm::pybind::batch_manager::algorithms +{ + +void initBindings(pybind11::module_& m); + +} diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp new file mode 100644 index 000000000..20de984f9 --- /dev/null +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -0,0 +1,41 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bindings.h" +#include "tensorrt_llm/batch_manager/common.h" +#include "tensorrt_llm/batch_manager/microBatchScheduler.h" +#include "tensorrt_llm/pybind/utils/bindTypes.h" + +namespace py = pybind11; +namespace tb = tensorrt_llm::batch_manager; +namespace tle = tensorrt_llm::executor; + +using namespace tensorrt_llm::runtime; + +namespace tensorrt_llm::pybind::batch_manager +{ + +void initBindings(pybind11::module_& m) +{ + py::class_(m, "ContextChunkingConfig") + .def(py::init(), py::arg("chunking_policy"), + py::arg("chunk_unit_size")) + .def_readwrite("chunking_policy", &tb::batch_scheduler::ContextChunkingConfig::chunkingPolicy) + .def_readwrite("chunk_unit_size", &tb::batch_scheduler::ContextChunkingConfig::chunkUnitSize); +} + +} // namespace tensorrt_llm::pybind::batch_manager diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.h b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h new file mode 100644 index 000000000..326143d4f --- /dev/null +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/pybind/common/opaqueBindings.h" +#include + +namespace tensorrt_llm::pybind::batch_manager +{ + +void initBindings(pybind11::module_& m); + +} diff --git a/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h b/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h index 3f19dddc7..0c3b81796 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h +++ b/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h @@ -21,6 +21,7 @@ #include "namedTensor.h" #include "tensorrt_llm/batch_manager/GptManager.h" #include "tensorrt_llm/batch_manager/callbacks.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include #include diff --git a/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h index 98ae79b34..d30864e6e 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h +++ b/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h @@ -20,6 +20,7 @@ #include "tensorrt_llm/batch_manager/inferenceRequest.h" #include "tensorrt_llm/pybind/batch_manager/llmRequest.h" #include "tensorrt_llm/pybind/batch_manager/namedTensor.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include #include diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp new file mode 100644 index 000000000..1e6e59b42 --- /dev/null +++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp @@ -0,0 +1,29 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +#include "kvCacheManager.h" +#include "tensorrt_llm/pybind/utils/bindTypes.h" + +namespace tb = tensorrt_llm::batch_manager; +namespace py = pybind11; + +void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m) +{ + // TODO: Provide proper bindings + py::classh(m, "KVCacheManager"); +} + +void tb::BasePeftCacheManagerBindings::initBindings(py::module_& m) +{ + // TODO: Provide proper bindings + py::classh(m, "BasePeftCacheManager"); +} diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h new file mode 100644 index 000000000..7753c684d --- /dev/null +++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h @@ -0,0 +1,36 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +#pragma once + +#include "tensorrt_llm/batch_manager/kvCacheManager.h" +#include "tensorrt_llm/batch_manager/peftCacheManager.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" +#include + +namespace tensorrt_llm::batch_manager::kv_cache_manager +{ +class KVCacheManagerBindings +{ +public: + static void initBindings(pybind11::module_& m); +}; +} // namespace tensorrt_llm::batch_manager::kv_cache_manager + +namespace tensorrt_llm::batch_manager +{ +class BasePeftCacheManagerBindings +{ +public: + static void initBindings(pybind11::module_& m); +}; +} // namespace tensorrt_llm::batch_manager diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp index 193940083..4ef2e6851 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp @@ -17,22 +17,29 @@ #include "llmRequest.h" #include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/pybind/utils/bindTypes.h" #include "tensorrt_llm/runtime/torch.h" #include "tensorrt_llm/runtime/torchUtils.h" #include "tensorrt_llm/runtime/torchView.h" +#include #include #include #include +#include #include #include namespace tb = tensorrt_llm::batch_manager; namespace tr = tensorrt_llm::runtime; +namespace tle = tensorrt_llm::executor; using namespace tensorrt_llm::pybind::batch_manager; +using LlmRequestPtr = std::shared_ptr; +using RequestList = std::list; + namespace { @@ -166,7 +173,6 @@ void LlmRequest::initBindings(py::module_& m) .def_property_readonly("orig_prompt_len", &LlmRequest::getOrigPromptLen) .def("has_draft_tokens", &LlmRequest::hasDraftTokens) .def("move_to_next_context_chunk", &LlmRequest::moveToNextContextChunk) - .def("is_full_context_request", py::overload_cast<>(&LlmRequest::isFullContextRequest, py::const_)) .def("is_last_context_chunk", py::overload_cast<>(&LlmRequest::isLastContextChunk, py::const_)) .def("is_first_context_chunk", py::overload_cast<>(&LlmRequest::isFirstContextChunk, py::const_)) .def("get_context_remaining_length", py::overload_cast<>(&LlmRequest::getContextRemainingLength, py::const_)) @@ -180,3 +186,140 @@ void LlmRequest::initBindings(py::module_& m) { self.setDraftLogits(std::make_optional(logits)); }) .def_property("num_return_sequences", &LlmRequest::getNumReturnSequences, &LlmRequest::setNumReturnSequences); } + +void tb::LlmRequestBindings::initBindings(py::module_& m) +{ + py::classh(m, "PyLlmRequest") + .def("get_num_tokens", &tb::LlmRequest::getNumTokens, py::arg("beam")) + .def_property_readonly("max_beam_num_tokens", &tb::LlmRequest::getMaxBeamNumTokens) + .def("get_token", &tb::LlmRequest::getToken, py::arg("beam"), py::arg("pos")) + .def("get_tokens", py::overload_cast(&tb::LlmRequest::getTokens, py::const_), + py::arg("beam")) + .def("get_tokens", py::overload_cast<>(&tb::LlmRequest::getTokens, py::const_)) + .def_property_readonly("max_num_generated_tokens", &tb::LlmRequest::getMaxNumGeneratedTokens) + .def("add_new_token", &tb::LlmRequest::addNewToken, py::arg("token"), py::arg("beam")) + .def("add_new_tokens", &tb::LlmRequest::addNewTokens, py::arg("beam_tokens")) + .def("set_generated_tokens", &tb::LlmRequest::setGeneratedTokens, py::arg("generated_beam_tokens")) + .def("pause", &tb::LlmRequest::pause, py::arg("max_input_len")) + .def_property("max_sent_token_len", &tb::LlmRequest::getMaxSentTokenLen, &tb::LlmRequest::setMaxSentTokenLen) + .def("prompt_embedding_table", + [](tb::LlmRequest& self) + { + std::optional value{std::nullopt}; + auto tensor = self.getPromptEmbeddingTable(); + if (tensor) + { + value = tr::Torch::tensor(*tensor); + } + return value; + }) + .def("bad_words_list", + [](tb::LlmRequest& self) + { + std::optional value{std::nullopt}; + auto tensor = self.getBadWordsList(); + if (tensor) + { + value = tr::Torch::tensor(*tensor); + } + return value; + }) + .def_property( + "draft_logits", + [](tb::LlmRequest& self) + { + std::optional value{std::nullopt}; + auto tensor = self.getDraftLogits(); + if (tensor) + { + value = tr::Torch::tensor(*tensor); + } + return value; + }, + [](tb::LlmRequest& self, at::Tensor& logits) + { self.setDraftLogits(std::make_optional(tr::TorchView::of(logits))); }) + .def("embedding_bias", + [](tb::LlmRequest& self) + { + std::optional value{std::nullopt}; + auto tensor = self.getEmbeddingBias(); + if (tensor) + { + value = tr::Torch::tensor(*tensor); + } + return value; + }) + .def("lora_config", + [](tb::LlmRequest& self) + { + std::optional value{std::nullopt}; + auto tensor = self.getLoraConfig(); + if (tensor) + { + value = tr::Torch::tensor(*tensor); + } + return value; + }) + .def("lora_weights", + [](tb::LlmRequest& self) + { + std::optional value{std::nullopt}; + auto tensor = self.getLoraWeights(); + if (tensor) + { + value = tr::Torch::tensor(*tensor); + } + return value; + }) + .def("stop_words_list", + [](tb::LlmRequest& self) + { + std::optional value{std::nullopt}; + auto tensor = self.getStopWordsList(); + if (tensor) + { + value = tr::Torch::tensor(*tensor); + } + return value; + }) + .def_property_readonly("prompt_vocab_size", &tb::LlmRequest::getPromptVocabSize) + .def_property_readonly("lora_task_id", &tb::LlmRequest::getLoraTaskId) + .def_property_readonly("lookahead_config", &tb::LlmRequest::getLookaheadConfig) + .def_property_readonly( + "context_current_position", py::overload_cast<>(&tb::LlmRequest::getContextCurrentPosition, py::const_)) + .def_property("context_chunk_size", &tb::LlmRequest::getContextChunkSize, &tb::LlmRequest::setContextChunkSize) + .def_readwrite("request_id", &tb::LlmRequest::mRequestId) + .def_readwrite("prompt_len", &tb::LlmRequest::mPromptLen) + .def_readwrite("max_new_tokens", &tb::LlmRequest::mMaxNewTokens) + .def_readwrite("sampling_config", &tb::LlmRequest::mSamplingConfig) + .def_readwrite("state", &tb::LlmRequest::mState) + .def_readwrite("is_streaming", &tb::LlmRequest::mIsStreaming) + .def_readwrite("end_id", &tb::LlmRequest::mEndId) + .def_readwrite("pad_id", &tb::LlmRequest::mPadId) + .def_readwrite("seq_slot", &tb::LlmRequest::mSeqSlot) + .def_property_readonly("return_log_probs", &tb::LlmRequest::returnLogProbs) + .def_property_readonly("return_context_logits", &tb::LlmRequest::setReturnContextLogits) + .def_property_readonly("return_generation_logits", &tb::LlmRequest::setReturnGenerationLogits) + .def_property_readonly("log_probs", py::overload_cast<>(&tb::LlmRequest::getLogProbs, py::const_)) + .def("get_log_probs", py::overload_cast(&tb::LlmRequest::getLogProbs, py::const_)) + .def("set_log_probs", &tb::LlmRequest::setLogProbs, py::arg("log_probs"), py::arg("beam")) + .def("set_return_encoder_output", &tb::LlmRequest::setReturnEncoderOutput, py::arg("return_encoder_output")) + .def("get_return_encoder_output", &tb::LlmRequest::getReturnEncoderOutput) + .def("priority", py::overload_cast<>(&tb::LlmRequest::priority, py::const_)) + .def("set_priority", py::overload_cast(&tb::LlmRequest::setPriority)) + .def_property_readonly("cum_log_probs", &tb::LlmRequest::getCumLogProbs) + .def("set_cum_log_prob", &tb::LlmRequest::setCumLogProb, py::arg("cum_log_prob"), py::arg("beam")) + .def_property_readonly("orig_prompt_len", &tb::LlmRequest::getOrigPromptLen) + .def("has_draft_tokens", &tb::LlmRequest::hasDraftTokens) + .def("move_to_next_context_chunk", &tb::LlmRequest::moveToNextContextChunk) + .def("is_last_context_chunk", py::overload_cast<>(&tb::LlmRequest::isLastContextChunk, py::const_)) + .def("is_first_context_chunk", py::overload_cast<>(&tb::LlmRequest::isFirstContextChunk, py::const_)) + .def( + "get_context_remaining_length", py::overload_cast<>(&tb::LlmRequest::getContextRemainingLength, py::const_)) + .def_property( + "draft_tokens", [](tb::LlmRequest& self) { return *self.getDraftTokens(); }, + [](tb::LlmRequest& self, tb::LlmRequest::VecTokens& draftTokens) + { self.setDraftTokens(std::make_shared(std::move(draftTokens))); }); + + py::bind_vector(m, "RequestVector"); +} diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h index 34ea424e6..1bc265600 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h +++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h @@ -18,6 +18,7 @@ #pragma once #include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include #include @@ -25,6 +26,15 @@ #include #include +namespace tensorrt_llm::batch_manager +{ +class LlmRequestBindings +{ +public: + static void initBindings(pybind11::module_& m); +}; +} // namespace tensorrt_llm::batch_manager + namespace tensorrt_llm::pybind::batch_manager { @@ -91,6 +101,7 @@ class LlmRequest : public tb::GenericLlmRequest std::optional callback); [[nodiscard]] std::shared_ptr toTrtLlm() const; + static void initBindings(pybind11::module_& m); }; diff --git a/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h b/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h index 9a0bf661d..522aa52e5 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h +++ b/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h @@ -18,6 +18,7 @@ #pragma once #include "tensorrt_llm/batch_manager/namedTensor.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp index 7a6c25c29..71950bbe5 100644 --- a/cpp/tensorrt_llm/pybind/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/bindings.cpp @@ -23,18 +23,20 @@ #include #include +#include "tensorrt_llm/batch_manager/BatchManager.h" +#include "tensorrt_llm/batch_manager/kvCacheConfig.h" +#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h" +#include "tensorrt_llm/common/mpiUtils.h" +#include "tensorrt_llm/common/quantization.h" +#include "tensorrt_llm/pybind/batch_manager/algorithms.h" +#include "tensorrt_llm/pybind/batch_manager/bindings.h" #include "tensorrt_llm/pybind/batch_manager/gptManager.h" #include "tensorrt_llm/pybind/batch_manager/inferenceRequest.h" +#include "tensorrt_llm/pybind/batch_manager/kvCacheManager.h" #include "tensorrt_llm/pybind/batch_manager/llmRequest.h" #include "tensorrt_llm/pybind/batch_manager/namedTensor.h" #include "tensorrt_llm/pybind/executor/bindings.h" #include "tensorrt_llm/pybind/utils/pathCaster.h" - -#include "tensorrt_llm/batch_manager/BatchManager.h" -#include "tensorrt_llm/batch_manager/kvCacheConfig.h" -#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h" -#include "tensorrt_llm/common/mpiUtils.h" -#include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/memoryCounters.h" @@ -178,19 +180,25 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) .def(py::self != py::self); py::class_(m, "ModelConfig") - .def(py::init(), - py::arg("vocab_size"), py::arg("num_attention_layers"), py::arg("num_rnn_layers"), py::arg("num_heads"), - py::arg("hidden_size"), py::arg("data_type")) + .def(py::init(), + py::arg("vocab_size"), py::arg("num_layers"), py::arg("num_attention_layers"), py::arg("num_rnn_layers"), + py::arg("num_heads"), py::arg("hidden_size"), py::arg("data_type")) .def_property_readonly("vocab_size", &tr::ModelConfig::getVocabSize) .def("vocab_size_padded", &tr::ModelConfig::getVocabSizePadded, py::arg("world_size")) - .def("num_attention_layers", &tr::ModelConfig::getNbAttentionLayers, py::arg("pipeline_parallelism") = 1) - .def("num_rnn_layers", &tr::ModelConfig::getNbRnnLayers, py::arg("pipeline_parallelism") = 1) + .def("num_layers", &tr::ModelConfig::getNbLayers, py::arg("pipeline_parallelism") = 1) + .def("num_attention_layers", &tr::ModelConfig::getNbAttentionLayers, py::arg("pipeline_parallelism") = 1, + py::arg("pipeline_parallelism_rank") = 0) + .def("num_rnn_layers", &tr::ModelConfig::getNbRnnLayers, py::arg("pipeline_parallelism") = 1, + py::arg("pipeline_parallelism_rank") = 0) + .def("num_kv_heads", &tr::ModelConfig::getNbKvHeads, py::arg("layer_idx")) + .def("set_num_kv_heads", &tr::ModelConfig::setNbKvHeads, py::arg("num_kv_heads")) .def_property_readonly("num_heads", &tr::ModelConfig::getNbHeads) .def_property_readonly("hidden_size", &tr::ModelConfig::getHiddenSize) .def_property_readonly("size_per_head", &tr::ModelConfig::getSizePerHead) .def_property_readonly("data_type", &tr::ModelConfig::getDataType) - .def_property("num_kv_heads", &tr::ModelConfig::getNbKvHeads, &tr::ModelConfig::setNbKvHeads) .def_property("head_size", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead) + .def_property( + "num_kv_heads_per_layer", &tr::ModelConfig::getNumKvHeadsPerLayer, &tr::ModelConfig::setNumKvHeadsPerLayer) .def_property("use_gpt_attention_plugin", py::overload_cast<>(&tr::ModelConfig::useGptAttentionPlugin, py::const_), py::overload_cast(&tr::ModelConfig::useGptAttentionPlugin)) @@ -317,16 +325,20 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) py::overload_cast(&tr::GptJsonConfig::engineFilename, py::const_), py::arg("world_config")); - py::enum_(m, "LlmRequestState") - .value("REQUEST_STATE_UNKNOWN", tb::LlmRequestState_t::REQUEST_STATE_UNKNOWN) - .value("REQUEST_STATE_ENCODER_INIT", tb::LlmRequestState_t::REQUEST_STATE_ENCODER_INIT) - .value("REQUEST_STATE_CONTEXT_INIT", tb::LlmRequestState_t::REQUEST_STATE_CONTEXT_INIT) - .value("REQUEST_STATE_GENERATION_IN_PROGRESS", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_IN_PROGRESS) - .value("REQUEST_STATE_GENERATION_TO_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_TO_COMPLETE) - .value("REQUEST_STATE_GENERATION_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_COMPLETE); + py::enum_(m, "LlmRequestState") + .value("UNKNOWN", tb::LlmRequestState::kUNKNOWN) + .value("ENCODER_INIT", tb::LlmRequestState::kENCODER_INIT) + .value("CONTEXT_INIT", tb::LlmRequestState::kCONTEXT_INIT) + .value("GENERATION_IN_PROGRESS", tb::LlmRequestState::kGENERATION_IN_PROGRESS) + .value("GENERATION_TO_COMPLETE", tb::LlmRequestState::kGENERATION_TO_COMPLETE) + .value("GENERATION_COMPLETE", tb::LlmRequestState::kGENERATION_COMPLETE); tpb::NamedTensor::initBindings(m); tpb::LlmRequest::initBindings(m); + tb::kv_cache_manager::KVCacheManagerBindings::initBindings(m); + tb::BasePeftCacheManagerBindings::initBindings(m); + + tb::LlmRequestBindings::initBindings(m); auto tensorNames = m.def_submodule("tensor_names"); // Input tensor names @@ -406,8 +418,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) .def(py::pickle(gptModelParamsGetState, gptModelParamsSetState)) .def("__eq__", &tb::TrtGptModelOptionalParams::operator==); - tpb::GptManager::initBindings(m); - py::class_(m, "MemoryCounters") .def_static("instance", &tr::MemoryCounters::getInstance, py::return_value_policy::reference) .def_property_readonly("gpu", &tr::MemoryCounters::getGpu) @@ -441,4 +451,11 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) auto& world = tensorrt_llm::mpi::MpiComm::world(); tensorrt_llm::mpi::MpiComm::setSession(world.split(color, rank)); }); + + auto mInternal = m.def_submodule("internal", "Internal submodule of TRTLLM runtime"); + + tensorrt_llm::pybind::batch_manager::initBindings(mInternal); + tensorrt_llm::pybind::batch_manager::algorithms::initBindings(mInternal); + + tpb::GptManager::initBindings(m); } diff --git a/cpp/tensorrt_llm/pybind/common/algorithmBindings.h b/cpp/tensorrt_llm/pybind/common/algorithmBindings.h new file mode 100644 index 000000000..0a81a4e63 --- /dev/null +++ b/cpp/tensorrt_llm/pybind/common/algorithmBindings.h @@ -0,0 +1,39 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +#pragma once + +#include "opaqueBindings.h" +#include +#include +#include + +namespace py = pybind11; + +namespace PybindUtils +{ +template +void makeAlgorithmBindings(py::module_& m) +{ + py::class_(m, T::name).def(py::init()).def("forward", &T::forward).def("name", [](T const&) { return T::name; }); +} + +template +void instantiatePybindAlgorithm(py::module_& m); +} // namespace PybindUtils + +#define INSTANTIATE_ALGORITHM(TYPE) \ + template <> \ + void PybindUtils::instantiatePybindAlgorithm(py::module_ & m) \ + { \ + makeAlgorithmBindings(m); \ + }; diff --git a/cpp/tensorrt_llm/pybind/common/opaqueBindings.h b/cpp/tensorrt_llm/pybind/common/opaqueBindings.h new file mode 100644 index 000000000..59f98a76d --- /dev/null +++ b/cpp/tensorrt_llm/pybind/common/opaqueBindings.h @@ -0,0 +1,18 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +#pragma once + +#include "tensorrt_llm/batch_manager/common.h" +#include + +PYBIND11_MAKE_OPAQUE(tensorrt_llm::batch_manager::RequestVector) diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp index c578eb181..5ca057704 100644 --- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp @@ -93,7 +93,8 @@ void InitBindings(pybind11::module_& m) py::enum_(m, "CapacitySchedulerPolicy") .value("MAX_UTILIZATION", tle::CapacitySchedulerPolicy::kMAX_UTILIZATION) - .value("GUARANTEED_NO_EVICT", tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT); + .value("GUARANTEED_NO_EVICT", tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT) + .value("STATIC_BATCH", tle::CapacitySchedulerPolicy::kSTATIC_BATCH); py::enum_(m, "ContextChunkingPolicy") .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS) @@ -153,6 +154,11 @@ void InitBindings(pybind11::module_& m) [](tle::IterationStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); }); + py::class_(m, "DebugTensorsPerIteration") + .def(py::init<>()) + .def_readwrite("iter", &tle::DebugTensorsPerIteration::iter) + .def_readwrite("debug_tensors", &tle::DebugTensorsPerIteration::debugTensors); + py::enum_(m, "RequestStage") .value("QUEUED", tle::RequestStage::kQUEUED) .value("ENCODER_IN_PROGRESS", tle::RequestStage::kENCODER_IN_PROGRESS) @@ -160,6 +166,10 @@ void InitBindings(pybind11::module_& m) .value("GENERATION_IN_PROGRESS", tle::RequestStage::kGENERATION_IN_PROGRESS) .value("GENERATION_COMPLETE", tle::RequestStage::kGENERATION_COMPLETE); + py::class_(m, "DisServingRequestStats") + .def(py::init<>()) + .def_readwrite("kv_cache_transfer_ms", &tle::DisServingRequestStats::kvCacheTransferMS); + py::class_(m, "RequestStats") .def(py::init<>()) .def_readwrite("id", &tle::RequestStats::id) @@ -169,6 +179,7 @@ void InitBindings(pybind11::module_& m) .def_readwrite("avg_num_decoded_tokens_per_iter", &tle::RequestStats::avgNumDecodedTokensPerIter) .def_readwrite("scheduled", &tle::RequestStats::scheduled) .def_readwrite("paused", &tle::RequestStats::paused) + .def_readwrite("dis_serving_stats", &tle::RequestStats::disServingStats) .def("to_json_str", [](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); }); @@ -289,7 +300,8 @@ void InitBindings(pybind11::module_& m) .def_property_readonly("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize); py::class_(m, "ContextPhaseParams") - .def(py::init(), py::arg("first_gen_tokens")); + .def(py::init(), py::arg("first_gen_tokens"), + py::arg("req_id")); py::class_ request(m, "Request"); request @@ -393,12 +405,18 @@ void InitBindings(pybind11::module_& m) .def_readwrite("encoder_output", &tle::Result::encoderOutput) .def_readwrite("finish_reasons", &tle::Result::finishReasons) .def_readwrite("sequence_index", &tle::Result::sequenceIndex) - .def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal); + .def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal) + .def_readwrite("decoding_iter", &tle::Result::decodingIter) + .def_readwrite("context_phase_params", &tle::Result::contextPhaseParams) + .def_readwrite("sequence_index", &tle::Result::sequenceIndex); py::class_(m, "Response") - .def(py::init(), py::arg("request_id"), py::arg("error_msg")) - .def(py::init(), py::arg("request_id"), py::arg("result")) + .def(py::init>(), py::arg("request_id"), py::arg("error_msg"), + py::arg("client_id") = std::nullopt) + .def(py::init>(), py::arg("request_id"), py::arg("result"), + py::arg("client_id") = std::nullopt) .def_property_readonly("request_id", &tle::Response::getRequestId) + .def_property_readonly("client_id", &tle::Response::getClientId) .def("has_error", &tle::Response::hasError) .def_property_readonly("error_msg", &tle::Response::getErrorMsg) .def_property_readonly("result", &tle::Response::getResult); @@ -430,25 +448,27 @@ void InitBindings(pybind11::module_& m) { return py::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(), self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(), - self.getOnboardBlocks()); + self.getOnboardBlocks(), self.getCrossKvCacheFraction()); }; auto kvCacheConfigSetstate = [](py::tuple state) { - if (state.size() != 7) + if (state.size() != 8) { throw std::runtime_error("Invalid state!"); } return tle::KvCacheConfig(state[0].cast(), state[1].cast>(), state[2].cast>>(), state[3].cast>(), - state[4].cast>(), state[5].cast>(), state[6].cast()); + state[4].cast>(), state[5].cast>(), state[6].cast(), + state[7].cast>()); }; py::class_(m, "KvCacheConfig") .def(py::init const&, std::optional> const&, - std::optional const&, std::optional const&, std::optional const&, bool>(), + std::optional const&, std::optional const&, std::optional const&, bool, + std::optional const&>(), py::arg("enable_block_reuse") = false, py::arg("max_tokens") = py::none(), py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(), py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(), - py::arg("onboard_blocks") = true) + py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none()) .def_property( "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse) .def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens) @@ -460,6 +480,8 @@ void InitBindings(pybind11::module_& m) &tle::KvCacheConfig::setFreeGpuMemoryFraction) .def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize) .def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks) + .def_property("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction, + &tle::KvCacheConfig::setCrossKvCacheFraction) .def(py::pickle(kvCacheConfigGetstate, kvCacheConfigSetstate)); py::class_(m, "OrchestratorConfig") @@ -567,25 +589,31 @@ void InitBindings(pybind11::module_& m) .def(py::pickle(decodingConfigGetstate, decodingConfigSetstate)); auto debugConfigGetstate = [](tle::DebugConfig const& self) - { return py::make_tuple(self.getDumpInputTensors(), self.getDumpOutputTensors(), self.getDebugTensorNames()); }; + { + return py::make_tuple(self.getDebugInputTensors(), self.getDebugOutputTensors(), self.getDebugTensorNames(), + self.getDebugTensorsMaxIterations()); + }; auto debugConfigSetstate = [](py::tuple state) { - if (state.size() != 3) + if (state.size() != 4) { throw std::runtime_error("Invalid state!"); } - return tle::DebugConfig( - state[0].cast(), state[1].cast(), state[2].cast>()); + return tle::DebugConfig(state[0].cast(), state[1].cast(), state[2].cast>(), + state[3].cast()); }; py::class_(m, "DebugConfig") - .def(py::init>(), py::arg("dump_input_tensors") = false, - py::arg("dump_output_tensors") = false, py::arg("debug_tensor_names") = py::none()) + .def(py::init, SizeType32>(), py::arg("debug_input_tensors") = false, + py::arg("debug_output_tensors") = false, py::arg("debug_tensor_names") = py::none(), + py::arg("debug_tensors_max_iterations") = false) .def_property( - "dump_input_tensors", &tle::DebugConfig::getDumpInputTensors, &tle::DebugConfig::setDumpInputTensors) + "debug_input_tensors", &tle::DebugConfig::getDebugInputTensors, &tle::DebugConfig::setDebugInputTensors) .def_property( - "dump_output_tensors", &tle::DebugConfig::getDumpOutputTensors, &tle::DebugConfig::setDumpOuputTensors) + "debug_output_tensors", &tle::DebugConfig::getDebugOutputTensors, &tle::DebugConfig::setDebugOutputTensors) .def_property( "debug_tensor_names", &tle::DebugConfig::getDebugTensorNames, &tle::DebugConfig::setDebugTensorNames) + .def_property("debug_tensors_max_iterations", &tle::DebugConfig::getDebugTensorsMaxIterations, + &tle::DebugConfig::setDebugTensorsMaxIterations) .def(py::pickle(debugConfigGetstate, debugConfigSetstate)); auto logitsPostProcessorConfigGetstate = [](tle::LogitsPostProcessorConfig const& self) @@ -615,14 +643,18 @@ void InitBindings(pybind11::module_& m) auto extendedRuntimePerfKnobConfigSetstate = [](py::tuple state) { - if (state.size() != 2) + if (state.size() != 4) { throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!"); } - return tle::ExtendedRuntimePerfKnobConfig(state[0].cast(), state[1].cast()); + return tle::ExtendedRuntimePerfKnobConfig( + state[0].cast(), state[1].cast(), state[2].cast(), state[2].cast()); }; auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self) - { return py::make_tuple(self.getMultiBlockMode(), self.getEnableContextFMHAFP32Acc()); }; + { + return py::make_tuple(self.getMultiBlockMode(), self.getEnableContextFMHAFP32Acc(), self.getCudaGraphMode(), + self.getCudaGraphCacheSize()); + }; py::class_(m, "ExtendedRuntimePerfKnobConfig") .def( py::init(), py::arg("multi_block_mode") = true, py::arg("enable_context_fmha_fp32_acc") = false) @@ -630,6 +662,10 @@ void InitBindings(pybind11::module_& m) &tle::ExtendedRuntimePerfKnobConfig::setMultiBlockMode) .def_property("enable_context_fmha_fp32_acc", &tle::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc, &tle::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc) + .def_property("cuda_graph_mode", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphMode, + &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphMode) + .def_property("cuda_graph_cache_size", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize, + &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize) .def(py::pickle(extendedRuntimePerfKnobConfigGetstate, extendedRuntimePerfKnobConfigSetstate)); auto executorConfigGetState = [](tle::ExecutorConfig const& self) diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.h b/cpp/tensorrt_llm/pybind/executor/bindings.h index 7a686b19b..59916dcd6 100644 --- a/cpp/tensorrt_llm/pybind/executor/bindings.h +++ b/cpp/tensorrt_llm/pybind/executor/bindings.h @@ -16,6 +16,8 @@ */ #pragma once + +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include namespace tensorrt_llm::pybind::executor diff --git a/cpp/tensorrt_llm/pybind/executor/executor.cpp b/cpp/tensorrt_llm/pybind/executor/executor.cpp index 229edfc31..768c08bf4 100644 --- a/cpp/tensorrt_llm/pybind/executor/executor.cpp +++ b/cpp/tensorrt_llm/pybind/executor/executor.cpp @@ -184,6 +184,7 @@ void Executor::initBindings(py::module_& m) .def("cancel_request", &Executor::cancelRequest, py::arg("id") = py::none()) .def("get_latest_iteration_stats", &Executor::getLatestIterationStats) .def("get_latest_request_stats", &Executor::getLatestRequestStats) + .def("get_latest_debug_tensors", &Executor::getLatestDebugTensors) .def("can_enqueue_requests", &Executor::canEnqueueRequests); } diff --git a/cpp/tensorrt_llm/pybind/executor/executor.h b/cpp/tensorrt_llm/pybind/executor/executor.h index 921988f26..b70ba4c9c 100644 --- a/cpp/tensorrt_llm/pybind/executor/executor.h +++ b/cpp/tensorrt_llm/pybind/executor/executor.h @@ -16,8 +16,10 @@ */ #pragma once + #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/executor/types.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include namespace tle = tensorrt_llm::executor; @@ -103,6 +105,11 @@ class Executor return mExecutor->getLatestRequestStats(); } + std::deque getLatestDebugTensors() + { + return mExecutor->getLatestDebugTensors(); + } + [[nodiscard]] bool canEnqueueRequests() const { return mExecutor->canEnqueueRequests(); diff --git a/cpp/tensorrt_llm/pybind/executor/streamCaster.h b/cpp/tensorrt_llm/pybind/executor/streamCaster.h index 4838cc6cc..e0c0ccf01 100644 --- a/cpp/tensorrt_llm/pybind/executor/streamCaster.h +++ b/cpp/tensorrt_llm/pybind/executor/streamCaster.h @@ -17,10 +17,10 @@ #pragma once -#include - #include "tensorrt_llm/executor/types.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include "tensorrt_llm/runtime/cudaStream.h" +#include namespace PYBIND11_NAMESPACE { diff --git a/cpp/tensorrt_llm/pybind/executor/tensorCaster.h b/cpp/tensorrt_llm/pybind/executor/tensorCaster.h index 894e0af30..e3c596503 100644 --- a/cpp/tensorrt_llm/pybind/executor/tensorCaster.h +++ b/cpp/tensorrt_llm/pybind/executor/tensorCaster.h @@ -17,11 +17,11 @@ #pragma once -#include - #include "tensorrt_llm/executor/tensor.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include "tensorrt_llm/runtime/torch.h" #include "tensorrt_llm/runtime/torchView.h" +#include #include namespace PYBIND11_NAMESPACE diff --git a/cpp/tensorrt_llm/pybind/utils/bindTypes.h b/cpp/tensorrt_llm/pybind/utils/bindTypes.h new file mode 100644 index 000000000..727c364d9 --- /dev/null +++ b/cpp/tensorrt_llm/pybind/utils/bindTypes.h @@ -0,0 +1,69 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +#pragma once + +#include "tensorrt_llm/pybind/common/opaqueBindings.h" +#include + +namespace PybindUtils +{ + +namespace py = pybind11; + +template +void bindList(py::module& m, std::string const& name) +{ + py::class_(m, name.c_str()) + .def(py::init()) + .def("push_back", [](T& lst, const typename T::value_type& value) { lst.push_back(value); }) + .def("pop_back", [](T& lst) { lst.pop_back(); }) + .def("push_front", [](T& lst, const typename T::value_type& value) { lst.push_front(value); }) + .def("pop_front", [](T& lst) { lst.pop_front(); }) + .def("__len__", [](T const& lst) { return lst.size(); }) + .def( + "__iter__", [](T& lst) { return py::make_iterator(lst.begin(), lst.end()); }, py::keep_alive<0, 1>()) + .def("__getitem__", + [](T const& lst, size_t index) + { + if (index >= lst.size()) + throw py::index_error(); + auto it = lst.begin(); + std::advance(it, index); + return *it; + }) + .def("__setitem__", + [](T& lst, size_t index, const typename T::value_type& value) + { + if (index >= lst.size()) + throw py::index_error(); + auto it = lst.begin(); + std::advance(it, index); + *it = value; + }); +} + +template +void bindSet(py::module& m, std::string const& name) +{ + py::class_(m, name.c_str()) + .def(py::init()) + .def("clear", &T::clear) + .def("size", &T::size) + // .def("insert", py::overload_cast(&T::insert)) + .def("erase", py::overload_cast(&T::erase)) + .def("__contains__", [](T const& s, typename T::value_type x) { return s.find(x) != s.end(); }) + .def( + "__iter__", [](T& s) { return py::make_iterator(s.begin(), s.end()); }, py::keep_alive<0, 1>()); +} + +} // namespace PybindUtils diff --git a/cpp/tensorrt_llm/pybind/utils/pathCaster.h b/cpp/tensorrt_llm/pybind/utils/pathCaster.h index 571be82ad..e74da30dd 100644 --- a/cpp/tensorrt_llm/pybind/utils/pathCaster.h +++ b/cpp/tensorrt_llm/pybind/utils/pathCaster.h @@ -22,6 +22,7 @@ #include "pybind11/detail/descr.h" #include "pybind11/pybind11.h" #include "pybind11/pytypes.h" +#include "tensorrt_llm/pybind/common/opaqueBindings.h" #include namespace PYBIND11_NAMESPACE diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp index 3f1954335..2ce57d5dd 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoder.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoder.cpp @@ -49,11 +49,6 @@ GptDecoder::GptDecoder(executor::DecodingMode const& mode, size_t maxBatchSiz auto const decodingDomain = tensorrt_llm::layers::DecoderDomain( maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, speculativeDecodingModule); mDynamicDecodeLayer = std::make_shared>(mode, decodingDomain, mManager); - auto constexpr nvFloatType = TRTDataType::value; - mLogProbsTiled = mManager->gpu(ITensor::makeShape({static_cast(maxSequenceLength), - static_cast(maxBatchSize), static_cast(maxBeamWidth)}), - nvFloatType); - mManager->setZero(*mLogProbsTiled); mDecodingLayerWorkspace = std::make_unique( mManager, decodingDomain, TRTDataType::value, mDynamicDecodeLayer->getWorkspaceSize()); @@ -166,6 +161,19 @@ void GptDecoder::setup(SamplingConfig const& samplingConfig, size_t batchSize lookaheadParams->attentionPackedMasks = output->lookaheadOutputs->packedMasks; setupParams->decodingParams = std::move(lookaheadParams); } + else if (mDecodingMode.isExternalDraftTokens()) + { + auto externalDraftTokensParams = std::make_shared(); + // signed to unsigned + if (mSamplingConfig.topK) + { + auto const& topK = mSamplingConfig.topK.value(); + externalDraftTokensParams->runtimeTopK = std::vector(std::begin(topK), std::end(topK)); + } + + externalDraftTokensParams->runtimeTopP = mSamplingConfig.topP; + setupParams->decodingParams = std::move(externalDraftTokensParams); + } setupParams->decodingParams->randomSeed = mSamplingConfig.randomSeed; mDecodingLayerWorkspace->setDeviceBatchSlots(batchSlots); @@ -249,6 +257,27 @@ void prepareMedusaInputs( TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } +void prepareExternalDraftTokensInputs( + DecodingInput const& inputs, size_t maxBatchSize, std::shared_ptr& baseInputs) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + auto inputParams = std::dynamic_pointer_cast(baseInputs); + + auto const& externalDraftTokensInputs = inputs.externalDraftTokensInputs.value(); + + inputParams->draftLogits = externalDraftTokensInputs.draftLogits; + inputParams->draftProbs = externalDraftTokensInputs.draftProbs; + inputParams->targetProbs = externalDraftTokensInputs.targetProbs; + inputParams->numDraftTokens = externalDraftTokensInputs.numDraftTokens; + inputParams->draftTokenIds = externalDraftTokensInputs.draftTokenIds; + inputParams->constantThreshold = externalDraftTokensInputs.constantThreshold; + inputParams->useRandomAcceptanceThreshold = externalDraftTokensInputs.useRandomAcceptanceThreshold; + inputParams->step = externalDraftTokensInputs.step; + inputParams->useDraftLogits = externalDraftTokensInputs.useDraftLogits; + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + void prepareExplicitDraftTokensInput(DecodingInput const& inputs, std::shared_ptr& baseInputs) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -321,6 +350,11 @@ std::shared_ptr prepareInputs( forwardParams = std::make_shared(input.endIds, input.batchSlots, input.batchSize); } + else if (decodingMode.isExternalDraftTokens()) + { + forwardParams = std::make_shared( + input.endIds, input.batchSlots, input.step, ite, input.batchSize); + } // No logits for explicit draft tokens if (!decodingMode.isExplicitDraftTokens()) @@ -384,6 +418,11 @@ std::shared_ptr prepareInputs( forwardParams->localBatchSize = input.batchSize; } + if (decodingMode.isExternalDraftTokens()) + { + prepareExternalDraftTokensInputs(input, maxBatchSize, forwardParams); + } + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); return forwardParams; @@ -491,8 +530,7 @@ void prepareSpeculativeDecodingOutputs(DecodingOutput& output, std::shared_ptr prepareOutputs( - DecodingOutput& output, DecodingOutput::TensorPtr& logProbsTiled, tle::DecodingMode const& decodingMode) +std::shared_ptr prepareOutputs(DecodingOutput& output, tle::DecodingMode const& decodingMode) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); std::shared_ptr outputParams; @@ -549,7 +587,7 @@ std::shared_ptr prepareOutputs( if (output.logProbs) { outputParams->outputLogProbs = output.logProbs; - outputParams->outputLogProbsTiled = logProbsTiled; + outputParams->outputLogProbsTiled = output.logProbsTiled; } // Beam search outputs @@ -575,7 +613,7 @@ void GptDecoder::forwardAsync(DecodingOutput& output, DecodingInput const& in { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto forwardParams = prepareInputs(input, mMaxBatchSize, mDecodingMode); - auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode); + auto outputParams = prepareOutputs(output, mDecodingMode); mDynamicDecodeLayer->forwardAsync(outputParams, forwardParams, mDecodingLayerWorkspace); @@ -587,207 +625,15 @@ void GptDecoder::forwardSync(DecodingOutput& output, DecodingInput const& inp { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto forwardParams = prepareInputs(input, mMaxBatchSize, mDecodingMode); - auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode); + auto outputParams = prepareOutputs(output, mDecodingMode); mDynamicDecodeLayer->forwardSync(outputParams, forwardParams, mDecodingLayerWorkspace); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -// Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree -template -void GptDecoder::gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, - BufferManager const& manager, std::optional> samplingConfig) -{ - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - auto& finalOutputIds = *decodingOutput.gatheredIds; - auto const& finalOutputIdsShape = finalOutputIds.getShape(); - auto const& decodingOutputIdsShape = decodingOutput.ids->getShape(); - auto const batchSize = finalOutputIdsShape.d[0]; - auto const beamWidth = finalOutputIdsShape.d[1]; - auto const maxSeqLength = finalOutputIdsShape.d[2]; - - TLLM_CHECK_WITH_INFO(beamWidth > 1, "gatherTree is only needed for beam search."); - - TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[0] == batchSize, - common::fmtstr("Decoder batch size (" FMT_DIM ") does not match final batch size (" FMT_DIM ")", - decodingOutputIdsShape.d[0], batchSize)); - TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[1] == beamWidth, - common::fmtstr("Decoder beam width (" FMT_DIM ") does not match final beam width (" FMT_DIM ")", - decodingOutputIdsShape.d[1], beamWidth)); - TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[2] <= maxSeqLength, - common::fmtstr("Decoder seq length size (" FMT_DIM ") is too large for final seq length (" FMT_DIM ")", - decodingOutputIdsShape.d[2], maxSeqLength)); - - auto const& stream = manager.getStream().get(); - - // prefill finalOutputIds with the EOS tokens from decodingInput.endIds - tensorrt_llm::kernels::invokeInitializeOutput(bufferCast(finalOutputIds), - bufferCast(*decodingInput.endIds), batchSize * beamWidth, maxSeqLength, stream); - sync_check_cuda_error(); - - // Prepare length penalty, use the value from samplingConfig or 1.0f by default - SamplingConfig const& samplingConf = samplingConfig ? (*samplingConfig).get() : mSamplingConfig; - std::vector lengthPenaltyVec; - TensorPtr lengthPenaltyPtr - = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize}), TRTDataType::value)); - if (!samplingConf.lengthPenalty.has_value() || samplingConf.lengthPenalty.value().size() == 0) - { - lengthPenaltyVec = std::vector(batchSize, 1.0f); - } - else if (long int const size = samplingConf.lengthPenalty.value().size(); size == 1) - { - lengthPenaltyVec = std::vector(batchSize, samplingConf.lengthPenalty.value()[0]); - } - else - { - TLLM_CHECK_WITH_INFO(size == batchSize, - common::fmtstr("Size of lengthPenalty in SamplingConfig (" FMT_DIM ") is different from batchSize (" FMT_DIM - ")", - size, batchSize)); - lengthPenaltyVec = samplingConf.lengthPenalty.value(); - } - - lengthPenaltyPtr = manager.copyFrom(lengthPenaltyVec, ITensor::makeShape({batchSize}), runtime::MemoryType::kGPU); - - tensorrt_llm::kernels::BeamHypotheses bh; - bh.nMaxBatchSize = batchSize; - bh.nBatchSize = batchSize; - bh.nBeamWidth = beamWidth; - bh.nMaxSeqLen = maxSeqLength; - bh.lengthPenalties = bufferCast(*lengthPenaltyPtr); - bh.inputLengths = bufferCast(*decodingInput.lengths); - bh.outputIds = bufferCast(finalOutputIds); - bh.logProbs = bufferCastOrNull(decodingOutput.logProbs); - bh.logProbsTiled = bufferCast(*mLogProbsTiled); - bh.sequenceLengths = bufferCast(*decodingOutput.lengths); - bh.cumLogProbs = bufferCast(*decodingOutput.cumLogProbs); - bh.outputIdsCBA = bufferCast(*decodingOutput.beamHypotheses.outputIdsCBA); - bh.logProbsCBA = bufferCast(*decodingOutput.beamHypotheses.logProbsCBA); - bh.sequenceLengthsCBA = bufferCast(*decodingOutput.beamHypotheses.sequenceLengthsCBA); - bh.cumLogProbsCBA = bufferCast(*decodingOutput.beamHypotheses.cumLogProbsCBA); - bh.normedScoresCBA = bufferCast(*decodingOutput.beamHypotheses.normedScoresCBA); - bh.numBeamsCBA = bufferCast(*decodingOutput.beamHypotheses.numBeamsCBA); - bh.minNormedScoresCBA = bufferCast(*decodingOutput.beamHypotheses.minNormedScoresCBA); - bh.batchDones = bufferCast(*decodingOutput.beamHypotheses.batchDones); - bh.finished = bufferCast(*decodingOutput.finishReasons); - bh.outputIdsUnfinish = bufferCast(*decodingOutput.ids); - bh.parentIdsUnfinish = bufferCast(*decodingOutput.parentIds); - - // This is where transpose is done - tensorrt_llm::kernels::invokeInsertUnfinishedPath(bh, stream); - sync_check_cuda_error(); - - tensorrt_llm::kernels::invokeFinalize(bh, stream); - sync_check_cuda_error(); - - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); -} - namespace tensorrt_llm::runtime { template class GptDecoder; template class GptDecoder; } // namespace tensorrt_llm::runtime - -void IGptDecoder::acceptDraftTokensByIds(ITensor const& targetTokenIds, ITensor const& draftTokenIds, - ITensor const& contextLengths, ITensor const& numDraftTokens, ITensor& sequenceLengths, ITensor const& finishedVec, - ITensor& finishedFinal, ITensor& finishedSum, ITensor const& batchSlots, BufferManager::CudaStreamPtr const& stream) -{ - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - - auto const finishedVecShape = finishedVec.getShape(); - auto const maxBatchSize = finishedVecShape.d[1]; - auto const batchSlotsShape = batchSlots.getShape(); - auto const batchSize = batchSlotsShape.d[0]; - auto const targetTokenIdsShape = targetTokenIds.getShape(); - auto const beamWidth = targetTokenIdsShape.d[1]; - auto const maxSeqLength = targetTokenIdsShape.d[2]; - auto const maxDraftTokens = draftTokenIds.getDimension<1>(); - - TLLM_CHECK_WITH_INFO(beamWidth == 1, - common::fmtstr("Beam width (" FMT_DIM ") > 1 is not supported for the speculative decoding", beamWidth)); - - TLLM_CHECK_WITH_INFO(batchSize <= maxBatchSize, - common::fmtstr("Batch size (" FMT_DIM ") is not smaller or equal to max batch size (" FMT_DIM ")", batchSize, - maxBatchSize)); - - TLLM_CHECK_WITH_INFO(draftTokenIds.getDimension<0>() == maxBatchSize, - common::fmtstr("Draft tokens batch size (" FMT_DIM ") is not equal to target batch size (" FMT_DIM ")", - draftTokenIds.getDimension<0>(), maxBatchSize)); - - TLLM_CHECK_WITH_INFO(contextLengths.getDimension<0>() == maxBatchSize, - common::fmtstr("Context length batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")", - contextLengths.getDimension<0>(), maxBatchSize)); - - TLLM_CHECK_WITH_INFO(numDraftTokens.getDimension<0>() == maxBatchSize, - common::fmtstr("Num draft tokens batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")", - numDraftTokens.getDimension<0>(), maxBatchSize)); - - TLLM_CHECK_WITH_INFO(sequenceLengths.getDimension<0>() == maxBatchSize, - common::fmtstr("Sequence length batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")", - sequenceLengths.getDimension<0>(), maxBatchSize)); - - tksd::invokeAcceptDraftTokensByIds(bufferCast(draftTokenIds), bufferCast(targetTokenIds), - bufferCast(contextLengths), bufferCast(numDraftTokens), - bufferCast(sequenceLengths), - reinterpret_cast( - bufferCast(finishedVec)), - reinterpret_cast( - bufferCast(finishedFinal)), - bufferCast(finishedSum), bufferCast(batchSlots), batchSize, maxBatchSize, beamWidth, - maxSeqLength, maxDraftTokens, stream->get()); - - sync_check_cuda_error(); - - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); -} - -void IGptDecoder::acceptDraftTokensByLogits(ITensor& draftLogits, ITensor const& targetLogits, ITensor& draftProbs, - ITensor& targetProbs, ITensor const& numDraftTokens, ITensor& finished, ITensor const& batchSlots, - SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold, - curandState_t* curandState, BufferManager::CudaStreamPtr const& stream) -{ - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - - auto const draftLogitsShape = draftLogits.getShape(); - auto const maxBatchSize = draftLogitsShape.d[0]; - auto const maxTokensPerStep = draftLogitsShape.d[1]; - auto const batchSlotsShape = batchSlots.getShape(); - auto const batchSize = batchSlotsShape.d[0]; - auto constexpr beamWidth = 1; - - TLLM_CHECK_WITH_INFO( - beamWidth == 1, common::fmtstr("Beam width (%d) > 1 is not supported for the speculative decoding", beamWidth)); - - TLLM_CHECK(draftLogitsShape.d[2] == vocabSize); - - if (draftLogits.getDataType() == nvinfer1::DataType::kFLOAT) - { - tksd::acceptDraftTokensByLogits(bufferCast(draftLogits), - const_cast(reinterpret_cast(bufferCast(targetLogits))), - bufferCast(draftProbs), bufferCast(targetProbs), bufferCast(numDraftTokens), - reinterpret_cast( - bufferCast(finished)), - curandState, bufferCast(batchSlots), batchSize, maxBatchSize, beamWidth, vocabSize, - vocabSizePadded, maxTokensPerStep, useRandomAcceptThreshold, randomAcceptThreshold, stream->get()); - } - else if (draftLogits.getDataType() == nvinfer1::DataType::kHALF) - { - tksd::acceptDraftTokensByLogits(bufferCast(draftLogits), - const_cast(reinterpret_cast(bufferCast(targetLogits))), - bufferCast(draftProbs), bufferCast(targetProbs), bufferCast(numDraftTokens), - reinterpret_cast( - bufferCast(finished)), - curandState, bufferCast(batchSlots), batchSize, maxBatchSize, beamWidth, vocabSize, - vocabSizePadded, maxTokensPerStep, useRandomAcceptThreshold, randomAcceptThreshold, stream->get()); - } - else - { - TLLM_THROW("Incorrect logits dtype. Only float32 and float16 are supported"); - } - - sync_check_cuda_error(); - - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); -} diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp index d0cf75283..3930f9aa9 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp @@ -93,30 +93,28 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz auto constexpr nvFloatType = TRTDataType::value; auto& dInput = mJointDecodingInput; - auto dummyLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); - auto endIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); - auto batchSlots = mBufferManager.emptyTensor(MemoryType::kPINNED, nvSizeType); - dInput - = std::make_unique(0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots)); - + { // prevent reusing these vars after std::move + auto dummyLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); + auto endIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); + auto batchSlots = mBufferManager.emptyTensor(MemoryType::kPINNED, nvSizeType); + dInput = std::make_unique( + 0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots)); + } dInput->sequenceLimitLength = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); dInput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); auto& dOutput = mJointDecodingOutput; - auto outputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); - auto gatheredOutputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); - dOutput = std::make_unique(std::move(outputIds), std::move(gatheredOutputIds)); - + { // prevent reusing these vars after std::move + auto outputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); + auto gatheredOutputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); + dOutput = std::make_unique(std::move(outputIds), std::move(gatheredOutputIds)); + } dOutput->newTokensSteps = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); - dOutput->parentIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); + dOutput->parentIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); mFinishedSteps = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType::value); - mDraftProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); - mTargetProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); - mBatchSlotsSetup = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); - mBatchSlotsDecoder = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); - mBatchSlotsAcceptTokens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); - mBatchSlotsAcceptLogits = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); + mBatchSlotsSetup = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType); + mBatchSlotsDecoder = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType); // use batchSize many entries instead of the usual 1 dOutput->finishedSum = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType); mFinishedSum = BufferManager::pinned(ITensor::makeShape({1}), nvSizeType); @@ -127,16 +125,12 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz dOutput->finishReasons = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType::value); - mNumDraftTokens = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); - mCurandStates = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT8); - mDraftTokenIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); - mDraftLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); - mTargetLogitsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); + dOutput->logProbsTiled = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType::value); dInput->stopWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); - dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); + dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType); dInput->badWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); - dInput->badWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); + dInput->badWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType); dInput->embeddingBias = mBufferManager.emptyTensor(MemoryType::kGPU, dtype); int device; @@ -147,13 +141,13 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz if (!mSpeculativeDecodingMode.isNone()) { - allocateSpeculativeDecodingBuffers(); + allocateSpeculativeDecodingBuffers(dtype); } TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void GptDecoderBatched::allocateSpeculativeDecodingBuffers() +void GptDecoderBatched::allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto constexpr nvSizeType = TRTDataType::value; @@ -199,6 +193,22 @@ void GptDecoderBatched::allocateSpeculativeDecodingBuffers() } dOutput->speculativeDecodingOutputs = speculativeDecodingOutputs; + if (mSpeculativeDecodingMode.isDraftTokensExternal()) + { + DecodingInput::ExternalDraftTokensInputs externalDraftTokensInputs; + + externalDraftTokensInputs.draftLogits = mBufferManager.emptyTensor(MemoryType::kGPU, dtype); + externalDraftTokensInputs.draftProbs = mBufferManager.emptyTensor(MemoryType::kGPU, dtype); + externalDraftTokensInputs.targetProbs = mBufferManager.emptyTensor(MemoryType::kGPU, dtype); + externalDraftTokensInputs.numDraftTokens = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); + externalDraftTokensInputs.useDraftLogits + = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType::value); + externalDraftTokensInputs.draftTokenIds + = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); + + dInput->externalDraftTokensInputs = externalDraftTokensInputs; + } + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -249,6 +259,7 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max auto const maxTokensPerStepXmaxBatchSizeXmaxBeamWidth = ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize, maxBeamWidth}); auto const maxBatchSizeXmaxTokensPerStep = ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep}); + auto const jointOutputIdsShape = ITensor::makeShape({maxBatchSize, maxBeamWidth, maxSequenceLength}); auto& dInput = *mJointDecodingInput; dInput.maxLength = mMaxSequenceLength; @@ -266,8 +277,6 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max inputLengths.reshape(maxBatchSizeXmaxBeamWidth); mBufferManager.setZero(inputLengths); - auto const jointOutputIdsShape = ITensor::makeShape({maxBatchSize, maxBeamWidth, maxSequenceLength}); - auto& dOutput = *mJointDecodingOutput; dOutput.ids->reshape(jointOutputIdsShape); @@ -294,15 +303,18 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max mBatchSlotsSetup->reshape(ITensor::makeShape({maxBatchSize})); mBatchSlotsDecoder->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize})); - mBatchSlotsAcceptTokens->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize})); - mBatchSlotsAcceptLogits->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize})); if (mSpeculativeDecodingMode.isDraftTokensExternal()) { - mDraftProbs->reshape(ITensor::makeShape( + dInput.externalDraftTokensInputs->draftProbs->reshape(ITensor::makeShape( {maxBatchSize, maxTokensPerEngineStep, maxBeamWidth, static_cast(mVocabSizePadded)})); - mTargetProbs->reshape(ITensor::makeShape( + dInput.externalDraftTokensInputs->targetProbs->reshape(ITensor::makeShape( {maxBatchSize, maxTokensPerEngineStep, maxBeamWidth, static_cast(mVocabSizePadded)})); + dInput.externalDraftTokensInputs->draftLogits->reshape( + ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep, static_cast(mVocabSizePadded)})); + dInput.externalDraftTokensInputs->draftTokenIds->reshape(maxBatchSizeXmaxTokensPerStep); + dInput.externalDraftTokensInputs->numDraftTokens->reshape(ITensor::makeShape({maxBatchSize, 1})); + dInput.externalDraftTokensInputs->useDraftLogits->reshape(ITensor::makeShape({maxBatchSize, 1})); } dOutput.parentIds->reshape(jointOutputIdsShape); @@ -315,7 +327,7 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidth); mBufferManager.setZero(*dOutput.cumLogProbs); - dOutput.logProbs->reshape(ITensor::makeShape({maxBatchSize, maxBeamWidth, mMaxSequenceLength})); + dOutput.logProbs->reshape(jointOutputIdsShape); mBufferManager.setZero(*dOutput.logProbs); if (maxBeamWidth > 1) @@ -323,14 +335,8 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max dOutput.beamHypotheses.reshape(maxBatchSize, maxBeamWidth, mMaxSequenceLength); } - // speculative decoding only works for beam width == 1 - mDraftTokenIds->reshape(maxBatchSizeXmaxTokensPerStep); - mDraftLogits->reshape( - ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep, static_cast(mVocabSizePadded)})); - mAcceptByLogits.resize(maxBatchSize); - mNumDraftTokens->reshape(ITensor::makeShape({maxBatchSize, 1})); - mCurandStates->reshape(ITensor::makeShape({maxBatchSize, sizeof(curandState_t)})); - mTargetLogitsPtrs->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize})); + dOutput.logProbsTiled->reshape(ITensor::makeShape({maxSequenceLength, maxBatchSize, maxBeamWidth})); + mBufferManager.setZero(*dOutput.logProbsTiled); const_cast(*dInput.embeddingBias) .reshape(ITensor::makeShape({maxBatchSize, static_cast(mVocabSizePadded)})); @@ -586,7 +592,6 @@ void GptDecoderBatched::newRequestSpeculativeDecoding( SizeType32 batchIdx, decoder_batch::Request const& request, SamplingConfig const& samplingConfig) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - mAcceptByLogits[batchIdx] = false; if (mSpeculativeDecodingMode.predictsDraftTokens()) { @@ -634,40 +639,41 @@ void GptDecoderBatched::newRequestDraftTokensExternal( auto const& stream = mDecoderStream; BufferManager manager{stream}; - auto constexpr localBatchSize = 1; + auto& dJointInput = *mJointDecodingInput; + auto useDraftLogits = false; auto const numDraftTokens = request.generatedTokensPerEngineStep - 1; if (request.draftLogits.has_value()) { TensorPtr draftLogitsView = ITensor::view(request.draftLogits.value()); - mAcceptByLogits[batchIdx] = true; + useDraftLogits = true; - TensorPtr draftLogitsReqBatchSlice = ITensor::slice(mDraftLogits, batchIdx, 1); + TensorPtr draftLogitsReqBatchSlice + = ITensor::slice(dJointInput.externalDraftTokensInputs->draftLogits, batchIdx, 1); draftLogitsReqBatchSlice->squeeze(0); TensorPtr draftLogitsReqTokensSlice = ITensor::slice(draftLogitsReqBatchSlice, 0, numDraftTokens); manager.copy(*draftLogitsView, *draftLogitsReqTokensSlice); } - TensorPtr draftTokensReqBatchSlice = ITensor::slice(mDraftTokenIds, batchIdx, 1); + auto useDraftLogitsView = ITensor::slice(dJointInput.externalDraftTokensInputs->useDraftLogits, batchIdx, 1); + kernels::invokeFill(*useDraftLogitsView, useDraftLogits, *stream); + + TensorPtr draftTokensReqBatchSlice + = ITensor::slice(dJointInput.externalDraftTokensInputs->draftTokenIds, batchIdx, 1); draftTokensReqBatchSlice->squeeze(0); TensorPtr draftTokensReqTokensSlice = ITensor::slice(draftTokensReqBatchSlice, 0, numDraftTokens); TensorPtr draftTokensView = ITensor::view(request.draftTokens, ITensor::makeShape({numDraftTokens})); manager.copy(*draftTokensView, *draftTokensReqTokensSlice); - auto const curandStatesView = ITensor::slice(mCurandStates, batchIdx, 1); - auto curandState = reinterpret_cast(bufferCast(*curandStatesView)); - auto batchSlotsPtr = bufferCast(*ITensor::slice(mBatchSlotsSetup, 0, localBatchSize)); - if (samplingConfig.randomSeed.has_value()) - { - tk::invokeCurandInitialize( - curandState, batchSlotsPtr, localBatchSize, samplingConfig.randomSeed.value()[0], stream->get()); - } - else - { - tk::invokeCurandInitialize(curandState, batchSlotsPtr, localBatchSize, 0, stream->get()); - } - auto numDraftTokensView = ITensor::slice(mNumDraftTokens, batchIdx, 1); + auto numDraftTokensView = ITensor::slice(dJointInput.externalDraftTokensInputs->numDraftTokens, batchIdx, 1); kernels::invokeFill(*numDraftTokensView, numDraftTokens, *stream); + bool const useRandomAcceptanceThreshold = !samplingConfig.draftAcceptanceThreshold.has_value(); + float const constantThreshold + = useRandomAcceptanceThreshold ? 0 : samplingConfig.draftAcceptanceThreshold.value()[0]; + + dJointInput.externalDraftTokensInputs->useRandomAcceptanceThreshold = useRandomAcceptanceThreshold; + dJointInput.externalDraftTokensInputs->constantThreshold = constantThreshold; + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -798,7 +804,7 @@ void GptDecoderBatched::forwardDispatch( } } -GptDecoderBatched::TokenPtr GptDecoderBatched::forwardAsync( +GptDecoderBatched::DecoderFinishedEventPtr GptDecoderBatched::forwardAsync( decoder_batch::Output& output, decoder_batch::Input const& input) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -808,7 +814,7 @@ GptDecoderBatched::TokenPtr GptDecoderBatched::forwardAsync( CudaEvent eventStop{}; mRuntimeStream->record(eventStop); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return std::make_unique(std::move(eventStop), input.active); + return std::make_unique(std::move(eventStop), input.active); } void GptDecoderBatched::forwardDecoder( @@ -833,8 +839,6 @@ void GptDecoderBatched::forwardDecoder( auto batchSlotsDecoderPtr = maxBeamWidth > 1 && input.seqSlots ? bufferCast(*input.seqSlots) : bufferCast(*mBatchSlotsDecoder); - auto batchSlotsAcceptTokensPtr = bufferCast(*mBatchSlotsAcceptTokens); - auto batchSlotsAcceptLogitsPtr = bufferCast(*mBatchSlotsAcceptLogits); auto& dInput = *mJointDecodingInput; auto& dOutput = *mJointDecodingOutput; auto& decoder = *mDecoder; @@ -859,26 +863,12 @@ void GptDecoderBatched::forwardDecoder( } SizeType32 localBatchDecoderIdx = 0; - SizeType32 localBatchAcceptTokensIdx = 0; - SizeType32 localBatchAcceptLogitsIdx = 0; for (SizeType32 bi = 0; bi < mActualBatchSize; ++bi) { if (mFinished[bi] || !input.active.at(bi) || step >= mNumDecodingEngineTokens[bi]) { continue; } - - if (!mAcceptByLogits[bi] && mMaxDecodingDecoderTokens == 1 && mNumDecodingEngineTokens[bi] > 1 - && step == mNumDecodingEngineTokens[bi] - 1) - { - batchSlotsAcceptTokensPtr[step * mActualBatchSize + localBatchAcceptTokensIdx] = bi; - localBatchAcceptTokensIdx++; - } - else if (mAcceptByLogits[bi] && mMaxDecodingDecoderTokens == 1 && mNumDecodingEngineTokens[bi] > 1 && step == 0) - { - batchSlotsAcceptLogitsPtr[step * mActualBatchSize + localBatchAcceptLogitsIdx] = bi; - localBatchAcceptLogitsIdx++; - } batchSlotsDecoderPtr[step * mActualBatchSize + localBatchDecoderIdx] = bi; localBatchDecoderIdx++; } @@ -887,9 +877,6 @@ void GptDecoderBatched::forwardDecoder( = *std::max_element(std::begin(mNumDecodingEngineTokens), std::end(mNumDecodingEngineTokens)); std::vector logitsVec; - auto targetLogitsPtrsSlice = ITensor::slice(mTargetLogitsPtrs, step, 1); - auto targetLogitsPtrsSlicePtr = reinterpret_cast(bufferCast(*targetLogitsPtrsSlice)); - SizeType32 targetLogitsIdx = 0; for (SizeType32 bi = 0; bi < mActualBatchSize; ++bi) { if (mFinished[bi] || !input.active.at(bi) || step >= mNumDecodingEngineTokens[bi]) @@ -899,32 +886,6 @@ void GptDecoderBatched::forwardDecoder( auto const& targetLogits = allTargetLogits[bi]; TensorPtr logitsSlice = ITensor::slice(targetLogits, step, singleRequest); logitsVec.push_back(logitsSlice); - targetLogitsPtrsSlicePtr[targetLogitsIdx++] = logitsSlice->data(); - } - - if (async && localBatchAcceptLogitsIdx > 0) - { - // These params are only used for testing. Thus, can be per batch instead of per request - auto const& samplingConfig = decoder.getSamplingConfig(); - bool const useRandomAcceptanceThreshold = !samplingConfig.draftAcceptanceThreshold.has_value(); - float const randomAcceptanceThreshold - = useRandomAcceptanceThreshold ? 0 : samplingConfig.draftAcceptanceThreshold.value()[0]; - - TensorPtr batchSlotsAcceptLogitsStepSlice = ITensor::slice(mBatchSlotsAcceptLogits, step, 1); - batchSlotsAcceptLogitsStepSlice->squeeze(0); - TensorPtr batchSlotsAcceptLogitsSlice - = ITensor::slice(batchSlotsAcceptLogitsStepSlice, 0, localBatchAcceptLogitsIdx); - - IGptDecoder::acceptDraftTokensByLogits( - /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mDraftLogits, - /* [maxBatchSize][maxDecodingTokens, vocabPadded] */ *targetLogitsPtrsSlice, - /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mDraftProbs, - /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mTargetProbs, - /* [maxBatchSize] */ *mNumDraftTokens, - /* [maxDecodingTokens, maxBatchSize] */ *mFinishedSteps, - /* [bs] */ *batchSlotsAcceptLogitsSlice, static_cast(mVocabSize), - static_cast(mVocabSizePadded), useRandomAcceptanceThreshold, randomAcceptanceThreshold, - reinterpret_cast(bufferCast(*mCurandStates)), stream); } TensorPtr finishedStepsInput = ITensor::slice(mFinishedSteps, step, 1); @@ -953,6 +914,11 @@ void GptDecoderBatched::forwardDecoder( dInput.medusaInputs->medusaLogits = input.predictedDraftLogits; } + if (mSpeculativeDecodingMode.isDraftTokensExternal()) + { + dInput.externalDraftTokensInputs->step = step; + } + dOutput.newTokens = newTokensStepView; dOutput.finishReasons = finishedStepsOutput; dOutput.lengths = sequenceLengths; @@ -982,26 +948,6 @@ void GptDecoderBatched::forwardDecoder( mNbSteps[bi] += 1; mFinished[bi] = mNbSteps[bi] >= mMaxNewTokens[bi]; } - if (async && localBatchAcceptTokensIdx > 0) - { - TensorPtr batchSlotsAcceptTokensStepSlice = ITensor::slice(mBatchSlotsAcceptTokens, step, 1); - batchSlotsAcceptTokensStepSlice->squeeze(0); - auto batchSlotsAcceptTokensSlice - = ITensor::slice(batchSlotsAcceptTokensStepSlice, 0, localBatchAcceptTokensIdx); - - // Update finished state for 0th step - auto finishedFinal = ITensor::slice(mFinishedSteps, step, 1); - IGptDecoder::acceptDraftTokensByIds( - /* [maxBatchSize, maxBeamWidth, maxSeqLen] */ *dOutput.ids, - /* [maxBatchSize, maxDecodingDraftTokens] */ *mDraftTokenIds, - /* [maxBatchSize] */ *dInput.lengths, - /* [maxBatchSize] */ *mNumDraftTokens, - /* [maxBatchSize] */ *dOutput.lengths, - /* [maxDecodingTokens, maxBatchSize] */ *mFinishedSteps, - /* [maxBatchSize] */ *finishedFinal, - /* [maxBatchSize] */ *dOutput.finishedSum, - /* [bs] */ *batchSlotsAcceptTokensSlice, stream); - } // If last iteration if (async && step == maxDecodingEngineTokens - mMaxDecodingDecoderTokens) @@ -1014,12 +960,12 @@ void GptDecoderBatched::forwardDecoder( TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void GptDecoderBatched::updateFinished(decoder_batch::Token const& token) +void GptDecoderBatched::updateFinished(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); for (std::int32_t i = 0; i < mActualBatchSize; ++i) { - if (token.active[i] && !mFinished[i]) + if (decoderFinishEvent.active[i] && !mFinished[i]) { auto finishedSum = ITensor::slice(mJointDecodingOutput->finishedSum, i, 1); mFinished[i] = mFinished[i] @@ -1030,25 +976,25 @@ void GptDecoderBatched::updateFinished(decoder_batch::Token const& token) TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void GptDecoderBatched::forwardSync(decoder_batch::Token const& token) +void GptDecoderBatched::forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - token.event.synchronize(); + decoderFinishEvent.event.synchronize(); - updateFinished(token); + updateFinished(decoderFinishEvent); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void GptDecoderBatched::forwardSync( - decoder_batch::Token const& token, decoder_batch::Output& output, decoder_batch::Input const& input) +void GptDecoderBatched::forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent, + decoder_batch::Output& output, decoder_batch::Input const& input) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - token.event.synchronize(); + decoderFinishEvent.event.synchronize(); forwardDispatch(output, input, ForwardType::kSYNC); - updateFinished(token); + updateFinished(decoderFinishEvent); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } @@ -1060,7 +1006,6 @@ CudaEvent GptDecoderBatched::postProcessRequest( TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto& stream = mRuntimeStream; - auto& decoder = *mDecoder; auto manager = BufferManager{stream}; auto& dJointInput = *mJointDecodingInput; @@ -1093,7 +1038,7 @@ CudaEvent GptDecoderBatched::postProcessRequest( TLLM_CHECK(dOutput.newTokens->getShape().d[0] == 1); dOutput.newTokens->squeeze(0); dOutput.newTokens = ITensor::slice(dOutput.newTokens, batchSlot, 1); - + dOutput.logProbsTiled = dJointOutput.logProbsTiled; if (streaming) { // in case of streaming we shouldn't overwrite the data in beamHypotheses, since the beam search kernels expect @@ -1105,7 +1050,7 @@ CudaEvent GptDecoderBatched::postProcessRequest( dOutput.cumLogProbs = mCumLogProbsTmp; } - decoder.gatherTree(dOutput, dInput, manager, samplingConfig); + kernels::gatherTree(dOutput, dInput, manager, samplingConfig); CudaEvent event{}; stream->record(event); @@ -1227,7 +1172,7 @@ void GptDecoderBatched::forwardAsync(decoder::Output& output, decoder::Input con batchOutput.cacheIndirection = output.cacheIndirection; batchOutput.sequenceLengths = output.sequenceLengths; - mForwardToken = forwardAsync(batchOutput, batchInput); + mDecoderFinishEvent = forwardAsync(batchOutput, batchInput); mBufferManager.setZero(*mFinishedSum); kernels::reduce( *mFinishedSum, *ITensor::slice(mJointDecodingOutput->finishedSum, 0, mActualBatchSize), *mRuntimeStream); @@ -1239,7 +1184,7 @@ void GptDecoderBatched::forwardAsync(decoder::Output& output, decoder::Input con void GptDecoderBatched::forwardSync() { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - forwardSync(*mForwardToken); + forwardSync(*mDecoderFinishEvent); // wait for mFinishedSum to be updated mForwardEvent.synchronize(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp index e64fde8e5..da58300fa 100644 --- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp +++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp @@ -85,6 +85,8 @@ std::vector buildLayerTypes( auto constexpr layerNameAttention = "attention"; auto constexpr layerNameRecurrent = "recurrent"; + auto constexpr layerNameLinear = "linear"; + auto constexpr layerNameNoop = "no_op"; // The json field specifies a "group" of layers, which gets repeated multiple times // Note that the total number of layers does not need to be a multiple of a layer @@ -102,9 +104,17 @@ std::vector buildLayerTypes( { result[i] = ModelConfig::LayerType::kRECURRENT; } + else if (layerStringTypes[i % groupSize] == layerNameLinear) + { + result[i] = ModelConfig::LayerType::kLINEAR; + } + else if (layerStringTypes[i % groupSize] == layerNameNoop) + { + result[i] = ModelConfig::LayerType::kNOOP; + } else { - TLLM_LOG_ERROR("Unknown layer type: %s", layerStringTypes[i % groupSize].c_str()); + TLLM_LOG_WARNING("Unknown layer type: %s, assuming attention", layerStringTypes[i % groupSize].c_str()); } } @@ -147,9 +157,25 @@ ModelConfig createModelConfig( auto const mlpHiddenSize = parseJsonFieldOptional(config, mlpHiddenSizeField); - auto modelConfig = ModelConfig{vocabSize, numAttentionLayers, numRnnLayers, numHeads, hiddenSize, dataType}; + auto numKvHeadsPerAttentionLayer + = parseJsonFieldOr>(config, "num_kv_heads_per_layer", std::vector()); + + auto modelConfig + = ModelConfig{vocabSize, numLayers, numAttentionLayers, numRnnLayers, numHeads, hiddenSize, dataType}; + + if (!numKvHeadsPerAttentionLayer.empty()) + { + std::transform(numKvHeadsPerAttentionLayer.cbegin(), numKvHeadsPerAttentionLayer.cend(), + numKvHeadsPerAttentionLayer.begin(), + [tensorParallelism](SizeType32 const numKvHeads) { return std::max(numKvHeads / tensorParallelism, 1); }); + modelConfig.setNumKvHeadsPerLayer(numKvHeadsPerAttentionLayer); + } + else + { + modelConfig.setNbKvHeads(numKvHeads); + } + modelConfig.setSizePerHead(sizePerHead); - modelConfig.setNbKvHeads(numKvHeads); modelConfig.setLayerTypes(layerTypes); // Set logits datatype @@ -269,13 +295,24 @@ void parseLora(ModelConfig& modelConfig, Json const& json, Json const& pluginCon if (loraTargetModules.has_value()) { + auto const& loraModuleNames = loraTargetModules.value(); + auto const& numKvHeadsPerLayer = modelConfig.getNumKvHeadsPerLayer(); + if (!loraModuleNames.empty()) + { + TLLM_CHECK_WITH_INFO(std::all_of(numKvHeadsPerLayer.cbegin(), numKvHeadsPerLayer.cend(), + [firstNumKvHeads = numKvHeadsPerLayer[0]](SizeType32 numKvHeads) + { return numKvHeads == firstNumKvHeads; }), + "LORA with a VGQA model is not supported"); + } + // TODO(oargov): don't assume all layers have the same num_kv_heads to support VGQA + auto const numKvHeads = numKvHeadsPerLayer.empty() ? modelConfig.getNbHeads() : numKvHeadsPerLayer[0]; bool hasMoE = !engineVersionNone && json.at("pretrained_config").contains("moe"); auto const numExperts = hasMoE ? json.at("pretrained_config").at("moe").at("num_experts").template get() : SizeType32{0}; modelConfig.setLoraModules(LoraModule::createLoraModules(loraTargetModules.value(), modelConfig.getHiddenSize(), - modelConfig.getMlpHiddenSize(), modelConfig.getNbHeads(), modelConfig.getNbKvHeads(), - modelConfig.getSizePerHead(), tensorParallelism, numExperts)); + modelConfig.getMlpHiddenSize(), modelConfig.getNbHeads(), numKvHeads, modelConfig.getSizePerHead(), + tensorParallelism, numExperts)); } modelConfig.setMaxLoraRank(loraMaxRank); diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp index c5d4dda55..73df2cb3f 100644 --- a/cpp/tensorrt_llm/runtime/gptSession.cpp +++ b/cpp/tensorrt_llm/runtime/gptSession.cpp @@ -72,7 +72,6 @@ auto const kProfileMbIdxs = populateMicrobatchIndexes(); GptSession::Config setPath(GptSession::Config const& original, std::string const& path) { GptSession::Config config = original; - config.enginePath = std::filesystem::path(path); return config; } @@ -219,8 +218,13 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea // tokens, when enabling cyclic kv cache. auto const useOneMoreBlock = maxBeamWidth > 1 && maxSequenceLength > maxAttentionWindow; - auto const localNbLayers = mModelConfig.getNbAttentionLayers(mWorldConfig.getPipelineParallelism()); - auto const nbKvHeads = mModelConfig.getNbKvHeads(); + auto [numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd] = mModelConfig.getNumKvHeadsPerLayerLocalRange( + mWorldConfig.getPipelineParallelism(), mWorldConfig.getPipelineParallelRank()); + TLLM_CHECK_WITH_INFO(std::all_of(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd, + [firstNumKvHeads = *numKvHeadsPerLayerBegin](SizeType32 numKvHeads) + { return numKvHeads == firstNumKvHeads; }), + "Deprecated session API does not support multiple cache pools, use the newer executor API instead"); + auto const sizePerHead = mModelConfig.getSizePerHead(); bool constexpr enableBlockReuse{false}; bool enableDiffMaxAttenWin = false; @@ -235,7 +239,8 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea TLLM_CHECK_WITH_INFO(maxBeamWidth == 1 || !enableDiffMaxAttenWin, "Can't support layer-wise max_attention_window with beam search. Please use a unified max_attention_window for " "all layers."); - mKvCacheManager = std::make_shared(localNbLayers, nbKvHeads, sizePerHead, tokensPerBlock, + mKvCacheManager = std::make_shared( + std::vector(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd), sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, maxBatchSize, maxBeamWidth, maxAttentionWindow, sinkTokenLength, useOneMoreBlock, mRuntime->getStreamPtr(), enableBlockReuse, kvCacheConfig.onboardBlocks); @@ -253,6 +258,7 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea for (auto& buffers : mBuffers) { buffers->transformerBuffers->setKvPoolPointers(mKvCacheManager.get()); + buffers->transformerBuffers->setKvPoolMapping(mKvCacheManager.get()); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); diff --git a/cpp/tensorrt_llm/runtime/ipcUtils.cpp b/cpp/tensorrt_llm/runtime/ipcUtils.cpp index a1ab91c4a..f0a3fafbd 100644 --- a/cpp/tensorrt_llm/runtime/ipcUtils.cpp +++ b/cpp/tensorrt_llm/runtime/ipcUtils.cpp @@ -38,7 +38,14 @@ bool canAccessPeer(WorldConfig const& worldConfig) for (SizeType32 rank : worldConfig.getTensorParallelGroup()) { SizeType32 destDevice = worldConfig.getDeviceOf(rank); - if (worldConfig.getNodeRankOf(rank) != worldConfig.getNodeRank() || destDevice == srcDevice) + if (worldConfig.getNodeRankOf(rank) != worldConfig.getNodeRank()) + { + TLLM_LOG_INFO("Detect inter-node TP between rank %d and rank %d, fail to access peer GPU memory", + worldConfig.getRank(), rank); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); + return false; + } + if (destDevice == srcDevice) { continue; } @@ -149,19 +156,24 @@ AllReduceBuffers::AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWi * std::min( static_cast(maxBatchSize) * maxBeamWidth * maxSequenceLength * hiddenSize * sizeof(float), utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(tpSize)); + auto const lamportBufferSize + = tpSize * tensorrt_llm::kernels::reduce_fusion::details::kLamportTokenNumThreshold * hiddenSize * sizeof(half); auto const flagsSize = IpcMemory::FLAGS_SIZE * tpSize * 2; - for (auto size : {bufferSize, bufferSize, flagsSize, flagsSize}) + for (auto size : + {bufferSize, bufferSize, flagsSize, flagsSize, lamportBufferSize, lamportBufferSize, lamportBufferSize}) { mIpcMemoryHandles.emplace_back(size, manager, worldConfig, isP2pSupported); } mAllReduceCommPtrs - = BufferManager::cpu(ITensor::makeShape({static_cast(mIpcMemoryHandles.size()) * tpSize + 1}), + = BufferManager::cpu(ITensor::makeShape({static_cast(mIpcMemoryHandles.size()) * tpSize + 2}), nvinfer1::DataType::kINT64); auto commPtrs = BufferRange(*mAllReduceCommPtrs); - auto const flagPtr = static_cast(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 1)); - *flagPtr = 0; + auto const CustomARFlagPtr = static_cast(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 1)); + auto const LamportFlagPtr = static_cast(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 2)); + *CustomARFlagPtr = 0; + *LamportFlagPtr = 0; for (std::size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++) { @@ -169,6 +181,22 @@ AllReduceBuffers::AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWi TLLM_CHECK(memCommPtrs.size() == static_cast(tpSize)); std::copy(memCommPtrs.begin(), memCommPtrs.end(), commPtrs.begin() + memIdx * tpSize); } +#if ENABLE_MULTI_DEVICE + auto rank = worldConfig.getRank(); + auto tp_rank = worldConfig.getTensorParallelRank(); + // When p2p is not supported all the mIpcMemoryHandles are + // null + if (rank == tp_rank && isP2pSupported) + { + tensorrt_llm::kernels::lamportInitialize( + mIpcMemoryHandles[4].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0); + tensorrt_llm::kernels::lamportInitialize( + mIpcMemoryHandles[5].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0); + tensorrt_llm::kernels::lamportInitialize( + mIpcMemoryHandles[6].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0); + cudaDeviceSynchronize(); + } +#endif TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } diff --git a/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp b/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp index 465641bf0..8f543f9ed 100644 --- a/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp +++ b/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp @@ -11,7 +11,9 @@ */ #include "tensorrt_llm/runtime/lookaheadBuffers.h" +#include "iTensor.h" #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/layers/lookaheadDecodingUtils.h" #include "tensorrt_llm/runtime/common.h" namespace tensorrt_llm::runtime @@ -28,8 +30,6 @@ LookaheadDecodingBuffers::LookaheadDecodingBuffers( , positionIds( bufferManager.gpu(ITensor::makeShape({maxNumSequences, maxTokensPerStep}), nvinfer1::DataType::kINT32)) { - TLLM_LOG_DEBUG( - "LookaheadDecodingBuffers, maxNumSequences = %d, maxTokensPerStep = %d", maxNumSequences, maxTokensPerStep); } LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, @@ -40,11 +40,11 @@ LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeTy TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK_WITH_INFO(maxBeamWidth == 1, "Lookahead decoding does not support beam search"); - // auto const tokensPerStep = modelConfig.getMaxTokensPerStep(); auto const tokensPerStep = modelConfig.getMaxDecodingTokens(); auto const numPackedMasks = static_cast(tensorrt_llm::common::divUp(tokensPerStep, 32)); - // Copy buffers to device + cumSumLength = manager.pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + packedMasksDevice = manager.gpu(ITensor::makeShape({maxBatchSize * tokensPerStep, numPackedMasks}), nvinfer1::DataType::kINT32); positionOffsetsDevice = manager.gpu(ITensor::makeShape({maxBatchSize, tokensPerStep}), nvinfer1::DataType::kINT32); @@ -54,7 +54,7 @@ LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeTy packedMaskHost = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32); positionOffsetsHost = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32); generationLengthsHost = manager.cpu(generationLengthsDevice->getShape(), nvinfer1::DataType::kINT32); - positionIdsHost = manager.gpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32); + positionIdsHost = manager.cpu(positionIdsDevice->getShape(), nvinfer1::DataType::kINT32); packedMaskHostCopy = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32); positionOffsetsHostCopy = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32); @@ -76,24 +76,59 @@ void LookaheadRuntimeBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType auto const tokensPerStep = modelConfig.getMaxDecodingTokens(); + manager.copy(seqSlots, *batchSlotsHostCopy); + manager.copy(*decoderLookaheadBuffers.generationLengths, *generationLengthsHostCopy); manager.copy(*decoderLookaheadBuffers.positionOffsets, *positionOffsetsHostCopy); manager.copy(*decoderLookaheadBuffers.packedMasks, *packedMaskHostCopy); manager.copy(*decoderLookaheadBuffers.positionIds, *positionIdsHostCopy); - manager.copy(seqSlots, *batchSlotsHostCopy); - manager.copy(*decoderLookaheadBuffers.generationLengths, *generationLengthsHostCopy); manager.getStream().synchronize(); BufferRange batchSlotsRange(*batchSlotsHostCopy); + BufferRange cumSumLengthRange(*cumSumLength); + + SizeType32 maxGenerationLength = 0; + for (SizeType32 bi = 0; bi < numGenSequences; bi++) + { + SizeType32 gbi = batchSlotsRange[bi + numCtxSequences]; + SizeType32 theLength = BufferRange(*generationLengthsHostCopy)[gbi]; + maxGenerationLength = std::max(maxGenerationLength, theLength); + } + + auto positionOffsetShape = positionOffsetsHost->getShape(); + positionOffsetShape.d[1] = maxGenerationLength; + positionOffsetsHost->reshape(positionOffsetShape); + positionOffsetsDevice->reshape(positionOffsetShape); + + auto positionIdsShape = positionIdsHostCopy->getShape(); + auto positionIdsShape1D = ITensor::makeShape({ITensor::volume(positionIdsShape)}); + positionIdsHostCopy->reshape(positionIdsShape1D); + positionIdsHost->reshape(positionIdsShape1D); + + cumSumLengthRange[0] = 0; for (SizeType32 bi = 0; bi < numGenSequences; bi++) { SizeType32 gbi = batchSlotsRange[bi + numCtxSequences]; + SizeType32 theLength = BufferRange(*generationLengthsHostCopy)[gbi]; + manager.copy(*ITensor::at(generationLengthsHostCopy, {gbi}), *ITensor::at(generationLengthsHost, {bi})); - manager.copy(*ITensor::at(positionOffsetsHostCopy, {gbi}), *ITensor::at(positionOffsetsHost, {bi})); - manager.copy(*ITensor::slice(packedMaskHostCopy, gbi * tokensPerStep, tokensPerStep), - *ITensor::slice(packedMaskHost, bi * tokensPerStep, tokensPerStep)); - manager.copy(*ITensor::at(positionIdsHostCopy, {gbi}), *ITensor::at(positionIdsHost, {bi})); + + manager.copy(*ITensor::slice(positionOffsetsHostCopy, {gbi, 0}, theLength), + *ITensor::slice(positionOffsetsHost, {bi, 0}, theLength)); + + manager.copy(*ITensor::slice(packedMaskHostCopy, gbi * tokensPerStep, theLength), + *ITensor::slice(packedMaskHost, cumSumLengthRange[0], theLength)); + + manager.copy(*ITensor::slice(positionIdsHostCopy, gbi * tokensPerStep, theLength), + *ITensor::slice(positionIdsHost, cumSumLengthRange[0], theLength)); + + cumSumLengthRange[0] += theLength; } + + positionIdsHostCopy->reshape(positionIdsShape); + positionIdsHost->reshape(positionIdsShape); + positionIdsDevice->reshape(positionIdsShape); + manager.copy(*ITensor::slice(generationLengthsHost, 0, numGenSequences), *ITensor::slice(generationLengthsDevice, 0, numGenSequences)); manager.copy(*ITensor::slice(positionOffsetsHost, 0, numGenSequences), @@ -102,6 +137,7 @@ void LookaheadRuntimeBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType *ITensor::slice(packedMasksDevice, 0, numGenSequences * tokensPerStep)); manager.copy( *ITensor::slice(positionIdsHost, 0, numGenSequences), *ITensor::slice(positionIdsDevice, 0, numGenSequences)); + positionIdsDevice->reshape(ITensor::makeShape({cumSumLengthRange[0]})); manager.getStream().synchronize(); diff --git a/cpp/tensorrt_llm/runtime/loraUtils.cpp b/cpp/tensorrt_llm/runtime/loraUtils.cpp index 176fb3b71..34302516c 100644 --- a/cpp/tensorrt_llm/runtime/loraUtils.cpp +++ b/cpp/tensorrt_llm/runtime/loraUtils.cpp @@ -68,6 +68,7 @@ void loraValidateRequestTensors(std::optional const& optTaskId, auto loraModules = modelConfig.getLoraModules(); auto configPtr = bufferCast(*config); + auto maxAdapterSize = modelConfig.getMaxLoraRank(); for (SizeType32 row = 0; row < config->getShape().d[1]; ++row) { auto modId = configPtr[row * kLORA_CONFIG_ROW_SIZE + kLORA_CONFIG_MODULE_OFF]; @@ -83,6 +84,9 @@ void loraValidateRequestTensors(std::optional const& optTaskId, TLLM_CHECK_WITH_INFO(it != loraModules.end(), "lora module " + moduleName + " not enabled for this model"); TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize) <= weights->getShape().d[2], "lora_weights has to few values for " + moduleName); + TLLM_CHECK_WITH_INFO(adapterSize <= maxAdapterSize, + "Invalid low_rank (" + std::to_string(adapterSize) + "). low_rank must be smaller than mMaxLowRank (" + + std::to_string(maxAdapterSize) + ")"); } } } diff --git a/cpp/tensorrt_llm/runtime/medusaModule.cpp b/cpp/tensorrt_llm/runtime/medusaModule.cpp index d889dcdfa..174a3b4cb 100644 --- a/cpp/tensorrt_llm/runtime/medusaModule.cpp +++ b/cpp/tensorrt_llm/runtime/medusaModule.cpp @@ -96,7 +96,7 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st if (curDepth != depth) { TLLM_CHECK(depth + 1 == curDepth); - TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(), + TLLM_CHECK_WITH_INFO(curDepth <= getMaxDraftPathLen(), "Medusa choices require more Medusa heads than the engine was built with."); // Save TopK topKs[depth - 1] = maxTopK; diff --git a/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp b/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp index c4f9d888b..6b9c1175f 100644 --- a/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp +++ b/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp @@ -15,11 +15,11 @@ */ #include "tensorrt_llm/runtime/rnnStateBuffers.h" +#include "iBuffer.h" #include "tensorrt_llm/runtime/runtimeBuffers.h" #include "tensorrt_llm/runtime/utils/sessionUtils.h" using namespace tensorrt_llm::runtime; -namespace tc = tensorrt_llm::common; RnnStateBuffers::RnnStateBuffers() { @@ -92,8 +92,8 @@ RnnStateBuffers::RnnStateBuffers( auto statePtrsShape = ITensor::makeShape({localNbLayers}); slotMappingDevice = bufferManager.gpu(slotMappingShape, nvinfer1::DataType::kINT32); slotMappingHost = BufferManager::cpu(slotMappingShape, nvinfer1::DataType::kINT32); - rnnStatePtrs = BufferManager::cpu(statePtrsShape, nvinfer1::DataType::kINT64); - convStatePtrs = BufferManager::cpu(statePtrsShape, nvinfer1::DataType::kINT64); + rnnStatePtrs = BufferManager::cpu(statePtrsShape, TRTDataType::value); + convStatePtrs = BufferManager::cpu(statePtrsShape, TRTDataType::value); } else { @@ -179,8 +179,8 @@ void RnnStateBuffers::fillStatePtrs() rnnStatePtr.resize(mLocalNbLayers); convStatePtr.resize(mLocalNbLayers); - void** rnnStatePtrArray = static_cast(rnnStatePtrs->data()); - void** convStatePtrArray = static_cast(convStatePtrs->data()); + auto* rnnStatePtrArray = bufferCast(*rnnStatePtrs); + auto* convStatePtrArray = bufferCast(*convStatePtrs); for (int i = 0; i < mLocalNbLayers; i++) { diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp index 8601ad0cc..8e0acf654 100644 --- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp +++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp @@ -18,6 +18,7 @@ #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" +#include "tensorrt_llm/kernels/decodingKernels.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include @@ -61,6 +62,7 @@ StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabS dOutput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); dOutput->cumLogProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType); dOutput->beamHypotheses.empty(mBufferManager); + dOutput->logProbsTiled = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType::value); dInput->stopWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); @@ -147,6 +149,8 @@ void StatefulGptDecoder::reshapeBuffers(SizeType32 batchSize, SizeType32 beamWid mBufferManager.setZero(*dOutput.cumLogProbs); dOutput.beamHypotheses.reshape(batchSize, beamWidth, mMaxSequenceLength); } + dOutput.logProbsTiled->reshape(ITensor::makeShape({maxSequenceLength, batchSize, beamWidth})); + mBufferManager.setZero(*dOutput.logProbsTiled); mNbSteps = 0; TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -299,6 +303,7 @@ void StatefulGptDecoder::newBatch( { // manager.setZero(*dOutput.cumLogProbs); } + mBufferManager.setZero(*dOutput.logProbsTiled); // copy the request ids into dOutput.ids (with tiling) kernels::initOutputIds( @@ -355,12 +360,12 @@ void StatefulGptDecoder::forwardSync() TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void StatefulGptDecoder::finalize(SamplingConfig const&) const +void StatefulGptDecoder::finalize(SamplingConfig const& samplingConfig) const { // TODO (rkobus) can we do this inplace? TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto& outputIds = mDecodingOutput->ids; - mDecoder->gatherTree(*mDecodingOutput, *mDecodingInput, mBufferManager); + kernels::gatherTree(*mDecodingOutput, *mDecodingInput, mBufferManager, samplingConfig); mBufferManager.copy(*(mDecodingOutput->gatheredIds), *outputIds); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); return; diff --git a/cpp/tensorrt_llm/runtime/tllmBuffers.h b/cpp/tensorrt_llm/runtime/tllmBuffers.h index 67a55d3ba..ea6beb7b2 100644 --- a/cpp/tensorrt_llm/runtime/tllmBuffers.h +++ b/cpp/tensorrt_llm/runtime/tllmBuffers.h @@ -216,7 +216,7 @@ class BorrowingAllocator : public BaseAllocator, , mCapacity(capacity) { TLLM_CHECK_WITH_INFO(capacity == 0 || static_cast(mPtr), "Undefined pointer"); - TLLM_CHECK_WITH_INFO(mCapacity >= 0, "Capacity must be non-negative"); + TLLM_CHECK_WITH_INFO(mCapacity >= std::size_t(0), "Capacity must be non-negative"); } protected: diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp index 409088820..3cb9b05b6 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp @@ -22,7 +22,10 @@ #include "tensorrt_llm/executor/tensor.h" #include "tllmLogger.h" +#include +#include #include +#include #include using namespace tensorrt_llm::runtime; @@ -140,6 +143,24 @@ TllmRuntime::TllmRuntime( // Print context memory size for CI/CD to track. TLLM_LOG_INFO("[MemUsageChange] Allocated %.2f MiB for execution context memory.", static_cast(devMemorySize) / 1048576.0); + + cacheTensorNames(); +} + +void TllmRuntime::cacheTensorNames() +{ + for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i) + { + auto const* const name = mEngine->getIOTensorName(i); + if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT) + { + mInputTensorNames.emplace_back(name); + } + else if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT) + { + mOutputTensorNames.emplace_back(name); + } + } } nvinfer1::IExecutionContext& TllmRuntime::addContext(std::int32_t profileIndex) @@ -187,68 +208,97 @@ bool TllmRuntime::executeContext(SizeType32 contextIndex) const return res; } -void TllmRuntime::setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap) +void TllmRuntime::setInputTensorsImpl(SizeType32 contextIndex, TensorMap const& tensorMap, bool throwOnMiss) { - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); NVTX3_FUNC_RANGE(); auto& context = getContext(contextIndex); - for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i) + for (auto const& name : mInputTensorNames) { - char const* name = mEngine->getIOTensorName(i); - if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT) + auto const pos = tensorMap.find(name); + if (pos == tensorMap.end()) { - auto pos = tensorMap.find(name); - auto posWeight = mManagedWeightsMap.find(name); - if (pos == tensorMap.end() && posWeight == mManagedWeightsMap.end()) + if (throwOnMiss) { - auto expectedShape = mEngine->getTensorShape(name); - TLLM_THROW( - "Input tensor '%s' not found; expected shape: %s", name, ITensor::toString(expectedShape).c_str()); + auto expectedShape = mEngine->getTensorShape(name.c_str()); + TLLM_THROW("Input tensor '%s' not found; expected shape: %s", name.c_str(), + ITensor::toString(expectedShape).c_str()); } - if (posWeight != mManagedWeightsMap.end() && mSetWeights.count(contextIndex) > 0) + else { - continue; // This input tensor is a managed weight, and we have already set it in a previous call. + continue; } + } - auto const& tensor = pos == tensorMap.end() ? posWeight->second : pos->second; - auto const tensorDtype = tensor->getDataType(); - auto const engineDtype = mEngine->getTensorDataType(name); - // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors. - TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype - || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF), - "%s: expected type %d, provided type %d", name, static_cast(engineDtype), - static_cast(tensorDtype)); - - auto const tensorShape = tensor->getShape(); - auto const setInputShapeSuccess = context.setInputShape(name, tensorShape); - if (!setInputShapeSuccess) - { - auto const minShape = mEngine->getProfileShape(name, contextIndex, nvinfer1::OptProfileSelector::kMIN); - auto const maxShape = mEngine->getProfileShape(name, contextIndex, nvinfer1::OptProfileSelector::kMAX); - - TLLM_THROW("Tensor '%s' has invalid shape %s, expected in range min %s, max %s", name, - ITensor::toString(tensorShape).c_str(), ITensor::toString(minShape).c_str(), - ITensor::toString(maxShape).c_str()); - } - auto* const data = tensor->data(); - if (data) - { - context.setInputTensorAddress(name, data); - } - else + auto const& tensor = pos->second; + auto const tensorDtype = tensor->getDataType(); + auto const engineDtype = mEngine->getTensorDataType(name.c_str()); + // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors. + TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype + || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF), + "%s: expected type %d, provided type %d", name.c_str(), static_cast(engineDtype), + static_cast(tensorDtype)); + + auto const tensorShape = tensor->getShape(); + auto const setInputShapeSuccess = context.setInputShape(name.c_str(), tensorShape); + if (!setInputShapeSuccess) + { + auto const minShape + = mEngine->getProfileShape(name.c_str(), contextIndex, nvinfer1::OptProfileSelector::kMIN); + auto const maxShape + = mEngine->getProfileShape(name.c_str(), contextIndex, nvinfer1::OptProfileSelector::kMAX); + + TLLM_THROW("Tensor '%s' has invalid shape %s, expected in range min %s, max %s", name.c_str(), + ITensor::toString(tensorShape).c_str(), ITensor::toString(minShape).c_str(), + ITensor::toString(maxShape).c_str()); + } + auto* const data = tensor->data(); + if (data) + { + context.setInputTensorAddress(name.c_str(), data); + } + else + { + TLLM_CHECK_WITH_INFO(tensor->getSize() == 0, std::string("Invalid data for tensor: ") + name.c_str()); + // TensorRT runtime does not support nullptr. + if (!mDummyTensor) { - TLLM_CHECK_WITH_INFO(tensor->getSize() == 0, std::string("Invalid data for tensor: ") + name); - // TensorRT runtime does not support nullptr. - if (!mDummyTensor) - { - mDummyTensor = mBufferManager.gpu(ITensor::makeShape({1})); - } - context.setInputTensorAddress(name, mDummyTensor->data()); + mDummyTensor = mBufferManager.gpu(ITensor::makeShape({1})); } + context.setInputTensorAddress(name.c_str(), mDummyTensor->data()); } } +} + +void TllmRuntime::setStaticInputTensors(TensorMap const& tensorMap) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + NVTX3_FUNC_RANGE(); + + TLLM_CHECK_WITH_INFO(getNbContexts() > 0, "Contexts should be created before calling setStaticInputTensors"); + for (auto contextIndex = 0; contextIndex < getNbContexts(); ++contextIndex) + { + setInputTensorsImpl(contextIndex, tensorMap, false); + } - mSetWeights.insert(contextIndex); + // move static input tensor names to separate vector + auto const begin = mInputTensorNames.begin(); + auto end = mInputTensorNames.end(); + for (auto const& [name, tensor] : tensorMap) + { + end = std::remove(begin, end, name); + } + mInputTensorNames.erase(end, mInputTensorNames.end()); + + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +void TllmRuntime::setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + NVTX3_FUNC_RANGE(); + setInputTensorsImpl(contextIndex, tensorMap, true); + + auto& context = getContext(contextIndex); if (mUseShapeInference) { NVTX3_SCOPED_RANGE(infer_shapes); @@ -277,41 +327,37 @@ void TllmRuntime::setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); NVTX3_FUNC_RANGE(); auto& context = getContext(contextIndex); - for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i) + for (auto const& name : mOutputTensorNames) { - auto const name = mEngine->getIOTensorName(i); - if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT) + auto const engineDtype = mEngine->getTensorDataType(name.c_str()); + auto const pos = tensorMap.find(name); + if (pos != tensorMap.end()) { - auto const engineDtype = mEngine->getTensorDataType(name); - auto pos = tensorMap.find(name); - if (pos != tensorMap.end()) - { - auto const& tensor = pos->second; - auto const tensorDtype = tensor->getDataType(); - // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors. - TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype - || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF), - "%s: expected type %d, provided type %d", name, static_cast(engineDtype), - static_cast(tensorDtype)); - - if (mUseShapeInference) - { - auto const dims = context.getTensorShape(name); - tensor->reshape(dims); - } - context.setTensorAddress(name, tensor->data()); - } - else if (mUseShapeInference) - { - auto const dims = context.getTensorShape(name); - auto tensor = ITensor::SharedPtr(mBufferManager.gpu(dims, engineDtype)); - tensorMap.insert(pos, std::make_pair(name, tensor)); - context.setTensorAddress(name, tensor->data()); - } - else + auto const& tensor = pos->second; + auto const tensorDtype = tensor->getDataType(); + // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors. + TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype + || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF), + "%s: expected type %d, provided type %d", name.c_str(), static_cast(engineDtype), + static_cast(tensorDtype)); + + if (mUseShapeInference) { - TLLM_THROW("Tensor %s is not found in tensorMap and shape inference is not allowed", name); + auto const dims = context.getTensorShape(name.c_str()); + tensor->reshape(dims); } + context.setTensorAddress(name.c_str(), tensor->data()); + } + else if (mUseShapeInference) + { + auto const dims = context.getTensorShape(name.c_str()); + auto tensor = ITensor::SharedPtr(mBufferManager.gpu(dims, engineDtype)); + tensorMap.insert(pos, std::make_pair(name, tensor)); + context.setTensorAddress(name.c_str(), tensor->data()); + } + else + { + TLLM_THROW("Tensor %s is not found in tensorMap and shape inference is not allowed", name.c_str()); } } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); @@ -329,7 +375,7 @@ bool TllmRuntime::hasLayerProfiler(SizeType32 contextId) const void TllmRuntime::setLayerProfiler() { - mLayerProfiler.reset(new LayerProfiler); + mLayerProfiler = std::make_unique(); for (auto& context : mContexts) { context->setProfiler(mLayerProfiler.get()); @@ -350,6 +396,8 @@ void TllmRuntime::reportToProfiler(SizeType32 contextId) void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank) { + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + NVTX3_FUNC_RANGE(); auto& engine = getEngine(); auto& manager = getBufferManager(); if (rawEngine.getManagedWeightsMapOpt().has_value()) @@ -360,9 +408,7 @@ void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank) { TLLM_LOG_DEBUG("Loading managed weight: %s", name.c_str()); auto iTensor = tensorrt_llm::executor::detail::toITensor(weight); - auto weightsDevice = std::shared_ptr{ - manager.allocate(MemoryType::kGPU, iTensor->getShape(), iTensor->getDataType())}; - manager.copy(iTensor->data(), *weightsDevice, MemoryType::kCPU); + auto weightsDevice = std::shared_ptr{manager.copyFrom(*iTensor, MemoryType::kGPU)}; mManagedWeightsMap.insert(std::make_pair(name, weightsDevice)); } } @@ -385,4 +431,6 @@ void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank) mManagedWeightsMap.insert(std::make_pair(name, weightsDevice)); } } + setStaticInputTensors(mManagedWeightsMap); + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h index 80cea9c5f..06d7815cd 100644 --- a/cpp/tensorrt_llm/runtime/tllmRuntime.h +++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h @@ -73,8 +73,20 @@ class TllmRuntime void clearContexts(); + /// @brief Set input tensors from tensorMap for all contexts. + /// @details The function can be used to set static input tensors for all iterations. If a tensor was set this way, + /// it doesn't need to included in calls to setInputTensors anymore. + void setStaticInputTensors(TensorMap const& tensorMap); + + /// @brief Set input tensors from tensorMap for context at contextIndex. + /// @details The function expects that all input tensors (excluding the ones set by setStaticInputTensors) are + /// contained in the tensorMap. If a tensor is missing, has a bad shape or type, it will throw. void setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap); + /// @brief Set output tensors from tensorMap for context at contextIndex. + /// @details The function expects that all output tensors are contained in the tensorMap. If a tensor is missing and + /// shape inference is enabled, it will allocate the tensor on GPU and insert it into the tensorMap. Otherwise it + /// will throw. void setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap); bool executeContext(SizeType32 contextIndex) const; @@ -123,6 +135,10 @@ class TllmRuntime void loadManagedWeights(RawEngine const& rawEngine, int localRank); private: + void cacheTensorNames(); + + void setInputTensorsImpl(SizeType32 contextIndex, TensorMap const& tensorMap, bool throwOnMiss); + BufferManager::CudaStreamPtr mStream; BufferManager mBufferManager; std::unique_ptr mRuntime; @@ -133,7 +149,10 @@ class TllmRuntime std::unique_ptr mEngineInspector; std::unique_ptr mLayerProfiler; bool mUseShapeInference; - TensorMap mManagedWeightsMap{}; - std::set mSetWeights; + TensorMap mManagedWeightsMap; + // List of input tensor names. Names of static tensors are removed from this list when setStaticInputTensors is + // called. + std::vector mInputTensorNames; + std::vector mOutputTensorNames; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/transformerBuffers.cpp b/cpp/tensorrt_llm/runtime/transformerBuffers.cpp index f8a78f091..fead9addf 100644 --- a/cpp/tensorrt_llm/runtime/transformerBuffers.cpp +++ b/cpp/tensorrt_llm/runtime/transformerBuffers.cpp @@ -15,12 +15,15 @@ */ #include "tensorrt_llm/runtime/transformerBuffers.h" +#include "iTensor.h" #include "tensorrt_llm/batch_manager/kvCacheManager.h" +#include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/stlUtils.h" #include "tensorrt_llm/runtime/runtimeBuffers.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/utils/sessionUtils.h" #include // std::getenv +#include using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; @@ -34,6 +37,7 @@ TransformerBuffers::TransformerBuffers() presentKeysVals.clear(); presentKeysValsAlt.clear(); kvCacheBlockPoolPointers = nullptr; + kvCacheBlockPoolMapping = nullptr; kvCacheBlockOffsetsHost = nullptr; kvCacheBlockOffsetsDevice = nullptr; } @@ -101,15 +105,16 @@ void TransformerBuffers::reshape( auto const maxAttentionWindow = generationConfig.maxAttentionWindow; auto const kvCacheReserve = ITensor::makeShape( - {batchSize, 2, modelConfig.getNbKvHeads(), maxAttentionWindow, modelConfig.getSizePerHead()}); + {batchSize, 2, modelConfig.getNbKvHeads(0), maxAttentionWindow, modelConfig.getSizePerHead()}); auto const kvCacheShape - = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxInputLength, modelConfig.getSizePerHead()}); + = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(0), maxInputLength, modelConfig.getSizePerHead()}); + if (modelConfig.isPagedKVCache()) { auto cacheBlockOffsetsShape = kvCacheBlockOffsetsHost->getShape(); if (cacheBlockOffsetsShape.nbDims > 0) { - cacheBlockOffsetsShape.d[0] = batchSize; + cacheBlockOffsetsShape.d[1] = batchSize; kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape); kvCacheBlockOffsetsDevice->reshape(cacheBlockOffsetsShape); } @@ -123,7 +128,8 @@ void TransformerBuffers::reshape( utils::reshapeBufferVector(presentKeysVals, kvCacheReserve); } - auto const localNbLayers = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism()); + auto const localNbLayers + = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank()); if (modelConfig.useGptAttentionPlugin()) { @@ -147,7 +153,7 @@ void TransformerBuffers::reshapeKvTensors( { auto const& manager = runtime.getBufferManager(); - auto const cacheBlockOffsetsShape = ITensor::makeShape({maxBatchSize * maxBeamWidth, 2, maxBlocksPerSeq}); + auto const cacheBlockOffsetsShape = ITensor::makeShape({1, maxBatchSize * maxBeamWidth, 2, maxBlocksPerSeq}); kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape); manager.setZero(*kvCacheBlockOffsetsHost); @@ -161,6 +167,11 @@ void TransformerBuffers::setKvPoolPointers(KvCacheManager const* kvCacheManager) kvCacheBlockPoolPointers = kvCacheManager->getBlockPoolPointers(); } +void TransformerBuffers::setKvPoolMapping(KvCacheManager const* kvCacheManager) +{ + kvCacheBlockPoolMapping = kvCacheManager->getLayerToPoolMapping(); +} + TransformerBuffers TransformerBuffers::sliceTo( GenerationConfig const& generationConfig, ModelConfig const& modelConfig, SizeType32 offset, SizeType32 batchSize) { @@ -169,8 +180,15 @@ TransformerBuffers TransformerBuffers::sliceTo( auto const generationBatchSize = generationConfig.batchSize; if (modelConfig.isPagedKVCache()) { + auto const& realCacheBlockOffsetsShape = kvCacheBlockOffsetsHost->getShape(); - auto const maxBlocksPerSeq = realCacheBlockOffsetsShape.d[2]; + auto const numPools = realCacheBlockOffsetsShape.d[0]; + // (oargov) with multiple pools, slicing the tensor along the batch*beam dimension would require us to support + // non-contiguous tensors. with a single pool, we can just ignore the pools dimension when slicing and restore + // it later. this is part of the deprecated GPTSession API, so not supporting VGQA here should be ok. + TLLM_CHECK_WITH_INFO(numPools == 1, + "Deprecated transformerBuffers API does not support multiple cache pools, use the newer API instead"); + auto const maxBlocksPerSeq = realCacheBlockOffsetsShape.d[3]; // enable slicing by moving generationBatchSize to first dim auto const fakeCacheBlockOffsetsShape = ITensor::makeShape({generationBatchSize, 2, maxBlocksPerSeq}); @@ -178,13 +196,14 @@ TransformerBuffers TransformerBuffers::sliceTo( TensorPtr kvCacheBlockOffsetsDeviceView{ITensor::view(kvCacheBlockOffsetsDevice, fakeCacheBlockOffsetsShape)}; // slice and reshape to correct shape - auto const cacheBlockOffsetsShape = ITensor::makeShape({batchSize, 2, maxBlocksPerSeq}); + auto const cacheBlockOffsetsShape = ITensor::makeShape({numPools, batchSize, 2, maxBlocksPerSeq}); buffers.kvCacheBlockOffsetsHost = ITensor::slice(kvCacheBlockOffsetsHostView, offset, batchSize); buffers.kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape); buffers.kvCacheBlockOffsetsDevice = ITensor::slice(kvCacheBlockOffsetsDeviceView, offset, batchSize); buffers.kvCacheBlockOffsetsDevice->reshape(cacheBlockOffsetsShape); buffers.kvCacheBlockPoolPointers = kvCacheBlockPoolPointers; + buffers.kvCacheBlockPoolMapping = kvCacheBlockPoolMapping; } else { @@ -529,7 +548,7 @@ void TransformerBuffers::postContextStep(RuntimeBuffers* runtimeBuffers, if (modelConfig.useGptAttentionPlugin() && modelConfig.isPagedKVCache()) { auto cacheBlockOffsetsShape = kvCacheBlockOffsetsHost->getShape(); - cacheBlockOffsetsShape.d[0] = batchSize * beamWidth; + cacheBlockOffsetsShape.d[1] = batchSize * beamWidth; kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape); kvCacheBlockOffsetsDevice->reshape(cacheBlockOffsetsShape); } @@ -720,6 +739,7 @@ void TransformerBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers, inputBuffers.insert_or_assign("kv_cache_block_offsets", kvCacheBlockOffsetsDevice); inputBuffers.insert_or_assign("host_kv_cache_block_offsets", kvCacheBlockOffsetsHost); inputBuffers.insert_or_assign("host_kv_cache_pool_pointers", kvCacheBlockPoolPointers); + inputBuffers.insert_or_assign("host_kv_cache_pool_mapping", kvCacheBlockPoolMapping); } else { diff --git a/cpp/tensorrt_llm/runtime/transformerBuffers.h b/cpp/tensorrt_llm/runtime/transformerBuffers.h index 5e4a6a847..4692e9b0e 100644 --- a/cpp/tensorrt_llm/runtime/transformerBuffers.h +++ b/cpp/tensorrt_llm/runtime/transformerBuffers.h @@ -53,6 +53,7 @@ class TransformerBuffers runtime::TllmRuntime const& runtime); void setKvPoolPointers(KvCacheManager const* kvCacheManager); + void setKvPoolMapping(KvCacheManager const* kvCacheManager); void reset(BufferManager& manager){}; @@ -92,9 +93,10 @@ class TransformerBuffers TensorPtr maxAttentionWindows; // with attention plugin, host tensor TensorPtr sinkTokenLengths; // with attention plugin, host tensor TensorPtr kvCacheBlockPoolPointers; - TensorPtr kvCacheBlockOffsetsHost; // [batchSize * beamWidth, 2, maxBlocksPerSeq * 2] - TensorPtr kvCacheBlockOffsetsDevice; // [batchSize * beamWidth, 2, maxBlocksPerSeq * 2] - TensorPtr runtimePerfKnobsHost; // can hold max 16 perf knobs + TensorPtr kvCacheBlockPoolMapping; + TensorPtr kvCacheBlockOffsetsHost; // [numPools, batchSize * beamWidth, 2, maxBlocksPerSeq * 2] + TensorPtr kvCacheBlockOffsetsDevice; // [numPools, batchSize * beamWidth, 2, maxBlocksPerSeq * 2] + TensorPtr runtimePerfKnobsHost; // can hold max 16 perf knobs }; } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp index a15cc1f0d..f324cf5f9 100644 --- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp +++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp @@ -22,6 +22,7 @@ #include #include +#include using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; @@ -89,6 +90,16 @@ void reshapeBufferVector(std::vector& vector, nvinfer1::Dims } } +void assertNoVGQA(ModelConfig const& modelConfig, WorldConfig const& worldConfig) +{ + auto [numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd] = modelConfig.getNumKvHeadsPerLayerLocalRange( + worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank()); + TLLM_CHECK_WITH_INFO(std::all_of(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd, + [firstNumKvHeads = *numKvHeadsPerLayerBegin](SizeType32 numKvHeads) + { return numKvHeads == firstNumKvHeads; }), + "Deprecated session API does not support multiple cache pools, use the newer executor API instead"); +} + std::vector sliceBufferVector( std::vector const& vector, SizeType32 const offset, SizeType32 const size) { diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h index 5fdd94f3e..4627cb369 100644 --- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h +++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h @@ -56,6 +56,8 @@ std::vector createBufferVector( void reshapeBufferVector(std::vector& vector, nvinfer1::Dims const& shape); +void assertNoVGQA(ModelConfig const& modelConfig, WorldConfig const& worldConfig); + std::vector sliceBufferVector( std::vector const& vector, SizeType32 offset, SizeType32 size); diff --git a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu index a0f9233df..b6fd6f3b9 100644 --- a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu +++ b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu @@ -51,6 +51,16 @@ void simple_assert(bool flag) } } +void check_last_cuda_error() +{ + auto err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("CUDA error: %s\n", cudaGetErrorString(err)); + exit(-1); + } +} + struct CudaBuffer { void* _data; @@ -85,8 +95,22 @@ struct CudaBuffer }; template -float compare(int rank, void* _pa, void* _pb, int size, float scale) +float compare( + int rank, void* _pa, void* _pb, int size, float scale, bool print_error = false, std::string const& cmp_info = "") { + if (print_error && rank == 0) + { + if (!cmp_info.empty()) + { + printf("compare %s\n", cmp_info.c_str()); + } + else + { + static int cnt = 0; + printf("unnamed compare %d\n", cnt++); + } + } + auto pa = reinterpret_cast(_pa); auto pb = reinterpret_cast(_pb); float max_diff = 0.f, tot_diff = 0.f; @@ -101,6 +125,10 @@ float compare(int rank, void* _pa, void* _pb, int size, float scale) float diff = std::abs(va - vb); if (diff > threshold) { + if (rank == 0 && print_error) + { + printf("err idx %d, value %f vs %f\n", n, va, vb); + } max_diff = std::max(max_diff, diff); tot_diff += diff; ++diff_cnt; @@ -130,7 +158,7 @@ float compare(int rank, void* _pa, void* _pb, int size, float scale) template void random_fill(std::vector& vec, T2 minv, T2 maxv) { - std::mt19937 gen(20240410); + std::mt19937 gen(20240725); std::uniform_real_distribution dis(static_cast(minv), static_cast(maxv)); for (auto& v : vec) { @@ -164,8 +192,64 @@ std::string ar_info(AllReduceStrategyType runtime_strategy, AllReduceStrategyCon return info; } -bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int warmup, int iter, - AllReduceStrategyType runtime_strategy = AllReduceStrategyType::ONESHOT, +struct SetDevice +{ + SetDevice(int device_id) + { + TLLM_CUDA_CHECK(cudaSetDevice(device_id)); + } +}; + +class Workspace +{ +public: + Workspace(int world_size, int rank, int max_token_num, int max_hidden_size) + : world_config(world_size, 1, rank, world_size) + , set_device(world_config.getDevice()) + , p_s(std::make_shared()) + , buf_mgr(p_s) + , buffers(1, 1, max_token_num, max_hidden_size, buf_mgr, world_config) + { + } + + void set_params(AllReduceParams& params) const + { + int world_size = world_config.getSize(); + for (int i = 0; i < world_size; ++i) + { + params.peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[0].getCommPtrs()[i]; + params.fusion_params.lamport_peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[4].getCommPtrs()[i]; + params.fusion_params.lamport_peer_comm_buffer_ptrs[i + MAX_RANKS_PER_NODE] + = buffers.mIpcMemoryHandles[5].getCommPtrs()[i]; + params.fusion_params.lamport_peer_comm_buffer_ptrs[i + MAX_RANKS_PER_NODE * 2] + = buffers.mIpcMemoryHandles[6].getCommPtrs()[i]; + } + for (int i = 0; i < world_size; ++i) + { + params.peer_barrier_ptrs_in[i] = reinterpret_cast(buffers.mIpcMemoryHandles[2].getCommPtrs()[i]); + } + for (int i = 0; i < world_size; ++i) + { + params.peer_barrier_ptrs_out[i] + = reinterpret_cast(buffers.mIpcMemoryHandles[3].getCommPtrs()[i]); + } + } + + cudaStream_t get_stream() const + { + return p_s->get(); + } + +protected: + tr::WorldConfig world_config; + SetDevice set_device; + std::shared_ptr p_s; + tr::BufferManager buf_mgr; + tr::AllReduceBuffers buffers; +}; + +bool test(Workspace const& workspace, int token_num, int hidden_size, bool has_bias, bool has_affine, int warmup, + int iter, AllReduceStrategyType runtime_strategy = AllReduceStrategyType::ONESHOT, AllReduceStrategyConfig config = AllReduceStrategyConfig(0), AllReduceFusionOp fusion_op = AllReduceFusionOp::NONE) { std::srand(20240603); @@ -183,9 +267,13 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa random_fill(residual_buffer, -1, 1); random_fill(weight_buffer, -1, 1); random_fill(bias_buffer, -1, 1); + random_fill(inter_buffer, 0, 0); + random_fill(output_buffer, 0, 0); residual.copy_from(residual_buffer.data()); weight.copy_from(weight_buffer.data()); bias.copy_from(bias_buffer.data()); + inter.copy_from(inter_buffer.data()); + out.copy_from(output_buffer.data()); auto& comm = mpi::MpiComm::world(); auto world_size = comm.getSize(); auto rank = comm.getRank(); @@ -195,40 +283,25 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa if (fusion_op == AllReduceFusionOp::RESIDUAL_RMS_NORM) { printf( - "Custom All Reduce with Residual Add and RMS Norm, %s, message size %d(token num %d, hidden size %d), " + "Custom All Reduce with Residual Add and RMS Norm, %s, message size %6d(token num %6d, hidden size " + "%6d), " "has bias %d, has affine %d\n", info.c_str(), message_size, token_num, hidden_size, static_cast(has_bias), static_cast(has_affine)); } else { - printf("Custom All Reduce, %s, message size %d(token num %d, hidden size %d), has bias %d, has affine %d\n", + printf( + "Custom All Reduce, %s, message size %d(token num %d, hidden size %6d), has bias %6d, has affine %6d\n", info.c_str(), message_size, token_num, hidden_size, static_cast(has_bias), static_cast(has_affine)); } } - random_fill(input_buffer, -1 / world_size, 1 / world_size); + random_fill(input_buffer, -1, 1); in.copy_from(input_buffer.data()); - cudaSetDevice(rank); - - tr::WorldConfig world_config(world_size, 1, rank, world_size); - auto p_s = std::make_shared(); - tr::BufferManager buf_mgr(p_s); - tr::AllReduceBuffers buffers(1, 1, token_num, hidden_size, buf_mgr, world_config); AllReduceParams params; - for (int i = 0; i < world_size; ++i) - { - params.peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[0].getCommPtrs()[i]; - } - for (int i = 0; i < world_size; ++i) - { - params.peer_barrier_ptrs_in[i] = reinterpret_cast(buffers.mIpcMemoryHandles[2].getCommPtrs()[i]); - } - for (int i = 0; i < world_size; ++i) - { - params.peer_barrier_ptrs_out[i] = reinterpret_cast(buffers.mIpcMemoryHandles[3].getCommPtrs()[i]); - } + workspace.set_params(params); params.barrier_flag = 0; params.ranks_per_node = world_size; params.local_rank = rank; @@ -242,11 +315,18 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa params.fusion_params.eps = eps; params.fusion_params.intermediate_buffer = inter.data(); - cudaStream_t s; - cudaStreamCreate(&s); + cudaStream_t s = workspace.get_stream(); cudaEvent_t begin, end; cudaEventCreate(&begin); cudaEventCreate(&end); + lamportInitialize( + params.fusion_params.lamport_peer_comm_buffer_ptrs[rank], message_size, nvinfer1::DataType::kHALF, s); + lamportInitialize(params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + MAX_RANKS_PER_NODE], message_size, + nvinfer1::DataType::kHALF, s); + lamportInitialize(params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + MAX_RANKS_PER_NODE * 2], message_size, + nvinfer1::DataType::kHALF, s); + cudaDeviceSynchronize(); + comm.barrier(); for (int i = 0; i < warmup; ++i) { params.barrier_flag += 1; @@ -307,7 +387,7 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa { printf("\033[31mFAILED\033[0m\n"); } - cudaStreamDestroy(s); + comm.barrier(); return pass; } @@ -315,6 +395,7 @@ TEST(Kernel, AllReduce) { auto& comm = mpi::MpiComm::world(); auto world_size = comm.getSize(); + auto rank = comm.getRank(); if (world_size % 2) return; @@ -331,6 +412,8 @@ TEST(Kernel, AllReduce) }; // clang-format on bool pass = true; + int max_token_num = 1000, max_hidden_size = 8192; + Workspace workspace(world_size, rank, max_token_num, max_hidden_size); for (auto config : configs) { for (auto op : ops) @@ -340,23 +423,23 @@ TEST(Kernel, AllReduce) for (auto has_affine : {false, true}) { pass = pass - && test( - 1, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op); + && test(workspace, 1, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, + config, op); pass = pass - && test( - 1, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op); + && test(workspace, 1, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, + config, op); pass = pass - && test( - 10, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op); + && test(workspace, 10, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, + config, op); pass = pass - && test( - 10, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op); + && test(workspace, 10, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, + config, op); pass = pass - && test( - 1000, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::TWOSHOT, config, op); + && test(workspace, 1000, 4096, has_bias, has_affine, warmup, iter, + AllReduceStrategyType::TWOSHOT, config, op); pass = pass - && test( - 1000, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::TWOSHOT, config, op); + && test(workspace, 1000, 8192, has_bias, has_affine, warmup, iter, + AllReduceStrategyType::TWOSHOT, config, op); } } } @@ -368,28 +451,22 @@ TEST(Kernel, AllReduceOneShot) { auto& comm = mpi::MpiComm::world(); auto world_size = comm.getSize(); + auto rank = comm.getRank(); if (world_size % 2) return; int warmup = 100, iter = 100; - std::vector candidate_bs{1, 2, 4, 8, 16, 32, 64, 128}; - std::vector candidate_hidden{4096, 8192, 12288, 16384}; + std::vector candidate_bs{1, 2, 4, 8, 16}; + std::vector candidate_hidden{1024, 2048, 4096, 8192}; bool pass = true; + int max_token_num = 16, max_hidden_size = 8192; + Workspace workspace(world_size, rank, max_token_num, max_hidden_size); for (auto bs : candidate_bs) { for (auto hidden : candidate_hidden) { pass = pass - && test(bs, hidden, false, true, warmup, iter, AllReduceStrategyType::ONESHOT, - AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM); - pass = pass - && test(bs, hidden, true, true, warmup, iter, AllReduceStrategyType::ONESHOT, - AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM); - pass = pass - && test(bs, hidden, false, false, warmup, iter, AllReduceStrategyType::ONESHOT, - AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM); - pass = pass - && test(bs, hidden, true, false, warmup, iter, AllReduceStrategyType::ONESHOT, + && test(workspace, bs, hidden, false, true, warmup, iter, AllReduceStrategyType::ONESHOT, AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM); } } diff --git a/cpp/tests/kernels/decodingKernelTest.cpp b/cpp/tests/kernels/decodingKernelTest.cpp index 0860326ba..9b9a868b4 100644 --- a/cpp/tests/kernels/decodingKernelTest.cpp +++ b/cpp/tests/kernels/decodingKernelTest.cpp @@ -286,6 +286,438 @@ TEST_F(TestBeamHypothesesCopy, SingleBatchTest) checkAllEqual(); } +/** + * @brief Fills a slice of a tensor with data from a source array. + * + * This function writes to `tensor` from source array `src` at index `idx. + * It optionally flattens the tensor before performing the insertion. + * For example tensor if we wanted to write 5 values in the 3rd row of [1,10,100] + * We will use (tensor, 2, 5, src, true, mBufferManager) where src is a buffer with at least 5 elems. + * + * @tparam T The type of elements in the source array. + * @param tensor A shared pointer to the tensor to be modified. Also need to be of type T. + * @param idx The index at which to start inserting data into the tensor. + * @param insertLen The number of elements to insert from the source array into the tensor. + * @param src An array containing the data to be inserted into the tensor. + * @param flattenFirst A boolean flag indicating whether to flatten the first dimension of the tensor before insertion. + * @param bufferManager A shared pointer to a BufferManager responsible for managing memory operations. + */ +template +void fillTensorAtIndex(ITensor::SharedPtr tensor, SizeType32 idx, std::vector src, bool flattenFirst, + std::shared_ptr bufferManager) +{ + SizeType32 insertLen = src.size(); + ITensor::SharedPtr target = ITensor::view(tensor); + if (flattenFirst) + { + target->squeeze(0); + } + + target = ITensor::slice(target, idx, 1); + target->squeeze(0); + target = ITensor::slice(target, 0, insertLen); + bufferManager->copy(src.data(), *target); +} + +class TestGatherTree : public ::testing::Test +{ +public: + SizeType32 batchSize{1}; + SizeType32 beamWidth{5}; + SizeType32 maxSeqLen{20}; + + using TensorPtr = ITensor::SharedPtr; + + using DecodingOutputPtr = std::unique_ptr; + DecodingOutputPtr decodingOutput{nullptr}; + + SamplingConfig samplingConfig = SamplingConfig(); + + std::shared_ptr mStream{nullptr}; + std::shared_ptr mBufferManager{nullptr}; + + SamplingConfig mSamplingConfig; + + using DecodingInputPtr = std::unique_ptr; + DecodingInputPtr decodingInput{nullptr}; + + TensorPtr targetOut{nullptr}; + + void SetUp() override + { + mStream = std::make_shared(); + mBufferManager = std::make_shared(mStream); + } + + // create the empty buffers with the correct shapes and zero them + void createBuffers() + { + auto constexpr nvTokenIdType = TRTDataType::value; + auto constexpr nvSizeType = TRTDataType::value; + auto constexpr nvFloatType = TRTDataType::value; + + auto const maxBatchSizeShape = ITensor::makeShape({batchSize}); + auto const maxBatchSizeXmaxBeamWidth = ITensor::makeShape({batchSize, beamWidth}); + auto const jointOutputIdsShape = ITensor::makeShape({batchSize, beamWidth, maxSeqLen}); + + { // prevent reusing these vars after std::move + auto dummyLogits = mBufferManager->emptyTensor(MemoryType::kGPU, nvFloatType); + auto endIds = mBufferManager->emptyTensor(MemoryType::kGPU, nvTokenIdType); + auto batchSlots = mBufferManager->emptyTensor(MemoryType::kPINNED, nvSizeType); + decodingInput = std::make_unique( + 0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots)); + } + auto& dInput = *decodingInput; + + dInput.maxLength = maxSeqLen; + + const_cast(*dInput.endIds).reshape(maxBatchSizeShape); + const_cast(*dInput.batchSlots).reshape(maxBatchSizeShape); + const_cast(*dInput.endIds).reshape(maxBatchSizeShape); + const_cast(*dInput.batchSlots).reshape(maxBatchSizeShape); + auto& inputLengths = const_cast(*dInput.lengths); + dInput.lengths = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, nvSizeType); + mBufferManager->setZero(const_cast(*dInput.lengths)); + + { // prevent reusing these vars after std::move + + auto ids = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType); + mBufferManager->setZero(*ids); + auto gatheredIds = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType); + mBufferManager->setZero(*gatheredIds); + + decodingOutput = std::make_unique(std::move(ids), std::move(gatheredIds)); + } + auto& dOutput = *decodingOutput; + + dOutput.logProbs = mBufferManager->gpu(jointOutputIdsShape, nvFloatType); + mBufferManager->setZero(*dOutput.logProbs); + dOutput.logProbsTiled = mBufferManager->gpu(ITensor::makeShape({maxSeqLen, batchSize, beamWidth}), nvFloatType); + mBufferManager->setZero(*dOutput.logProbsTiled); + dOutput.lengths = mBufferManager->gpu(ITensor::makeShape({batchSize, beamWidth}), nvSizeType); + mBufferManager->setZero(*dOutput.lengths); + dOutput.cumLogProbs = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, nvFloatType); + mBufferManager->setZero(*dOutput.cumLogProbs); + + dOutput.beamHypotheses.empty(*mBufferManager); + dOutput.beamHypotheses.reshape(batchSize, beamWidth, maxSeqLen); + + dOutput.finishReasons + = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, TRTDataType::value); + mBufferManager->setZero(*dOutput.finishReasons); + dOutput.parentIds = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType); + mBufferManager->setZero(*dOutput.parentIds); + + targetOut = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType); + mBufferManager->setZero(*targetOut); + } + + // clang-format off + + // hardcode the input data for the output_len = 10 case + // this should not cause any beam swapping from the CBAs, just reorder the beams + void hardcodeBuffersLen10() + { + auto constexpr nvTokenIdType = TRTDataType::value; + auto constexpr nvSizeType = TRTDataType::value; + auto constexpr nvFloatType = TRTDataType::value; + + std::vector len = {3, 3, 3, 3, 3}; + TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)}; + mBufferManager->copy(len.data(),*inputLengths); + + std::vector> logProbs = + { + {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -2.41985}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -0.493615, -2.61479}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -3.11851, -1.01671}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, 0, 0}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -3.62298} + }; + for (SizeType32 it = 0; it < logProbs.size(); it++){ + fillTensorAtIndex(decodingOutput->logProbs, it, logProbs[it], true, mBufferManager); + } + + std::vector> logProbsTiled = + { + {-2.70907, -2.96689, -3.27157, -3.37314, -3.50595}, + {-1.84733, -1.8942, -1.63675, -1.9567, -1.47513}, + {-0.305059, -0.765237, -2.31329, -2.37162, -2.48475}, + {-1.97517, -0.0377979, -2.0169, -2.42439, -2.27471}, + {-1.31451, -2.2442, -1.5831, -2.44732, -2.02409}, + {-1.57552, -2.63339, -2.11286, -2.57304, -3.85214}, + {-0.310524, -0.534199, -0.74379, -2.86232, -1.72914}, + {-0.696636, -0.493615, -0.237725, -3.07164, -3.11851}, + {-2.41985, -2.61479, -1.01671, -3.62298, -1.26586}, + {-0.844337, -0.922832, -0.427682, -0.419985, -1.85996} + }; + TensorPtr logProbsTiledView = ITensor::view(decodingOutput->logProbsTiled,ITensor::makeShape({maxSeqLen*batchSize, beamWidth})); + for (SizeType32 it = 0; it < logProbsTiled.size(); it++){ + auto logProbsSlice = ITensor::slice(logProbsTiledView, it+3,1); + mBufferManager->copy(logProbsTiled[it].data(),*logProbsSlice); + } + + std::vector outputLenghts = {13, 13, 13, 13, 13}; + mBufferManager->copy(outputLenghts.data(),*decodingOutput->lengths); + + std::vector cumLogProbs = {-15.0458, -15.4681, -15.8323, -15.8424, -16.0614}; + mBufferManager->copy(cumLogProbs.data(),*decodingOutput->cumLogProbs); + + std::vector> outputIdsCBA = + { + {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973}, + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973} + }; + for(SizeType32 it = 0; it < outputIdsCBA.size(); it++) + { + fillTensorAtIndex(decodingOutput->beamHypotheses.outputIdsCBA, it, outputIdsCBA[it], true, mBufferManager); + } + + std::vector> logProbsCBA = + { + {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -2.19674}, + {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -2.81382,} + }; + for(SizeType32 it = 0; it < logProbsCBA.size(); it++) + { + fillTensorAtIndex(decodingOutput->beamHypotheses.logProbsCBA, it, logProbsCBA[it], true, mBufferManager); + } + + std::vector sequenceLengthsCBA = {10, 10, 0, 0, 0, 0, 0, 0, 0, 0}; + mBufferManager->copy(sequenceLengthsCBA.data(), *decodingOutput->beamHypotheses.sequenceLengthsCBA); + + std::vector cumLogProbsCBA = {-13.6336, -13.8988, 0, 0, 0, 0, 0, 0, 0, 0}; + mBufferManager->copy(cumLogProbsCBA.data(), *decodingOutput->beamHypotheses.cumLogProbsCBA); + + std::vector normedScoresCBA = {-1.7042, -1.73735, 0, 0, 0, 0, 0, 0, 0, 0}; + mBufferManager->copy(normedScoresCBA.data(), *decodingOutput->beamHypotheses.normedScoresCBA); + + std::vector numBeamsCBA = {2}; + mBufferManager->copy(numBeamsCBA.data(), *decodingOutput->beamHypotheses.numBeamsCBA); + + std::vector minNormedScoresCBA = {-1.73735}; + mBufferManager->copy(minNormedScoresCBA.data(), *decodingOutput->beamHypotheses.minNormedScoresCBA); + + std::vector batchDones = {0}; + mBufferManager->copy(batchDones.data(), *decodingOutput->beamHypotheses.batchDones); + + std::vector finishReasons = {4, 4, 4, 4, 4}; + mBufferManager->copy(finishReasons.data(), *decodingOutput->finishReasons); + + std::vector> ids = + { + {1, 864, 304, 1073, 825, 1048, 278, 278, 3815, 29973, 13, 4806, 526}, + {1, 864, 304, 367, 920, 304, 310, 1749, 3815, 29973, 13, 4806, 526}, + {1, 864, 304, 679, 263, 760, 679, 263, 29973, 13, 310, 526, 502}, + {1, 864, 304, 1207, 901, 278, 1749, 445, 3889, 393, 591, 13443, 276}, + {1, 864, 304, 1074, 263, 29973, 1207, 263, 2446, 12623, 1334, 29915, 30010} + }; + for(SizeType32 it = 0; it < ids.size(); it++) + { + fillTensorAtIndex(decodingOutput->ids, it, ids[it], true, mBufferManager); + } + + std::vector> parentIds = + { + {0, 0, 0, 0, 0, 3, 0, 1, 1, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1, 1, 1}, + {0, 0, 0, 0, 1, 2, 1, 4, 3, 2, 4, 4, 3}, + {0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0, 0, 4}, + {0, 0, 0, 0, 3, 3, 1, 2, 0, 4, 0, 3, 0} + }; + for(SizeType32 it = 0; it < parentIds.size(); it++) + { + fillTensorAtIndex(decodingOutput->parentIds, it, parentIds[it], true, mBufferManager); + } + + std::vector> targetOutput = + { + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 4806, 526}, + {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 13, 4806, 526}, + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 13443, 502}, + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 591, 29915, 276}, + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 4806, 30010} + }; + for(SizeType32 it = 0; it < targetOutput.size(); it++) + { + fillTensorAtIndex(targetOut, it, targetOutput[it], true, mBufferManager); + } + } + + // this case has the output_len = 8, and tests that the beams from the CBAs are correctly swapped. + void hardcodeBuffersLen8() + { + auto constexpr nvTokenIdType = TRTDataType::value; + auto constexpr nvSizeType = TRTDataType::value; + auto constexpr nvFloatType = TRTDataType::value; + + std::vector len = {3, 3, 3, 3, 3}; + TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)}; + mBufferManager->copy(len.data(),*inputLengths); + + std::vector >logProbs = + { + {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -2.44732, -2.11286, -0.74379}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -2.86232}, + {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -3.85214, -1.72914} + }; + for (SizeType32 it = 0; it < logProbs.size(); it++){ + fillTensorAtIndex(decodingOutput->logProbs, it, logProbs[it], true, mBufferManager); + } + + std::vector> logProbsTiled = + { + {-2.70907, -2.96689, -3.27157, -3.37314, -3.50595}, + {-1.84733, -1.8942, -1.63675, -1.9567, -1.47513}, + {-0.305059, -0.765237, -2.31329, -2.37162, -2.48475}, + {-1.97517, -0.0377979, -2.0169, -2.42439, -2.27471}, + {-1.31451, -2.2442, -1.5831, -2.44732, -2.02409}, + {-1.57552, -2.63339, -2.11286, -2.57304, -3.85214}, + {-0.310524, -0.534199, -0.74379, -2.86232, -1.72914}, + {-0.696636, -0.493615, -0.237725, -3.07164, -3.11851} + }; + TensorPtr logProbsTiledView = ITensor::view(decodingOutput->logProbsTiled,ITensor::makeShape({maxSeqLen*batchSize, beamWidth})); + for (SizeType32 it = 0; it < logProbsTiled.size(); it++){ + auto logProbsSlice = ITensor::slice(logProbsTiledView, it+3,1); + mBufferManager->copy(logProbsTiled[it].data(),*logProbsSlice); + } + std::vector outputLenghts = {11, 11, 11, 11, 11}; + mBufferManager->copy(outputLenghts.data(),*decodingOutput->lengths); + + std::vector cumLogProbs = {-11.7816, -11.9304, -14.0883, -14.1566, -14.2035}; + mBufferManager->copy(cumLogProbs.data(),*decodingOutput->cumLogProbs); + + std::vector> outputIdsCBA = + { + {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973}, + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973} + }; + for(SizeType32 it = 0; it < outputIdsCBA.size(); it++) + { + fillTensorAtIndex(decodingOutput->beamHypotheses.outputIdsCBA, it, outputIdsCBA[it], true, mBufferManager); + } + + std::vector> logProbsCBA = + { + {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -2.19674}, + {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -2.81382,} + }; + for(SizeType32 it = 0; it < logProbsCBA.size(); it++) + { + fillTensorAtIndex(decodingOutput->beamHypotheses.logProbsCBA, it, logProbsCBA[it], true, mBufferManager); + } + + std::vector sequenceLengthsCBA = {10, 10, 0, 0, 0, 0, 0, 0, 0, 0}; + mBufferManager->copy(sequenceLengthsCBA.data(), *decodingOutput->beamHypotheses.sequenceLengthsCBA); + + std::vector cumLogProbsCBA = {-13.6336, -13.8988, 0, 0, 0, 0, 0, 0, 0, 0}; + mBufferManager->copy(cumLogProbsCBA.data(), *decodingOutput->beamHypotheses.cumLogProbsCBA); + + std::vector normedScoresCBA = {-1.7042, -1.73735, 0, 0, 0, 0, 0, 0, 0, 0}; + mBufferManager->copy(normedScoresCBA.data(), *decodingOutput->beamHypotheses.normedScoresCBA); + + std::vector numBeamsCBA = {2}; + mBufferManager->copy(numBeamsCBA.data(), *decodingOutput->beamHypotheses.numBeamsCBA); + + std::vector minNormedScoresCBA = {-1.73735}; + mBufferManager->copy(minNormedScoresCBA.data(), *decodingOutput->beamHypotheses.minNormedScoresCBA); + + std::vector batchDones = {0}; + mBufferManager->copy(batchDones.data(), *decodingOutput->beamHypotheses.batchDones); + + std::vector finishReasons = {4, 4, 4, 4, 4}; + mBufferManager->copy(finishReasons.data(), *decodingOutput->finishReasons); + + std::vector> ids = + { + {1, 864, 304, 1073, 825, 1048, 278, 278, 3815, 29973, 13}, + {1, 864, 304, 367, 920, 304, 310, 1749, 3815, 29973, 13}, + {1, 864, 304, 679, 263, 760, 679, 263, 29973, 13, 310}, + {1, 864, 304, 1207, 901, 278, 1749, 445, 3889, 393, 591}, + {1, 864, 304, 1074, 263, 29973, 1207, 263, 2446, 12623, 1334} + }; + for(SizeType32 it = 0; it < ids.size(); it++) + { + fillTensorAtIndex(decodingOutput->ids, it, ids[it], true, mBufferManager); + } + + std::vector> parentIds = + { + {0, 0, 0, 0, 0, 3, 0, 1, 1, 0, 0}, + {0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1}, + {0, 0, 0, 0, 1, 2, 1, 4, 3, 2, 4}, + {0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0}, + {0, 0, 0, 0, 3, 3, 1, 2, 0, 4, 0} + }; + for(SizeType32 it = 0; it < parentIds.size(); it++) + { + fillTensorAtIndex(decodingOutput->parentIds, it, parentIds[it], true, mBufferManager); + } + + std::vector> targetOutput = + { + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13}, + {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 13}, + {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 0}, + {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 0}, + {1, 864, 304, 367, 263, 760, 310, 278, 2446, 12623, 310} + }; + for(SizeType32 it = 0; it < targetOutput.size(); it++) + { + fillTensorAtIndex(targetOut, it, targetOutput[it], true, mBufferManager); + } + } + + // clang-format on + + bool checkResult() + { + + TensorPtr reference = this->mBufferManager->copyFrom((*targetOut), tensorrt_llm::runtime::MemoryType::kCPU); + auto referencePtr = bufferCast(*reference); + + TensorPtr real + = this->mBufferManager->copyFrom((*decodingOutput->gatheredIds), tensorrt_llm::runtime::MemoryType::kCPU); + auto realPtr = bufferCast(*real); + + bool allEqual = true; + for (SizeType32 iAssert = 0; iAssert < batchSize * beamWidth * maxSeqLen; iAssert++) + { + if (referencePtr[iAssert] != realPtr[iAssert]) + { + TLLM_LOG_ERROR("Mismatch input value. Position of inputs: %d, expected value: %d, output value: %d", + iAssert, referencePtr[iAssert], realPtr[iAssert]); + allEqual = false; + } + } + return allEqual; + } +}; + +TEST_F(TestGatherTree, GatherTreeNoSwap) +{ + createBuffers(); + hardcodeBuffersLen10(); + cudaDeviceSynchronize(); + kernels::gatherTree(*decodingOutput, *decodingInput, *mBufferManager, mSamplingConfig); + cudaDeviceSynchronize(); + + EXPECT_TRUE(checkResult()); +} + +TEST_F(TestGatherTree, GatherTreeWithSwap) +{ + createBuffers(); + hardcodeBuffersLen8(); + cudaDeviceSynchronize(); + kernels::gatherTree(*decodingOutput, *decodingInput, *mBufferManager, mSamplingConfig); + cudaDeviceSynchronize(); + + EXPECT_TRUE(checkResult()); +} + enum AcceptKernelMode { BY_IDS, @@ -872,24 +1304,24 @@ class DecodingKernelsTest : public testing::Test void callAcceptByIds() { - tksp::invokeAcceptDraftTokensByIds(bufferCast(*mDraftTokens), - bufferCast(*mTargetTokens), bufferCast(*mContextLengths), - bufferCast(*mNumsDraftTokens), bufferCast(*mSequenceLengths), - reinterpret_cast(bufferCast(*mFinishedSteps)), - reinterpret_cast(bufferCast(*mFinishedFinal)), - bufferCast(*mFinishedSum), bufferCast(*mBatchSlots), mBatchSize, mMaxBatchSize, - mBeamWidth, mMaxSeqLen, mMaxDraftTokens, mStream->get()); + // tksp::invokeAcceptDraftTokensByIds(bufferCast(*mDraftTokens), + // bufferCast(*mTargetTokens), bufferCast(*mContextLengths), + // bufferCast(*mNumsDraftTokens), bufferCast(*mSequenceLengths), + // reinterpret_cast(bufferCast(*mFinishedSteps)), + // reinterpret_cast(bufferCast(*mFinishedFinal)), + // bufferCast(*mFinishedSum), bufferCast(*mBatchSlots), mBatchSize, mMaxBatchSize, + // mBeamWidth, mMaxSeqLen, mMaxDraftTokens, mStream->get()); } void callAcceptByLogits() { - tksp::acceptDraftTokensByLogits(bufferCast(*mDraftLogits), - reinterpret_cast(bufferCast(*mTargetLogitsPtrs)), bufferCast(*mDraftProbs), - bufferCast(*mTargetProbs), bufferCast(*mNumsDraftTokens), - reinterpret_cast(bufferCast(*mFinishedSteps)), - reinterpret_cast(bufferCast(*mCurandStates)), bufferCast(*mBatchSlots), - mBatchSize, mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSize, mMaxDraftTokens, false, 0.9f, - mStream->get()); + // tksp::acceptDraftTokensByLogits(bufferCast(*mDraftLogits), + // reinterpret_cast(bufferCast(*mTargetLogitsPtrs)), bufferCast(*mDraftProbs), + // bufferCast(*mTargetProbs), bufferCast(*mNumsDraftTokens), + // reinterpret_cast(bufferCast(*mFinishedSteps)), + // reinterpret_cast(bufferCast(*mCurandStates)), + // bufferCast(*mBatchSlots), mBatchSize, mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSize, + // mMaxDraftTokens, false, 0.9f, mStream->get()); } void callAcceptByIdsWithPaths() @@ -1165,7 +1597,7 @@ typedef testing::Types FloatAndHalfTypes; TYPED_TEST_SUITE(DecodingKernelsTest, FloatAndHalfTypes); -TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelSmall) +TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByIdsKernelSmall) { this->runTest(DecodingKernelTestParam() .setBatchSize(1) @@ -1176,7 +1608,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelSmall) .setAcceptMode(AcceptKernelMode::BY_IDS)); } -TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelLarge) +TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByIdsKernelLarge) { this->runTest(DecodingKernelTestParam() .setBatchSize(128) @@ -1187,7 +1619,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelLarge) .setAcceptMode(AcceptKernelMode::BY_IDS)); } -TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelSmall) +TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByLogitsKernelSmall) { this->runTest(DecodingKernelTestParam() .setBatchSize(1) @@ -1198,7 +1630,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelSmall) .setAcceptMode(AcceptKernelMode::BY_LOGITS)); } -TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelLarge) +TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByLogitsKernelLarge) { this->runTest(DecodingKernelTestParam() .setBatchSize(64) diff --git a/cpp/tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/kernels/mixtureOfExpertsTest.cu index 7d00c5d27..894780b3a 100644 --- a/cpp/tests/kernels/mixtureOfExpertsTest.cu +++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu @@ -1087,11 +1087,37 @@ protected: void BasicPermuteTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4); std::vector calcPermuteMapExpertParallel(std::vector const& expected_experts); - void ExpertParallelTest(int k = 1); - void TensorParallelTest(int k = 1); + void ExpertParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4) + { + // 2 experts per rank + ParallelelismTest(k, 1, num_experts / 2, hidden_size, num_experts); + // 1 expert per rank + ParallelelismTest(k, 1, num_experts, hidden_size, num_experts); + } + + void TensorParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4) + { + ParallelelismTest(k, 2, 1, hidden_size, num_experts); + ParallelelismTest(k, 4, 1, hidden_size, num_experts); + ParallelelismTest(k, 8, 1, hidden_size, num_experts); + } - void MixedParallelTest(int k = 1); + void MixedParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4) + { + // 2 experts per rank + ParallelelismTest(k, 2, num_experts / 2, hidden_size, num_experts); + ParallelelismTest(k, 4, num_experts / 2, hidden_size, num_experts); + ParallelelismTest(k, 8, num_experts / 2, hidden_size, num_experts); + + // 1 expert per rank + ParallelelismTest(k, 2, num_experts, hidden_size, num_experts); + ParallelelismTest(k, 4, num_experts, hidden_size, num_experts); + ParallelelismTest(k, 8, num_experts, hidden_size, num_experts); + } + + void ParallelelismTest(int k = 1, int tp_size = 4, int ep_size = 2, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, + int64_t num_experts = 4); }; template @@ -1276,6 +1302,7 @@ TYPED_TEST(MixtureOfExpertsTest, PermuteMixtral8x7b) { this->mUseBias = false; this->mActType = tensorrt_llm::ActivationType::Swiglu; + this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE; this->BasicPermuteTest(2, 4096, 8); } @@ -1299,7 +1326,8 @@ std::vector MixtureOfExpertsTest::calcPermuteMapExpertParallel( } template -void MixtureOfExpertsTest::ExpertParallelTest(int k) +void MixtureOfExpertsTest::ParallelelismTest( + int k, int tp_size, int ep_size, int64_t hidden_size, int64_t num_experts) { if (FP8) { @@ -1307,121 +1335,12 @@ void MixtureOfExpertsTest::ExpertParallelTest(int k) mUseBias = false; } - auto test_archs = getAllTileConfigsToTest(); - for (auto [gemm1, gemm2] : test_archs) + ASSERT_LE(ep_size, num_experts); + if (tp_size == 1) { - mInternalSelectedConfig1 = gemm1; - mInternalSelectedConfig2 = gemm2; - - int64_t hidden_size = DEFAULT_HIDDEN_SIZE; - int parallelism = 2; - int64_t num_experts = 4; - int64_t num_tokens = 3; - - std::vector hidden_states(hidden_size * num_tokens); - auto raw_unquant_input = populateTokens(hidden_states); - - std::vector probs = { - 0.5, 0.1, 0.25, 0.15, // - 0.03, 0.2, 0.07, 0.7, // - 0.25, 0.21, 0.35, 0.19, // - }; - - std::vector expected_experts{0, 3, 2}; - if (k == 2) - expected_experts = {0, 2, 3, 1, 2, 0}; - else if (k == 3) - expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1}; - std::vector results(hidden_states.size(), 0); - for (int i = 0; i < parallelism; i++) - { - if (i == 0) - { - // Only need to init the inputs on the first iteration - runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {}, - MOEParallelismConfig{1, 0, parallelism, i}); - } - else - { - runMoEPermute(MOEParallelismConfig{1, 0, parallelism, i}); - } - - auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k); - // Experts should only be selected when we are on the right node - // Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node - int const start_expert = i * (mNumExperts / parallelism); - std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(), - [&](int val) { return val >= mNumExperts ? val : val + start_expert; }); - auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, parallelism, i); - ASSERT_EQ(selected_expert, masked_expected_experts); - - auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k); - auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts); - ASSERT_EQ(permute_map, proj_map) << "Iteration " << i; - compareSoftmax(expected_experts, probs); - - // Do the final reduce - auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size); - std::transform( - iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{}); - } - - compareFinal(expected_experts, probs, raw_unquant_input, results); - } -} - -TYPED_TEST(MixtureOfExpertsTest, ExpertParallel) -{ - this->ExpertParallelTest(); -} - -TYPED_TEST(MixtureOfExpertsTest, ExpertParallelK2) -{ - this->ExpertParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, ExpertParallelNoBias) -{ - this->mUseBias = false; - this->ExpertParallelTest(); - this->ExpertParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, ExpertParallelRenorm) -{ - this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE; - this->ExpertParallelTest(); - this->ExpertParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSparseMixer) -{ - this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER; - this->ExpertParallelTest(); - this->ExpertParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, ExpertParallelGeglu) -{ - this->mActType = tensorrt_llm::ActivationType::Geglu; - this->ExpertParallelTest(); - this->ExpertParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSwiglu) -{ - this->mActType = tensorrt_llm::ActivationType::Swiglu; - this->ExpertParallelTest(); - this->ExpertParallelTest(2); -} - -template -void MixtureOfExpertsTest::TensorParallelTest(int k) -{ - if (FP8) - { - // TODO Remove this when bias + FP8 is supported - mUseBias = false; + // Only the first 4 experts are ever used. They should be split across at least 2 ranks + ASSERT_LT(num_experts / ep_size, 4) + << "Expert parallelism must have less than 4 experts per rank or the test is ineffective"; } auto test_archs = getAllTileConfigsToTest(); @@ -1430,9 +1349,6 @@ void MixtureOfExpertsTest::TensorParallelTest(int k) mInternalSelectedConfig1 = gemm1; mInternalSelectedConfig2 = gemm2; - int64_t hidden_size = DEFAULT_HIDDEN_SIZE; - int parallelism = 8; - int64_t num_experts = 4; int64_t num_tokens = 3; std::vector hidden_states(hidden_size * num_tokens); @@ -1444,130 +1360,9 @@ void MixtureOfExpertsTest::TensorParallelTest(int k) 0.25, 0.21, 0.35, 0.19, // }; - std::vector expected_experts{0, 3, 2}; - if (k == 2) - expected_experts = {0, 2, 3, 1, 2, 0}; - else if (k == 3) - expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1}; - std::vector results(hidden_states.size(), 0); - for (int i = 0; i < parallelism; i++) - { - if (i == 0) - { - // Only need to init the inputs on the first iteration - runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {}, - MOEParallelismConfig{parallelism, i, 1, 0}); - } - else - { - runMoEPermute(MOEParallelismConfig{parallelism, i, 1, 0}); - } - - auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k); - EXPECT_EQ(selected_expert, expected_experts); - - auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k); - std::vector permute_map{0, 2, 1}; - if (k == 2) - permute_map = {0, 5, 4, 3, 2, 1}; - if (k == 3) - permute_map = {0, 8, 6, 4, 2, 1, 7, 5, 3}; - - ASSERT_EQ(permute_map, proj_map) << "Iteration " << i; - - // Do the final reduce - auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size); - std::transform( - iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{}); - } - - compareFinal(expected_experts, probs, raw_unquant_input, results); - } -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallel) -{ - this->TensorParallelTest(); -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallelK2) -{ - this->TensorParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallelK3) -{ - this->TensorParallelTest(3); -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallelNoBias) -{ - this->mUseBias = false; - this->TensorParallelTest(); - this->TensorParallelTest(2); - this->TensorParallelTest(3); -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallelRenorm) -{ - this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE; - this->TensorParallelTest(); - this->TensorParallelTest(2); - this->TensorParallelTest(3); -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallelSparseMixer) -{ - this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER; - this->TensorParallelTest(); - this->TensorParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallelGeglu) -{ - this->mActType = tensorrt_llm::ActivationType::Geglu; - this->TensorParallelTest(); - this->TensorParallelTest(2); - this->TensorParallelTest(3); -} - -TYPED_TEST(MixtureOfExpertsTest, TensorParallelSwiglu) -{ - this->mActType = tensorrt_llm::ActivationType::Swiglu; - this->TensorParallelTest(); - this->TensorParallelTest(2); - this->TensorParallelTest(3); -} - -template -void MixtureOfExpertsTest::MixedParallelTest(int k) -{ - if (FP8) - { - // TODO Remove this when bias + FP8 is supported - mUseBias = false; - } - - auto test_archs = getAllTileConfigsToTest(); - for (auto [gemm1, gemm2] : test_archs) - { - mInternalSelectedConfig1 = gemm1; - mInternalSelectedConfig2 = gemm2; - - int64_t hidden_size = DEFAULT_HIDDEN_SIZE; - int tp_parallelism = 2; - int ep_parallelism = 2; - int64_t num_experts = 4; - int64_t num_tokens = 3; - - std::vector hidden_states(hidden_size * num_tokens); - auto raw_unquant_input = populateTokens(hidden_states); - - std::vector probs = { - 0.5, 0.1, 0.25, 0.15, // - 0.03, 0.2, 0.07, 0.7, // - 0.25, 0.21, 0.35, 0.19, // - }; + std::vector> hidden_input = {hidden_states}; + std::vector> router_input = {probs}; + resizeRouterInputs(router_input, num_experts, num_tokens); std::vector expected_experts{0, 3, 2}; if (k == 2) @@ -1575,34 +1370,34 @@ void MixtureOfExpertsTest::MixedParallelTest(int k) else if (k == 3) expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1}; std::vector results(hidden_states.size(), 0); - for (int i = 0; i < tp_parallelism; i++) + for (int i = 0; i < tp_size; i++) { - for (int j = 0; j < ep_parallelism; j++) + for (int j = 0; j < ep_size; j++) { if (i == 0 && j == 0) { // Only need to init the inputs on the first iteration - runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {}, - MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j}); + runMoEPermute(hidden_input, router_input, hidden_size, num_experts, k, {}, + MOEParallelismConfig{tp_size, i, ep_size, j}); } else { - runMoEPermute(MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j}); + runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j}); } auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k); // Experts should only be selected when we are on the right node // Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node - int const start_expert = j * (mNumExperts / ep_parallelism); + int const start_expert = j * (mNumExperts / ep_size); std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(), [&](int val) { return val >= mNumExperts ? val : val + start_expert; }); - auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_parallelism, j); + auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_size, j); ASSERT_EQ(selected_expert, masked_expected_experts); auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k); auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts); ASSERT_EQ(permute_map, proj_map) << "Iteration " << i << " " << j; - compareSoftmax(expected_experts, probs); + compareSoftmax(expected_experts, router_input[0]); // Do the final reduce auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size); @@ -1611,54 +1406,76 @@ void MixtureOfExpertsTest::MixedParallelTest(int k) } } - compareFinal(expected_experts, probs, raw_unquant_input, results); + compareFinal(expected_experts, router_input[0], raw_unquant_input, results); } } -TYPED_TEST(MixtureOfExpertsTest, MixedParallel) -{ - this->MixedParallelTest(); -} - -TYPED_TEST(MixtureOfExpertsTest, MixedParallelK2) -{ - this->MixedParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, MixedParallelNoBias) -{ - this->mUseBias = false; - this->MixedParallelTest(); - this->MixedParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, MixedParallelRenorm) -{ - this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE; - this->MixedParallelTest(); - this->MixedParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, MixedParallelSparseMixer) -{ - this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER; - this->MixedParallelTest(); - this->MixedParallelTest(2); -} - -TYPED_TEST(MixtureOfExpertsTest, MixedParallelGeglu) -{ - this->mActType = tensorrt_llm::ActivationType::Geglu; - this->MixedParallelTest(); - this->MixedParallelTest(2); -} +#define PARALLEL_TEST_SUITE(ParallelismType) \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType) \ + { \ + this->ParallelismType##Test(); \ + } \ + \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K2) \ + { \ + this->ParallelismType##Test(2); \ + } \ + \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K3) \ + { \ + this->ParallelismType##Test(3); \ + } \ + \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##NoBias) \ + { \ + this->mUseBias = false; \ + this->ParallelismType##Test(); \ + this->ParallelismType##Test(2); \ + this->ParallelismType##Test(3); \ + } \ + \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Renorm) \ + { \ + this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE; \ + this->ParallelismType##Test(); \ + this->ParallelismType##Test(2); \ + this->ParallelismType##Test(3); \ + } \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##SparseMixer) \ + { \ + this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER; \ + this->ParallelismType##Test(); \ + this->ParallelismType##Test(2); \ + /* k=3 is not supported for sparse mixer tests */ \ + } \ + \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Geglu) \ + { \ + this->mActType = tensorrt_llm::ActivationType::Geglu; \ + this->ParallelismType##Test(); \ + this->ParallelismType##Test(2); \ + this->ParallelismType##Test(3); \ + } \ + \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Swiglu) \ + { \ + this->mActType = tensorrt_llm::ActivationType::Swiglu; \ + this->ParallelismType##Test(); \ + this->ParallelismType##Test(2); \ + this->ParallelismType##Test(3); \ + } \ + \ + TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Mixtral8x7b) \ + { \ + this->mUseBias = false; \ + this->mActType = tensorrt_llm::ActivationType::Swiglu; \ + this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE; \ + this->ParallelismType##Test(2, 4096, 8); \ + } -TYPED_TEST(MixtureOfExpertsTest, MixedParallelSwiglu) -{ - this->mActType = tensorrt_llm::ActivationType::Swiglu; - this->MixedParallelTest(); - this->MixedParallelTest(2); -} +PARALLEL_TEST_SUITE(ExpertParallel) +PARALLEL_TEST_SUITE(TensorParallel) +PARALLEL_TEST_SUITE(MixedParallel) TYPED_TEST(MixtureOfExpertsTest, ConfigSweep) { diff --git a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp index 399595583..e3c479ba4 100644 --- a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp +++ b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp @@ -62,7 +62,7 @@ class AirTopPSamplingKernelTest : public SamplingKernelTest tk::TopPSamplingKernelParams kernelParams; kernelParams.probs = bufferCast(*this->mProbsDevice); - kernelParams.outputIds = bufferCast(*this->mIdsPtrHost); + kernelParams.outputIdsPtrs = bufferCast(*this->mIdsPtrHost); kernelParams.workspace = workspaceDevice->data(); kernelParams.topPs = bufferCast(*this->mTopPsDevice); kernelParams.sequenceLength = bufferCast(*this->mSeqLengthsDevice); @@ -91,54 +91,59 @@ TYPED_TEST_SUITE(AirTopPSamplingKernelTest, FloatAndHalfTypes); TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessAncestral) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeVocabSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeVocabLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f).setDeterministicTopP(true)); + this->runTest( + SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f).setDeterministicTopP(true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f).setDeterministicTopP(true)); + this->runTest( + SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f).setDeterministicTopP(true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessAncestral) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f).setDeterministicTopP(true)); + this->runTest( + SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f).setDeterministicTopP(true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeVocabSmallP) { this->runTest( - SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f).setDeterministicTopP(true)); + SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f).setDeterministicTopP( + true)); }; TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeVocabLargeP) { this->runTest( - SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f).setDeterministicTopP(true)); + SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f).setDeterministicTopP( + true)); }; class AirTopPSamplingKernelUtilsTest : public SamplingKernelTest diff --git a/cpp/tests/kernels/sampling/samplingTest.cpp b/cpp/tests/kernels/sampling/samplingTest.cpp index c7f9cd2b6..d5d900244 100644 --- a/cpp/tests/kernels/sampling/samplingTest.cpp +++ b/cpp/tests/kernels/sampling/samplingTest.cpp @@ -110,6 +110,8 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) auto const topK = param.topK; auto const topP = param.topP; + // TopK == 0 case (TopP kernel) + auto const topKDistUpperBound = std::max(topK, static_cast(1)); std::mt19937 gen(42); @@ -133,7 +135,7 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) 0, vocabSize - 1); // -1 because uniform_int_distribution generates closed interval std::uniform_real_distribution<> skipDecodeDist(0, 1); std::uniform_real_distribution<> topPDist(0, topP); - std::uniform_int_distribution<> topKDist(1, topK); + std::uniform_int_distribution<> topKDist(1, topKDistUpperBound); std::uniform_int_distribution<> tokensPerStepDist(1, maxTokensPerStep); std::uniform_int_distribution<> seqLenDist(0, mMaxSeqLen - maxTokensPerStep); std::uniform_real_distribution<> logProbDist(-3.f, 3.f); @@ -158,7 +160,7 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) endIdsHostPtr[bi] = endIdsDistr(gen); skipDecodeHostPtr[bi] = skipDecodeDist(gen) > 0.8; topPsHostPtr[bi] = topPDist(gen); - topKsHostPtr[bi] = topKDist(gen); + topKsHostPtr[bi] = topK == 0 ? 0 : topKDist(gen); tokensPerStepPtr[bi] = tokensPerStepDist(gen); finishedHostPtr[bi] = finishedDist(gen) > 0.8 ? tk::FinishedState::finished() : tk::FinishedState::empty(); } @@ -196,9 +198,9 @@ void SamplingKernelTest::setupBuffers(SamplingKernelTestParam const& param) // Init logits randomly auto logitsHostPtr = bufferCast(*mLogitsHost); initRandom(logitsHostPtr, batchSize * maxTokensPerStep * vocabSize, -3.0f, 3.0f); - // Only in greedy search we can guarantee the selected token and stop by condition - if (topK == 1) + // TopK == 1 for TopK kernel greedy, TopK == 0 for TopP kernels + if (topK <= 1) { for (SizeType32 bi = 0; bi < batchSize; ++bi) { @@ -231,13 +233,29 @@ std::vector SamplingKernelTest::computeTopKTopPVariants( auto topK = bufferCast(*mTopKsHost)[batchSlot]; auto topP = bufferCast(*mTopPsHost)[batchSlot]; - allowedTokens.insert(allowedTokens.begin(), indices.begin(), indices.begin() + topK); + if (topK > 0) // handling top K kernel, top P result based on topK tokens + { + float sSum = 0.f; // sSum as in samplingTopKKernels.cu + for (auto ki = 0; ki < topK; ki++) + { + sSum += static_cast(probsPtr[indices[ki]]); + } + topP *= sSum; // the adjusted topP in the selected topK distribution + } + float totalProb = 0.f; SizeType32 idx = 0; while (totalProb < topP && idx < vocabSize) { allowedTokens.push_back(indices[idx]); totalProb += static_cast(probsPtr[indices[idx++]]); + // cuda may selected a different index with same probability in kernel reduce, in test we allow them + while (idx < vocabSize + && static_cast(probsPtr[indices[idx]]) == static_cast(probsPtr[indices[idx - 1]])) + { + allowedTokens.push_back(indices[idx]); + totalProb += static_cast(probsPtr[indices[idx++]]); + } } return allowedTokens; } @@ -284,12 +302,15 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) auto const tokensPerStep = tokensPerStepPtr[batchSlot]; for (SizeType32 ti = 0; ti < tokensPerStep; ++ti) { - auto kResults = param.returnAllTopK ? bufferCast(*mTopKsHost)[batchSlot] : 1; - - for (SizeType32 ki = 0; ki < kResults; ++ki) + auto topK = bufferCast(*mTopKsHost)[batchSlot]; + auto kResults = param.returnAllSelectedTokens ? (topK == 0 ? vocabSize : topK) : 1; + auto topKTopPVariants = computeTopKTopPVariants(bi, batchSlot, ti, maxTokensPerStep, vocabSize); + SizeType32 ki; + for (ki = 0; ki < kResults && ki < topKTopPVariants.size(); ++ki) { // Set reference finished state to true if we finished before or at current step - auto const idsIdx = param.returnAllTopK ? ti * mMaxTopK + ki : seqLengthsOrigHostPtr[batchSlot] + ti; + auto const idsIdx + = param.returnAllSelectedTokens ? ti * mMaxTopK + ki : seqLengthsOrigHostPtr[batchSlot] + ti; auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx]; // Check the range of the returned token ([0, vocabSize)) EXPECT_TRUE((outputId >= 0) && (outputId < vocabSize)); @@ -299,7 +320,7 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished() && !finishedOrigHostPtr[batchSlot].isSkipDecoding()) { - if (maxTokensPerStep == 1 && !param.returnAllTopK) + if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens) { if (generatedEOS) { @@ -314,8 +335,6 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) } } - auto topKTopPVariants = computeTopKTopPVariants(bi, batchSlot, ti, maxTokensPerStep, vocabSize); - bool found = false; for (auto const& var : topKTopPVariants) { @@ -340,11 +359,24 @@ void SamplingKernelTest::verifyResult(SamplingKernelTestParam const& param) EXPECT_EQ(finishedHostPtr[batchSlot].isFinished(), finishedOrigHostPtr[batchSlot].isFinished()); } } + + // a boundary check for returnAllSelectedTokens in topP kernel and when TopP selected indices < topK in topK + // kernel. + if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished() + && !finishedOrigHostPtr[batchSlot].isSkipDecoding()) + { + if (param.returnAllSelectedTokens && (topK == 0 || ki != topK)) + { + auto const idsIdx = ti * mMaxTopK + ki; + auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx]; + EXPECT_EQ(outputId, -1); + } + } } } // Cum log probs is not supported for multiple tokens per step or all top K return - if (maxTokensPerStep == 1 && !param.returnAllTopK) + if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens) { for (int32_t bi = 0; bi < batchSize; ++bi) { diff --git a/cpp/tests/kernels/sampling/samplingTest.h b/cpp/tests/kernels/sampling/samplingTest.h index 33d4e46b0..10de1f059 100644 --- a/cpp/tests/kernels/sampling/samplingTest.h +++ b/cpp/tests/kernels/sampling/samplingTest.h @@ -194,7 +194,7 @@ struct SamplingKernelTestParam bool normalizeLogProbs{false}; bool logitsHasProbs{true}; int32_t maxTokensPerStep{1}; - bool returnAllTopK{false}; + bool returnAllSelectedTokens{false}; bool useLogitsPtrs{false}; bool isDeterministicTopP{false}; @@ -228,9 +228,9 @@ struct SamplingKernelTestParam return *this; } - SamplingKernelTestParam& setReturnAllTopK() + SamplingKernelTestParam& setReturnAllSelectedTokens() { - returnAllTopK = true; + returnAllSelectedTokens = true; return *this; } diff --git a/cpp/tests/kernels/sampling/samplingTopKTest.cpp b/cpp/tests/kernels/sampling/samplingTopKTest.cpp index 0d3ea5b78..2bb5763fc 100644 --- a/cpp/tests/kernels/sampling/samplingTopKTest.cpp +++ b/cpp/tests/kernels/sampling/samplingTopKTest.cpp @@ -70,10 +70,10 @@ class TopKSamplingKernelTest : public SamplingKernelTest kernelParams.finishedOutput = reinterpret_cast( bufferCast(*this->mFinishedDevice)); kernelParams.skipDecode = bufferCast(*this->mSkipDecodeDevice); - kernelParams.cumLogProbs = params.returnAllTopK || params.maxTokensPerStep > 1 + kernelParams.cumLogProbs = params.returnAllSelectedTokens || params.maxTokensPerStep > 1 ? nullptr : bufferCast(*this->mCumLogProbsDevice); - kernelParams.outputLogProbs = params.returnAllTopK || params.maxTokensPerStep > 1 + kernelParams.outputLogProbs = params.returnAllSelectedTokens || params.maxTokensPerStep > 1 ? nullptr : bufferCast(*this->mOutputLogProbsDevice); kernelParams.curandState = reinterpret_cast(bufferCast(*this->mCurandStatesDevice)); @@ -84,7 +84,7 @@ class TopKSamplingKernelTest : public SamplingKernelTest kernelParams.vocabSizePadded = params.vocabSize; kernelParams.normalizeLogProbs = params.normalizeLogProbs; kernelParams.logitsHasProbs = params.logitsHasProbs; - kernelParams.returnAllTopK = params.returnAllTopK; + kernelParams.returnAllSelectedTokens = params.returnAllSelectedTokens; // Perform batched TopK sampling tk::invokeBatchTopKSampling(kernelParams, this->mStream->get()); @@ -136,7 +136,7 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessTopKMaxTokensPerStep) SamplingKernelTestParam().setBatchSize(16).setVocabSize(4000).setTopK(63).setTopP(1.0f).setMaxTokensPerStep(4)); }; -TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllTopK) +TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllSelectedTokens) { this->runTest(SamplingKernelTestParam() .setBatchSize(16) @@ -144,7 +144,18 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllTopK) .setTopK(10) .setTopP(1.0f) .setMaxTokensPerStep(4) - .setReturnAllTopK()); + .setReturnAllSelectedTokens()); +}; + +TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllSelectedTokensSmallP) +{ + this->runTest(SamplingKernelTestParam() + .setBatchSize(16) + .setVocabSize(50) + .setTopK(20) + .setTopP(0.3f) + .setMaxTokensPerStep(4) + .setReturnAllSelectedTokens()); }; TYPED_TEST(TopKSamplingKernelTest, CorrectnessLogitsPtrs) diff --git a/cpp/tests/kernels/sampling/samplingTopPTest.cpp b/cpp/tests/kernels/sampling/samplingTopPTest.cpp index c09133312..92fc81738 100644 --- a/cpp/tests/kernels/sampling/samplingTopPTest.cpp +++ b/cpp/tests/kernels/sampling/samplingTopPTest.cpp @@ -53,7 +53,7 @@ class TopPSamplingKernelTest : public SamplingKernelTest tk::TopPSamplingKernelParams kernelParams; kernelParams.probs = bufferCast(*this->mProbsDevice); - kernelParams.outputIds = bufferCast(*this->mIdsPtrHost); + kernelParams.outputIdsPtrs = bufferCast(*this->mIdsPtrHost); kernelParams.workspace = workspaceDevice->data(); kernelParams.topPs = bufferCast(*this->mTopPsDevice); kernelParams.sequenceLength = bufferCast(*this->mSeqLengthsDevice); @@ -64,12 +64,15 @@ class TopPSamplingKernelTest : public SamplingKernelTest kernelParams.finishedOutput = reinterpret_cast( bufferCast(*this->mFinishedDevice)); kernelParams.skipDecode = bufferCast(*this->mSkipDecodeDevice); - kernelParams.cumLogProbs = bufferCast(*this->mCumLogProbsDevice); - kernelParams.outputLogProbs = bufferCast(*this->mOutputLogProbsDevice); + kernelParams.cumLogProbs + = params.returnAllSelectedTokens ? nullptr : bufferCast(*this->mCumLogProbsDevice); + kernelParams.outputLogProbs + = params.returnAllSelectedTokens ? nullptr : bufferCast(*this->mOutputLogProbsDevice); kernelParams.curandState = reinterpret_cast(bufferCast(*this->mCurandStatesDevice)); kernelParams.batchSize = params.batchSize; kernelParams.maxBatchSize = maxBatchSize; kernelParams.vocabSizePadded = params.vocabSize; + kernelParams.returnAllSelectedTokens = params.returnAllSelectedTokens; // Perform batched TopP sampling tk::invokeBatchTopPSampling(kernelParams, this->mStream->get()); @@ -80,26 +83,36 @@ TYPED_TEST_SUITE(TopPSamplingKernelTest, FloatAndHalfTypes); TYPED_TEST(TopPSamplingKernelTest, CorrectnessSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessAncestral) { - this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f)); + this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabSmallP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f)); }; TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabLargeP) { - this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f)); + this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f)); +}; + +TYPED_TEST(TopPSamplingKernelTest, CorrectnessReturnAllSelectedTokens) +{ + this->runTest(SamplingKernelTestParam() + .setBatchSize(16) + .setVocabSize(50) + .setTopK(0) + .setTopP(0.8f) + .setReturnAllSelectedTokens()); }; } // end of namespace diff --git a/cpp/tests/layers/baseSamplingLayerTest.cpp b/cpp/tests/layers/baseSamplingLayerTest.cpp index 7b286514d..3e0dd2b3b 100644 --- a/cpp/tests/layers/baseSamplingLayerTest.cpp +++ b/cpp/tests/layers/baseSamplingLayerTest.cpp @@ -48,21 +48,23 @@ void BaseSamplingLayerTest::setup(uint64_t seed, TestSamplingParams const& pa computeProb(mTestLogitsInit.data(), mTestLogitsInit.data(), 4, mVocabSize); } - mSeqLengthsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32); - mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32); + mSeqLengthsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32); + mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32); mFinishedDevice = mBufferManager->gpu( - ITensor::makeShape({mMaxBatchSize}), TRTDataType::value); - mOutputIdsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, mMaxSeqLen}), nvinfer1::DataType::kINT32); - mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32); - mIdsPtrHost = mBufferManager->pinned(ITensor::makeShape({mMaxBatchSize}), ptrType); + ITensor::makeShape({maxBatchSize()}), TRTDataType::value); + mOutputIdsDevice + = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), mMaxSeqLen}), nvinfer1::DataType::kINT32); + mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32); + mIdsPtrHost = mBufferManager->pinned(ITensor::makeShape({maxBatchSize()}), ptrType); - mCumLogProbsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kFLOAT); + mCumLogProbsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kFLOAT); mOutputLogProbsDevice - = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, mMaxSeqLen}), nvinfer1::DataType::kFLOAT); + = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), mMaxSeqLen}), nvinfer1::DataType::kFLOAT); - mBatchSlots = mBufferManager->pinned(ITensor::makeShape({mBatchSize}), nvinfer1::DataType::kINT32); + mBatchSlots + = mBufferManager->pinned(ITensor::makeShape({mBatchSize + mBatchSizeBadPad}), nvinfer1::DataType::kINT32); mCurandStatesDevice - = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, sizeof(curandState_t)}), nvinfer1::DataType::kINT8); + = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), sizeof(curandState_t)}), nvinfer1::DataType::kINT8); auto const workspaceSize = mSamplingLayer->getWorkspaceSize(); @@ -79,10 +81,14 @@ void BaseSamplingLayerTest::setup(uint64_t seed, TestSamplingParams const& pa { batchSlotsPtr[bi] = 2 * bi; } + for (SizeType32 bi = 0; bi < mBatchSizeBadPad; ++bi) + { + batchSlotsPtr[mBatchSize + bi] = 0xbaadf00d; + } auto idsPtrHostPtr = BufferRange(*mIdsPtrHost); auto outputIdsDevicePtr = bufferCast(*mOutputIdsDevice); - for (SizeType32 bi = 0; bi < mMaxBatchSize; bi++) + for (SizeType32 bi = 0; bi < maxBatchSize(); bi++) { idsPtrHostPtr[bi] = outputIdsDevicePtr + bi * mMaxSeqLen; } @@ -158,13 +164,13 @@ void BaseSamplingLayerTest::batchCopy(int32_t step) template bool BaseSamplingLayerTest::checkResult(int32_t* outputIds, std::vector>& expectedIds) { - assert(expectedIds.size() == mMaxSeqLen * mBatchBeam); + assert(expectedIds.size() == mMaxSeqLen * batchBeam()); int failures = 0; auto* const batchSlotsPtr = bufferCast(*mBatchSlots); - for (int32_t i = 0; i < mMaxSeqLen * mBatchBeam; ++i) + for (int32_t i = 0; i < mMaxSeqLen * batchBeam(); ++i) { - int32_t s = i / mBatchBeam; - int32_t b = i % mBatchBeam; + int32_t s = i / batchBeam(); + int32_t b = i % batchBeam(); auto const batchSlot = batchSlotsPtr[b]; std::set expts = expectedIds.at(i); auto const outputId = outputIds[batchSlot * mMaxSeqLen + s]; @@ -186,7 +192,7 @@ bool BaseSamplingLayerTest::checkResult(int32_t* outputIds, std::vector void BaseSamplingLayerTest::runTest( std::vector> expectedOutputIds, TestSamplingParams const& params, int32_t endId) { + mBatchSize = params.batchSize; initLayer(params); auto const decoderDomain - = tensorrt_llm::layers::DecoderDomain(mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSizePadded); + = tensorrt_llm::layers::DecoderDomain(maxBatchSize(), mBeamWidth, mVocabSize, mVocabSizePadded); mDecodingWorkspace = std::make_unique( mBufferManager, decoderDomain, TRTDataType::value, mSamplingLayer->getWorkspaceSize()); mEndId = endId; diff --git a/cpp/tests/layers/baseSamplingLayerTest.h b/cpp/tests/layers/baseSamplingLayerTest.h index 4ebb75447..f70dc3561 100644 --- a/cpp/tests/layers/baseSamplingLayerTest.h +++ b/cpp/tests/layers/baseSamplingLayerTest.h @@ -85,6 +85,7 @@ struct TestSamplingParams std::vector decay; std::vector minTopP; std::vector topPResetIds; + int32_t batchSize = 6; bool useBias = false; }; @@ -96,11 +97,10 @@ class BaseSamplingLayerTest : public testing::Test using BufferPtr = tensorrt_llm::runtime::IBuffer::SharedPtr; int32_t seed = 0; - static uint64_t const mMaxSeed = 32; - int32_t const mBatchSize = 6; - int32_t const mMaxBatchSize = 2 * mBatchSize; + int32_t mBatchSize = -1; // setup by runTest + static int32_t constexpr mBatchSizeBadPad = 512; + static uint64_t constexpr mMaxSeed = 32; int32_t const mBeamWidth = 1; - int32_t const mBatchBeam = mBatchSize * mBeamWidth; int32_t const mVocabSize = 8; int32_t const mVocabSizePadded = mVocabSize; @@ -135,6 +135,16 @@ class BaseSamplingLayerTest : public testing::Test std::vector mTestLogitsInit; + int32_t maxBatchSize() const + { + return 2 * mBatchSize; + } + + int32_t batchBeam() const + { + return mBatchSize * mBeamWidth; + } + void setup(uint64_t seed, TestSamplingParams const& params); virtual void initLayer(TestSamplingParams const& params) = 0; diff --git a/cpp/tests/layers/lookaheadAlgorithmTest.cpp b/cpp/tests/layers/lookaheadAlgorithmTest.cpp index fc70b2bff..68eb8c193 100644 --- a/cpp/tests/layers/lookaheadAlgorithmTest.cpp +++ b/cpp/tests/layers/lookaheadAlgorithmTest.cpp @@ -21,6 +21,7 @@ #include "tensorrt_llm/layers/lookaheadAlgorithm.h" #include "tensorrt_llm/layers/lookaheadDecodingUtils.h" #include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/lookaheadModule.h" #include "tests/layers/randomLlm.h" @@ -84,9 +85,10 @@ TEST_P(LookaheadAlgorithmTest, predict) std::tie(std::ignore, std::ignore, maxDraftLenRuntime, std::ignore) = executor::LookaheadDecodingConfig(w, n, g).calculateSpeculativeResource(); auto shape = ITensor::makeShape({maxTokensPerStep}); + auto shape2d = ITensor::makeShape({maxTokensPerStep, maxTokensPerStep}); auto shapeSingle = ITensor::makeShape({1}); TensorPtr posidMax = BufferManager::cpu(shape, nvinfer1::DataType::kINT32); - TensorPtr smaskMax = BufferManager::cpu(shape, nvinfer1::DataType::kBOOL); + TensorPtr attentionMaskMax = BufferManager::cpu(shape2d, nvinfer1::DataType::kBOOL); TensorPtr inputLengthPtr = BufferManager::cpu(shapeSingle, nvinfer1::DataType::kINT32); auto& inputLength(*BufferRange(*inputLengthPtr).begin()); @@ -123,26 +125,34 @@ TEST_P(LookaheadAlgorithmTest, predict) { TLLM_LOG_DEBUG("\noracle[%d] = '%c'", sequenceLength - 1, static_cast(sequenceRange[sequenceLength - 1])); bufferCast(*posidMax)[0] = sequenceLength - 1; - bufferCast(*smaskMax)[0] = true; + BufferLocation amaskLocation(*attentionMaskMax); + for (auto& item : amaskLocation) + { + item = false; + } + for (SizeType32 i = 0; i < maxTokensPerStep; i++) + { + amaskLocation.at(i, 0) = true; + } + algo.prepare( // ITensor::slice(sequence, sequenceLength, maxDraftLenRuntime), // ITensor::slice(posidMax, 1, maxDraftLenRuntime), // - ITensor::slice(smaskMax, 1, maxDraftLenRuntime), // inputLengthPtr, // + attentionMaskMax, 1, // sequenceLengthPtr, // ITensor::slice(sequence, sequenceLength - 1, 1)); TensorPtr input = ITensor::slice(sequence, sequenceLength - 1, inputLength + 1); TensorPtr posid = ITensor::slice(posidMax, 0, inputLength + 1); - TensorPtr smask = ITensor::slice(smaskMax, 0, inputLength + 1); + TensorPtr amask = ITensor::slice(attentionMaskMax, 0, inputLength + 1); PRINT_TOKENS(input); PRINT_VALUES(posid); - PRINT_VALUES(smask); + PRINT_VALUES(amask); TensorPtr output = ITensor::slice(outputMax, 0, inputLength + 1); - llm.foretell(output, input, posid); - llm.sampleByMask(output, smask); + llm.foretell(output, input, posid, amask); PRINT_TOKENS(output); // algo.update(acceptedMax, acceptedOffsetsMax, acceptedLengthPtr, output, endIdPtr); @@ -207,4 +217,46 @@ INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSmall_222, LookaheadAlgorit testing::Combine(testing::Values(std::make_tuple(2, 2)), testing::Values(std::make_tuple(2, 2)), testing::Values(std::make_tuple(2, 2)))); +TEST(LookaheadAlgorithmTest, treeEncodeTest) +{ + auto testWithData = [](TensorPtr inputTokens, TensorPtr inputPosIds, SizeType32 lastPosId, SizeType32 gold_len) + { + auto shape = inputTokens->getShape(); + auto shape2d = ITensor::makeShape({shape.d[0], shape.d[0]}); + + TensorPtr inputMasks = BufferManager::cpu(shape2d, nvinfer1::DataType::kBOOL); + LookaheadAlgorithm::posIdsToMask(inputMasks, inputPosIds); + + TensorPtr outputTokens = BufferManager::cpu(shape, nvinfer1::DataType::kINT32); + TensorPtr outputPosIds = BufferManager::cpu(shape, nvinfer1::DataType::kINT32); + TensorPtr encodeMap = BufferManager::cpu(shape, nvinfer1::DataType::kINT32); + TensorPtr outputMasks = BufferManager::cpu(shape2d, nvinfer1::DataType::kBOOL); + + // auto len = LookaheadAlgorithm::treeEncode(outputTokens, outputPosIds, outputMasks, inputTokens, inputPosIds, + // inputMasks, '$', 9); + auto len = LookaheadAlgorithm::treeEncode(inputTokens, inputPosIds, inputMasks, encodeMap); + TLLM_LOG_DEBUG("len = %d", len); + + EXPECT_EQ(len, gold_len); + }; + + testWithData( // + initTensor(std::string("01234512345")), // + initTensor({10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}), // + 9, 6); + + testWithData( // + initTensor(std::string("01234512abc")), // + initTensor({10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}), // + 9, 9); + + testWithData( // + initTensor(std::string("01234512abc2aBCD")), // + initTensor({10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 12, 13, 14, 15, 16}), // + 9, 12); + + testWithData(initTensor(std::string("wmplhi folxamp")), + initTensor({21, 22, 23, 24, 25, 26, 27, 21, 22, 23, 24, 21, 22, 23, 24}), 20, 15); +} + } // namespace tensorrt_llm::tests::layers diff --git a/cpp/tests/layers/lookaheadDecodingLayerTest.cpp b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp index e3460a52b..71c62bd1a 100644 --- a/cpp/tests/layers/lookaheadDecodingLayerTest.cpp +++ b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp @@ -230,11 +230,11 @@ class LookaheadDecodingLayerTest : public testing::Test TensorPtr mNumNewTokensCumSum; TensorPtr mPathsOffsets; TensorPtr mDraftLengths; + TensorPtr mPrevDraftLengths; TensorPtr mDraftTokens; TensorPtr mPackedMasks; TensorPtr mPackedMasksBool; TensorPtr mGenerationLengths; - TensorPtr mGenerationLengthsMax; TensorPtr mPositionOffsets; TensorPtr mPositionIds; TensorPtr mAttentionPackedMask; @@ -371,6 +371,7 @@ void LookaheadDecodingLayerTest::allocateBuffers() ITensor::makeShape({mMaxTokensPerStep, maxBatchSize, 1}), nvinfer1::DataType::kINT32); mNumNewTokens = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32); mDraftLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32); + mPrevDraftLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32); mDraftTokens = BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32); auto packedMaskShape = ITensor::makeShape( @@ -382,7 +383,6 @@ void LookaheadDecodingLayerTest::allocateBuffers() mPathsOffsets = BufferManager::pinnedPool( ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32); mGenerationLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32); - mGenerationLengthsMax = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32); mPositionOffsets = BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32); mPositionIds @@ -462,10 +462,8 @@ void LookaheadDecodingLayerTest::newRequests(std::vector requestIds) setupParams->prompt.emplace_back(mPrompt[gbi]); setupParams->algoConfigs.emplace_back(mTestParam.w, mTestParam.n, mTestParam.g); PRINT_TOKENS(setupParams->prompt[bi]); - setupParams->generationLengths = mGenerationLengthsMax; - setupParams->actualGenerationLengths = mGenerationLengths; + setupParams->generationLengths = mGenerationLengths; setupParams->positionOffsets = mPositionOffsets; - // setupParams->outputs.positionIds = mPositionIds; setupParams->attentionPackedMasks = mPackedMasks; } std::vector seed(requestIds.begin(), requestIds.end()); @@ -669,14 +667,14 @@ void LookaheadDecodingLayerTest::decodeForward() PRINT_VALUES(mSequenceLengths); outputParams->sequenceLength = mSequenceLengths; outputParams->nextDraftLengths = mDraftLengths; + outputParams->prevDraftLengths = mPrevDraftLengths; outputParams->nextDraftTokens = mDraftTokens; outputParams->packedMasks = mPackedMasks; outputParams->numNewTokens = mNumNewTokens; outputParams->newTokens = mNewTokens; outputParams->numNewTokensCumSum = mNumNewTokensCumSum; outputParams->pathsOffsets = mPathsOffsets; - outputParams->generationLengths = mGenerationLengthsMax; - outputParams->actualGenerationLengths = mGenerationLengths; + outputParams->generationLengths = mGenerationLengths; outputParams->positionOffsets = mPositionOffsets; outputParams->positionIds = mPositionIds; outputParams->packedMasks = mPackedMasks; @@ -722,17 +720,17 @@ void LookaheadDecodingLayerTest::verifyDecode() BufferRange cumSumRange(*mNumNewTokensCumSum); BufferRange pathOffsetsRange(*mPathsOffsets); PRINT_VALUES(mNumNewTokensCumSum); - for (SizeType32 gbi = 0; gbi < mTestParam.maxBatchSize; gbi++) + for (SizeType32 bi = 0; bi < batchSize; bi++) { - SizeType32 pathOffsetBegin = cumSumRange[gbi]; - SizeType32 pathOffsetEnd = cumSumRange[gbi + 1]; + auto gbi = BufferRange(*mBatchSlots)[bi]; + SizeType32 pathOffsetBegin = cumSumRange[bi]; + SizeType32 pathOffsetEnd = cumSumRange[bi + 1]; TensorPtr golden = ITensor::at(mGoldenSampledTokens, {gbi}); auto sequenceLength = BufferLocation(*mSequenceLengths).at(gbi); auto numNewTokens = BufferLocation(*mNumNewTokens).at(gbi); TensorPtr newTokens = ITensor::slice(mOutputIds, {gbi, 0, sequenceLength - numNewTokens}, numNewTokens); BufferRange goldenRange(*ITensor::at(mGoldenSampledTokens, {gbi})); - BufferRange newTokensRange( - *ITensor::slice(mOutputIds, {gbi, 0, sequenceLength - numNewTokens}, numNewTokens)); + BufferRange newTokensRange(*newTokens); SizeType32 ni = 1; for (SizeType32 poi = pathOffsetBegin; poi < pathOffsetEnd; poi++) diff --git a/cpp/tests/layers/lookaheadRandomLlmTest.cpp b/cpp/tests/layers/lookaheadRandomLlmTest.cpp index e4570b1ee..f8e8ff027 100644 --- a/cpp/tests/layers/lookaheadRandomLlmTest.cpp +++ b/cpp/tests/layers/lookaheadRandomLlmTest.cpp @@ -207,7 +207,7 @@ TEST(LookaheadRandomllm, gpuSampling) kernelParams.vocabSizePadded = vocabSize; kernelParams.normalizeLogProbs = false; kernelParams.logitsHasProbs = false; - kernelParams.returnAllTopK = false; + kernelParams.returnAllSelectedTokens = false; PRINT_TOKENS(mEndIds); PRINT_VALUES(mTokensPerStep); diff --git a/cpp/tests/layers/randomLlm.cpp b/cpp/tests/layers/randomLlm.cpp index 2116186a6..9746286d9 100644 --- a/cpp/tests/layers/randomLlm.cpp +++ b/cpp/tests/layers/randomLlm.cpp @@ -276,8 +276,8 @@ void LookaheadRandomLlm::foretell(TensorPtr const& output, TensorConstPtr const& { right &= maskLocation.at(i, j) ? oracleRange[positionRange[j]] == inputRange[j] : true; } - if (i < verifyStart) - { // lookahead might be right + if (i < verifyStart && false) + { // lookahead might be right. Since we verify lookahead branch, then must be right. outputRange[i] = ((right || rand() % 5) && legal) ? oracleRange[positionRange[i] + 1] : invalid; } else diff --git a/cpp/tests/layers/samplingLayerTest.cpp b/cpp/tests/layers/samplingLayerTest.cpp index 8efd52479..a6641378b 100644 --- a/cpp/tests/layers/samplingLayerTest.cpp +++ b/cpp/tests/layers/samplingLayerTest.cpp @@ -51,7 +51,7 @@ class SamplingLayerTest : public BaseSamplingLayerTest } auto const decodingDomain - = tensorrt_llm::layers::DecoderDomain(this->mMaxBatchSize, 1, this->mVocabSize, this->mVocabSizePadded); + = tensorrt_llm::layers::DecoderDomain(this->maxBatchSize(), 1, this->mVocabSize, this->mVocabSizePadded); this->mSamplingLayer = std::make_shared>( decodingMode, decodingDomain, this->mBufferManager); } diff --git a/cpp/tests/layers/topKSamplingLayerTest.cpp b/cpp/tests/layers/topKSamplingLayerTest.cpp index 1da62cc23..f6ddde52a 100644 --- a/cpp/tests/layers/topKSamplingLayerTest.cpp +++ b/cpp/tests/layers/topKSamplingLayerTest.cpp @@ -34,7 +34,7 @@ class TopKSamplingLayerTest : public BaseSamplingLayerTest void initLayer(TestSamplingParams const& params) override { auto const decodingDomain - = tensorrt_llm::layers::DecoderDomain(this->mMaxBatchSize, 1, this->mVocabSize, this->mVocabSizePadded); + = tensorrt_llm::layers::DecoderDomain(this->maxBatchSize(), 1, this->mVocabSize, this->mVocabSizePadded); this->mSamplingLayer = std::make_shared>(decodingDomain, this->mBufferManager); } diff --git a/cpp/tests/layers/topPSamplingLayerTest.cpp b/cpp/tests/layers/topPSamplingLayerTest.cpp index c1af229e1..8f65c312e 100644 --- a/cpp/tests/layers/topPSamplingLayerTest.cpp +++ b/cpp/tests/layers/topPSamplingLayerTest.cpp @@ -40,12 +40,13 @@ class TopPSamplingLayerTest : public BaseSamplingLayerTest void initLayer(TestSamplingParams const& params) override { auto const decodingDomain - = tensorrt_llm::layers::DecoderDomain(this->mMaxBatchSize, 1, this->mVocabSize, this->mVocabSizePadded); + = tensorrt_llm::layers::DecoderDomain(this->maxBatchSize(), 1, this->mVocabSize, this->mVocabSizePadded); this->mSamplingLayer = std::make_shared>( decodingDomain, this->mBufferManager, &mDeviceProp); } - struct cudaDeviceProp mDeviceProp; +protected: + cudaDeviceProp mDeviceProp{}; }; TYPED_TEST_SUITE(TopPSamplingLayerTest, FloatAndHalfTypes); @@ -165,4 +166,28 @@ TYPED_TEST(TopPSamplingLayerTest, TopPDecay) this->runTest(expectedOutputIds, params); } +TYPED_TEST(TopPSamplingLayerTest, LargeBatch) +{ + SizeType32 topK = 0; + float topP = 0.3f; + TestSamplingParams params; + params.topKs = {topK}; + params.topPs = {topP}; + + // Force to use more than 1 block + params.batchSize = this->mDeviceProp.maxThreadsPerBlock + 1; + std::vector> expectedOutputId{{4}, {0}, {2}, {0}}; + std::vector> expectedOutputIds; + expectedOutputIds.reserve(expectedOutputId.size() * params.batchSize); + + for (auto const& id : expectedOutputId) + { + for (int32_t i = 0; i < params.batchSize; ++i) + { + expectedOutputIds.emplace_back(id); + } + } + this->runTest(expectedOutputIds, params); +} + } // namespace diff --git a/cpp/tests/resources/data/test_model_lora_config.json b/cpp/tests/resources/data/test_model_lora_config.json index 73a598d01..ea6442186 100644 --- a/cpp/tests/resources/data/test_model_lora_config.json +++ b/cpp/tests/resources/data/test_model_lora_config.json @@ -63,7 +63,6 @@ "gather_context_logits": false, "gather_generation_logits": false, "strongly_typed": true, - "builder_opt": null, "profiling_verbosity": "layer_names_only", "enable_debug_output": false, "max_draft_len": 0, diff --git a/cpp/tests/resources/scripts/build_chatglm_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py index e845a0365..530db1d8f 100644 --- a/cpp/tests/resources/scripts/build_chatglm_engines.py +++ b/cpp/tests/resources/scripts/build_chatglm_engines.py @@ -59,7 +59,6 @@ def build_engine(ckpt_dir: str, "--max_seq_len=384", "--gpt_attention_plugin=float16", "--gemm_plugin=float16", - "--builder_opt=0", ] if is_ifb: build_cmd.extend([ diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py index 104879be4..7cbc8c382 100755 --- a/cpp/tests/resources/scripts/build_gpt_engines.py +++ b/cpp/tests/resources/scripts/build_gpt_engines.py @@ -63,7 +63,6 @@ def build_engine( f'--max_input_len={max_input_len}', f'--max_seq_len={max_seq_len}', '--max_beam_width=2', - '--builder_opt=0', '--kv_cache_type=continuous', ] legacy_args = [ diff --git a/cpp/tests/resources/scripts/build_llama_engines.py b/cpp/tests/resources/scripts/build_llama_engines.py index 425b636f4..12f56b364 100644 --- a/cpp/tests/resources/scripts/build_llama_engines.py +++ b/cpp/tests/resources/scripts/build_llama_engines.py @@ -90,7 +90,7 @@ def build_engines(model_cache: str, only_multi_gpu: bool): tp_pp_sizes = [(1, 1)] if only_multi_gpu: - tp_pp_sizes = [(1, 4), (4, 1), (1, 2), (2, 2)] + tp_pp_sizes = [(1, 4), (4, 1), (1, 2), (2, 2), (2, 1)] for tp_size, pp_size in tp_pp_sizes: tp_pp_dir = f"tp{tp_size}-pp{pp_size}-gpu" print(f"\nBuilding fp16 tp{tp_size} pp{pp_size} engine") diff --git a/cpp/tests/resources/scripts/generate_expected_gpt_output.py b/cpp/tests/resources/scripts/generate_expected_gpt_output.py index 4037a236f..69607af7c 100755 --- a/cpp/tests/resources/scripts/generate_expected_gpt_output.py +++ b/cpp/tests/resources/scripts/generate_expected_gpt_output.py @@ -151,7 +151,7 @@ def generate_outputs(num_beams): output_logits=True, output_log_probs=True, output_cum_log_probs=True) - # GptExecutorTest.GenerationLogitsEarlyStop requires to use context_fmha_fp32_acc flag in runtime + # GptExecutorTest.GenerationLogitsEarlyStop and several tests require to use context_fmha_fp32_acc flag in runtime model_spec_obj.enable_context_fmha_fp32_acc() generate_output(engine=model_spec_obj.get_model_path(), num_beams=num_beams, @@ -165,6 +165,14 @@ def generate_outputs(num_beams): model_spec_obj.use_gpt_plugin() model_spec_obj.set_kv_cache_type(_tb.KVCacheType.PAGED) model_spec_obj.use_packed_input() + generate_output(engine=model_spec_obj.get_model_path(), + num_beams=num_beams, + input_name=input_name, + model_spec_obj=model_spec_obj, + output_logits=False, + output_log_probs=True, + output_cum_log_probs=True) + model_spec_obj.enable_context_fmha_fp32_acc() generate_output(engine=model_spec_obj.get_model_path(), num_beams=num_beams, input_name=input_name, diff --git a/cpp/tests/resources/scripts/generate_expected_llama_output.py b/cpp/tests/resources/scripts/generate_expected_llama_output.py index 08d904201..cff87fbe0 100644 --- a/cpp/tests/resources/scripts/generate_expected_llama_output.py +++ b/cpp/tests/resources/scripts/generate_expected_llama_output.py @@ -72,7 +72,7 @@ def generate_outputs(num_beams, only_multi_gpu=False): elif COMM_WORLD.size == 4: tp_pp_sizes = [(4, 1), (2, 2), (1, 4)] elif COMM_WORLD.size == 2: - tp_pp_sizes = [(1, 2)] + tp_pp_sizes = [(1, 2), (2, 1)] else: raise RuntimeError( f"The world size of MPI {COMM_WORLD.size} is not equal to 1, 2, or 4." diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py index a54ff4c30..ba0fe6d0d 100755 --- a/cpp/tests/resources/scripts/test_cpp.py +++ b/cpp/tests/resources/scripts/test_cpp.py @@ -16,6 +16,7 @@ import argparse as _arg import copy +import glob import logging as _log import os as _os import pathlib as _pl @@ -68,6 +69,98 @@ def run_command(command: _tp.Sequence[str], _sp.check_call(command, cwd=cwd, shell=shell, env=env, timeout=timeout) +def merge_report(parallel, retry, output): + import xml.etree.ElementTree as ElementTree + base = ElementTree.parse(parallel) + extra = ElementTree.parse(retry) + + base_suite = base.getroot() + extra_suite = extra.getroot() + + base_suite.attrib['failures'] = extra_suite.attrib['failures'] + base_suite.attrib['time'] = str( + int(base_suite.attrib['time']) + int(extra_suite.attrib['time'])) + + case_names = {element.attrib['name'] for element in extra_suite} + base_suite[:] = [ + element + for element in base_suite if element.attrib['name'] not in case_names + ] + list(extra_suite) + + base.write(output, encoding="UTF-8", xml_declaration=True) + + +def add_parallel_info(report, parallel): + import xml.etree.ElementTree as ElementTree + try: + document = ElementTree.parse(report) + except FileNotFoundError: + return + root = document.getroot() + root.attrib['parallel'] = str(parallel) + document.write(report, encoding="UTF-8", xml_declaration=True) + + +def parallel_run_ctest( + command: _tp.Sequence[str], + cwd: _pl.Path, + *, + shell=False, + env=None, + timeout=None, + parallel=2, +) -> None: + if parallel == 1: + return run_command(command, + cwd=cwd, + shell=shell, + env=env, + timeout=timeout) + + env = {} if env is None else env + env['CTEST_PARALLEL_LEVEL'] = str(parallel) + + def get_report(): + reports = glob.glob("results-*.xml", root_dir=cwd) + if not reports: + return '' + + return reports[0] + + report = None + try: + run_command(command, cwd=cwd, shell=shell, env=env, timeout=timeout) + except _sp.CalledProcessError: + report = get_report() + if report == '': + # Some catastrophic fail happened that there's no report generated + raise + + parallel_report = 'parallel-' + report + _os.rename(cwd / report, cwd / parallel_report) + + try: + _log.info("Parallel test failed, retry serial on failed tests") + del env['CTEST_PARALLEL_LEVEL'] + command = [*command, "--rerun-failed"] + run_command(command, cwd=cwd, shell=shell, env=env, timeout=timeout) + finally: + if not _os.path.exists(cwd / report): + # Some catastrophic fail happened that there's no report generated + # Use parallel result as final report + _os.rename(cwd / parallel_report, cwd / report) + else: + retry_report = 'retry-' + report + _os.rename(cwd / report, cwd / retry_report) + merge_report(cwd / parallel_report, cwd / retry_report, + cwd / report) + finally: + if report is None: + report = get_report() + if report: + add_parallel_info(cwd / report, parallel) + + def run_tests(build_dir: _pl.Path, model_cache: _tp.Optional[str] = None, skip_unit_tests=False, @@ -376,6 +469,12 @@ def prepare_multi_gpu_model_tests(python_exe: str, model_cache_arg=model_cache_arg, only_multi_gpu_arg=only_multi_gpu_arg) + prepare_model_tests(model_name="llama", + python_exe=python_exe, + root_dir=root_dir, + resources_dir=resources_dir, + model_cache_arg=model_cache_arg) + prepare_model_tests(model_name="t5", python_exe=python_exe, root_dir=root_dir, @@ -483,7 +582,7 @@ def run_unit_tests(build_dir: _pl.Path, timeout=1800): excluded_tests.append("Encoder") excluded_tests.append("EncDec") ctest.extend(["-E", "|".join(excluded_tests)]) - run_command(ctest, cwd=build_dir, env=cpp_env, timeout=timeout) + parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout) def run_single_gpu_tests(build_dir: _pl.Path, @@ -541,7 +640,18 @@ def run_single_gpu_tests(build_dir: _pl.Path, ctest.extend(["-R", "|".join(included_tests)]) if excluded_tests: ctest.extend(["-E", "|".join(excluded_tests)]) - run_command(ctest, cwd=build_dir, env=cpp_env, timeout=timeout) + parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout) + if run_gpt: + xml_output_file = build_dir / "results-single-gpu-disagg-executor_gpt.xml" + trt_model_test = produce_mpirun_command( + global_commands=["mpirun", "--allow-run-as-root"], + nranks=2, + local_commands=[ + "tests/executor/executorTest", + "--gtest_filter=*GptSingleDeviceDisaggSymmetricExecutorTest*" + ], + leader_commands=[f"--gtest_output=xml:{xml_output_file}"]) + run_command(trt_model_test, cwd=build_dir, env=cpp_env, timeout=timeout) def produce_mpirun_command(*, global_commands, nranks, local_commands, @@ -574,6 +684,7 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500): "-n", "2", "--allow-run-as-root", + "--oversubscribe", "batch_manager/cacheTransceiverTest", ] run_command(cache_trans_test, cwd=tests_dir, env=cpp_env, timeout=300) @@ -654,25 +765,78 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500): run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) new_env = copy.copy(cpp_env) - xml_output_file = build_dir / "results-multi-gpu-dist-executor_gpt.xml" + xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml" trt_model_test = produce_mpirun_command( global_commands=["mpirun", "--allow-run-as-root"], nranks=2, local_commands=[ "executor/executorTest", - "--gtest_filter=DistExecutorTest.GPTTokenComparison" + "--gtest_filter=*DisaggSymmetricExecutorTest*" ], leader_commands=[f"--gtest_output=xml:{xml_output_file}"]) run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) new_env = copy.copy(cpp_env) - xml_output_file = build_dir / "results-multi-gpu-dist-executor_chatglm.xml" + new_env["RUN_LLAMA_MULTI_GPU"] = "true" + xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml" trt_model_test = produce_mpirun_command( global_commands=["mpirun", "--allow-run-as-root"], - nranks=2, + nranks=4, + local_commands=[ + "executor/executorTest", + "--gtest_filter=*DisaggSymmetricExecutorTest*" + ], + leader_commands=[f"--gtest_output=xml:{xml_output_file}"]) + run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) + + new_env = copy.copy(cpp_env) + new_env["RUN_LLAMA_MULTI_GPU"] = "true" + xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml" + trt_model_test = produce_mpirun_command( + global_commands=["mpirun", "--allow-run-as-root"], + nranks=8, + local_commands=[ + "executor/executorTest", + "--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*" + ], + leader_commands=[f"--gtest_output=xml:{xml_output_file}"]) + run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) + + new_env = copy.copy(cpp_env) + new_env["RUN_LLAMA_MULTI_GPU"] = "true" + xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml" + trt_model_test = produce_mpirun_command( + global_commands=["mpirun", "--allow-run-as-root"], + nranks=4, + local_commands=[ + "executor/executorTest", + "--gtest_filter=*DisaggAsymmetricExecutorTest*" + ], + leader_commands=[f"--gtest_output=xml:{xml_output_file}"]) + run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) + + new_env = copy.copy(cpp_env) + new_env["RUN_LLAMA_MULTI_GPU"] = "true" + xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml" + trt_model_test = produce_mpirun_command( + global_commands=["mpirun", "--allow-run-as-root"], + nranks=6, + local_commands=[ + "executor/executorTest", + "--gtest_filter=*DisaggAsymmetricExecutorTest*" + ], + leader_commands=[f"--gtest_output=xml:{xml_output_file}"]) + run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) + + new_env = copy.copy(cpp_env) + new_env["RUN_LLAMA_MULTI_GPU"] = "true" + xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml" + trt_model_test = produce_mpirun_command( + global_commands=["mpirun", "--allow-run-as-root"], + nranks=8, local_commands=[ "executor/executorTest", - "--gtest_filter=DistExecutorTest.ChatGLMTokenComparison" + "--gtest_filter=*DisaggAsymmetricExecutorTest*" ], leader_commands=[f"--gtest_output=xml:{xml_output_file}"]) run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500) diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp index 152a060f0..42a2fecd0 100644 --- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp +++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp @@ -195,7 +195,8 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector& sa SizeType32 constexpr nbRnnLayers{0}; SizeType32 constexpr nbHeads{16}; SizeType32 constexpr hiddenSize{1024}; - ModelConfig modelConfig{vocabSize, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype}; + ModelConfig modelConfig{ + vocabSize, nbAttentionLayers + nbRnnLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype}; modelConfig.useGptAttentionPlugin(false); auto streamPtr = std::make_shared(); @@ -315,7 +316,8 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector(); @@ -440,7 +442,8 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); + mModelConfig = std::make_unique(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); mModelConfig->setMlpHiddenSize(32); mWorldConfig = std::make_unique(2, 1, 0); std::vector modules{ @@ -166,7 +166,7 @@ TEST_F(LoraCacheTest, LoraCachePageManagerTest) TEST_F(LoraCacheTest, determineNumPages) { - ModelConfig modelConfig(0, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT); + ModelConfig modelConfig(0, 2, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT); modelConfig.setLoraModules(LoraModule::createLoraModules({"attn_dense", "attn_qkv"}, 4, 4, 1, 1, 2, 2, 0)); WorldConfig worldConfig(1, 1, 0); @@ -358,7 +358,7 @@ TEST_F(LoraCacheTest, basicPutGet) TEST_F(LoraCacheTest, splitTransposeCpu) { - auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); + auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); auto worldConfig = WorldConfig(2, 1, 0); SizeType32 const split{2}; @@ -421,7 +421,7 @@ TEST_F(LoraCacheTest, splitTransposeCpu) TEST_F(LoraCacheTest, copyToPages_tp1) { - auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); + auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); modelConfig.setMlpHiddenSize(32); auto worldConfig = WorldConfig(1, 1, 0); std::vector modules{ @@ -479,7 +479,7 @@ TEST_F(LoraCacheTest, copyToPages_tp1) TEST_F(LoraCacheTest, copyToPages_tp2_rank0) { - auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); + auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); modelConfig.setMlpHiddenSize(32); auto worldConfig = WorldConfig(2, 1, 0); std::vector modules{ @@ -536,7 +536,7 @@ TEST_F(LoraCacheTest, copyToPages_tp2_rank0) TEST_F(LoraCacheTest, copyToPages_tp2_rank1) { - auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); + auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); modelConfig.setMlpHiddenSize(32); auto worldConfig = WorldConfig(2, 1, 1); std::vector modules{ diff --git a/cpp/tests/runtime/loraManagerTest.cpp b/cpp/tests/runtime/loraManagerTest.cpp index b7cfd987e..0718bb316 100644 --- a/cpp/tests/runtime/loraManagerTest.cpp +++ b/cpp/tests/runtime/loraManagerTest.cpp @@ -59,7 +59,7 @@ class LoraManagerTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-t { protected: LoraManagerTest() - : mModelConfig(1, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT) + : mModelConfig(1, 2, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT) { } @@ -80,7 +80,7 @@ class LoraManagerTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-t PeftTable getPeftTable(SizeType32 tpRank = 0) { - auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); + auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); modelConfig.setMlpHiddenSize(32); auto worldConfig = WorldConfig(2, 2, 3); std::vector modules{ @@ -292,7 +292,7 @@ static std::tuple, std::vector, PeftTable> createF TEST_F(LoraManagerTest, fillInputTensors) { LoraManager loraManager; - auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); + auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT); modelConfig.setMlpHiddenSize(32); auto worldConfig = WorldConfig(1, 1, 0); std::vector modules{ diff --git a/cpp/tests/runtime/loraUtilsTest.cpp b/cpp/tests/runtime/loraUtilsTest.cpp index b6cdd15f8..b44303346 100644 --- a/cpp/tests/runtime/loraUtilsTest.cpp +++ b/cpp/tests/runtime/loraUtilsTest.cpp @@ -86,7 +86,7 @@ TEST_F(LoraUtilsTest, dims_mem_type) TEST_F(LoraUtilsTest, loraValidateRequestTensors) { - auto modelConfig = ModelConfig(0, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT); + auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT); auto worldConfig = WorldConfig(); std::optional optReqLoraWeights @@ -114,6 +114,11 @@ TEST_F(LoraUtilsTest, loraValidateRequestTensors) LoraModule(LoraModule::ModuleType::kATTN_Q, 4, 4, false, true, -1, 0), }; modelConfig.setLoraModules(modules); + EXPECT_THAT([&]() + { loraValidateRequestTensors(12345, optReqLoraWeights, optReqLoraConfig, modelConfig, worldConfig); }, + testing::Throws()); + + modelConfig.setMaxLoraRank(4); loraValidateRequestTensors(12345, optReqLoraWeights, optReqLoraConfig, modelConfig, worldConfig); diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index abf1e9d22..e9a7506ce 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -62,7 +62,7 @@ COPY benchmarks benchmarks COPY scripts scripts COPY tensorrt_llm tensorrt_llm COPY 3rdparty 3rdparty -COPY setup.py requirements.txt requirements-dev.txt ./ +COPY .gitmodules setup.py requirements.txt requirements-dev.txt ./ # Create cache directories for pip and ccache RUN mkdir -p /root/.cache/pip /root/.cache/ccache @@ -89,9 +89,11 @@ RUN ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensor test -f bin/executorWorker && \ ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensorrt_llm/libs")') lib && \ test -f lib/libnvinfer_plugin_tensorrt_llm.so && \ - ln -sv lib/libnvinfer_plugin_tensorrt_llm.so lib/libnvinfer_plugin_tensorrt_llm.so.9 && \ echo "/app/tensorrt_llm/lib" > /etc/ld.so.conf.d/tensorrt_llm.conf && \ ldconfig +# Test LD configuration +RUN ! ( ldd -v bin/executorWorker | grep tensorrt_llm | grep -q "not found" ) + ARG SRC_DIR=/src/tensorrt_llm COPY --from=wheel ${SRC_DIR}/benchmarks benchmarks ARG CPP_BUILD_DIR=${SRC_DIR}/cpp/build diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh index ebd93e81f..70c01917a 100644 --- a/docker/common/install_pytorch.sh +++ b/docker/common/install_pytorch.sh @@ -6,6 +6,10 @@ set -ex # and closest to the version specified in # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 TORCH_VERSION="2.4.0" +# Check the compatible torchvision from +# https://github.com/pytorch/vision/tree/main?tab=readme-ov-file#installation +# and also confirm with https://pypi.org/pypi/torchvision/0.19.0/json +TORCHVISION_VERSION="0.19.0" SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') prepare_environment() { @@ -35,29 +39,44 @@ restore_environment() { install_from_source() { if [[ $SYSTEM_ID == *"centos"* ]]; then - VERSION_ID=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') - if [[ $VERSION_ID == "7" ]]; then - echo "Installation from PyTorch source codes cannot be supported..." - exit 1 - fi + VERSION_ID=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"') + if [[ $VERSION_ID == "7" ]]; then + echo "Installation from PyTorch source codes cannot be supported..." + exit 1 + fi fi prepare_environment $1 + export _GLIBCXX_USE_CXX11_ABI=$1 - export TORCH_CUDA_ARCH_LIST="8.0;9.0" + export TORCH_CUDA_ARCH_LIST="8.0;9.0" + export PYTORCH_BUILD_VERSION=${TORCH_VERSION} + export PYTORCH_BUILD_NUMBER=0 pip3 uninstall -y torch cd /tmp - git clone --depth 1 --branch v$TORCH_VERSION https://github.com/pytorch/pytorch + git clone --depth 1 --branch v${TORCH_VERSION} https://github.com/pytorch/pytorch cd pytorch git submodule sync && git submodule update --init --recursive pip3 install -r requirements.txt python3 setup.py install cd /tmp && rm -rf /tmp/pytorch + + export PYTORCH_VERSION=${PYTORCH_BUILD_VERSION} + export FORCE_CUDA=1 + export BUILD_VERSION=${TORCHVISION_VERSION} + pip3 uninstall -y torchvision + cd /tmp + git clone --depth 1 --branch v${TORCHVISION_VERSION} https://github.com/pytorch/vision + cd vision + python3 setup.py install + cd /tmp && rm -rf /tmp/vision + restore_environment $1 } install_from_pypi() { - pip3 install torch==${TORCH_VERSION} + pip3 uninstall -y torch torchvision + pip3 install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} } case "$1" in diff --git a/docs/requirements.txt b/docs/requirements.txt index a0616a0e8..f32e828e0 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,3 +4,4 @@ sphinx-rtd-theme myst_parser breathe pygit2 +sphinx_copybutton diff --git a/docs/source/advanced/batch-manager.md b/docs/source/advanced/batch-manager.md index d46e05d6d..4a6d8650a 100644 --- a/docs/source/advanced/batch-manager.md +++ b/docs/source/advanced/batch-manager.md @@ -147,6 +147,7 @@ Note: this feature isn't supported with the `V1` batching scheme for the moment. * `capacitySchedulerPolicy`, policy used to select the subset available requests in each iteration of the InflightBatching generation loop. - `MAX_UTILIZATION` packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability. - `GUARANTEED_NO_EVICT` uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction. + - `STATIC_BATCH` similarly to `GUARANTEED_NO_EVICT` schedules the maximum possible batch size without eviction. New requests are scheduled only after all requests in the previous batch have finished. ### Optional GptManager parameters * `TrtGptModelOptionalParams` class encapsulates the following fields: @@ -227,6 +228,9 @@ It can also adopt a more conservative approach and schedule requests only when i knows that the memory allocation will be sufficient to process all active requests even in the worst case of KV cache consumption. That mode corresponds to a `SchedulerConfig::capacitySchedulerPolicy` set to `kGUARANTEED_NO_EVICT`. +Another traditional batching scheme with a batch of requests running in lockstep +until generation for all of them is completed corresponds to +`SchedulerConfig::capacitySchedulerPolicy` set to `kSTATIC_BATCH`. The `GptManager`'s worker thread terminates when the `GptManager` destructor is called and there are no more active requests. diff --git a/docs/source/executor.md b/docs/source/advanced/executor.md similarity index 81% rename from docs/source/executor.md rename to docs/source/advanced/executor.md index 8955c6bae..500e7cab3 100644 --- a/docs/source/executor.md +++ b/docs/source/advanced/executor.md @@ -15,37 +15,6 @@ The following sections provide an overview of the main classes defined in the Ex The `Executor` class is responsible for receiving requests from the client, and providing responses for those requests. The executor is constructed by providing a path to a directory containing the TensorRT-LLM engine or buffers containing the engine and the model JSON configuration. The client can create requests and enqueue those requests for execution using the `enqueueRequest` or `enqueueRequests` methods of the `Executor` class. Enqueued requests will be scheduled for execution by the executor, and multiple independent requests can be batched together at every iteration of the main execution loop (a process often referred to as continuous batching or iteration-level batching). Responses for a particular request can be awaited for by calling the `awaitResponses` method, and by providing the request id. Alternatively, responses for any requests can be awaited for by omitting to provide the request id when calling `awaitResponses`. The `Executor` class also allows to cancel requests using the `cancelRequest` method and to obtain per-iteration and per-request statistics using the `getLatestIterationStats`. -#### Logits Post-Processor (optional) - -Users can alter the logits produced by the network, by providing a map of named callbacks of the form: - -``` -std::unordered_map)>> -``` -to an instance of `LogitsPostProcessorConfig`. The map key is the name associated with that logits post-processing callback. Each request can then specify the name of the logits post-processor to use for that particular request, if any. - -The first argument to the callback is the request id, second is the logits tensor, third are the tokens produced by the request so far, fourth is the operation stream used by the logits tensor, and last one is an optional client id. The callback returns a modified tensor of logits. - -Users *must* use the stream to access the logits tensor. For example, performing a addition with a bias tensor should be enqueued on that stream. -Alternatively, users may call `stream->synchronize()`, however, that will slow down the entire execution pipeline. - -Multiple requests can share same client id and callback can use different logic based on client id. - -We also provide a batched version that allows altering logits of multiple requests in a batch. This allows further optimizations and reduces callback overheads. - -``` -std::function const&, std::vector&, std::vector> const&, StreamPtr const&, std::vector> const&)> -``` - -A single batched callback can be specified in `LogitsPostProcessorConfig`. Each request can opt to apply this callback by specifying the name of the logits -post-processor as `Request::kBatchedPostProcessorName`. - -Note: Neither callback variant is supported with the `STATIC` batching type for the moment. - -In a multi-GPU run, callback is invoked on all tensor parallel ranks (in last pipeline rank) by default. -For correct execution, user should replicate client-side state accessed by callback on all tensor parallel ranks. -If replication is expensive or infeasible, use `LogitsPostProcessorConfig::setReplicate(false)` to invoke callback only on first tensor parallel rank. - ### The Request Class The `Request` class is used to define properties of the request, such as the input token ids and the maximum number of tokens to generate. The `streaming` parameter can be used to indicate if the request should generate a response for each new generated tokens (`streaming = true`) or only after all tokens have been generated (`streaming = false`). Other mandatory parameters of the request include the sampling configuration (defined by the `SamplingConfig` class) which contains parameters controlling the decoding process and the output configuration (defined by the `OutputConfig` class) which controls what information should be included in the `Result` for a particular response. @@ -83,6 +52,32 @@ The executor can process requests with different beam widths if the following co The request queue of the executor must be empty to allow it to reconfigure itself for a new beam width. This reconfiguration will happen automatically when requests with a new beam width are enqueued. If requests with different beam widths are enqueued at the same time, the executor will encounter an error and terminate all requests prematurely. +### Controlling output with Logits Post-Processor + +Optionally, you can alter the logits produced by the network by providing an instance of `Executor::LogitsPostProcessorConfig`. For instance, this feature can be used to generate JSON formatted output. {cpp:class}`Executor::LogitsPostProcessorConfig ` specifies a map of named callbacks in the following form + +```cpp +std::unordered_map)>> +``` + +The map key is the name associated with that logits post-processing callback. Each request can then specify the name of the logits post-processor to use for that particular request, if any. + +The first argument to the callback is the request id, second is the logits tensor, third are the tokens produced by the request so far, fourth is the operation stream used by the logits tensor, and last one is an optional client id. The callback returns a modified tensor of logits. Multiple requests can share same client id and callback can use different logic based on client id. + +You must use the stream to access the logits tensor. For example, to perform an addition with a bias tensor, the addition operation is enqueued on that stream. Alternatively, you can call `stream->synchronize()`, however, that will slow down the entire execution pipeline. + +The executor also includes a {cpp:class}`LogitsPostProcessorBatched ` method that enables altering logits of multiple requests in a batch. The batched method allows further optimizations and reduces callback overheads. + +```cpp +std::function const&, std::vector&, std::vector> const&, StreamPtr const&, std::vector> const&)> +``` + +A single batched callback can be specified in `LogitsPostProcessorConfig`. Each request can opt to apply this callback by specifying the name of the logits post-processor as `Request::kBatchedPostProcessorName`. + +Note: Neither callback variant is supported with the `STATIC` batching type for the moment. + +In a multi-GPU run, the callback is invoked on all ranks in the first tensor-parallel group, by default. To ensure correct execution, replicate the client-side state that is accessed by the callback on these ranks. If replication is expensive or infeasible, use `LogitsPostProcessorConfig::setReplicate(false)` to invoke the callback only on rank 0. The executor broadcasts the sampled tokens internally to ensure correct execution. + ## C++ Executor API Example Two C++ examples are provided that shows how to use the Executor API and can be found in the [`examples/cpp/executor`](source:examples/cpp/executor/) folder. diff --git a/docs/source/advanced/gpt-runtime.md b/docs/source/advanced/gpt-runtime.md index 60a881e02..2e8ce590c 100644 --- a/docs/source/advanced/gpt-runtime.md +++ b/docs/source/advanced/gpt-runtime.md @@ -133,14 +133,14 @@ value for a given parameter, the vector can be limited to a single element ***General*** -| Name in TRT-LLM | Description | Data type | Range of value | Default value | Name in HF | -| :-----------------: | :----------------------------------------------------------: | :-----------: | :----------------------------------------------------------: | :---------------------------------------------------: | :--------------------: | -| `temperature` | modulation of logits in sampling workflow | List\[Float\] | \[0.0f, $+\infty$\) | `1.0f` (no modulation) | `temperature` | -| `minLength` | lower-bound on the number of tokens generated | List\[Int\] | \[0, $+\infty$\) | `0` (no effect (the first generated token can be EOS) | `min_length` | -| `repetitionPenalty` | penalize repetitive tokens
multiplicative, irrespective of appearances count | List\[Float\] | \[0.0f, $+\infty$\)
`< 1.0f` encourages repetition
`> 1.0f` discourages it | `1.0f` (no effect) | `repetition_penalty` | -| `presencePenalty` | penalize existed tokens
additive, irrespective of appearances count | List\[Float\] | \($-\infty$, $+\infty$\)
`< 0.0f` encourages repetition
`> 0.0f` discourages it | `0.0f` (no effect) | no | -| `frequencyPenalty` | penalize existed tokens
additive, dependent on appearances count | List\[Float\] | \($-\infty$, $+\infty$\)
`< 0.0f` encourages repetition
`> 0.0f` discourages it | `0.0f` (no effect) | no | -| `noRepeatNgramSize` | | List\[Int\] | \[0, $+\infty$\)
`> 0` all ngrams of that size can only occur once | `0` (no effect) | `no_repeat_ngram_size` | +| Name in TRT-LLM | Description | Data type | Range of value | Default value | Name in HF | +| :-----------------: | :-------------------------------------------------------------------------------: | :-----------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------: | :--------------------: | +| `temperature` | modulation of logits in sampling workflow | List\[Float\] | \[0.0f, $+\infty$\) | `1.0f` (no modulation) | `temperature` | +| `minLength` | lower-bound on the number of tokens generated | List\[Int\] | \[0, $+\infty$\) | `0` (no effect (the first generated token can be EOS) | `min_length` | +| `repetitionPenalty` | penalize repetitive tokens
multiplicative, irrespective of appearances count | List\[Float\] | \[0.0f, $+\infty$\)
`< 1.0f` encourages repetition
`> 1.0f` discourages it | `1.0f` (no effect) | `repetition_penalty` | +| `presencePenalty` | penalize existed tokens
additive, irrespective of appearances count | List\[Float\] | \($-\infty$, $+\infty$\)
`< 0.0f` encourages repetition
`> 0.0f` discourages it | `0.0f` (no effect) | no | +| `frequencyPenalty` | penalize existed tokens
additive, dependent on appearances count | List\[Float\] | \($-\infty$, $+\infty$\)
`< 0.0f` encourages repetition
`> 0.0f` discourages it | `0.0f` (no effect) | no | +| `noRepeatNgramSize` | | List\[Int\] | \[0, $+\infty$\)
`> 0` all ngrams of that size can only occur once | `0` (no effect) | `no_repeat_ngram_size` | * The tokens of input prompt are included during adopting `repetitionPenalty`, `presencePenalty`, and `frequencyPenalty` onto logits. @@ -158,9 +158,9 @@ value for a given parameter, the vector can be limited to a single element | `topPResetIds` | the decay in the `topP` algorithm | List\[Int\] | \[-1, $+\infty$\) | `-1` (no effect) | no | * If setting `topK = 0` and `topP = 0.0f`, greedy search is performed. - * If setting `topK > 0` and `topP = 0.0f`, `topK` tokens of highest probilities will become the candidates of sampling (named `TopK sampling` in TRT-LLM). - * If setting `topK = 0` and `topP > 0.0f`, tokens will be sorted with probility descendly, then the tokens with highest probilities which the accumulated probility larger than `topP` will become the candidates of sampling (named `TopP sampling` in TRT-LLM). - * If setting `topK > 0` and `topP > 0.0f`, `topK` tokens of highest probilities will be selected, then those selected tokens will be sorted with probility descendly and their probility will be normalized, then the tokens with highest normalized probilities which the accumulated probility larger than `topP` will become the candidates of sampling (named `TopKTopP sampling` in TRT-LLM) + * If setting `topK > 0` and `topP = 0.0f`, `topK` tokens of highest probabilities will become the candidates of sampling (named `TopK sampling` in TRT-LLM). + * If setting `topK = 0` and `topP > 0.0f`, tokens will be sorted with probability descendly, then the tokens with highest probabilities which the accumulated probability larger than `topP` will become the candidates of sampling (named `TopP sampling` in TRT-LLM). + * If setting `topK > 0` and `topP > 0.0f`, `topK` tokens of highest probabilities will be selected, then those selected tokens will be sorted with probability descendly and their probability will be normalized, then the tokens with highest normalized probabilities which the accumulated probability larger than `topP` will become the candidates of sampling (named `TopKTopP sampling` in TRT-LLM) * If different `topK` values are provided for the different sequences in the batch, the performance of the implementation will depend on the largest value. For efficiency reasons, we recommend to batch requests with similar `topK` values together. diff --git a/docs/source/kv_cache_reuse.md b/docs/source/advanced/kv-cache-reuse.md similarity index 99% rename from docs/source/kv_cache_reuse.md rename to docs/source/advanced/kv-cache-reuse.md index 266ca7bc5..4d7fcfc81 100644 --- a/docs/source/kv_cache_reuse.md +++ b/docs/source/advanced/kv-cache-reuse.md @@ -1,3 +1,5 @@ +(kv-cache-reuse)= + # KV cache reuse This document describes how kv cache pages can be shared and reused by requests that start with the same prompt. This can greatly lower first token latency, the time it takes before the first output token is generated. Many use cases can benefit from this, including multi-turn requests and system prompts. diff --git a/docs/source/speculative_decoding.md b/docs/source/advanced/speculative-decoding.md similarity index 91% rename from docs/source/speculative_decoding.md rename to docs/source/advanced/speculative-decoding.md index 00d8bfb68..9fb771a15 100644 --- a/docs/source/speculative_decoding.md +++ b/docs/source/advanced/speculative-decoding.md @@ -1,3 +1,5 @@ +(speculative-decoding)= + # Speculative Sampling Speculative Sampling (also referred to as Speculative Decoding) is a set of techniques designed to allow generation of more than one token per forward pass iteration. This can lead to a reduction in the average per-token latency **in situations where the GPU @@ -30,11 +32,11 @@ may prove simpler than generating a summary for an article. Furthermore, when integrating Medusa with a standard PyTorch model implementation which may not be as finely tuned as TensorRT-LLM, the potential time savings are more pronounced. -# Draft Model Approach +## Draft-Target-Model Approach + +The Draft-Target-Model involves the use of two distinct models trained independently but sharing the same vocabulary: a smaller Draft model and a larger Target model. For example, GPT 125M / 6.7B models can serve as the Draft / Target model. -The Draft model approach involves the use of two distinct models trained independently -but sharing the same vocabulary: a smaller Draft model and a larger Target model. -For example, a GPT 125M model can serve as the Draft model, while a GPT 6.7B model acts as the Target model. +There are two styles of using Draft-Target-Model in TensorRT-LLM now. The first one is using TensorRT-LLM-BLS in Triton, which more information and detailed steps can be found in this document. The second one is using it directly in TensorRT-LLM, which steps can be found in [examples/draft_target_model/README.md](../../../examples/draft_target_model/README.md) and the code can be found in [examples/run.py](../../../examples/run.py). The management of Draft and Target models is facilitated through two separate `GptManager` instances. It is essential that you to coordinate the interactions between the Draft and Target models effectively. @@ -58,7 +60,7 @@ it is advisable to enable KV cache reuse for both models. This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command and setting `enableBlockReuse=true` in the `KVCacheConfig`. -## Using Draft model approach with Triton Inference Server +### Using Draft model approach with Triton Inference Server + Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS). + The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py). @@ -218,7 +220,7 @@ and setting `enableBlockReuse=true` in the `KVCacheConfig`. pkill -9 -f tritonserver ``` -# Medusa +## Medusa This approach leverages a single model to both generate and verify draft tokens. It enhances the existing model by adding multiple extra language model heads, known as Medusa heads. @@ -249,7 +251,7 @@ In the TensorRT-LLM implementation of Medusa, the configuration of the tree is a This flexibility allows you to experiment and identify the optimal tree structure for your use case, which can then be utilized in a production environment. -## Medusa Tree +### Medusa Tree Consider the following diagram, which illustrates how the hidden states from the last layer of the base model are passed to the base model's language model (LM) head and to four Medusa heads (MHs). @@ -294,11 +296,11 @@ So, only `9` candidates are specified. **Specifying paths-only instead of all choices is currently supported only in the Python runtime.** -## Using Medusa with TensorRT-LLM +### Using Medusa with TensorRT-LLM For guidance on constructing and executing Medusa with the Python runtime, consult the [Medusa README](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/medusa/README.md). When utilizing the Inflight Fused Batching (IFB) with the C++ API, it is necessary to define the `medusa_choices` explicitly within the model configuration. For detailed instructions, refer to the [model configuration in TensorRT-LLM backend](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration) for more details. -### Limitations +#### Limitations - TensorRT-LLM supports Medusa only for Vicuna (fine tuned LLaMA). However, similar to any new model, you can follow the same approach to define your own Medusa model and deploy with TensorRT-LLM. @@ -306,7 +308,7 @@ However, similar to any new model, you can follow the same approach to define yo - Beam search is **not** compatible with Medusa. -# ReDrafter +## ReDrafter This approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. However, unlike Medusa, it predicts draft tokens using a recurrent predictor, where each draft token depends on the previous one. This method also allows the use of beam search to identify more prominent draft tokens. For more details, please read [the ReDrafter paper](https://arxiv.org/html/2403.09919v1). @@ -339,7 +341,7 @@ Each request can be assigned a specific lookahead configuration when input to th ## Build and execute an engine from a model -Vicuna models re-use Llmama Python scripts located in [examples/llama](../../examples/llama). +Vicuna models reuse Llmama Python scripts located in [examples/llama](../../examples/llama). ### Convert a model to checkpoint ```bash @@ -347,49 +349,50 @@ MODEL_DIR=/path/to/vicuna-7b-v1.3 ENGINE_DIR=tmp/engine CKPT_DIR=tmp/engine/ckpt -python3 examples/llama/convert_checkpoint.py \ ---model_dir=$MODEL_DIR \ ---output_dir=$CKPT_DIR \ ---dtype=float16 \ ---tp_size=1 \ ---pp_size=1 +python3 examples/llama/convert_checkpoint.py \ + --model_dir=$MODEL_DIR \ + --output_dir=$CKPT_DIR \ + --dtype=float16 \ + --tp_size=1 \ + --pp_size=1 ``` ### Build checkpoints for an engine ```bash -trtllm-build \ ---checkpoint_dir=$CKPT_DIR \ ---output_dir=$ENGINE_DIR \ ---gpt_attention_plugin=float16 \ ---gemm_plugin=float16 \ ---max_batch_size=32 \ ---max_input_len=1024 \ ---max_seq_len=2048 \ ---max_beam_width=1 \ ---log_level=error \ ---max_draft_len=83 \ ---speculative_decoding_mode=lookahead_decoding +trtllm-build \ + --checkpoint_dir=$CKPT_DIR \ + --output_dir=$ENGINE_DIR \ + --gpt_attention_plugin=float16 \ + --gemm_plugin=float16 \ + --max_batch_size=32 \ + --max_input_len=1024 \ + --max_seq_len=2048 \ + --max_beam_width=1 \ + --log_level=error \ + --max_draft_len=83 \ + --speculative_decoding_mode=lookahead_decoding ``` ### Execute an engine Run `examples/run.py` to generate sequences. ```bash -python examples/run.py \ ---max_output_len=32 \ ---lookahead_config=[7,7,7] \ ---tokenizer_dir=$MODEL_DIR \ ---engine_dir= $ENGINE_DIR \ ---log_levelverbose--input_text 'Once upon' 'To be, or not' 'Be not afraid of greatness' +python examples/run.py \ + --tokenizer_dir=$MODEL_DIR \ + --engine_dir=$ENGINE_DIR \ + --max_output_len=32 \ + --lookahead_config=[7,7,7] \ + --log_level=verbose \ + --input_text 'Once upon' 'To be, or not' 'Be not afraid of greatness' ``` Run `examples/summarize.py` to summarize the CNN daily dataset. ```bash -python examples/summarize.py \ ---test_trt_llm \ ---hf_model_dir$MODEL_DIR \ ---data_type fp16 \ ---engine_dir$ENGINE_DIR \ ---lookahead_config= [7,7,7] \ ---test_hf +python examples/summarize.py \ + --test_hf \ + --test_trt_llm \ + --hf_model_dir=$MODEL_DIR \ + --engine_dir=$ENGINE_DIR \ + --data_type=fp16 \ + --lookahead_config=[7,7,7] ``` diff --git a/docs/source/architecture/core-concepts.md b/docs/source/architecture/core-concepts.md index 4bfabbf2f..d2e638cdf 100644 --- a/docs/source/architecture/core-concepts.md +++ b/docs/source/architecture/core-concepts.md @@ -205,7 +205,7 @@ void invokeQuantization(...) { ``` For more details on how TensorRT-LLM implements the GPT Attention operator, see -the [Multi-head, Multi-query and Group-query Attention](gpt_attention.md) document. +the [Multi-head, Multi-query and Group-query Attention](../advanced/gpt-attention.md) document. # Runtime @@ -214,7 +214,7 @@ the runtime components is to load the TensorRT engines and drive their execution. Typically, for an auto-regressive model like GPT, the runtime is in charge of loading the engine that implements both the processing of the input sequence as well as the body of the generation loop. See the [GPT C++ -Runtime](gpt_runtime.md) document for details on the C++ Runtime. +Runtime](../advanced/gpt-runtime.md) document for details on the C++ Runtime. (multi-gpu-multi-node)= @@ -254,3 +254,131 @@ subsets of layers. Tensor Parallelism usually leads to more balanced executions but requires more memory bandwidth between the GPUs. Pipeline Parallelism reduces the need for high-bandwidth communication but may incur load-balancing issues and may be less efficient in terms of GPU utilization. + +## Examples + +Here are examples of Llama 3.1 70B and Llama 3.1 405B showing how to perform multi-GPU and multi-node inference in TensorRT-LLM. The example of Llama 3.1 70B performs multi-GPU inference on a single node, while the example of Llama 3.1 405B performs multi-node inference. + +### Llama 3.1 70B + +The following sample commands build an engine for running the Llama 3.1 70B model with tensor parallelism (TP=4) using 4 GPUs on a single node. + +```bash +folder_trt_llm=../TensorRT-LLM +model_dir=Llama-3.1-70B +ckpt_dir=ckpt_llama_3.1_70b +engine_dir=engine_llama_3.1_70b +dtype=bfloat16 +tp_size=4 +pp_size=1 +kv_cache_type=paged +max_input_len=128 +max_output_len=128 +max_batch_size=4 +workers=$(( tp_size * pp_size )) + +python ${folder_trt_llm}/examples/llama/convert_checkpoint.py \ + --output_dir ${ckpt_dir} \ + --model_dir ${model_dir} \ + --dtype ${dtype} \ + --tp_size ${tp_size} \ + --pp_size ${pp_size} \ + --workers ${workers} \ + --use_parallel_embedding + +trtllm-build \ + --output_dir ${engine_dir} \ + --checkpoint_dir ${ckpt_dir} \ + --gemm_plugin ${dtype} \ + --gpt_attention_plugin ${dtype} \ + --kv_cache_type ${kv_cache_type} \ + --max_input_len ${max_input_len} \ + --max_seq_len $(( max_input_len + max_output_len )) \ + --max_batch_size ${max_batch_size} \ + --workers ${workers} +``` + +The following sample commands perform inference using 4 GPUs on a single node by running `examples/run.py`. + +```bash +input_text="Born in north-east France, Soyer trained as a" + +mpirun -n $(( tp_size * pp_size )) \ + python ${folder_trt_llm}/examples/run.py \ + --engine_dir ${engine_dir} \ + --tokenizer_dir ${model_dir} \ + --input_text "${input_text}" \ + --max_output_len ${max_output_len} +``` + +### Llama 3.1 405B + +The following sample commands build an engine for running the Llama 3.1 405B model with tensor parallelism (TP=16) on 2 nodes that each have 8 GPUs. Although the model runs on multiple nodes, you can build the engine on a single node. + +```bash +folder_trt_llm=../TensorRT-LLM +model_dir=Llama-3.1-405B +ckpt_dir=ckpt_llama_3.1_405b +engine_dir=engine_llama_3.1_405b +dtype=bfloat16 +tp_size=16 +pp_size=1 +kv_cache_type=paged +max_input_len=128 +max_output_len=128 +max_batch_size=4 +workers=8 + +python ${folder_trt_llm}/examples/llama/convert_checkpoint.py \ + --output_dir ${ckpt_dir} \ + --model_dir ${model_dir} \ + --dtype ${dtype} \ + --tp_size ${tp_size} \ + --pp_size ${pp_size} \ + --workers ${workers} \ + --use_parallel_embedding + +trtllm-build \ + --output_dir ${engine_dir} \ + --checkpoint_dir ${ckpt_dir} \ + --gemm_plugin ${dtype} \ + --gpt_attention_plugin ${dtype} \ + --kv_cache_type ${kv_cache_type} \ + --max_input_len ${max_input_len} \ + --max_seq_len $(( max_input_len + max_output_len )) \ + --max_batch_size ${max_batch_size} \ + --workers ${workers} +``` + +The following sample script, `launch_llama_3.1_405b.sh`, shows how to perform inference with Slurm on 2 nodes that each have 8 GPUs. If you use a different workload management software, the key concern is to run the `examples/run.py` command. + +```bash +#!/bin/bash +#SBATCH --account account +#SBATCH --partition partition +#SBATCH --job-name job-name +#SBATCH --time 1:00:00 +#SBATCH --nodes 2 + +folder_trt_llm=../TensorRT-LLM +engine_dir=engine_llama_3.1_405b +model_dir=Llama-3.1-405B +max_output_len=128 + +input_text="Born in north-east France, Soyer trained as a" + +srun \ + --ntasks-per-node 8 \ + --mpi pmix \ + python ${folder_trt_llm}/examples/run.py \ + --engine_dir ${engine_dir} \ + --tokenizer_dir ${model_dir} \ + --input_text "${input_text}" \ + --max_output_len ${max_output_len} +``` + +You can perform inference by running the script on the Slurm cluster. + +```bash +sbatch launch_llama_3.1_405b.sh +``` diff --git a/docs/source/architecture/workflow.md b/docs/source/architecture/workflow.md index 1d366dc5f..6d02e406b 100644 --- a/docs/source/architecture/workflow.md +++ b/docs/source/architecture/workflow.md @@ -10,7 +10,7 @@ The build workflow contains two major steps. To generalize the TensorRT-LLM optimization features to all models, and to share the same workflow between different models for TensorRT-LLM users, TensorRT-LLM has conventions about how the models shall be defined and how the models shall be imported. -TensorRT-LLM checkpoint convention is documented in [checkpoint doc](/docs/source/architecture/checkpoint.md), and all decoder-only models had been migrated to adopt the convention. Model-specific convert_checkpoint.py scripts are shipped as source code in example directories, and a trtllm-build CLI tool had been added. However, there are some disadvantages of providing convert checkpoint scripts outside the core TensorRT-LLM lib as example: +TensorRT-LLM checkpoint convention is documented in [](checkpoint.md) and all decoder-only models had been migrated to adopt the convention. Model-specific convert_checkpoint.py scripts are shipped as source code in example directories, and a trtllm-build CLI tool had been added. However, there are some disadvantages of providing convert checkpoint scripts outside the core TensorRT-LLM lib as example: 1. TensorRT-LLM evolves so quickly that the model's definition code might have changed for better performance; which means the `convert_checkpoint.py` is out of date. @@ -47,7 +47,9 @@ class LLaMAForCausalLM (DecoderModelForCausalLM): ``` -Then, in the [convert_checkpoint.py](../../../../examples/llama/convert_checkpoint.py) script, the logic can be greatly simplified. Even if the model definition code of TensorRT-LLM LLaMA class is changed due to some reason, the `from_hugging_face` API will keep the same, thus the existing workflow using this interface will not be affected. +Then, in the convert_checkpoint.py script in the +[`examples/llama/`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama/) directory of the GitHub repo, +the logic can be greatly simplified. Even if the model definition code of TensorRT-LLM LLaMA class is changed due to some reason, the `from_hugging_face` API will keep the same, thus the existing workflow using this interface will not be affected. ```python @@ -65,7 +67,9 @@ Since LLaMA models were also released with different formats, such as the Meta c In the 0.9 release, only LLaMA is refactored. Since popular LLaMA (and its variants) models are released by Hugging Face and Meta checkpoint formats, only these two functions are implemented. -In future releases, there might be `from_jax`, `from_nemo`, `from_keras` or other factory methods for different training checkpoints added, for example the TensorRT-LLM [GEMMA](../../../../examples/gemma/README.md) model supports JAX/Keras formats in addition to huggingface. The model developers can choose to implement **any subset** of these factory methods for the models they contributed to TensorRT-LLM. +In future releases, there might be `from_jax`, `from_nemo`, `from_keras` or other factory methods for different training checkpoints added. +For example, the Gemma 2B model and the convert_checkpoint.py file in the [`examples/gemma`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gemma/) +directory support JAX and Keras formats in addition to Hugging Face. The model developers can choose to implement **any subset** of these factory methods for the models they contributed to TensorRT-LLM. For some formats which are not supported by TensorRT-LLM model developers, you still have the freedom to implement your own weights conversion outside the core lib; the flow will look like this: @@ -96,7 +100,9 @@ TensorRT-LLM relies on NVIDIA Modelopt toolkit to support some of the quantizati In TensorRT-LLM 0.8 version: -* For Modelopt-supported quantization algorithms, a standalone script in the example folder [quantize.py](../../../../examples/quantization/quantize.py) shall be executed to export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines. +* For Modelopt-supported quantization algorithms, a standalone script, + [example/quantization/quantize.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py) + can export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines. * For the non-Modelopt quantization algorithms, users need to use the per-model convert_checkpoint.py scripts to export TensorRT-LLM checkpoints. @@ -116,8 +122,6 @@ class PretrainedModel: # and save the checkpoint to output_dir ``` -```{note} - * The default implementation only handles the Modelopt supported quantization. The LLaMA class then inherits this `PretrainedModel` and dispatches the Modelopt quantization to the super class's default implementation. * The model developer raises errors in the sub-class implementation if the new model is not supported by Modelopt yet. @@ -145,7 +149,7 @@ class LLaMAForCausalLM: The `quantize` API is designed to take multi-GPU resources internally to make quantization. For example, a LLaMA 70B BF16 takes 140G memory, if we make FP8 quantization, then, another 70G is needed. So, we need at least 210G, 4 * A100(H100) is needed to quantize the LLaMA 70B model. If you want to call `quantize` API inside a MPI program, be cautious and ensure the quantize API is only called by rank 0. -Usage of the `quantize` API in an MPI program looks like this, only rank 0 calls it. In an non-MPI program, and `if rank == 0` and the `mpi_barrier()` is not needed. +Usage of the `quantize` API in an MPI program looks like this, only rank 0 calls it. In an non-MPI program, the `if rank == 0` and the `mpi_barrier()` are not needed. ```python quant_config = QuantConfig() @@ -179,7 +183,7 @@ engine.save(engine_dir) ``` -The Llama object can be created by any method mentioned in the [conversion APIs](#conversion-apis) and the [quantization APIs](#quantization-apis) section. +The Llama object can be created by any method mentioned in the [](#conversion-apis) or [](#quantization-apis) sections. The `trtllm-build` CLI tool is a thin wrapper around this `tensorrt_llm.build` API. The flags of the CLI tool are kept close to the fields of the `BuildConfig` class. @@ -216,8 +220,7 @@ All the weights conversion, quantization, and build APIs mentioned above have co * A unified quantization script is inside the `examples/quantization/quantize.py` and can be shared by all **supported** models. * A `trtllm-build` CLI tool builds all models from TensorRT-LLM checkpoint. - -```{note} +Refer to the following considerations for the CLI tools: * These scripts and tools should be used for scripting. Do not import the Python functions/class defined in these tools. TensorRT-LLM does not promise the content of these scripts can be compatible with previous versions. The options of these tools may also be changed when it’s not avoidable. diff --git a/docs/source/blogs/quantization-in-TRT-LLM.md b/docs/source/blogs/quantization-in-TRT-LLM.md index bf0cfb1bc..73f18b8f4 100644 --- a/docs/source/blogs/quantization-in-TRT-LLM.md +++ b/docs/source/blogs/quantization-in-TRT-LLM.md @@ -12,31 +12,31 @@ TensorRT-LLM offers a best-in-class unified quantization toolkit to significantl ### Performance In the following benchmark, we highlight the acceleration of a few popular models at a small batch size without imposing latency constraints. It's important to note that in scenarios where there's a latency constraint in your application, TRT-LLM can achieve an even greater performance improvement. Using LLaMA-v2-7B as an example, when the first token latency is constrained to be under 500ms, quantization with FP8 and a batch size of 16 achieves a notable **2.3x inference speedup** compared to FP16 on a H100. -| Model | Batch Size | Speedup (FP8 v.s. FP16) | Speedup (INT8 SQ v.s. FP16) | -|-------------|:----------:|:------------------------:|:---------------------------:| -| GPT-J | 1 | 1.40x | 1.40x | -| GPT-J | 8 | 1.44x | 1.30x | -| LLaMA-v2-7B | 1 | 1.51x | 1.47x | -| LLaMA-v2-7B | 8 | 1.40x | 1.32x | +| Model | Batch Size | Speedup (FP8 v.s. FP16) | Speedup (INT8 SQ v.s. FP16) | +| ----------- | :--------: | :---------------------: | :-------------------------: | +| GPT-J | 1 | 1.40x | 1.40x | +| GPT-J | 8 | 1.44x | 1.30x | +| LLaMA-v2-7B | 1 | 1.51x | 1.47x | +| LLaMA-v2-7B | 8 | 1.40x | 1.32x | *The above benchmarks were run with Input Length=1024, Output Length=128, and TP=1 on H100 80GB. ### Accuracy -| Model | Quantization Methods | MMLU Baseline (FP16) | MMLU Post-quantization | MMLU Loss | -|--------------|:--------------------:|:--------------------:|:----------------------:|:-------------:| -| Falcon-180B | FP8 | 70.4 | 70.3 | 0.14% | -| | INT8-SQ | 70.4 | 68.6 | 2.56% | -| | INT4-AWQ | 70.4 | 69.8 | 0.85% | -| Falcon-40B | FP8 | 56.1 | 55.6 | 0.89% | -| | INT8-SQ | 56.1 | 54.7 | 2.50% | -| | INT4-AWQ | 56.1 | 55.5 | 1.07% | -| LLaMA-v2-70B | FP8 | 69.1 | 68.5 | 0.87% | -| | INT8-SQ | 69.1 | 67.2 | 2.75% | -| | INT4-AWQ | 69.1 | 68.4 | 1.01% | -| MPT-30B | FP8 | 47.5 | 47.4 | 0.21% | -| | INT8-SQ | 47.5 | 46.8 | 1.47% | -| | INT4-AWQ | 47.5 | 46.5 | 2.11% | +| Model | Quantization Methods | MMLU Baseline (FP16) | MMLU Post-quantization | MMLU Loss | +| ------------ | :------------------: | :------------------: | :--------------------: | :-------: | +| Falcon-180B | FP8 | 70.4 | 70.3 | 0.14% | +| | INT8-SQ | 70.4 | 68.6 | 2.56% | +| | INT4-AWQ | 70.4 | 69.8 | 0.85% | +| Falcon-40B | FP8 | 56.1 | 55.6 | 0.89% | +| | INT8-SQ | 56.1 | 54.7 | 2.50% | +| | INT4-AWQ | 56.1 | 55.5 | 1.07% | +| LLaMA-v2-70B | FP8 | 69.1 | 68.5 | 0.87% | +| | INT8-SQ | 69.1 | 67.2 | 2.75% | +| | INT4-AWQ | 69.1 | 68.4 | 1.01% | +| MPT-30B | FP8 | 47.5 | 47.4 | 0.21% | +| | INT8-SQ | 47.5 | 46.8 | 1.47% | +| | INT4-AWQ | 47.5 | 46.5 | 2.11% | @@ -46,19 +46,19 @@ A quantization method comprises three primary components: 2. Activation precision format 3. Calibration algorithms -Typically, in the context of small-batch inference scenarios (batch size ≤ 4), the key consideration is memory bandwidth, making weight-only quantization methods the preferred choice. Conversely, for large-batch inference scenarios, such as serving scenarios (batch size ≥ 16), both memory bandwidth and computation density become crucial factors. Consequently, it's recommended to opt for a quantization method that has both weight and activation quantized. For batch size ≥ 16, the choice of quantization method can be model speicfic. We suggest to prioritize using FP8 first, as we typically see it offers the best performance and accuracy. If the results do not meet your specific use case, you can further experiment with Int8 SmoothQuant (Int8 SQ) followed by AWQ and/or GPTQ. +Typically, in the context of small-batch inference scenarios (batch size ≤ 4), the key consideration is memory bandwidth, making weight-only quantization methods the preferred choice. Conversely, for large-batch inference scenarios, such as serving scenarios (batch size ≥ 16), both memory bandwidth and computation density become crucial factors. Consequently, it's recommended to opt for a quantization method that has both weight and activation quantized. For batch size ≥ 16, the choice of quantization method can be model specific. We suggest to prioritize using FP8 first, as we typically see it offers the best performance and accuracy. If the results do not meet your specific use case, you can further experiment with Int8 SmoothQuant (Int8 SQ) followed by AWQ and/or GPTQ. Based on specific use cases, users might have different tolerances on accuracy impact and calibration time. The table below summarizes the tradeoffs* to consider when choosing a quantization method. You can also learn more about precision formats in our [documentation](https://nvidia.github.io/TensorRT-LLM/precision.html). -| Quantization Methods | Performance Improvement (batch size <= 4) | Performance Improvement (batch size >= 16) | Accuracy Impact | Calibration Time** | -| :--------------------------- | :--------: | :--------------: | :----------: | :--------------: | -| FP8 (W8A8) | Medium | Medium | Very Low | Minutes | -| Int8 SQ (W8A8) | Medium | Medium | Medium | Minutes | -| Int8 weight-only (W8A16) | Medium | Low | Low | Not Required | -| Int4 weight-only (W4A16) | High | Low | High | Not Required | -| Int4 AWQ (W4A16) | High | Low | Low | Tens of Minutes | -| Int4 GPTQ | High | Low | Low | Tens of Minutes | -| Int4-FP8 AWQ (W4A8) | High | Medium | Low | Tens of Minutes | +| Quantization Methods | Performance Improvement (batch size <= 4) | Performance Improvement (batch size >= 16) | Accuracy Impact | Calibration Time** | +| :----------------------- | :---------------------------------------: | :----------------------------------------: | :-------------: | :----------------: | +| FP8 (W8A8) | Medium | Medium | Very Low | Minutes | +| Int8 SQ (W8A8) | Medium | Medium | Medium | Minutes | +| Int8 weight-only (W8A16) | Medium | Low | Low | Not Required | +| Int4 weight-only (W4A16) | High | Low | High | Not Required | +| Int4 AWQ (W4A16) | High | Low | Low | Tens of Minutes | +| Int4 GPTQ | High | Low | Low | Tens of Minutes | +| Int4-FP8 AWQ (W4A8) | High | Medium | Low | Tens of Minutes | \* The performance and impact are measured on 10+ popular LLMs. We'll follow up with more data points. ** Calibration time is subject to the actual model size. diff --git a/docs/source/conf.py b/docs/source/conf.py index d07365555..db68ed16b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -34,6 +34,7 @@ "breathe", 'sphinx.ext.todo', 'sphinxarg.ext', + 'sphinx_copybutton' ] myst_url_schemes = { @@ -45,6 +46,12 @@ "https://github.com/NVIDIA/TensorRT-LLM/tree/" + branch_name + "/{{path}}", } +myst_heading_anchors = 4 + +myst_enable_extensions = [ + "deflist", +] + autosummary_generate = True # -- Options for HTML output ------------------------------------------------- diff --git a/docs/source/helper.py b/docs/source/helper.py index d14fcb963..03cdf42a0 100644 --- a/docs/source/helper.py +++ b/docs/source/helper.py @@ -91,7 +91,7 @@ def generate_llmapi(): # Destination paths doc_dir = root_dir / "docs/source/llm-api" doc_dir.mkdir(exist_ok=True) - doc_path = doc_dir / "index.rst" + doc_path = doc_dir / "reference.rst" hlapi_all_file = root_dir / "tensorrt_llm/hlapi/__init__.py" public_classes_names = extract_all_and_eval(hlapi_all_file)['__all__'] diff --git a/docs/source/index.rst b/docs/source/index.rst index dd8ae88c5..142454d03 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -29,23 +29,24 @@ Welcome to TensorRT-LLM's Documentation! installation/windows.md installation/build-from-source-windows.md + .. toctree:: :maxdepth: 2 - :caption: LLM API Examples + :caption: LLM API :hidden: + :glob: - llm-api-examples/index.md - llm-api-examples/customization.md - llm-api-examples/llm_api_examples + llm-api/* .. toctree:: :maxdepth: 2 - :caption: LLM API + :caption: LLM API Examples :hidden: - :glob: - llm-api/* + llm-api-examples/index.md + llm-api-examples/customization.md + llm-api-examples/llm_api_examples .. toctree:: @@ -96,11 +97,14 @@ Welcome to TensorRT-LLM's Documentation! advanced/gpt-attention.md advanced/gpt-runtime.md + advanced/executor.md advanced/graph-rewriting.md advanced/batch-manager.md advanced/inference-request.md advanced/lora.md advanced/expert-parallelism.md + advanced/kv-cache-reuse.md + advanced/speculative-decoding.md .. toctree:: :maxdepth: 2 @@ -108,7 +112,8 @@ Welcome to TensorRT-LLM's Documentation! :name: Performance performance/perf-overview.md - performance/perf-best-practices.md + Benchmarking + Best Practices performance/perf-analysis.md diff --git a/docs/source/installation/build-from-source-windows.md b/docs/source/installation/build-from-source-windows.md index 9dcb3e1b2..e99540814 100644 --- a/docs/source/installation/build-from-source-windows.md +++ b/docs/source/installation/build-from-source-windows.md @@ -185,12 +185,15 @@ Building from source produces the following library files. - `th_common.exp` - `th_common.lib` -The locations of the DLLs, in addition to some `torch` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these: +The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these: ```bash +%USERPROFILE%\inference\TensorRT\lib %USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm %USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs %USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib ``` Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder. + +Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`. diff --git a/docs/source/installation/windows.md b/docs/source/installation/windows.md index b3d9a660c..33cd5c658 100644 --- a/docs/source/installation/windows.md +++ b/docs/source/installation/windows.md @@ -4,7 +4,7 @@ ```{note} The Windows release of TensorRT-LLM is currently in beta. -We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.13.0) for the most stable experience. +We recommend checking out the [v0.14.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.14.0) for the most stable experience. ``` **Prerequisites** @@ -52,7 +52,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L before installing TensorRT-LLM with the following command. ```bash - pip install tensorrt_llm==0.13.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/ + pip install tensorrt_llm==0.14.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/ ``` Run the following command to verify that your TensorRT-LLM installation is working properly. @@ -70,8 +70,4 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install [the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version) -and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSCV DLL). - -2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies. - -Installing the latest [Build Tools for Visual Studio 2022](https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue. +and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL). diff --git a/docs/source/llm-api-examples/index.md b/docs/source/llm-api-examples/index.md index 9018aa9fa..a4817c09d 100644 --- a/docs/source/llm-api-examples/index.md +++ b/docs/source/llm-api-examples/index.md @@ -7,13 +7,16 @@ Here is a simple example to show how to use the LLM with TinyLlama. ``` The LLM API can be used for both offline or online usage. See more examples of the LLM API here: -* [LLM Generate](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate.html) -* [LLM Generate Distributed](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate_distributed.html) -* [LLM Generate Async](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate_async.html) -* [LLM Generate Async Streaming](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate_async_streaming.html) +* [LLM Inference](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference.html) +* [LLM Inference Distributed](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_distributed.html) +* [LLM Inference Async](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async.html) +* [LLM Inference Async Streaming](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async_streaming.html) * [LLM Quantization](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_quantization.html) * [LLM Auto Parallel](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_auto_parallel.html) - +* [LLM Logits Processor](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_logits_processor.html) +* [Automatic Parallelism](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_auto_parallel.html) (in preview) +* [Generation Async](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async.html) +* [Generation Async Streamling](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async_streaming.html) For more details on how to fully utilize this API, check out: * [Common customizations](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/customization.html) @@ -31,6 +34,7 @@ For more details on how to fully utilize this API, check out: * Falcon * Baichuan-1/2 * GPT-J +* Mamba-1/2 ## Model Preparation diff --git a/docs/source/llm-api/index.md b/docs/source/llm-api/index.md new file mode 100644 index 000000000..27a7780c2 --- /dev/null +++ b/docs/source/llm-api/index.md @@ -0,0 +1,101 @@ +# API Introduction + +The LLM API is a high-level Python API and designed for LLM workflows. +This API is under development and might have breaking changes in the future. + +## Supported Models + +* Llama (including variants Mistral, Mixtral, InternLM) +* GPT (including variants Starcoder-1/2, Santacoder) +* Gemma-1/2 +* Phi-1/2/3 +* ChatGLM (including variants glm-10b, chatglm, chatglm2, chatglm3, glm4) +* QWen-1/1.5/2 +* Falcon +* Baichuan-1/2 +* GPT-J +* Mamba-1/2 + +## Model Preparation + +The `LLM` class supports input from any of following: + +1. **Hugging Face Hub**: Triggers a download from the Hugging Face model hub, such as `TinyLlama/TinyLlama-1.1B-Chat-v1.0`. +2. **Local Hugging Face models**: Uses a locally stored Hugging Face model. +3. **Local TensorRT-LLM engine**: Built by `trtllm-build` tool or saved by the Python LLM API. + +You can use any of these formats interchangeably with the `LLM(model=)` constructor. +The following sections describe how to use these different formats for the LLM API. + +### Hugging Face Hub + +Using the Hugging Face Hub is as simple as specifying the repo name in the LLM constructor: + +```python +llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") +``` + +### Local Hugging Face Models + +Given the popularity of the Hugging Face model hub, the API supports the Hugging Face format as one of the starting points. +To use the API with Llama 3.1 models, download the model from the [Meta Llama 3.1 8B model page](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) by using the following command: + +```console +git lfs install +git clone https://huggingface.co/meta-llama/Meta-Llama-3.1-8B +``` + +After the model download is complete, you can load the model: + +```python +llm = LLM(model=) +``` + +Using this model is subject to a [particular](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) license. Agree to the terms and [authenticate with Hugging Face](https://huggingface.co/meta-llama/Meta-Llama-3-8B?clone=true) to begin the download. + +### Local TensorRT-LLM Engine + +The LLM API can use a TensorRT-LLM engine. +There are two ways to build a TensorRT-LLM engine: + +1. You can build the TensorRT-LLM engine from the Hugging Face model directly with the [`trtllm-build`](../commands/trtllm-build.rst) tool and then save the engine to disk for later use. +Refer to the [README](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama) in the [`examples/llama`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama) repository on GitHub. + + After the engine building is finished, you can load the model: + + ```python + llm = LLM(model=) + ``` + +2. Alternatively, you can use an `LLM` instance to create the engine and persist to local disk: + + ```python + llm = LLM() + + # Save engine to local disk + llm.save() + ``` + + The engine can be loaded using the `model` argument as shown in the first approach. + +## Tips and Troubleshooting + +The following tips typically assist new LLM API users who are familiar with other APIs that are part of TensorRT-LLM: + +- RuntimeError: only rank 0 can start multi-node session, got 1 + + There is no need to add an `mpirun` prefix for launching single node multi-GPU inference with the LLM API. + + For example, you can run `python llm_inference_distributed.py` to perform multi-GPU on a single node. + +- Hang issue on Slurm Node + + If you experience a hang or other issue on a node managed with Slurm, add prefix `mpirun -n 1 --oversubscribe --allow-run-as-root` to your launch script. + + For example, try `mpirun -n 1 --oversubscribe --allow-run-as-root python llm_inference_distributed.py`. + +- MPI_ABORT was invoked on rank 1 in communicator MPI_COMM_WORLD with errorcode 1. + + Because the LLM API relies on the `mpi4py` library, put the LLM class in a function and protect the main entrypoint to the program under the `__main__` namespace to avoid a [recursive spawn](https://mpi4py.readthedocs.io/en/stable/mpi4py.futures.html#mpipoolexecutor) process in `mpi4py`. + + This limitation is applicable for multi-GPU inference only. diff --git a/docs/source/media/image-09-29-2024.png b/docs/source/media/image-09-29-2024.png new file mode 100644 index 0000000000000000000000000000000000000000..840c76907b7524f03ddabf0864f72b95db9aeb2b GIT binary patch literal 178826 zcmeFYRZv{p9`&09C%8kJMuWS%6C`K|7Mz9v!9#Eh5ZocSdvJHBad&rX+-V%X-gWlg z=iI0Ja_c^QRsFDPb@f`S=bUTKF~D4QE_gAmrHjxow zXHGLF4qv@`$&-^3|LmfFl!owAujjhA-O+jA{_EDs3AKx_U=3afKJ)MAtURp=35$;yYTFapZq5KsU(#D@6GJaRIFVdx5_H z@7v(VyUK#bM&j%6Yv!!N#(_E+)!#Xa4r){+2j6twoN7v|;{E57=e`;L`}dD&ck3g* z`-j^cGZT*GmF11y-GJdu1w3`UA+x-PuBwN|->MF3v8Khdg_7BgjVL}F6G2M*NW7Qj zS#1~#%{)0}P4cJ8|6KI9jch%Z``!R&6NGNraWP|KW8*a@FJ?={nW;y`E|rU`%VNZ* zCfsnvaC0jg-}Z!zyQTS3Vc}$wcjNQpuhu%S7({j*TCF1|iDB2++`hE$gj4PqA^k(g z)bn;O&2L$FY5k<|rr@7ZCGVWGgtnC(C?=@}cR+HWGN*;)e5vP&*hH&1GO~v~2}c+y zN|vDfgx(_D(p)+(X`8;j$EWac9nSaC_L4kSY5vWs=2%@I6m6@>pP4JDK5{kl-vPQ3 zdqM?G^ah>)afB4dHz-zO5vTSurp<(aA~zzB-zIxxBNVuFQg#fS{Z@s9WfZ&O#)iR1 zIww+H9Y*>3m4nU(tknNB3!d~{;g7eKffCB<*7)B)UaDxR20dQ9`TaSr2qMP&3rKoy zbdHRPoUq{>zB2zYpe~??nu3Xk0MIz2RSx{`J2P0R3~OB>VWCP!TV5Mah|}^_Z)wBE zwayth!vE~Y+?`kxnGJ`5vYOhB0dx2lia@fYm+2&#e>z+>5=}YFpI~)SI9G z9z4>jUU)K!M_FQB1BsTT5_%xdkC`{JW?2cr!WicQS+78K_Uf#NUiE!uFT?Lw8*v&7twAPd$6OMDi!jTqpp_2I7) zuS<^|6A_zNM&TKE(QinRHGoT%n^x|_KI3t?yhBAg*S#WrhGT-fvc$Yq5jDcS1j&HO z8DQ$$puUthzA1t9w3y0_4b3UX`6u7kC_1(dEsNgsmEp}&&#|v5wlL4x^Ik8^FQ}N6 zO$@Ze4a6OO+b&N*+NxTx+%zF+Se%Xgk~OT`J~7{kQf`p|N>#w6?bG=8?>D9AR9y-5 zAdKE|@Ofn-lUQfHs^s-1n?#*5RWE}Y>atU??~yHO1{@y^FECS1^5=|ot-~`uf5x2) zw1gOGjskxn1sIK1sIk$QaKByQC|9a(DZG=#!R=IJq9T(L1m9Po((T2MyCHJwMy5c`Pet!1kOVmzk=$p>tQqGMSTC#{vJX$$DE4bz~-9ls;Jl#2bs z$O_x1?_3K~v}WV_AuJTan-i8-pRb5TB{#1w&OxuU7PghOy?C5hUI^SZo_{X4E~VwC z8IPagj&(NP2))14QvDp1U?U_GaVqSTTzM`OdkWszKL&?pSLI@*w5J7z8+iRJnl#o+ z*Zb!k%M0aaFvj#p3So9mhtk64JGp{s?9q+Oi%rdQXvodTGgL4HPg@D-Mrk8*j(9nO zZeXqdvp*9#ObKXFHbuS(uX!WSI@;0R?uBX=EyPmgS)epW2wV%D$<=jF=3n!x_{Tf1YWs^nJCE}dcxSNpD)@%83E$T-T z4*!uwI`=JKD(^1f3OS;#XM+Z!f}HJ)BdR&8zR_H|wlRW#npsrWr1#oqPH15M@d4K+ zDsF^sefhj|WeBpG{3Zw2MP+VB?AOF! z7J0APz>JI5RlZcv-?hTMKwXNvFCC$8FmA-u10fn4aU3 z#q+`T*%Nj`T^4R#9C_PiWU@8dhQ{gTPUZ2&LnYPvXleMCBbOy<3FW$WWRKI!cc8qz z16;i&Q2ejH1&Bm!vX!x+MUYW%z7g~l$&Ve%(!3hX;vG8txYmVKEP6Cf^~<;a-0$mV zQ_`HQ&@{uU4maWDtS7_eo-WVbr(q&C#TL~)>EHUleP{`V!aWfl*vaceZG@f$uyy9; zJg+Xj4W$nl_#?)v&EkU}0<_y>8}1^Zzt!V5YE5m8dsfxy3K@LL6UgqRZo1y%vUaUE z_)w0sg}>Zxu4%QINq_Wk;`Mh3bm4`t#_^zw7U^B z+C{FDs+)GiJ&@4Ia@oT!Dyw7>f#S*m2n8?fmcHN#Ryk)^RLKe)r`rC}{LGrlea~eQ zw=KKMmWEd(`&}vqO}*NAmY!PiRCH#0$o?Wta&A@YIbwT8Tq2s^%wWUo^Xz9}&1nUw z3_ocZsWZBAH|KqsolROBZUp%E$Ff*ARc+lc){G|1Jl7<<0{2k3nsRbaR(S_bX*&B~ zl34*jZ-UXr2f6dfEa#6FHauj+~(&i~W#Cg_h_Co1O{>wg@kon&Xv|;t~6a zfC@m9((T}joAkae%@_1Y9(#x9wEcuTu1xwJ{Z~&#paCf;C~g)-oQaQQHD=U9Z^b4|ZM_&^a*?fc z4e!@oU~n_I_iAL*b7~@$7EgVzk%PxH3@~(Olx}d#Azw7q{jQ*-37~ebtm8&2 zMOnye{C@IhdjlEzmcWRnqt5`I0%%64Oqc7D=$W8+hwmnH9phPZ8H}L36-XM|O%<57 zy%_Mt1kcP^^8N$1J0%>|j_&Ab_1Bj#atYNG#u0a&V|d}4;LYw($OP+iu%pF06;&OI zO@f=t%jd2qtw!fs7omy6zUZz&iGcz@tVnkJKNVNq6+yYPXvW6D5AB>%#wX($ExcR_eG$8l=cXH#x8#q= zVxz*ZRY$^XG+wyBMKXet0mWQAT3kv$r?`mR9$wGi&LI6ME`q21R8~kQ2`w9uU24E- z?IT5ciBjPgrOYC9VtPiM*?Er?Ec+{#1~b(Ob@Lwc14y!Z_%$9DFv9T?vg!9=E4EvA zy}Qie5bnk`J)7A=PWUj(3QkvltE&DbsLD50$6Hr=3nU-ONw z<)%m9wknWM+d<8Oo@7&X0j|%RW!64sO0dOLPlaj?{0{ryEv?b#bIi_}3lHjkm>ohtEkp;X}AFC=251lEGROk-oa#&+1adxUnF>nuwlc?Wd92=#EI$TZ^ zTi)4m=$dkWA;L^NuUmT+ES%{Qr~v74tsus>`!P?-UONuc{LneW^%imcHn0AVD->OymV)zL zH&h*1J{W2RA*-QZqidZVfhn=rS=@*wiU({s+uiSOLu4;!rf&R-J!*BnekHs#ya_q2 z+*xw%B~UX{1nQi-b_|V?6e_m6PnFC0T5sfl$mq-pRP?itehGzn#ES_JR|X&aN9=!t-1yh6kGzR zqgw0)fQy=;)%7YGQOuMfj3pZ;r9+|Mi-rr0!9N4L)%?ASpp-=?k zeBZ_QnrBr3?zLXO&fKll`B9;#Hgexpw0?8{`q%^f$07e_*+|53NAAIuImL6Eu~RRL zH~JjW6^i1BUb|`E(YhH->7HunG|m0p^}2>2GQG}DFu@F>LKpIy5>JYH&_EFHb3=F^ z2cBkJ|HaO6kMpXT$iFtC@Jye5!) zIqXQU8rg9sm2;r6H889)8axofd830^@1wKVBSq045ty(em;bL+_<|vW3HRPsdQ?8%QNU zVn_an0Y${I-Ox ztd)OV%guqp5otDqiWV9bS1mv?*h9w{imIX?b(u57$kawF7W(OxNyDr-sR2|T^Q5LuTHrkV+T1SDyMP%_ zla6lYyD7tEiw4iJA$-ksCS*~5ntR&W{Ea`2#6#<{u&D@{duo|sV2b4Y zV6kFfOU_$5rHx0PRj+3ar4*(T7RC>f)}?Wc$0`P%h7Fko3EZ?QGF3=9urI{p8hazJ zyU#IE*?Dv_8oz1inI;7vp2Z8(g38M6@f832q$l01@<(_3fF_f!M#pZ;z$XtxT%Ur# zgwnOK;M8}+eNbT$WPh4wm6-8W4gAx{tM z42(EQR*+1x)*Kp>ty0L*F=UPs=f8n#JKE=?ed;5&2bVipJ8qAVK9v>}bK?A+fWbuu z;ziS&k_~osN5JORR}r1d(;C78($NV?x?4MQjksXVzzKkn0`$8-qmms{k68#EIkiMQ zTSAw^>!C$fw)to2(ehGbfJ|MBxvI2%TS>fhLrIgjolA4wfA8K0T+uk%mwnv(W*%9d zz?$usEnQ)c7>RQ6RzgcmYluA+u#ip=@=BMqBr{v3jW@~L=yewICdU!WR*eQqC`xer zX;@B4lLC!n-CJ3CTK?^$?~-bgttB1OT2kWC>}OCjSAMc@J;x$G;>h8OF#}13r*@wp zMW2+8JWXtQL%en07uOX@w7I*yk(2BLo*(}A!B#?TCf=HmrN-hpEd21w3c}t42~6`u z=js?LWbJXVm9>8~3Xo8e&;n4t>Do!`&8ImUQMPsz(wtpPNnq&v?*7Ms&g}#Bkh9r% zHMI~MAb2DAJB#3zh5!Da4eD{3fohqu!IvbHJJ-9x?-wfsu`*EIiz{ecaExA@flgw( zt~Mj+P(Y5cC#56Eyc3gaShecS>JM6M9z+^(~%3x`+*PiOQU!8Wkx* zFDOi0OBu6r&FaMJ)uWGQYAW34tN&qPoRgX%E$yS(% zNqTaAnq%6~=iimx+&zgHjFGT1xQLq{WWbp!7VS;9FTArF?Ca?sTVB;NfXOT5V9Eda z<5SjdPDgaMqst85^yXKQ`+>0m!yN8R6#eWC7ry+HXHCJ_C=G^>{2aW>dXi~YTk_hE zLWN`S3MYc9@FZ@{B_DMnQ>&IW(peu<)sp>Y9%cv4fPxIJ)L$`0@fGu@^wv~tLt{TS z%o&?^VWqNv!$?+yr1hP$%7S6ZTIuGIJe`+Gyh*V2Zyr7f)^R9W2$oMM&B=LdZ`aBS>BC6lkP~ad z*stujTfjNXupN4%2-?;hx0K76o)6C-BV@adHVW2TDj^+fO9mQqgN$Q+Z{Hexuuw15 z9l~CWS4*~> z*hT5u>~_je>t24s)(;TU`^XH$ExPi;sP6~Z(~{i`yA=Lq&y}9+O!joW$&VyvWuhFe z4Y}1dO-B{F4=P4&nh9~_MrX3!SNndwO<#XOgTPVRX5^+R{C^4= zWQ^{C#2BW5uChkCOswzV^3&1CL=B2$_qe}RxuDqhFtU;vP@@5SP$2p~#0hI})-uWY z6NbvY&$kVt8y(Yv=tkGa3w@@P#C2afFIuT-KX#LB$ZQh}I~08kRY}hX^@@e*fP{n~ zAlf74Vlh z=hsj<$C(+emmc>`>Fq&W^UQ(bc`v_h2bc@y@w)ce&H=UjI}Qe;vzCH{kKi?CH7MIl zUcqEG`8e-5j&K()+q4H6j;L(cS$EcaY(3#pm2=R#5g~7HURd1d({tCAUw1k&Tz;^j zL?r}f=Dq|A(?<)lXU*{ihe}L*A~eSAK@CxTH_1auPl+8&6UIH4y6T-ABqxcBI26Uy ztD~;z|Kj=b2Bxawt*tlM^Jen;nMXJ+-Z5R`Vx#J8xkYx`!{rZ+m%+@ONgm6fsG@nV zdog5lNYNnF6X|QVML5WdO}QAOdLGQTKcl9lN^PuLVNemI{#xWU<T*lVnh+1$UFzW~A$Z*!iZsce zVrlT*|CRqu5jo><&glNp+!lvt{g4-|&H#OS$MD(i5hMw25(TsAN-71Tg33l!-<5}6 zjQ1HIsz=%~x_(hTuFfx$XARI`@%P1OuIk*q7SR&c?8Y6f#yueOeMRQI-_3EvxE$&2 zc};eX^_}Kw{`|*H_u%x5%k5NOg@9wYOaC}Q*)6{DCf@5HLN1y-eAD%P+l-!v;@!c< zvR^@7nL-x#QyU zSrUbN71qdEZv*eZyd6au=#Gjo$AR#1&1-_Dzke6b&QH$Pf$DgB4UsXcEDB6I`Q!KD z&wmW@AS-%(Zf-8L$yx?s)JHNhwc@88`~yPgk~mk^c(h(}XiO0fwkDm`t@|BCKdTm0 z)h=@`jMQxcmc4XMfsH2?BGRMT>nuyQyZ4_{gu*L zPNeLNHr^y2LV|7vyY;uQiSg+BKF!a^G~&OT;vJ}XU$#6$%vV`iUr>+cPm;u?K;xaC zD8Zi9k`K_Ssx#7oI%(UaZK|660`3Ox*n?c}@iYlS6DBtT>^YuQV%wiry=WpA%>!xB znF=NLtK zorPT`C2W-3@Yte9uJ-R}k6>E)bVPAP^hwrkR(v~(JcQ5bs1L_!n)C)+CM)L{>yHK` z**gLpo*FFd?0u_A951Ene-5k|LkI5aqH>6 z4+4M&23m>cC0yku5>%01L;ChQwsOI05`2HsG8^g2T=gbJ4u1GBeKDk9xnskgEfw#j zbL0nhSx+|Tr(A4SN|{(!Vdcfc1)IfVd#u6)n($qdyAw41P~3=Es%V;jTo7IlcVNE! z4~ir5PkdRtAD6~8ZY#(R<;^QkLFSBZH3Idxq@IF+0O?aIjOHreNS8}M(YT~J+>~jL zGr7{}+|~|tE9piejhThW^0-LfAwS~cWLhRN7{6}sc#`MGhdl}*C7Q3$Yrp!87oYV* zOy%k)m0v)KjcyxoG?sL(NMeIY3#LDu6P?@v##EXdhnc!7TF)|UivO+% zA(j|=Nh<#~JM5r|`Tw1@P)4#_B{n#PbJzpy-`W1}uf0%LoVd$QUTO9p5GRFA2+IuK z95j(IEjGf()yA#ZY#ROBLa>8uV`U_jv%-2-{$oTJcbDSv|A9i}>!kzj9e<*2#q5lG znpph6cl)2)?2rCmfaL!Nh(@X&+dVlYMRiSU4JSD{#R|3<@h@`ma%W|gX-I%P)0WHs z9Y^vv-OTj#`+sg0=k2d9q0+kMfWHutug^R2knEZwG#eY6x}4u9$U`TPPaj4`6nS)N zl}J&Uwthyj{pz{l(uM@&-@D-*t@$@CCTFZYSr8UpHD&|8Zfo~reyRUY&;Pa&`wuY! zlj|o!$I54CXFmf|ko0x0PHjl}vK5aZG3! zj316(CoFH4$vYMlI|jbH5k}#4&djcWXVv#+`L0&q`DFIpdy=zoV*0f*tF4RMLuOXn zC=7}5B2iSFz3drU!aPY&|1|dZDBLlhf&mhZ&CNP*eD{OYq|UG~$?^mzO$0`bUK{aQ zP7~jbY#Vegft!~?gh#@|k�?ZXLWrt54lKJQziq@}odc`eNH!da z>uogC!ay0KIZHI42t-7e`}q6S@x+pqb-edVczysPH9A=&W5zv^+4G%IZdmFoUT4he zf(JJW`anWAJ$RX(_3S`{-k!*oAc|@wp90<GqdI2Ges7v~Jg5sye+pt0c^1jb~J|fo- z5G(N_N%Z_%RbhAEuROC3xA9KhUUK7?l;lh}D1HGK;(h^5yEekxDtC7sP549vKJ%|7 zrPeI|SWY(O{pW5!-ZlBpZ|!dcr|~!259!ozsacCCK+^LyfkLq*{#jY~0S(SH{B$?!cMaKoOzxGgNtW}D`H z6517BXKlo(p)LF;uF0m8=Ux93ER5$2!-c*eB&9A8VDvcr>Ra5xD#yij_B&>Ut7q@8 ztQzZPeMl}n-E}MLCOL7g8EXq?3_355Ou&SttsPnBPD_~hi(|J(bvjPkb@mTgz5C{z zWA&1%MQvHfjwO#9_M-vhY~>B7Vigzj5#1I?mY%~t>rA6m2$1o0hK~_hm7Vd<7sc+? zcdKsgpuMB+5$NHWi(5DVIlSsgh1PuM92VZ;G!#XWLgv^VlT6L)oM+YjA@3#fG z7~Q4^ugWPNFj`k}h6vO~OE+sSHq3`ZL9Bino%4HI=t;AF5tRs|m($u>30fFPkiyE& zZ40vl+{-_m8qn)qz{8V-AV$Q}Z{d1JN`0Aa5=V_z6F{kzY3f%&YePLgJgt)$v(keC z$$QL7r3s7&&yF!CbHb^WS*PNYT5mEoQHxOFRXE2<*2BcPfma_GCSP(Fnxwld{Y|R&}f{bpud7qYKzx_ zB-Cn>L3PB0DS8e_bstlkojjEGciGQ(h;DXgj0p(mymQCEK@9&@5XsGNEv@>R$6F9n zr|kj0Hf~3+PDpU5S?1IyMkyZ2=ov9U26U8^(6zK?Fxdi)UJtP#_8{}3Pheo=b)R43=2j6}#m2*!Md*}J)O*EHD*npq` zwl36Gm;RcLqmVZM;7xNMk@h;HgPgyutZ8x{Dm**ljr4wD|Lp7{{`gpYqf3w+*>kvc z!gliarI$w8Za}ZV`#d?dvMl zeccu(cNbkkl@5?91iJZlog=FUE4vRcb_qGW_QbENrq!76Qh?IurZZ`e^O^4!5%H2E zMia(Kg0_4kqbgl%=cl0}QIFS{Yb>bnZIRzjAYE?(M-#BGZ$uTD$vU)A>J zLx17!9?MB8btv{3u=13{ptSbkRwNW7lm@q6TXt%%id5TEj|FMK;!Ed2e><91o&YJ`kWq9M& zYkIzZ6%?IQpWE7!u?07ct=Th-S4a*D5j#CHJNZai z^0ck!akB|Ra!iFOj~#?NmupPI?9*Iawk=*>3i56PD-&I(hyIB7Yg;+(D-6CeGH<#z zex|>}9c%8I_6zLB5UdHsjk2Z3XI!wAGcCX1U-@QUUw)&b^5Q z4!Ld-Qfg9;a1_VV3uc-9Q&*bLuf55-p*;r49L7(gQ3b!!S%vR0dhwN81K0mxso55t z9jTioHpK<;d%LI%>KnE#9uO9PI!XWRadpN_sar#PJ3ia+8f2uYs{E;(`k;)8FRM4}ncS;)!1Sr_y4Vm?%+KO-T)TN()EZkeF1D$Fqa>%KNRk zkvdrTA(#K|Xzaz|#N!QUOh8S9)$Ix2<93~y$o0UVZ~wlzn=v)$n7Y2Tj(NaxZVR5$yWMv4AoZ2T4T z(pj%{LN=4A&Vh}mB6oiB!`{_P^U`*)BkBf!TB%N`zBh4(5EP;m{Nrz(e4%E)dVlh! z1o7FY!y8X?k3tRfM$^?8)J8?epn?aJQqt;r_%t_ZK}IN&Qp*DjBLwU@`E7{jM)nOm z-6?F9WAHGl$5?q)|MT?c5c7@QlK{1Vw4k7lGlFxlQciFRkNTV@l@K}F9MHKISr{(_ z#%ETrgE*Za$I+H&x^8Q!!uJZPq9*`3lBu1r-d>tijw`8Uw!<&ozq)JCW*AE7y2@Hs z`WXgrmRv|hSw&L?cdcMzvYt|RK~Sb_-8Ck%mu4{*FFHs^sR@ND_JgcfoJK-${S*-7 ztSVjEEi=K)K_>yN4_I3YfYVxAJQ2mh2suHf;O`2pZF}sjSy|)Z1IMrR={?I#$+ZxQ zN2@aXesUE@sd7G+mb3AB&%`a!g7Q_@OIV4UR2~yEUh`032bpz2YtB$0Hfnui$1jUE z)t~eEk&Y4v^u8^mi(~s}HLzuxYg%Jg_DkziD;5#KjByGT^l+r*;>B)MI5Kn`#Op(; z+WD+d*~rH@_Tw|NI(3Z|w+%IMV56^Urbkd_`pMxV; zDe=C{zXq~72Jr7<>@zukHp67J9-0Oo@h!nH2T_P!5JsyQ%k$C@k@GCV8MOLWdcge$ zyj-&+q505@M;v;bS$|;b!+MK*2F@wAnz&6I0R9 zx2R@k(Tf|i%&b{A5g2Yp)uf5KyL^x3n{7P2k5Lp4SHsPaC(7co?!Cw9l!_e7G=IP7 zhDV~K{d{MU^oi_<9wRbR+|cE>kaC(hc9gz&!P7am!plhQr!iO8cxySQwr*{GEr#nr zUs=1)craAXOQCh!t&ni^poj{uU|WREb>sLQIS>Rh29@y-nJqy#;tUtiyqvmhjk2w+ z)8I^G$c>L7>YuiJ{fqj_PIL-k9J>Q%OyjOb!lW9P?6Q1p|0Mpp2Z}WSs0Y{T#Qyx} zH~jK}!g#k$EP#F88t`wkl=qgOAzc_x8ixFQ=|*xQ;JX<~-6#ms#6Sp{OK9`Q)bO6q zl0g<$iXh{{of~QSuxaE$R-O`wIp;+Qu3<^Jw^TUcRE5W4or)e|810=}6JazS%e?hz zIBBrNBEVvJAiH~HFOk=y>O#u8liD86jcyz3voSyzn-F_Z|w&iUB_?Zlf}qNn-@21 zk2i{D1LtR{CE3;>sYCJA4urgB|BWuQ6wSUF)o7Uy77ybhci`_Sb!&{5{HX;|2ce2iR?yBcg^B6R4$`sU@_1Wfn;$ko3SHVKi7^z6=0v*_{K-Jrs^7`WD)jB? z%UbVJ0M!2Wv+Ax_4XIm?(q86%wf(B+SP&ulRuSD`U)S8TUu!WzRtq25vv`&)K!9M% z%M&*{#$BE^G2!;gz!`McJl6Qgo$)^uT<Ti*^1DIj+nXD?lQ3MU2-?pLtPHpe0#58?KMnTF zZC>Nxkb(x`VUuw8UzxE5sY@0BL+gaedlt^!Gmi})m=Knvxk>|WpcC+Txe$%61#O7~ zRhe#Lj?hd_m(4}$%g9`pmXK&NuEX1zWXaYsra z@Gw7;ld*QpYm8{8FL3FG6>p4a)BJL?a?Vx_Y&-7nPWi8`vI$B2Q(|o!)T|5%c21W- z2kW@9Yb~%S))bjz|CG}v06>TZG(z=%qlUiP`rsTN7>lcOZ0fjs9BFj54;g~>mj$y2 z931vsI~mlL?WXf#jLPgrqtJ|r*G==9n3oiHv>9h;^@*&_{h6H@6%ynErM~GgeeoK@ zF+V)Wd&W%Ucsmk4qIJ{rF#kBx$8l7YU-ibzhayPfBpg9Uec}p|KN_3zU$)xDVf8&e4!9Fsw zvi+@fV|s{!F>#LSlU(HTfk{(RIwbR4cF_x*@ zk6I9L!{Z^kfjO;hK?E%QNZ%paLs3aRP+E|v5DU1wmdcFw!!;mSY)uT?<+1gt!uXT{ zJNRf5C5v{#mgWW3)BOs6jMQyi==#|Ed}h~HMO%k{?hi-Wxu^d_<-j(0{Z>2(8JQ59 zhJ}j&W(lM3JW|mi--j{1pf^w%39Xiat(Dt{M{i@(gKsd9Eo1~QSH9?DC9f^3#<5QL zW)TkFZ^A|6s(!{HLh-=Mdp5h(obEHJFj)Z2j_m9G5z|vcTk)c8pC>*FvFM+3 zbVEZiqj-WdXbbrcL`2x(;+#s{t{9)=;tV^hF+pohrv1EWd=K^-^FAa~+SY>0L8OAV z?XQL#cf_Tr-%qH%s?7)8c0RQyjJl|6vy&*&mRFS^YcI`Gcn*Xz(ji=CV5SXcxU`Uoj)Ns%`vtppEqtTLu@6eO0G(h77V10Ac} zp(ZDh3=Sic%fm2^F)|{tP15e(+M93;_k>|&q!;OCB}kpD%~;@`7)KeTS30ksn9e}D zUNcg>{tnAmxR^kmc9vZUl?D$UaU{|D!f#b3pUU#t3tHY>v0ZAWqHT3lTrTP88oZxd z9^uO;;04LQ?Z~g!_>079m$$8*aR2_R<39`rwpZRwN@r|Cs z_saYTUOqmQHC(g2#zQw5^2C?>CNjG*!6BVtL$>|~c8P#yi_X5=TU&22)vra!c477T z*}A{T7Ka*b6Q6M$MlO6){km0VgDd8#Q=JaMF*N$se1dScFFOoLDM2^zTPF86Nln^` zm6%VoY2=#`36ZqET_A;YZyPHSExbDLo%iOw_RZ_d_k0YC^H0R9jSnQb*18reNV_Dp zGx(n(Z_2cpD2+{3eTKQGq0gJG@32THOqE%wLk?1rLbDfXBcD*#TCbF2$sSDf>3fG7 zTP_rZ=j-;~;-%b%j)<%>)+9JK7S&WZu5gScAjYOoIIzUK`&v9a$9WqyH?F14Ft5^D zXR!XQcfaSF8_4nds{F|$H%6GEFhm-1CCi+u5ZC_;TS1m4LR2ammP>RM9?gn{ZDYuw zr>w-zP>Ub(Djq$T z*OgDvMA76K$7c?oq+*cyY65v!7HU6BQ5dk|7VMj_i!Ao@S**!e?haJq+45ChRCH%u zG;*>%!+?RL*dQpok{g0HDG_h=8r*97Dh!nGxvvRF>KOtxzKB&=)kq4q!nl8c1Nj+= z9)kCIfsC)POUk;%%_^J!@b`@tU_>7uVnNzt0Ai9@fM(J% zx*#$`*KBPu56RK4us2NP*mz16gU~3%w}W>~*jJCS4Sx(NKZ8^$z3=Km5Zrdncy`x* z%F$P!p1DLk?6*=(U<^(;1y9XOmLs}*=+h|E5lu`<+hF3VM~e7SdmjSdSi)lUKa^NDy4Qt{_dOPnZsgnC1y-c$Kp3Sl z+N0QR1Ct?13q2s`@>}t4WM$jMZ>-*!I-VE!?1B2CC7NXynWw*!ki)|hZi!fhXZH}K z*?+IHLa(Mk=6##x73YVTJsLFpHtWbR$bx=uAFT!R;*&Gz8TTf$sAk?n)Q(Ig$hZ~M z<`DB;iDi`@+kZ@uOu5A6MY`na$U1neiHvD!$1AGEygS}jY7`IfQ$|^|;oKlB ztnGJyW47H!8Z(YFN`l?Xbt0ZQE_l&m8LzGmCla1c3C!OGy7}KC45?czO|CHNI{l)b zwTmk`iUEAg04<5WShvO{(ZbA@WD9gjP#FR2B76+we<9aIt< z$4rr?Mt)rI(_$pBL(Z8fS#plI#en*XS4K1;D*JS-| z#E^E3&*nXcTJa=Sao{PbY zZO$#&CK4Q0>LIcL?y&aW9|wG=N-_>)^h#`Lu>OQInPDuR=ND&!sMx>ntte?+ZsJbB z5FCuU?19UHD@XjGt(_WK^$3_6c(UKY2WD6aOve*qCZvunRu+Osud_OmhZ`-v(UC74 z)KWni`A-BqG*8`JB2~}ch8|kv`CWH_;i!~aU&XRCVc@bh^tv{jITJbDm#=Txo~SKt zz2_?mL?$jzNUMCg^58GDbh=74Y*X=@lY658KHa!A)hsaaULhLj4HQQ(10GzWt#tv)pW z>T^JbA=wdk9z+>BG^^QRt(4u`AO1%81}leHwJ0I%?4Ox{ zrAXpn)AhR~QfZ~AVb|cJE;MOacH@0F`i$5^;YQdPHjS7NN2}nM(40`BKWjw0+O0jM z;#YORQ7Oa!W8OagecfG;iIJm4We_Q;Bf()2CK}<@Mcpn+4A54PO9 zQq-Ii!*R(@_jq76$TE9#nf1CZ*fP>pA=+Lx&OoKY!{*z!59SJh-R19uqxYA7e3#0Y zeCW&0i={W@OzD?bT019OeMZ@3S@V)(Y2eqb?(uC;XiiMp<_GDG4~nYxwKdPNW`?7s zH=A97`W-Vb4`}V~8*hj2z+eukP|4v`lo*dum9v;|z2+tW%3_CIC4xqVJ( zkc@-jWr3QwSWgGXVz72kVmn?mhGp(QbA#fYbg=MtK`=1Ti-%T>pq$hxhDZUP!#Uzd zG9AXu&^JBOPFSVRzHfHz0PD+=$*2WaD*)f!i~6LlQNb!{bf!$;Vw{2!d124W6}uRG zya>vsI!eb9$^gF8WqAntVdHHyQ+UnB$D8zH-o*By6CgQ8g2_5d;q?_jYO-uJs2Rer z<9}C`y_BFVN*U`Fx@fcs0d)`c!BI?11RV>$f(<&ngy%!WRF{<%%00f~wN!9`z74c# z_}*GLLYK&jNA1Ys4?kP#m_U|;Gh8b(v(F=&X;Ap+a`V6M;aMoIyBBKOzSHa&V zZ>xxQcxZ}3rlFdefpusJv3~mMF~&yKB}lWbO0O@kD7ctqpY^=`b>~?haG{cGP??UV#D=EIa0E*zS zu2X8K2z47y88>uyBzr4bTUj7CT(% zzpRftS*r{Dbj-|p&AnrKk+dmmZA#4J#Z71_>W+IT;@zxfbX;Id?m2b9(r!kt@I4)X z5h}mbLNc`K4m@4< z=y0U!!UU>hJPWXd<)r_vS{6#>T7()KsM~GamZ)o#zl!e>5*ZG-^FT4={Xb zFD^`oagOaQas{Te&V#B=)=j8FH3J5*snE_B6*57*o=~rmlya0Q_vV?`Rqir>AbrzI znWH61J^RiF`tVxgg}1W8jL)hq@&Fe{De8!u+I15&cG_2Pn5ztW7c0+JTSYtQjaOemnukADE&tc=`r}Q3ZiI*}^<0Ls zn_b8CD5tRY>(yD>!4*VPlr#%UtR+Jsu8I8bxRfL;<#x5xYgXPWWe5)dXvT)x#V@7- za}v!_eZiuYrfvGdwu(JSY`>|1(i~?j%iF;ysfjK8C0E2*;~7F5tXdX%@-e00de;4XDwz@ju~HF-m>SVwJFg55}Dc= zFy^Myv1S}mY^1m5lPQhKg#$9{I;kBraSL~fUzjWIy4pz}C252V33dg%e;lpVz@cC7 zMw-<{g>h=CrOy zF&f!-2|MA)8>diD(e5H9lYS@bN!gE16xV&mILs}RR__xXp(uOU>PCyzle5pczvX;_ zHzoAo```3@GL8HVmXD%o#s?K+gaE-N&DsO<71ULVoYQm<1Uy}t8!{%7xUs&iS9TVC z0Uz?5sK3=4Ios*|=ZCmxb|O>;Vye7-#WnHUVr;n%#Y$ z-f4~hYqYXJFIO~|$Z4vTOKK+{0~{H=ga}Cw{x*pfSyxK|P`v%-kKwrPBZ4tD&?$r| zCQJOtV_~8n7KR}*rV%VXfnI;+RT(59ZO+OWRoxe3Wl7=!E1wkeB&cU2TLzq+(3B52 zM6_X*hEb1C!qQTguD9ZP`P~xO`M@=0R>dH8&R)gptG#`Q1IOq} zW_Xl~sFOWK*%1auHLOaAgOQW~qKeqHIE7HR9mT3B5!WG_r+1d8(r4b?XZf|d zk{zKT_!5xUnxmW0?yG5%lqtgt)r)0=(j$rgehY2i?WgO1IM&eFWW7bi74>DGuBgT+pWX{8T70FAs4JB9m z&W!{*$r!I^09v+p-7-7a;nN(uG;Bgi){Sv#Bdl+Uu__gRTib0E?-(#dP&@5uV>d3& zzDN9TgMFVF^x?yYlSV($w)8j|Cv#32ddxI0+gm^FUM%9xWGssKcjv*hj-;W86F!nX zp(d);Kk;Ieh-v=y1%RsLuTP^H&Ky$DpA&191BGjE6YIoqICfM-XJ!B_6l2ynurW4sRc;LW(*;xJw3(G$AEkwVZh5IKtItUY>f zJHEL4>1e|V_Lxte{rj%{SS-J=X=+|#c&oHWfaNO-2bXoon4y$Jsg`Thw(;pt10phh zEIM1VW@;KSrVTAxEnuPxxP)AP>rz47f)5717MuT;YXC4P`u~LXkS82eZAHEcD$-;XWum z@zz{E{8h*Nz03$2B))agjKGElV`v>=Pvvnr|g4Cd2P6A!%vE$Fh1)*rh17zVSjqxHUae#(&6Ho(hJ?zp2s6M0rthYiuN@*(%A z)s`&3S>Gc3_Iq2Dh+~HqFcIYa@j1f7ZKfNsz1@^#()F?sOvWG0Ncub-JR?_yK%~Kt;H|r&yhKjXyOa27FIJmsf0;EUaKqHwqky~bgmFm z_n_*+!1tbm50NqIXM!rU5s#nXi&5E4GhP5tJhq zT(Bi&_cE{6&E{tEIOR*biTQ6(ecD@hMPg)590Ax9-t%-m_esSL)mbTKjNHmFI z3PdX|c%M*VKwcgL8$>n`OTz@4R0uJ_3r+1mgD*M*uWr$vWW2F7YN$;Z{lRTu5dHzu z_H<~LuBq#(=%m3J#T>-}07Sk@(5(GZ{%Lqz29Zo6C=irM#0+^5`*MRUx^R{;&)_00 zP{7hp;Z2sJnL@Gza5ZU3`1~~e>~A}Kt_HhXUMFHa@2G+S;Ebx2wpqinp*+9xN&qOF zi&y`dRvOC0HO>;k#`?CGY#>JjLPHL&_%7heSl8sx_Y5E^2XE_wzhvjcwH+`Q#pI%~ zJE*8rg1__F!Xqk)q0=Y}sR$kU4Jy^4&v7zjpuYEhB4&MAmln)x64-K#776jnskdQa zjz(1~bxGV$_Hkh`@xJL1pDv)_x@3*vAvbalLZbAj8;)KrF+B;p2!cB|5KQf-=+Ud= z@Wb4Eh?O#P4vJvLqo&5eQCh&XD|<8C7na}F0GHME@L7sWwutTTU!%2q^SAp6F+k6I z1lD~f2h<1u&IC_$J{GxfIKx7T+&2Y@Kz;-!_gCSi)p-$&bPIxsmT_K={rwa!dfJNA zGUKh?dP{G5)c%w_$M=0>Vle$(!OeN`IrRyF2y|%{GB`*o%sBI^J(ax&`qHTj^3P6T z$qUSG9vWruUS|KXO}u>!Wv5 z&v?+!@O?BlPcL6W(XB}bTuORed6;s^-Zom9o%XL9a|PxxnsG$D?Ty8#sW)0%O9kMB z$=w-(qsa!;@z^QV7p$Z!+w9F>nxr%>5mhXN3HB(}nY6ZzWTS2-%4#vd~_hL^S@X_i#= z2IqmI9Gd(sk4AZSdh0eh8ic~dAq-lAi@cLR!r&(UHBVWfouP#>Jb-f|+A@suPW&gg z&e8-2s=L3I?-|Z&*NwNC{Td3tZooK0H>yNJ&l&0kr`I?bDT<5xY*_3$$RRHvIdsvO z*s+W0D+^iN1(%a!%h*A@Vc+4)jg77DZLEAAuVrjjII#OZV&UP#bs0s)d@tZz5WX?F zmUZ}I5<<3sC`)SYR*i9Fwjz>keUCh;P|7NXcE*N=}QWU%2VE1gz(G{nsZzG0`;=ut3o}2(4Gl)vBQ)^ap zSPJ6KnC|Fo>b>GP0gsXQ%dWzlP<#{Z4CA|e*fEsgGv05}4^Gyz`57(7v!`Za)pu#F zH&o3$S|Ba@n~aFx`Er7pn~>eHO6~f99M25r#5q9Wd-XQg}jjQh5n`ee@y zf~G8dmUg^O1X1Er!i#+)@#0C8vy&*^3_B@11%Vy$eQ3}f<4qbk8d^$0qy(AVn7lo(d*9kGQzFkBy>D`Lhq9mk`V zX%@4&i_`D5Zk2c0m{pTA1dG3ivjlBp_cuN&G$iyb=@rLIh5`S5n>$=B#jI2TB#Y|} z=J`7}3KG8hnNZ+{0jpmwAN{DBnfE?Vzo*W^Z*#;}Z?SVOz3gIP#U0}Eli<>M?S7{Y z4u4v?nhGP&CUh8c*Tc#-kvA>&&LB0-{6H;Z`jGnt2oeClY|~Cvkd?T6Tx1q5bF-iU zJu;v97#C{1N~*E6v6DQP1C}(YcgJLK_)e$g&WCef6SACh&jRx7ONROW@r{@(iY>Fh z^l-zbvrOKTM{6x`8g#p@J4z6)xjXCd(ar~i@R6ik0|iB0A2{1^B4I%_p70v^wisl* zdgi6}p#WOe=#nrQco`X*ZV=_7H|M!`3Y|1{FZqcL%YVy2+-=44l0}k##x#^L_~QGT z88_!QCg~ow(`@CMb0A?s;UW~67-dd-p1<8}Kgcb)N*YGyw|PADLzc8< z_nkiy>6S5)96L;2f0bK>N+O4x3J*`<^6)hm$n}r}DAg@TiW3-dS7`M(;AP_=SSt1cW18PV|jW`IT{E>?P{Djl`PZ!@8@`-zTae` z;?bQlpQN7YBexk{$XHp>-9vX4z_r_1f%oeX0Rz|uJQXr zcyW`TQuV)IQ1I^scs|Nx&T#gE`-qM~TR~AD9`{#-&4Vlef@Ei?uRt%{VS<9~UA4>q zHuKJ_)s~XCjC{`9TTP#|1p8K3l@HG866AwR3jcmJ`tOJHpLeo?0Mz8Z1*!P`*a;Lr zeeHnha83RHfyj!N3Qw5GyEndc8g%0Lg4`l4|GNeLy9BrK|Nrx4|5pb5e=J1a0F1pe zwhz9at?lh)Ox1rJldhkfV0iHVEyODpo?KoI0l1*U=q%D@5zsg#|KS59WH8K|UP`q6 z{LW-*Qs3Bs8-kDop~y9uwcH`ZTuVVmZ)!6>JL7O$=r}wX2n&;Wu-nF4{E&sW1C>j% z4(H?lVyDJGjp%lB)OG`ZdQ9J0x+t;QvHSENfG6i)A#Z#)54getLJ(VWIet71f<;q6}P(8jJ_#x#M0gNpCtIoGnsn$6j{ngh=cRI1)f zi)XXw@U*8vp_&h3LgjF3a9JfxxD)ENtM6f!*MWZRwkK?l=zl$L0~j)sEdyA_Y@!@~ zwPG&EH0E{KEpm&_y8m1H*5USBs0YRdMR@3$W2A8};LqFpO^z-4-yNuwiM?^yavcNT zimqrJ8E=g3C)NxX%eMJ2cFfp3A_pUMuUG!_#n1~go{@u}z@Zz+qREOe|E84}0CH$&&d?S5yLe(daUWmtsqr#qAOWmY*RB zo@dz)j!$E2K+m44K1F}#WbRdgtSoRLzdcR@zCMu`nU~0+g~2r&)BUuFYu{Q5_KWQO zrun?kCfNdCj)2BRW?Ag0@4UQjw-=)to~Iq5x9h>u@88kT7=r2XrSF{1h9ahxSu4^X z5H)?zN{}6l?jEX5S1euB4Eu~To8J6;+U#KM(N}Ww&+yF@{%S%20EwTK<(qyn*WJB{OTs05B zlBlWc%)UPi%DR(HsB4R9;!9VJUyR~9mzf^ybml2|C0`3IPV4Nai3i7ay*%lI?)QW} zk7{qeXJ)=pQTfj!L>4~^Wy=OHqeEk@ zl)v6>#zNQE2$cr62o?JoO5VM9tta4Fxjp<C=YXvY)BUUy(i`;w`ReRWkjk{|udb9@8*jEx zK_LgGxt9*TSz%j6e^aUORLn>sb!}2XUCTF2k|j}T!8X-7Yf^xFRU~r_UTFGv)EbHZ z#|4tX^=q^o1@;BDRMMpf0Y@`5!wW6YC8UQoHLLy-v8&MX>Q%nvJfBAJY*i)rU}z_Y7*HMcO-x&6qfZA zl~y&;&17|Jv=mj+zSzi^`stUOl1W-J*za*2uYqG3sBz>~=!zB~b zt0q1a^ic#Xwa|(h%oXhqi3?{hS~c-zdGDdfW^HR%f1V zZ*lGKnU0>87JDFzoX*-f_pfL1b@xN<%SY#Vd(50+%+$zwm*c>>?M6)eD(4+m(Xfey z9kP-XzQ%D>L>t|?v=o+fq=};jq?0X9v3J7$T(;a~o?;|E+3+XAGdIoTKFd{`PfSiP zcw}o@zMZ^N^g|D;AeEyVkv#O+-G@NGT?d9Wecu4!Fbfqze7M|IJw?)j*e&dyojh^2 zcoQyeiIG4)SpIfdwn$RfCGKC%OQrm`ao?=}WbK$pQ~GX*mDr2BG4_LKsJ+^d-r6(ZmoWagefAmQ_pq zYMsCoCNKq3uM4vU$49K`h@{n=JjQQj7~;i0t0uP`bLltOxeO8jwj%cZ3&&6@eQ{NF@(2ANrrL0279dno$}-+H($ zM%{pmn|XkwJ~~&kL-(%3?mJ6}rVQ0mj_JZ7q3uCmzS`jK7S^;67Rc|ZD29nTIpfgI zuymBwezLN{aS zAU!g^2{6ULv0tX03-@09VrJo*Ks%V*M`hXVZU{r&vyP~S4Zoau>78w!lNy(cDXz$}G6T?-yfg%@RkPe8h ztWfas@i1E2WUmS6`v_EMy8A}&oEI8Kmu9kB{QI_vqGSPq2H-+RGp>yURMK%X>Ojp{ zu(PQRu01sX(27H(r6sBahjg1Ltr@MPWW%YMn7(wDmO=}fnqCtc2)yYFze%+#D_iiq zS)r`4(W39|g>$;!PvUUe6xY%CRal5<=v#}k=3Bo?w!nIKiWViEUE?{KZ>E^R1^)(- zwr6hcBaL*#*ccqEP93q$+{p)P>l9Y`RqWZXno9IlQe**-R+s!e^V@V@d1+cP>|J_x zo6A9+ez<2d>ikWt9nWc9;E2;s-L;dc!^Xzzw-_*fuSwJ-222TOXH0qavZ|_q-SLOB zK1@=jT(jQ~;ns&}VA1%1^}tb1zk7}aK8IDV<$#w%v9J4XgF}yp3*3Xr!&J5WJUpsZ z5{wty&@`j*O#S&z5%dSx&IN1Qj%(4n&-c^Y{s{Q17`egq+rS8vxUl$A4J7Tj0gZwA z@14kiyqdb6Q9+YzN07Y7Wr1&xe1()|?4iw2(GdO~UXCO~%+}_tW3)-)YmfhV2yCzL zU9PX8G~JD~%faK z3fo*Qy^JKts?4Ro!V(w-h#DXPiAq!0IHu7-b@fL17>LqiYbOih;IZ$PN*GWSsdx}7 zx@D1nGrtd9g9vNYS=!E zpqPC@6pw6oVKAs-u!1JL54tl@y|I!)=_=r$lC%FRhv&-k@j2vhoSz@t4Y24*#pH|k z$Pkl$09wae=heC0#55{H$dVQrG_7OpUM`sm)45QB&IopXzm)@TyjW$j-WG@N0-7iK zv6h`a1YJ4q#PmfmNr;Sw3sXE>Xx+Q03>nW>$Dn`XLFCiQWf)0aY-ttuL@)^B#3+Zc ze%I#z{ zNwUA)+w=PfLG^jm5G24jMVQvPyJGaS^-zI;a`(lKNC?_0> z>@@q62sO?o?V2`TOnjc#Td_34WTOQii1Bb1Bn-ygLJXHi8=afwbn0k|wnnz6RjNL3Hua|AcK9hOuRF+_W6}pWXrJ&s6 z3uyMeK@jyia{o*~*z}cgI-1Vux78!8%tL8wNu|l}ac@N%Hu17(*ZoNj87&%sIs?8@ z_K%J!7dUqwYy; zKUuK6G@LYw1kc^R>*<4wA3ty(`VY(t4xG0Cg#B8GN@2gZ_lR+UZ^vt%=P!C zn5|qo=3x^Cg`12!1o&`Gi25)G1wGO#N3o5``j?bWS>{!C2O-$R9;Zcm_8m4bz#PSn zbA;T8*xHZa(a6%5O~UoXYp0IG>gqm@Wi@}MH>#?ru$EWx;dt(&gK>0uPJ|Ltgu6g; z!hXvsJSN{%98)x2%>L5d+0k&Ww9I?@JY^-eE7#|-7E!bz0F+wgB9EBF^!_kL{dOn4 z^)Edx`dRoEKx&K@5%Tf}my;-(d_81-$QwpP#@hw$zbWYj+q0DK^{psEcy+Asj)&6qHKi3*n@?uU`E;iZed=G)y?*>T}J zN?_~JUPY>~7|V?+`~>!ZT+cT?l9ob9qPXNGft}mt1n7+6R^Fuc&;IN}>3v1=jkv4Tpvi+4 zrV1+1=GsXl1xU|mY%=_w-ez0gb8ZYT1`_9+qd|U%*w@wQypi^N?(1vEC=uVV@T{B7 zeNA3_}NYI zEEt{72%CXO26jAAJ5n@xu1Xe>thP8~RfZEjJ&c4QR?wUBcx)lIq1RPMDJE9jLB&Cj zXTx_I3v`*^LT1fFJ_QKHqP{2NB``u06Ay4A+2<^TrwR55Bcd4$B1TK}n9-$uXr}N% zvKMM3sT2{Iu>+LKVC1dX6^h+)Wd}-clyjl0whhue$v@rz2WnKhp3Yw%nbp3h zhW5tjO?Nl39oYy4zf-H#EU!SH!sc)@Iw8yxsjI6dT*FpJy+24rqHD8mX zjgGCNBgIvha*-PDDb*!gA!N;5kMv2tn%ZTTZ7IF8?s_>AzOn6`#vJoc$r%h@Uoy35 zH<8N|!^24deIt)KKJ(5Ww%uFJrpy-W@|kF%%h063lh*N2M+VJ*D?+e)$`gZYuU|Qc#DqC;q_I&tEENH)_iqJAu60J zBm}-N-~}GLSZE}=581A`dHm;g!rL=+4$^J0WX-fDuomhFYLQ)!spO-i6u8xY_H-R; zcky?CJ8sDNrIEY3r>swf0Afm^aJsZy2RdR`Ou7XEEaUav-bBENqzo(;s5`zA`Yd-h z4=2a-@+dY%o>A^g>rdwLM5t!Y!kpzYPjrAEmtFI9v|L>wl@{vfu5lEHN&vEQ2^*XNPxNugQ9Gz1w{4 zorLpjpsNpP-tnXkN6i<%W|vDt_ZJw)-zQsYS}EcI0c+q25u4=kNmg*=M%l;=Guz`d zfD`C_ViOU(b3gY|P1Le&L`7*m_;ClZjX(K|1YNzR1>#zfwLJseu(uMr!8E3d6rfmR zhFlc>yM&abEXDii?Ueyb{=BVOZ*_Ou-EnYScSk;FlaK6Lx0@``VIrA?rHCygJ4BPL6-b3VP0 z#Z6C}mDvU;`TV*$`!^YBqs?nq2(?z&As#IUr5Rb~$@_|P2^O_C10&?;gL@df>Orrs z3hj!&KC~^i?(XQimjpL%l8@=$Fr}Ut&_jp9uEf&=(%s@B0Sm06CYwtNd)(chBa2pF z?kBwNI)!>pk8xS&ocM8G0^2(RU@#7F4G_KJm@8Pi1N^tN6z%zvgiaVVKSV}htV5gn z3fm;Z*pbw`j#faI(O_sZM`7$4-x;lMYGt%`*>}x!O8k7H_`9>Sf`aQ%`)H?0V%o$OUz1O1ofi7 zyeCB#F`N=T+dXPO9Wyn|S<)^CtvO0DN_Mx2{PrX?!1$b0Wtf+5EC5M3Ex?8ZA|3IO zI0R6l)&}%N1#gyg{F`*cU>Hy;-+Uddr=%gM>32oY>9e1weGjYGe%dHExjachRnyW4 zj_CLQ$;LjI>E)0)`1R0mctN!&{C7TaeeuRe{dM8Z}!_Z2XKS#oN zm|fsAd|!x!WRd#d(7^0z*LTCJH1IjktRJkYb(VU114ZgHWOb?)ISP$??kMy)x+}2& zJ(V(>tgQJeq8)b}nq@@BlW7Ral~ErAzZJH0a|q#OE4aaQ_V ze|`EV6&?Y#X z!K*1#z*_d>#|lsUKf>WVebo1`CAaxzc3E4e5SOKF*D38~??XX82MT^X{FJT6xz0fa zcE7;pwdd9CVde_GR>@6eOEnmePMP`PP{1bs) z|9o%jph(Gq4$N57U%eR%{##~qb}+@Sn6-}U+qt3O>##^-1FP(V9UX6~ky>vEBbQM@{H(H-b+PPCnNQWSZzCgwN)6kyei9jTQ#>RE*jMQTwRKE4#uGMdJ4li@y zrLPGfS2C6uFSam~rCx}dh909B;`~+#qwj?NOH;1=^;jUit=<`Or}u%s7iQJVrv8y` zJ6fm22DOx`HfC%jA0!?ANc*BO^2fEK^WqRC_QWHuNK9Dtx07At#WX{|m^+o#=wb(v zV01%p{1Le>R_^> zq2_wIB?XSI)`Xu9*rd+GPCDJI3Na*X_>$e}H&Ib0PqK4Fa*{_54c-?4a2g!;TA8oD z(=ADOE%``&vRx1pp<#GJJp@O07#3={d1y(&cUtulRY}_EOWB_47ae0qI{qN=`N9}- zK&QM!^smGY3I9P4JDX#Q5?reHFtVBDyZig~kG)hf{e^A4n+Zy*ng?TqB2Va=5?YcCP<&|L|8qH5*L>EK ziK(WqMKd&;p@|F~SV(C``r#9YwGlxf+G!xlbrlVrAHeN}bmh7tBMx4t*-WBB7f$nr z7)V9=VW}g06LWyvtC`LvdJNiUh$L?-$Eta%8$8m+TpGKiX%cV!9RM~W=j*E-#V8&& zomdxZ;Sunq2X}?eiKo;gJXajHOBjR0tcgY>=1n~bGMG5YJ){3gZp@8Wy!6E^VTI#`&pU=zbc3Tb6VxW>kh+0AM|eG5~tBSJkvm$|RAzK@sqcsuVEyEldOlj~;IC5D} zhY9rIf^_Vp1%a@v#O4|7l9Mm1ZPFqRanL|lgUzL*2K(?B8ohahw&aAw&$?iR87ton zAolpi5jw3Wmp4EH_6`XhbJI<5nQA;)_8@bh#JEH1qT(jKEaG^{Fi!AuZMawO9eT<0 z0itIn1{o!`!rDFM?#ly(`Y9~yqVXbGvhv~vi1+=C{x_wEaTFrE+DZPYIU57YJ!e5a zx~#1BHNpJw!T~#D2B~96Ir&i(_={a}@d9@sMS&YNFx30~EUssOws2!beW~-Fg*V}4Ad`6d119+@g>Gq+% z(tzBLdT3c4u+E#rou`2d8UDiP4CO&_ZikN3Uyht4<>e75e2zr0B^l8sxy@!P5-z}& z>0$wNtDdH>zuFv{VNg!=Ot*VvA8jp*0u|h*NX%3!PH$H+J-=&P`C$FM4US*qS?U9l z5%0LUDMwT70!v#}WMcMc;84P8)yTfII{PU2h8VoMGHJcrb&wYJi=Ky$UsFvu}9 z3QMs>98=j=sNw#02jaa}8d%TjH!J3@4Zp*}#SIgC+NTx_AF?pp-a2VFU?9*56z`E!4gq`WmxW zJp1pi$Ve01T3N@?1QtZ^`PH9XfTqj_Bn*SVkUVZLlQ`a{s;4WFF~4;gGg}=NF~swn zaY64X{^|s%HxM))dk|R$cRjA}AF~%K$wDss-oef?Mg?w9(Dt!y7EQl{18z=i%qNPO zlS*OW_hWLvuJw1mWnB^&JAoq+CHY0CH_EPvLVm#%RctcGXRYO79!rPg!q6L$DsM*k&oGms&f ziss_e|`WY~E6}~F}H20gA zY@_zdR|v~D#tw;uIMIE2!QL+hpV8Az`ppbRxOgqiSIjH%vxfLB{#e>5+w7LPbE2X3 zit~OMa%kEH{Ls`Z-f-LJU7GW#a~SYI_;j1UF5Z0AfX-68`9xUZqJ%9?e=P%Oi%NR_ zI(cCOS`xe?1zz;ME;-3)r$u09nbSSm17Fp5!%s^nZ&|y5Ibl*(@r>3r&MoL7Dot#U zdKW?amO{sTm+j$cr4gXKOc`O)A@UUB%Wqw`m3P}pkz3*vo>3h1xSvZ;Ih!(ql|FO& zAx4km@gB8gAc{&;?WU1g?(sVj_`(GhIV}=kkSh>#A{%oePZ3vCRSd?NuMz3mAvH*x zyvwqKt$()Nr)^*oU>@XYiS^s{>DX^kXLRBw_b53Q&&>p#!BXr}ilw!mS6`&i4t^T1 zE^1N^Q?^dN>jmwyFG$7-kI$Q4^n`3?+diI6i4{vgkjocZ3q>CF4D~}#4W9`=x-97M znA@-ko9$XViwUG^=`xxTSel#su~Er<2uc#q@*E$I$#UbfnX@=2Gx3d~r*4h6O=H)G zx7(h4kLZ2t>E%Vm-z*1@ce2Ip2Ax22DWY4hnJ_cQkBWS>jie=59yGO>AT!P)BBt&I z-)<68M>y``OOqkmoRIN@FDE>>A~~AX#5bfS)YJ-`^@YhMMk9atTu0fR16BY?iBcIC zAr=CzPd1r}%;~$u`Gi-J=fRiA#YNhyLS(S`G8)^&HEF-7(9Trjch*CFURXG; zkFYk5>XANiF#=$88q}w@orkw$$YSQXb;dWG@~eJyrQ7MR7p zI2ziyHNNG@^)1POnsf|D*FCbXl2~Lh1X24mfG}@;i=lLRy1pm1-h?H`eZbsGy94Lo zzRQ8-q1}bg`d@-h$GIyDYv!XML;3{V>y1ZlsR4JQn`UDa2D?<3hB^{~pn9u%(`>xr zdJo(FC`^OI8~V#xe0N>h9REnr8simUi4CgG?WE+z-!Y+tw$!d7*%IE$i%u?uixJe4 z|M3~)$=J&jn;^3`i?DD^2X|F8NchM6tYpPs0x8}f-%%Z-Cb2g}U%o#1@?3fJZvcRQ zr>h_8#@k-RPa68gXbXv_yP@NTivoEVVwZ%?b(ts&4C#EYMPeAEYP_(vZ7M&m zP6*7!*AL=XL*A)m<3_+|-U?ie%)+qdiBBn~F$fkL!v&Gt%yOhZ#9*|y-r!Xm$9hILJGwT)QbdM=rS#{M=3Bv65(gaD! z(c>xLxz($U(&)#d^p4HlyqXWLjnjEp&}AH1W+4yIstKHJ3YL^$n&-)HrG8tE8_jLH zBxHIzRDwM=Os2nJgiJ%$X6nG$9+YObES&037kkqLhtlN|TJkuPljx6wNBllkZiqCn ziGM~z!Qebi2j(wKV_*v%51om6(7d~@rsirzj0MUxINF#BO z#G;M(z+d%2Ub$^XT2{LW58cJxjt@H%o&FX9K3Y~&YvM&*cn{s=RC`o7zf%HaTlCD! z6NwaQPJ2jfW=|Gv0gw14X$xmoLOEhg(35dl)pov?hJ2~%nDBMs`(?vVpO{#jGZZ4x zYiqIP6k~AS4Pqd~B+yhAm^MVh&d74t2vEeX&NfgCCtynjm4Shs)U#Oaz?4aHN=oPr zDscH}`kK<}5KE5sL*_a(&(J2JiDi99AOP#Kq~qR%us06zLXqowwU*qdk><38|n%l=$#06CqB zY9UV-&wP)GbS@wrH0Qbed_9`b70`?(ZI2pmHEkLpS$*g0f-s%NVTNYneW`EGm)7~VY%)D~F=nbldfJN6kO3ZBNA{`iA!&(1kiL|8Ivxj16{Sx;tVRn-~6Zhig* zeVpzj6tTvdJlM#Aa(ETUHB#c{o=z}Tl1`9b;Ic`5j0=ENGj~OUnV24AL;=|>WWS@DN zY3dJdZOs^AAE-r=g>uD41x0$Cp;9B+5S$x-4wW|<`R5i{BIQ9vr$Kva1Pbp~?X}}U zvYM`vL2pXOtVX@IMhM%iluX!fOZUxY2Ne$?q5kj`$ z|LOzBt_AT^GH+H+h?N+WUg)?Xl6udPa0Im+#r9=nxslgW81o&P(YSkB&dQW% z{#Ve0EYE8KE-f6&;ewbe_c9{@zrwvDabwqE^Jl@a1f@WNfRaYhqbyA$<{d3c)oMHI;7fyVzI?RPorc8IK!>q zOC;_dMR|IaEhRKrg?hdXXj_JXdq`tr9 zTOL34@)~IBm@vNIFYEL2z0ELD*3*+J_dUP9mazIs2k#Q#@9#9q!EJdq6B7ffU3r(O zImySyuL20M6#`j=gwQOrId$Oiv^K`L8Gbmd!DDNr%V21sqM|0>#_=eQj-)K9YFZTM zP(x1rJxFr>?=AG428QcgeVv^f$VMB~Umc*d>-BiB4-k6qQSciH0liH+3V{cM$F1J3 z4MReaMI4H*+f&cIY#^Ia6b>HJfdzKHE`gEjd1lLgcX?F9Gk(nWy@RPc4CGn7V3(qa zAot;hzw2k8$5B$$4eRTU=R0ncVEm>oWwgQ7IKJDeZL^vKD-zBf_WoE+=Dp@9hLBhX zQl4F#;9dpOFy^WfQIm<1TIBk{9<(-Nq7sbd!9?*}j~Lb8uUZ+d<}t4-kuK; zgo#DN@E8L}{QI6Uf?IP$Zi@rTeS6dFJ(IYQrZA=@910==YXjCQJ3+m(y9vRNJ*ENI z^S>knY2Oqm{1DPW7bt@{euDQgViLOe`7C3QDJ~ zu>VF~AXO3xe=F?!hCAOYrkXQu(htB4r_HZ+4tQ#=8&Rf@6NC<4N(cwHCdb|I%D)xU z5gjnAmAzmb%Fq7~dvDnmN85A@hXBFdg9Qi@++BhN2p-%acyM<}kf6aeXwcy9?lQn2 zK?ipjd~kjHdhT5N-TTAy5AOap#|%Am&)HRV)~dBuiMFQ3tUbJp(~op(KW`^F$J?G^ z=SUSJEmP6P>}>qtLr41;Hi&swpvXI%P0JgOERnh}7nFI(pskf&X(ahT4WH^{dR|4u z@vRt#p^GRY&iL81$~41|iM>SsupFpfb2bKxq4oe9s%2>|O6Hof?~I~3J{e_qRX*SHR`}bC`rmIMrTPOF>>rP5Il7LhzLm?AzcFO@9Xoi+{I?hfVvEGLTs zRW!r)e$hsGkS0bygg(0s2k4&52%mOs^REZ1(y_IhQ{+{AI3IlQ=vmWgdM*PTN#yv} zx2WhxPHnx9=;~2W)=@ZpK`XD;GK9uYxv}`hm#;6A9lE^E<~7$oV{Q8UNHB*ybMCqe zyN*A4-Ynfa_5T5}Gx+Bqi!BdB263LkER)=2VsztD7p^) zCf==iDc0O;fMmAv|@Lh)Drkw2;)IpP4=buowu zE3X&B&fKu=c89O&{#eWG_)SIZ2QVh)wuv`?aD`DP!aq}!IKCAAHyu?OR$*o3h0S+pA8^byQuoj_uD@ z)&+cH?!E7aUSd3czT3Enb8!Tdcx*&irwn6>Eo+?abJMA{@mG>U;W~friqiY1u(2BZ zpAl0)ob}Up%RB>|vr&8-wCJnw^}LckF8OM?kqwp_b)+3<+YlGRI6b6b2}B%WT-bgP+^e|;RcB6V)c3`XwbPEM??t2D*@K(qT9TfB zh=b7$pF8s{1>*g6`m6RC_KHgD55}Gd7OM|h=s!jEgne5O629@6_kVgqC&OgG<7(7G z92F4t7oVcCS})S*HN{+cCfhKl6zn_cC=B{DSuTA-Y;#O0RL(VvYj10Wj?W{=!0d&k z5^R04c=b!0`pqA{jU=rOAnvq>%TRc=s{Icn_x!MX&l#%HDrt>LQfdNR9-72x@5 z&gdE8xOvxHC)4)vZkZ)Z{ld4T(jne_(E*rABdM_<(vRlGH+t1~HJFyC#7Dbt|1pmw zBuU!S>8!NL=D5A-KJVTU9E#Ml?D8BDD(Z*tWq!*Usw`8*^8RG=Hoc>(opL#*Rgp}Y z+4J?P-8}`w#l0T=wvb`VWMsQ;z1C^H$;Hm{If|nZF;uK}yhayx%bEabR2%d32l#hvmaK55ZO*^Sx07K53+m1Rb| zWkiCT>t75X#=k4(DCa254}p)wJ?wi1SVI%)vwUO)s8bKkP2(h!^KTSLZbLt4IueN$eyDKS8d2O_Wdz&{{x5ud_5R>eOn!u06&O9 zOx0QC933Qz7oG5dfoJkDU2_CszC1Xb_4^Nt3Rca}hx5&@aD)#h4xZb|Xuz5)+AJ5U z=vmY9sm<-F`lj=D{m;f8PP zNdi<=n~_C#80kd$n2kCxPCD*`m^D_?PM_}4&dvtqRdsbI%~wQ%Z@cPL)gcWmtlP32=_7y$i(Lm8NviPHItxwT}{1zgNhJg?as-_ItIM#{=?vG2KrHgjM(-m)aePVj+blKgC z8(Kac;I6|IcgAa1zuLu~^IRLg*qw3;_o_(e6<731+Y)ntEFPeiJwDhm2RtK|-K_O1 zg_xg+&?gAWBN&7s8;W+&Wk;Af1=9lXC%~8!$m+auZF{&!tM*=LJ%>l*3JUm;;Vqi$ zgT;PQlkKt$q_1P$`3ZQd&>^HIBZ$NR)i(Pq6O~qGY14G;=7V zPCp8zdtOnSWS9U=CMBOAuXzc55UyjvQF(1G`u^*+XtE=5JwMe715Ix*n6ClzZ_oSm z?;VlnpPPN|5p_!gQ#aqKsN?VEMZmCJyV2Q?GC)UIFJwxCFKgY?hg+Z9WT#U%+p@^5ySx;`t889wXz|wuQ z;J;RqZ9q#$iIS>WK}hSJB=@_hk}@#Qbj)u}P|-KrS^Q-y?+BGH0AAt}tv><&0qKs4 z)q@z_s`|I%ZW+fr9jMU}=Bh#=PSf&S4$rLbuqXyMn>UD33{tMe>4UMZ=97<7FE`QHUlwmdtyy=-dtpGLFXh~Io$d{Onw=8kEZWGn1GQ;y{ zv7>dkZ(#Z3DM#Vy9IsN(UP*P=cR77oX6lzWk%{}iQ#Y>&jvd^5)m&%*37cUf%>9io z6yYXk=L;lJTua=9c)g02lVrU_ejl6)71|ExU+jI{DtX8r#-n7Lwi1XiBqZO6m6@vI z_;EST+mhCq5tSx^V1k1veGxm}S{f}xDs6HmSF^<=qkeXN3-o!$I9$v?ksI*siu|Lp zuCxiGhIvJ@w|I`y=?X#tIoQH@=f%;liRW&Rsirajf z6%tL@=C7&wt*py@jAlCO8$2F_z(U)<_xojFObicmumnyDZ)5K+oXDrE!{Nk}sL)b3 z)l;V=%`7hC_wk<4robLq3k$fb6EK0^yRaydY2^lw2Gnz%iG^Lbm|Eq;QMKG$2{Y;g zNxv*tSN_l@FCQOM*pB+)QkuzBFUs#RE`gy{-iM>1-e1~qqO%9q4_fNh7-_DE(!ah{ z!}oDrzxQtv&CBD(LZnqXZF7GX+)5#;3zLt^J}S&+@)b2dT6&Au&qabi0?@6hvc#1b zU%}!j7*T>`Kxe-8syVE~V^&_XXAUM7j3bff3KUN8vFwid!jYLXie4UrbR&Sb%H)L7 zdMmdn==GnOKatkraU27)akKhjnOiEs0*2s@8k+3g3t_=OK9O}Em{4qpjKTz}QkWe&;QQ$ZNk)wOqd;Wc^Sh z`}rQL_cQ@qPnYa&0^jITfLo#8bAR&Nm=)yZQ}S`-ZE4R&{6xUi*&dHY#9<+JT#H2< zbL740F7~@Ip3EG zRBs7?Xl7v^%@uph=H{joSmb)GpU9{WjjipgomL5SHBI83QZ=7SjR`*8q?a%!vC@5WZ){>d1viR!p z@wot?Zxp^tP?uyVBKA-<>08^DH}He>Yq45r1ih+{TgrVZ1;rEzT*zE5i;hl+GbV_h zifS=mWH0u+=Gm-qhpv1HmtBQjzAR`s?`S_KY}U}9OzCEP^D_#*<_%k)n3S!@s}z|S z^r$W;_#%)8^wAz;()kta(|Kog&~D_nl@9%OdG^6s-am^Q&t5&8EaOWtz<}OXbHAl) zryPCQZnVLkWBz2FI4}?`4VbR5fXhdZNSR}#^g2!jbk#H?F2-zhvqI#0=~H(N0*b&A zPhY14roxdUqBys0A$tfBoBQcnG?qn|QI{9agMF|HDZulny8h9Ud-qo!(T~v3ltd4i zQ{r)wA%dPQN4$+!=%Gb1F=6jB`#VzMDwpk-Iq&&{$5}79D9hRo<~0F2b%D?MrnjQZ z3*OTP3%it8Xhvax`;Rd&Vl-v^S#p4bjK$;|Z=uv>A3#}3II&HvK9lijS{wlE8&hWk zB7Ao%9n!gYX-eOgChek$SrS^BNZx&b^;{=j{5av!d>!mo_^a#I@2%4Gsc}a8*yZ?Z zzm_m5{NS31+Um(ZpSm{d`I{levG`_WZB5%l=LH>9eTBwNUql6YnIE$>zz{PjFkzME zdwp@g-(>AX>{S+3ejN%J zh-&Iu*WQ9o@M-R%v%jf1_`j#K5Pog`p)x;XSx+IzuN_&Vjc*{^2k{>_%I{(U(#zFb zHr1Dd+`>9U!0!B8%a;hl2OeHrQ$+!2R zo>lJ9$_6rHp6_BDy#RR93{aF#!O{U!u#mQD!vmD>&hDAWIZO?;ldt5y2 zgA_jc@7+GAnJjNsCa|OYuxe(}d z8NOhu;78=(SZ}!X<=-lkLm=2$uJq?o(~Q}u&5#(#l}=YtQCTdZP*GWEV8IY_&xZ(W z{QMs#rL!3&&R4R{EzYCP+p%!x_UJwg!%c|8oXLryJlS^58Q+R0A)Ny88z)XtpQWo4 z^g7EwzdLTGvst+ZF`XIB>O-7t=c{Cr(~4vXfSDOCa0atvDMKIsE5lM21+M<|(n?W{ z@w_$jf@Y-(Fc>|mv!jDzx(JKl*AK|hO1sYIdz{KjI`8om8L)0g%xwp0f-r77wOZeF z@+vtXsn#Lh22rJj;v3~Pib{G6YCNxcCW2sQ>_gFir<`;!<+%`n@#9AOHr8_$3% z%xA_}>&O|N&yfOiPGM}%ys2#qcaKFo&9>op;_D`#XkCr%sKc z5#PEA!S614=G<_re(z|}<3YiTUZ4YwIHTSq)zai_c%UP6(xb=jNr_ajX{!V!dlWDHp^|&8|-u zayXsW*t}u|2qEVh#fW~g@5TKdW~`;6YBk2@N?hUhdx*^kP+X+qrBD?~P=qfcSw|VJ z5&ZO!6DRvx_?6!Pr6o@m-DqNXs=V`50f%L%y9M z|Bg*hK;d*4SAM7$Xb^_}4Xeb3f>JHT5k zzZ;O@v_&5P9JIu_EF$GH@T?w*PUN!qcamt}i?X8T{e!k#tnGPu{V%_9GaJmn+ZPRO z5aBbLcSBN+&so zZ|tVKH{V3*hr@yo+vm9&2PytT^tvP$;mLajry<8_9d`2U{H#6mk%+md>c`wnZd)>- zKv7L?fVdYT9ktZh`C`{GjoZ44EFn3WzEnu(?EQD5Oy?2pH2F}8 zo;jmRad-D$@91s99Wdj53VVgM!CxJLx&eZsIpuo8(h@a}-gor3X}uQX!Z9 zI2Q70!Y{qf$0Ip@0NQj`#FT-w3*jjXpT!~bt(ft8+@Gta(zt6 zGXDN3Sw8lnA3ySk#z4K@_;XTwJh<#E)4v7?7>#$`z*(?~I3H{ojBig4raz`AJtbL# zqok6&kam-I!iIW19CZ{0k`E0nq`w69wg*1EJkx`9i20u?L@B)f<~VJEa)uUPvsu5; zKuj!#4a}L71NfF)#mV-u`67J!>Hhay7Pk%A9uK(eBfx)c==x7R2-t3YO1P#tk3#+t zK6sii9_JyaVO_igt52F`{F48ik2mUK{o7Z`ppndHj#=>3o_i4A{E!PgD~BBUBW_pI zcC|ZfpG!82KBJdqNVrtM7W+Q`Yh#glE| z3Q7-tq-ejRYMFbF!>4^K;oWCCk7Bw05C78Bx2?HCW5B-9P>X;_?zI_lgXH5z$HO0k z4f5A~U-5{0=NDG%=qZRJN5@=G)GuY5t<9FjPD~C*mub&V^&`0X$OAIe#Q&c8Y^V`{sDd7aM z&?1h}3&mqDl(}>pM-5z5QQN4R>aot?xt+@N>wt4?-h4afG%T6dPbIT9quc;zY`RdD zhzdj2g1j`*lyC}d_pgaD%a82Y2}l`=8sLXgThTnm9VlEL-6l{}nET2tiQx1c#)lyk zu%JYI9p7lwrKP+&XHS!HmFc&ON9?!gHMPIcXis3+vaer?w6&2Dy>>bOKEv~AhJW27 z;CswIadn)$&#kam$3;@NVBD-B-)1I)s71(p{WR{6wsZ3Z+!NxWf~&JgT|x# zoxGQOl+F~jp>oBg92jR2sKXAtr;maBe4*7X!+kai%cnC%H#c!&5wY`zi;$k1y>IR- zM$eI1G#^qqt11Th0sFDVLOai^5drEC0?8!Pi@mO#spSB{Vbjd_U!)SW5y&;H0DBct){a&JC%8ioSW9z44gZ5bRQ z(R5h>1b~(~93d5>Lec*AYPz85hP}f$?TRH?#br#u1AKURxK$O#dQKyFvEIh%{-{x& z+{I07vUjr$R=7T^2Ts_X+~`ktoHJmHWS6W=7#hNUxL=*4qNUZ;>}fe_cs3dPnMv(? zO6;AyVA5O$;D%#yK{Vz*5`;{bcLY$^5j9s#)62fkK*LD2v2O=l>U{B{M zMA8$v9#J$=DYqVr)(MWAKP%QncijgxT09Tvf)5pnxlP>t$J4Ar!P=2ALo?ZQh7pEaZNh{i=IMZRv7Y$Uq~EO3T?5%aui4?~TJZ@^leyErUn zw~NxnW7{{~3#P^0Hf#hX`md2j4GmtY0WIxN>i)&af<0R2vT{P8MeuzGDFwCr}&_Ov-k z*jISG87da0@G*9m0x;J!(x5FkN5?=0sBGs;oEk3A0~nqsD^QihH17$QNJnCAb!P_{ z^8@7u61r7*?aC6ohIxUr(tFveI=GL{&eaZ&AJMZo25mjKz@^K8q+7>!+WwnDTv7Q) z$sYFmWF(HmzUgfNnLSlKJ)Owc<6>sGBQw~3FMSm$PRXFnnJF!lZ2BI&j1kU?x`K>+ zHViv|A}je$V2~e~6r2DGU#&seoN}Y!e-xu-48_WB^!BLM2ii6L8sO0okN02ahvj+y z@T7TY6$qE^PeLh2h1YWNkyemSR{Ssky$9hvk;;DvsJ>P0j4ep`2!uzvfdT%v{0){T z*lYx^BG8Zmd0a=@R<_7Gj25uBn$eM((|`8PBiEcX*0brYvDroMF0RZWb=y=`pWgwb zpY5H>g1A9BoLeSCZR1q##d+d9dQd>84<=fJFT{Q`5@Y?OfvwNfNSM{-rU{fG68qt^ ztt@&Ed=IS>=JLkd*h*$Uj7nxN13EwVnaS$SR7x_4!*)_>BDtW#E6oKxfb0Eledjn<)@!jPnSp*l{Mw zf(W75WmmNI0G(wCH#`cz5iWXThXv3BILTi?^`E}z85sm+g802)%BAvUXc^it0J!RE zkX{903-=6^n_BUbUo|wP3xdb{vSvyZC0BlShR^gv=J2Q{e<4SjlOx9t@JVV|hwmeZ zgm}d~WnY4L_3lDnfMVoFJAJ@B+OT0D#(n1bVs6v^PLRFjye z;9Zr02?`Tj;daGPNNT?lT8!83cxh~_WxA_>hUv#;kRjF$s8OIflUm^SxT_;2wmT{? zfmH3dF5&gTnV1M(NJ#AAG$@RSd_|cK6BP~JuKm+ntWjQ@=WgCJU=9WZZB%?C{T<+- zEV+Q0Ja;os%&l-6ImUngR>3i57{MM;q}q=bGVF^W7!RTN%(uQ^Db~vc_GRqtBc`0j z!NK{w5rqwyV?f)l2{-lujOw@f^?0@5lqOm9{Pf*0y}a}v-5Rr67PCE+( z)>%jo<45>DQ`0aGk7H`vm9~?d2N%Q`NRv~g98_Mr!M4(-rrltc zhz@n?B;gLFG;4&+s1~Y-0=;xqx!(2WijPshmQnd}-H%ZC#XgT^P^3`;?F4kpuit}v z^)e%yp7*V$=KNL+g?`w2xIlYfTlYs%kU_1#piH)+$|Z2k_GiDrmN* z-^(-p#!Dsip7On!V7WOox{hd~9p(wY#JGrSx-~Q={WfP-Tfef@>oZr;e|~KwhAEWvvb!Ffo_~?EHL1m-p)Cd5_y6J8Gr*dOXeN#7g>i zf#3I8A;@_H!~@P_aPe(p^|Ai%e7o`G&mNFdg9lzJ!4(Rd<0p5UV_qMO6&9aULrs&0 zCD@r?ylO1fT8%#_F_)W9xo#0l7DQSf|UsY zpZZ0WC~=h6(?Cc=`(;{DtJtCHQ45H%gvmAv4xwmClTQ(G^E@X$LMVPO@cYL~K6Pyl z3)M5Y*U!DAfyBRujn|A^Wft;Y=0Q^$?Go_K?B-J77>!#TCY33awGZ8PRT>I=?Dw)7)7-m@q(U%;5!DX z#kZ_*Jo=IA)c}X=nJXXP=;BYYi{Gz%q8389myGFrmO74z%F#U!IElq*$uT)H zyW{)Lht=?ISx3w{Y8v%w4R)XgR`bq>nvZib7eeJMFM?NX7ZXjOx)A?vK@-E}minai zUZ0P16?V3M%SRueB*Kei-S2E+4R!(zl@A87bFV<%-t`Wv61sI=rdu8&A*kZ7UHK&E z*xRl`Rt4dH@62MLZ=S!U8}JOYKlUF>w4P4ZeZwf$IiX8Kj_lqm$A@Nm^NZB8h%oal z+4k3`6s-(;msxN^NIIdMl4^7k1o^yf#ybpSyez##5{m5kHRzh7*I$Ah(ex~zwk|QN z+(&j$T~RBpTDRegfy^`0WD6QaXNU2$gib>BEYK)DVbdPO-#Qv*IZfF*nP>DO(X*LS zHA8Qyb+7lWlz<&6Wcvy=YY2ra%GJ+ZLCEzUl7ZfZG4@77J61y0PlsRWb$YxewyzOt zQXYj38_M-wVe8PX)JYcZgg^@6>hD=d5p6KK*Qj+CqYz z>{KZ7MqL`aiAH$-pq>7{!F>F#+vu3cgsczeA4$*snS}V6&8GJ?%P!3Bo|)rMLP1Of zXSlMRoic{Jf(>+Q+uh&fSqs$SrB?7p5?UpCj}HyI2!g9r^*5RD7v{h>C0g)sqpK6Ddg90qwHY3q!9x!8Da3Xcd4`1&!$ zV*(zLK^Og7eg0VsJB;YC`Fgq&lF2tZZf#PB*0CuRbLTim3Ee2=fGybt>;>Gg?9^5o z>mVyFD4nUU=FB|>qDE=>J{;Pp6JJ}A>>%BB(WnHXPEk0vxl&+w3G@FbHY1S`~ihXE3-J&6K>0Nf>_8Ot5;_#UWvn@cQna z{V+i{c}}JEYB+1Ks=@T@w{T_jtnpBx56YRYQIX zBlbxWo^r0`P&axPgU!>djkN6yNu1jQvU~*Fsi2vy{Ppd(Uw66QO-1REjLq3*8BIq} zPJtfQS$VIg%ZM61#@#9s!%XJmo z4-);R`UrmY(RjaW*tA;g01bAYPPF0aH9Oc^HZ4`SOEg=3Y;N3I<;S}zf|btFJ0%T2 zw?LE}^mxK%_a}?qG=TafcvC z(St;!{vP7N-4xXSIdb3h7%P*rcTs0;Z7bj)nZ1|#_l-MLMFoYnRoG8oRNwvsN z!!CcHGNex=MgIS9q;kwl{{4sWrHBuy{(dnkHTo6yf6j}8i1W(l-wOyt{$DQu>+i=^ zu}!K%|9+k!jQzi-|L52J@6Gw|$^jm&|2|egVEAuP09L|(gTg;4;(rYaxKkZ|_nBF9 ze-KJaOU!o1zxa^jPAU8h`MwB!G_iE6SJ%?=dOBz0W>$C>>$?s5n{EOzgG3aWSqnmf zW%{-G3)-(CC@gid=zJA5OXP2*U~sJLcjT&j*n#wE4_!CYliU3)vDB;{>`7c#=RUqKec6RRZvi70yuaeovnP_)DqNr($u-VUkXo$*bRsj z##|**I2-o<{t2oD`V}`1_ldaPJC8+WZ0*H1+i#<{2TZryP_XUK`x*Kgi*fWyx5XHy ze+vjCgdh>|m&-NDN@PrwPmix}Z=Nd}_k-@A-`0vgR~nDzJaX9no5RUH(XSA2iXg*vkS{@?^bg_#1I1TgA*{_g$ zb{|YGE-9OTtUdn(=uOvv4*~}xO$AyXJ1j)aKN&Rd;*}oM)Rv4q3v+aqd;vGU9&DI7 zX_L33bZR62TUeCDJYMfzW(pnQYyrEsgsLHicT9|lQEp86_c()~t^{zx?;C`7W8fsO zP0{#mwT`5NgOB^%R>A0^kDtSp6>HZMKE_eVkkK#4kkN324_kep zA?TN``7*0qW7R|SDK(QSMf9oRLhyj!Mz!{4&|y1MC_oFEH&<;SblF+Q+MW+NHO=>1 zrNYI*bLf+SAHtbxSz9hy_cS2lQ(R+Qy*S# zh>J0#|IbZ8{dCVVAC$~1SH3u+0S;gsgA{-oW}s&4KwnKYMW0613FodE)*eyAczb@< zn*UZzYrJDuE?ho3>EKITrXZDZwe9)a+r1~7{5tkkt`B3Z$UAR-|8W>ya30-ml#Prg zKk|FP6!|Gk34IRVvWHNqt#~O}_}>+l>)FM~$;1|b*7COcpvMUf?RVNf3{a}=ed-Xm z)i&4aD8KVc=dF-{_zs!u-;}fDhvMvX1&T5xkN4^MCmfctt`bpPM&Oe9mIHUKM6KcT z;Z;y0n=k_jRP&>@)Y@7c-~TBT{rx2H@bFHq+g6I)kIRyW<~>O~H@t9hGT*GnRZlss z8Q+(&3mHlD(y&-fG1sNl?F+nbK_{gRFCHs+!=%hO!pLI$N#kAoVn>jfgDzV-Avnpx z#1JkrQ>Z+0E)1J zvNWy*3lr0&oA)_Iu(denmDTLVHJn*Er(fr}obQHay)=&#Ps5Pq6l~rd-DA!jMI<*R zCO0faxs{r)UuW<0hIsEZpzTB+wXHoHCTG#eKs|a`!qhc%Sfm%cKE)g5!;t5WW>=4G z>B?K&lcIw@jNILF<-c)hUG=&vsVIrpT)vY%Q7zKOE^f=0+VQ=OUVXptq+X8n0x!_T zhwEFX>4*P3%NI5cMnNK~PAHsvi7;A0QE`c030e;*;qAqE;$h!mso&Ihvx+wuxwMNawW0k?2}l>B%~STLFW75P1}y+e$j!peZdD}fumU@ zT4BzQL=2yF)fv=Q{8Z`Mu9_5{*UWV|_eqBr+YEKx4^2StZGO48BOQ3##$VRx4+)DJ3AlNWIaaQ*;?x%|s zD#6OwuiX@NZ$H}4D2)fKNn0XanD`u(SeGh8a0L75Ct&@jFq_zDhdU{BO?>$=jYphVmC3d4UW_NjUsdY zM*|^U!vtVo>fwF6@@-JCR#xJPq36WFukf(wq*bxUxVjS|Y$vQ?NP~u9>y!TQQq!V; zq?4b&&#DU$&#CoN*pm1hM2#ZOYYKE~RnXpsNJ|Y%W3{s`V^Y4;AW$MI1XDIE?nL{6 zbzdx8mz3t0)Jz3RSQX=?EG^BNLQ1yJcNBN!O_X-*zmY_xx%?@Z1BK76*nOk$z20ri zPC3a6n3zQ|_{FYV$8aFFLnZ(n50H~mi0Wumo1=eg%*Xn$a{-}Vz{OJdc{c~ggWbZ-(DcXbb@?3fAUt^a|DfTYLATDK$j@nbv^rLT&h z>)soUmgOHI@4-D=`0)~vNE z(+V@{22h%alQGj%3+m2}W&vr&2k#8OxB}ODp*Yl)!Z{JqKwqe|)J~v1Ms!RUB_x=d zp-!Q&+61xGLM;0>PClM4#mp;~SY3~kN|=a22~&of!RezVCn9X-NSu%T z5O0(e)O7*kqa2A3H?>;{lfD!=X1>pYB<-twyOdUWRqi~|0*u_6R2Eu^B>zo@b*=EY zht>3o8k(|Ra_jaioF5}m4~^P15)-V;^-pDFYq5R2iLz=!y5v38qY$;1jFFy>_}3i6 zqh=xl{7T#MmsM;Y~7az6k*;r<3 zDm*HoayII)50EB*dp{XZ*_jwIg(?QRAfy)g5Ea+7g`lS^XZUOM?gB$EtFuBQl}PvH zuU2gh!~t4d>zVpSZil%A3d8`0?uH!5OqovYR#CQpyj%hukM~c0r(9IU$Wj%=$_=e9oYrsXLHd*MO-T{1x6!!P0gFp6BCDEU@^Aq6`#t7$CChui{RDH`{Pz6!D61Ke`6ffxhqfvIY+xR zYPn)sB#uJEQ2lp+*VAmI7n*lsioj}PqewQ@JRS)PN^$uii~co{#YvUD$+y54o_In5`Y`VBXyuwS_d}sOh^HU8#e5%&abnCCCfHf-@Z!X@8F7IGE=WXs?st=v zoK+fY195su0pv8VVD-%M3*Nl&A2a z1?=kaDnSb#VxOUE1|iAJ)YWpucPHXLH-GEdy%z$Wd5GU~g*x1Gh17osdr>q-8)+mb z>GG9*ouu$1EteRPk2vJRMDP-3Qw(zxI&Vun7(pei6PRE&&>*I2A+&<4(}TVl#R8R^ z_RRIrNW2xsO|Y*OsfJDj(_2eRNgZC|6Yg}TQ~YPSO;#?zd0lvagf#H7pFP`mO7Svh zSjW7Uk>2$lA=ShhbDalQ2by|R(3qaRh<+K_o-5|JUZgRI@}xg7@mxiZ+k zR`z2+VbNXmVJcLNovEc;NEN#bu`}fJD6!S$1<&%t2Axep&(6-Am)+*%(s}K!4kn?< z7tGpK?+v}S3DVgHkNy_l&SdA0Unj~@gGfysEnAmX-|=QCT4@I(y1f*4e8b5*9?1E1 z_WkT7n!}zYGJ6vK)}m#qyr)#1)Ic9~zUj$rPwr`$hwC(IojP-ES<}P--pf8~_cDWb zc&ol2{r&G3yps`gNv|hehn) z#@Wyy73HD0@hcW!1H;9v6r}w@|D!x<5dFl94o{-x9Cj7Xj?d+W&YVC2*h%O4b#>=x zo_c04sbIDHA%0qj-=O)S>)z>IF>Xpm1V_r_G$O&SxxDy*rXJl+`ch%Ll=R@DS_?LF z$w_d9&E>kt0Ct02BdggxNpg%PKM4s*jTg)U@K=z}qkj+;5`u79{>{41}%{`DT)ao&QThL+aHkC1WPA`mA2VRuXJi(Oh6Y5hLOdmyaTvRYmez6smL z+KF@ctIto@&f7^^yxw=qC;s%ntImax$p1V! z|8xJ~^bZ}h+0{lOK!+#5_wV~%6pz0E`M;zQDlI7(j5B@9O4~15QGu&d*?DbWA7rdN zu(%!HY?9>G*mK=N>6Fn|R9DAp*gA=o9>SfllBXOOwuk)4<;aeh&GP7#fxj!L{8Ii) zCadIl)1Q^fBag4!nn@}0vLdYxRE|m6)fyx69}iE4_N?TfQ+m6bm-)4i=n(DbqU5CH z{ zqnBOX9IxwAwUclo(MujnF;^Ra+GIM^58><@(1t>NtKX0c{jQnXvFpx^aP~PA%6gSN z&uI|da^am>0UIrMC!V$(1a3>Md1l8fpM=z_Oy`dJ_azK04VUip%#9VsYHH$+6~%Oa zSIDA))#uL}k6Cl~_$|27{j}OuRDx9EjGz(oZjWr)mFO|T#P_8?StVuU2H_-G7^Rr>JROu0Gv)$G&ryc&zywJ-&Hw+Bz!} z@$4(VSCAJI=KY9Qn_II-W^3<=hEV2~Se%B3KWXTgL#LU!r8sBHlS5Vm3M<=nH-IO7 z>rlH@LJ*6D9V4+>dBF&n#RpU=magD%PoqVwy2(0Bbo7I2x#d77V4 z@P$Nfq999`ppaBktli>4YLput~R6ZzH6_+5J;hmk}M{n>GFOS2V&N)|u z_GyGTU<&`wkt;bz^}Ofii#f|EU6jYxsZm6VV9hxgPbKl^!w5 zLBjs6Bd{rRyVfBF=03#`%IIAE=PFQ&x(57N-+~j2k5~ zEb&L=&Jd8~)GY$8ekBqREVP8|wU}h3dVGWcC5FdK5#+B)goL2;;%W1BEFb<1 zQj%y_n*2xjm@K=f?ucR7`iQye+hCTZy|!KUFFE<&a}JYJmay~NS?OVn^CtG}62Q2{ zep$Q)Q%tu7L+r!G&&EF|zl5}t61R2%gLgIa{yNIHrT|Nn@bxi5=7CBvEXrZ!qc^!* z`~@+`K#Mcg{3lKIH>DO&3}~ItOS+fb4iKFpr3MUXfwbAXArJC)stE*ZgL~OsDw=}n zE}FNxyFWBVz7oE_72*XQEQmp#7CS7`9tVtUel6xeiLXo}qnFE}Iuel=KaW1-l$J3V zB`ZG7X4z^&?*6^xZo5uIAM}?TNd$zka8Yra%ek)%ikIjOelf*ee;p?56F1J^X~d~V zQWb`sUz3lbeZy=-)Al?1b4w+=&DIs{)-Pz9rp?Q3M$%yP#Ve|*y!MzWN~Vslo7h93 zgB@(qQsKkp@Sg{STJZGB;s4-EVOPa1_iTTQWt>T>p#KL_`!DG7zxG%EZ$t{kdbQ(G z32gJ_<%~vBO>N7fVJNo^-Izr0;pJ0RZHaD4DU(RREgtnRll7a>FidA5kK@FW62|++ zv>n=CzkX#YhrOA!v9W`5k1SQIMQ1w0th44sytH=^hetz(7HXW6y#| z`oUN__^N|>vbVt8giFX?vEUQfhmR#EV>#XHzOcMKlky1jo?sBI%E;^ZSdX7_xwcxT z7TcOI-@5r7>}wFtKmA>f@^Y#Efd%MP?n|JpX5IPr_1meYL_*y3j0iE|ljDk(mX9@j zl?ewCs6~Bm1;d(29MnoGW&=Uci}&j@^X54c5w~!&1ng|1nB8gZIv%5JoSdI@>%W-8 zN+w*~p|j;YpuF~4wyD`)FCZCzTH2+gq#>*3{UPzKelA^{n{x>FbOP<|!O|ZfkFgtV zw!`414x_2z@t+45E7N2{u!sjI*z9b%9;8}&rNtcd|M7I*k5vDU7Y~ULNmfECdnS8R zl1*f zoa|hMeAibbp7QgLV~&3PFf%j+IlarivUG}+CNC#1zdhgLnMb@rN$hqi8_#4n zg-}F(`+WI%MK2@-Vx789%1!{{O2eNRqX*o+9Npts9M!6-iLvJw5|-bd(R{%ryqd4R zsAi#-+uP2j10S-}&zRcHJs3*hjCc`}B*!EvVt%m>I6k!%$(TH(oB*5_{QT-Gd!-K; zX)v`C&0;K+?)oUfz&|{c##|?7?%V3QR41JL`2i{59&zaTDG$#ysX(Zap>Jdgo(oBw z6POz|YuhqPiy=yGqIKYjUFtXzf!w2`XBPZP&5=fL*7m(@UcD;3&kWRbCU%1HDs=h) zC=I^giI+lAp!IirJkv|gh<_ytXG?xkb#{WTXT0F|Jx2+jzckH3d0c%SuriBA2CZrn z1%-qP{2l78p-Atp*TYoQHQc*(S=KLIWy5EwZC~dVc7Zzc&zISbQ4Rp&!7_8plJKgz zlH1TowLU)hrJc<6`r6vY^%dsYaphOE#jX4E=pROv?<2aZJ&G^uz+fw{&La;z0;5;& zyTEL5;sMVVCRWANcr53;MPlw)P2w;}+pD9LNLSXv$AYG{nH>iZ?Z(E&Js#~BT%LXX z{T3k)BwSM~K>w*QJo<$~^z?))x#E&kOuO(81EiYbj-82(jfhJ@K=#sy-*;M+2U0ji zQxs2gX1gBbZe_V$j)E*d#Y<)wOdR!xI?dCa(>t@$bDUPw6q&l^x^i#|r`Y(Hf_3#K zecxAJpwU0Xy{@#rP{N_YHam9sg)PU|UMipMuw|pg5Q%D$_`> zLE^NRTGXsSK4&;~^3R_?1jV`^w@>5`bTmDY3pNxBq6s_rI;7KB#b0z%9F zK2mx{-M$xZU>*1T=}Y3B(rFxsdF}S@st#hWPa$foV9}zJ?l;cL|3u?7D;R4c{qxa4 zn1Fyl^BQ9XRy*4vnU&9_NO4*qZL0tP&iCn0^WFXNyV!(AwRdg{CeOD?zYY?z=e67cfr7$wI}@VaYkG zGRAtAj*-Kn>kEzJ<71)27fG?}QJwfI?b-~YygK{|KZ9$#_x_|N0< zW^{HLit$!X`zpRPbQJ)+%~we$k0br!y*z!j+~!(XlXH5h)bT9lgoT4ig!zuLZU2d^ z2`xrkCwW`Eb805dnhwatPM4wEp2uA$qc8YdX#G;v#>&|EXbsNcv+leD?F;moGn2W#E7o?p=w1aY}hMI^Ozb#LbJ&kB0TY==^8qYZ4c*K6aHwZThzNO{?vsi zH@cHL5Xh$8Jd8hDDcDCP9?WXq0%!qsiGsJ8MJ@X2bc1@jmAG$I;`Ws4qLhB>TSx@u zjoGsgt8HtW0@G7z8XRr`2Q8lPVWPBEGmgk9#`Jt{j(!woZEkZTVd1c6B zI#tFM&-iv=pd?8KeO+DfZBUp(K%rd81NlfNj} z&EeckiJJILXIsIT;kWPSVHsX*_d*?;?>9b$muQ6&g(~@l`gdf6?-=sL3aQ;`whcPb zj^smr|=Bmd-8Vrt2Yc#8+bWteuIT!|m5(PWVZdL3@mA!i4|1Duf;s#z`dU6i}T zt$`oM7g*^bOk`wCc?CD;-yKeL*ycBv#W)$8>kI5e;OGM@(cTt!dpU@yhAhC$TQ=0y z+b!#%cMr{|v>w{M^3LwNq5_%XyqCKdI8Qm1dQ4gG?~SL5Ok1=V;Yzv#F3o3#ft`~A zEl&0|`I2BDd!3=8e2~S3#9q{DzDBiDIlqduYPsa!lvJJ=qSmHL3g9-?!Rsl7{xiX^ zHOPCM@0D1hfOW{Tzq^XIPvt0O@ot}KSSHngY+s|+S+ME-qD`=xOnz%L|&1tkn)v`zD=lSekTrgCLT%5*3&A{ zQM>aHX+w+JwlzmYvNO(t=|JoRJvVV6wnAuwA6}jzElXa6K6xp}9{BN3#UE`a*%;+n z=&%sIb-NZ7w!9OpR;d+U;2kZNjPx476J_M4{v*@9G@!C~S`CW){c%E| z8W)ok-()hVKSk=W)aDa7mD15#Ss3!O{QGx{b`^?S5Jml5umQ1vPBD0|fzG2Bjp@#F zAE^2RF8e}?g& z!YkDCRYe$K+aOx4i(ddfSYt@}@sk{gMGV>O-a529o5yDT?qbY4Io+m9?lrOD;$vJK z@y9%Zf}ji2pjWtn;PV~*4cqJU*IwOtS^R+V6>occ+&Q%b9VokLf`jXl0QlJ?9v9GW z{WC*PPIGO2-BZzJG>40*zb&FWlE6kk4ScAReLD;dm@u*{lW+q=J`u&$Cos zC8?&nOxSP{&pb`U#}mP~k5PY+^ZH-8PEOR-zdO~B9=T?w-hJ{rUY_xV;}uI3o+xU) z%oatul1IbisH2OI)?G#-uL%WL_r_wL**Sn81{g!-sA-IE`t&>t`$z0AmY;An`RZh! zmwV45$gb3uRX0|>-8>!Y(4?y#Y+gvWtupyg%~$*QX|`#>TvvVw3_C!t)*jgYZNJS>^ZhT}esC%-^nKu`483eLfnk z)rpCTJk%VIJW-pQ!d*0HX+}zyHJSFe$ss+879!z_(KVQPl0fD!JzJJaooxV4H ze|PD%gN**vSd+Cp4X@93Q4>LWzKmC(@WESi8|Uk_KYh?{hHi?tklz4;PGtN*{2WE@ z_l}XHg1yUE#-9V4y(A#Vd@cgcokkeFxf*jjY&GIKYlYuD24qPeV{HuR=fM+e8h0w{jH zt6u^M@>E_62fO~DhP+?x=-_dz?>Not&?nW6KJ^KbIUg0xu=EaBy~0lbHZt^iMi!eb z-7Vk}_fOd&+0Cx2RyT|5{(Gi7UY>j2W8E=kvaZ_T%8~Hf^c@-=ZiNPDgMYaiy7b+I z$96!1E?~ct#eF^kq(Xfji_0rq6ZL#Er1QY5;}11p!N-B@LK<%bIF$ujyh|D zLzDR2_gQwu!!}*8f26jd0sGpvucZ&X0YycXopLI3cU&777KJ2Mz=W@H|1QUC`9GKf zhu@DCRSS3=AG8=$xG`bg+Fu9h2GJGaW04)$USk)?sSp_htgQRJW$U?6tH@x{v%9>7 zxgQluw1Fbit=`CxIr39Uk0PizEcB-*^UCwE0bA=T5j|GS1uY5$SlAo0)(9I2?ds$W z`y{u}bAwQ5yhthh+@_M-_O+!eBDVJ3Ys>9VA9*9$KZD~u{m{AxAJ2h) zIt3TKpc#DLA>zEsHpFv2qQ6`I;_$cd3YlfASC)DL6Z>CKNPQe*ZcDyq-M3DjizE@p zpvAYIdpWCe{m*Bckz&ec9cHh>+n`4_8lC+i@|II#ffu;eafQnX<|6W8ZCZ(P{>0`u ziCfztU%+#zVOYCnBu7{B%ezeU{5uGOW{@#hk^Vh;Sbf_4gx`0bpZT@1RZQmnjiF9& zrBro>?s_f9*54^P?P|6m_p7`lW8)!6J27M}GX|%h))foeuxk zFzxhj>K-tL8swbo575%IX{jlYe`}&cP!)#>1D#7&aqoMRRdm9zCe!GujPilc&+ZlE z)R;dl{P6ZEQ3Z0;m-tWl6A54Qyd3w*jug2%vLwP_$l3Gl0OQUEFV7I#%mWP$-u-i$ z|AHD5E~xZoACS;D857u*_-tX$y$2@O8a*ZU@`oG9;r7VzSFMJ7LxQ5#;qS{hyOQyJ z+kpU%uuS&6OeuC;j$BGbpE_+aa?=Uxi0IsD^hDXq_Y>AN9k2YiQ=Q3By~*sdjqhH3 zdMhPHiP)aH#N9k_N*TQc?pL9{li1a0-^UXwdUuqJm9+n(?D60-48$w9DBI09NZfh; zj{WA{bK}Cg$mD4bZ_F(+iomod$=ky0@-08V@lwfw->BA;3co!(soL8krIz`pW*+5Z z4*__)3+o=jvjkgT>*;kX`i5-`M)>|U495lJN3a?0QCUvZK@0iwq?{;Yo(43GTD4Ng zvT+8QNn0XeAR~{Wm8Ra_d*7Z@kQE634ex%j)xK3WaHJ?i4efg}f=J}N<^U}H%H{Q` zvFTD*B6xiD1WJ$6W~J~OFnGBp%cz<9DL;Rv?A)KX zIq@8b8j%r^`Hty)-UIP`cj#n0kNNRK^Y~Kvm~J8l2(A zkN~tZ62B5lw@06P2lV$*)l{b^v*!ER{;P3#!`qfRo5x>-XHfMsBAK->G_-Zz_`%b- z@$HRnLl3`6A3Rxr*q5NFqeYlV2#olkyh5@)!`Z>g)>?}i?9i0{3!&>>=}n>kJZe;E z+h>vSv6hQK{9lX&-C_y$tO#kpJlL^rWT`ozs-czgBn3!R(LD=N0l@cHje!UE%}=pG zLsrl?xo$Swc>o6vdW-Ip2LWGSz8siGo_5y`)cZx$p=TNLgsWZjv*MJ)Y6qOp!V>}n zg@=Kti~;Rn{z3zkvL|;lv2_LR-^)YHR(#*Ly{THtkPZslR@`;`qTXJHj*n8?{50FR zO_=b!8_qj{4DXimN*3u*AMiRYuOE+3s(1RRAsc)j#))qBS@ke~_8+g&{p+WFNi%94 zQYdo^)NwW&-so=IR!>VbYI99ReMw`Q9|D+RW$FV5>64{dW6H#|*wFNCw^CwUXd3gc z>8bk2@?C3Dg+(O?gccr^chz)2kdxaOwG*xSb3Qm${WyLVc-51S7OUwa<`g?{< ztt)IR`<@6#rfxPI%ss?vgjz6@a+=32_?``%SqO661ng3iR3!gkf66nhNx{}6t>xuY zIM(Fp%O+cjUY)Mg_5MONcUd!3UR21c>G&h!&7?X%7+>1jFXbfn_Z|+Ot<6X(S z)nJY_S;s$p32^4_qQZQUvGq55n9}LT=D_CVWxAU^Pj&W<7B!)Pm*tUgrF)zxe^2%9 zE?GdrsMT{zCU5h{6{rf>80c9lX6=h+kisd^z4W-U4aa2>ciS^gAxb*|wyN#1K5x%Y zc3kzVcBjffvkW(!7ItSXo!0&YH-1>PfD@aVfFm*7Sv#WQiuXN=qux9 ze5HMhcabx;2mF4PDU>w(Oi5MtYh#m9CG1p%{O`LDMmI;uMtO;&MiF?Y3Pcp~XqPu@ zn4k9u7ke31X?ukd-GVo^dX1&Cd56_brPyfHqkdM`rYDGY&$qbac4y@T#pxyy7kjySfEK~U?>=IcaTtl%7dGhL{`DBGd-``U9Y$+`yv7nMpuK@Bg;@mBI zcA=9Cw1HwYFQAn^A_Yk$GxTdbl&P;))MRuUC@?iQZ?E7Sb(kt`Hi@j(N6cI6D-3fW z4(cbwTBd)~DSKd~Z*6{21#&7}2kjtQ;*8&SVS6TchBHxY9v_f_V+^RN1bwh-RO{yb zK~L`#pa*E~6U=A4?e1U7IsXOFTfd!|+KLlT_?vsB61KC!PxRrX*!(W%1R+E|7aU8) zZFLrl$!|ywx@Q8+uCkPFvb$Xv^Y%@0NX(YD&0#o=aem>R754s)oa5Y-q(efMt&I1O6h2WFqav$YI`bLyq~Dh{yuMZ8nChu}B!P~aK8raA1^ z1h50sZ8`J7JY$oN0FR(PEf!B0MswO+80WTX&kY~lGFMn-WA&NJvY@{+L%Os(X2B2V zceoC(v;LSLFz0ZOJ*DR@?$P!~$k;IJ4C)eU?WxM95IxLBOOuQ#HWJO80wT(p6+k$} zf;tY)&Xl3;;81okDmXboYhIe9Frn1&VTX>84<(Z;85lKf3d) z>&(VJW$u<$j`HCzOW)qxv8YA&39ucNIH~k=mAIQ@n_T4c&UB>}dEu>F!`1-nN5o6svZr-J-Qn4H z-EVRebRGrnf>8zg)D67O*8Wosk_7rDu-(sbI&>7`t5f&&Rm$;dd`68gszcFi!lkf% znZq!ds*IX3xV3^_fc1*XU#O|P+_)BcWln8^H_;4x7E5A5lC<@yRJmtHnVCQ#wpWF9=guBNB!ZE zWt6&DJ~pTGpmsL5Fyse>`#wVm$TK^9O!(J<&$jp_-swhd4Pgxh&LF!PDPePw zI@@bD{_%;0^_+i3h9&~gE{wAH-V$>6)0f)c3CA;e*V!{(p)b!l0g#XCa%@SSE$g*t zTML87%H3Dq^3Hm!`wP#yPnw2@eRt#>!TU&BviRK=*;O)t^4=mvg?QveV`C+rcyD;<12=9;^nO+(;zEE&%Jr52j=t>HE$Q&>x#P<bo=~%)>q!E%h_inlq_u|s~WAKZaaH|X3ke!Ma z1Pc_EZRyn@VzZ2bU43l)sb8db9Iu>T%G{1{A7XMiJg)Z5y@yX3&HkI{55yTz*uXA*-p$*nCtT+$|$f1i#k363tMln9L=7uEuUG4onPt3SwApPps7Jdr~ zW=m|Il_nw3f$BLzwz=30*aohDR2HUBC=qIUaF3Z=0Jg+mux~{u+PK~IWhCn~Kcet4 z`RPYKo$S`fE`S(;_078U6c>>eCaGmVjSrO?AwLy-r*5Yg&#|^!EF)^s(NX$2MG`46 zaps2$`d13hIpj{+JMBIM_`1iWJyHQSu6cR(HyW`oO`1D#vdG9L8<`8;$E9UYEg3vjU)7peNvZ-r`z!5z*aUovyVJ!-Jv`Z*?$2AQJ(YZelTk*x+j`8UO<0ii9a|v zuJPUPK#-WDD~X9Sw8sHFGo8LiyqvxJ^vsetwfkQ7bUwOJYgVt?qhxZi%_p8b;n4E_ zhk=LB4TzeH;6DAzau1C+B~>x`G#QR70JP@qs^s8aeHVN_Si$1=f8*e4(X> z_y!N?E4c3=PpUu}R?%=01g z=z1eTpEyV@9Q61zvP?WJ_knkGGjVEHqFQCIy_@o0+_p=HzTU9!KE7Rke?m(}KvjlA z4^{qcKcUtd?uwFHOfC_9h#?v@mUe@PgT910`~pSC%5#M{YDU9{TYGyR=B-`@A|0p0 zsmk-Qvk=;>+?uqxmc1p7@lLFd+e`=0`q0IDv6dOHjYE3onwHGQzBUev$q)M-F1|qn z%6uGKL;h1B3nh2`#yXh?IKRQ!>>P>cJiS~hP*Y$Qh~DYTK6tW+q(V@vt{TGb?GQsnyal&P1ELB| z7VO&-Tc?E$wBA*)S@ELhDUTi&&+N;ka4U=o_)WmQMQK{jpZ)>2`1ad!Wb{-PmJB?8 znPQY&i(wMvBpFfFUS6;3whl>AdJEh)@|q(e9cr}lxW~dSr`BA7m`LN3wyF}fbuPWWcm{)1<=zX=I3@ExX5#Zl{#$9cbA!e%I$pcz8 zy@0;@>XpG(eqU02MQ#Q96h_5h9@)jNr0oB(ctcMr!xBn4Q;WgeCdrr5-0>aw(K@_C zrk~dKx{~`|b`v+C+GEhy;wcxM+A+&~MAQXp!?Nb>y_UL$NeG!Lr%*LfC!{#x7kbTE zrCeO=vym2FQ}g>kgW486ya0BK?`pxSprv?=6m*D2HBuYi67sMMqQ8F8b2e$uic;8> zbq$}XNPs1|FGyv&30pihFCc-c21_{&KNduGjaX-5FV6$Ai4KWb0W}#J^t6Xn zH(A5})vjTo5bROcKLRaJBnbyIB>2vQzq;rSQ&;x(O0PUW!xTB5-M;^NG79cv99SLt zX|k^g&=DL9xE}nLZI9O<^hd_Rz|72bpqhU-l5+XuPMDaisu!KL##y+30!Z|x>nq$I z-!)+QNW3&OKDU^raJQLDaDr5l4fo)Fq=Pb8TQhV0JTTDW??25*%GyVt*8{&adf$|x z34$Lx)?p}OeodRl65V8nPi|Kn4(Oz_$u&xJj=5PWx)wkD%j-Da;TAmN=ll@LlbWELWzDfbsTtmNu%D&QZl<=n1(;Xj^iEUkv`Va_SK%(#|SLlF!;>2rS`@`p7iTn8w2j< z>5m*mHBf&MssE2e;;4G5H{qS0F0$9r%;o9>up( zuu5ia`4hEb;KxtuS{_>I;O)s5#K4$|;8cYqoP+XUuVoje5t1TDzEzkB%pA-YMK&C~ zZEkiv{5+ea}-MqEeb+TJGqt zE&o9&E+glPEs>GGqT1T>qI*fB>A-(fqwI&$?;J0>BM0H$`l)nE|1`Vt&e=M7Eq`t; zsT(<3Wlj`yXzBm6CQy*SY4!4KR>R46eAbAgiksUi($8YHeJF_OIx7pq&HCrjK^Rv> zn5i@*V*Af{^=({xc#-RI0fgV2dd;^#OeudaLl94*g5FGYR>|G)GQhoQlsXNRg%cba2E#luQb4TbGjGfv{ z08#J;P`b_raJ1Dc8}RoOylGX7aVjNt*><0N!k0BBVjqm{kPw(1j)^!*e+xs=s;zO@ z-3`@ZYEZP8tv^@T6<$<`WxH*CYTh^g_wUsY&0T{J}khrE?5@vF=sBwfo8$C43-p>o(#|Ic600)ZMv1DggF7ckZzk@IBKM zb{seLM)!qAn_e}|9RDCvl%g8fS0M#ZBNFw6!{61{>6+JAN#|Gy@uOBHxaWSJ8 zCwCC1Kqr{Xpsg~H$TsX?dMnfkwGb~Z2pan+c2<7T#2%u;S|=hQZ9|TI(?19<>9EU<{x_IIgm%yu5tgdb8`B z{mBxs^J^>`A_#dQeSEU!lARUAC@%A1e`MB3cGX0so>s=D!)9d?0Hf zDq-)mxoCbo4n9L+ZM)MOu+X$3Q=K&OJSL0Q8t|WNaZeo0LLs#5yOoM|$w@g(ys-l| z=g(d*qAWttq#F5}FF&vJVM@Gh{n1V#Z1$?Zlcz5KCo@Y2z^yNA>MeScq4w;iw-`^# z;Dli1pykYR+QeoU*E(}}9a`Q}YRh9Pp`zs;BL}ySq0cl|bJr;B>?r5_))o+Q*8@JK z!RcuYnZ!BoFE+!%A|sJQ>RqMY$YkH^ODNXgr?#bWYdwW`NU$yp6p-{qJaCVunt}Lw zK4zbcx$pFEV<^ERbSXS)j);7%6myG1*u57sxv80IXw5dc=KcTBcWr=Yk|*&ipA6zI z&Y(z(YI_V!hEJbPxwA6+hxL982c2>dVV@JceB3lkG5M9k2`37znU>&u0E3oY5onWY*$or_38XR#I2+|E`*LWh+Pic=pPG?3gO-@blus5H7|v* zPBW59%W!t3FemeV(pI@a=SGjSA`S@19us%tEvrKw3i|JBMA4COlple?_t~g!;FK%f z=L-ku-Ixj6Co}r3FvWVO5vFQpE{w;;5mm#D{u_wEo7z~dVaLVcq(;fgN!SxT^dy-a zxCJH*$5NxVD$x54$Sc@X+^_z3>@%pAnO7Ft4>yAQETk`wfsDte8~Ed%6ZPX+(GjP| z5DgH*UNsdqboVje+-cP-6`d?HKNVnybuy0>(ahY4KCbChgmJ|G$bAS zlc=kBod48ix_iuglWn%TEP&dwLD)!X4*hw;MKnv_ML130MKt3GYdU1yb<3<<(#YcJ zA1&V+Kcgl<9urKB0)4j%b-#sIT*3HOz~7S!_?PfQY$Gn+GO_9zFdc)r=1%bI zufF0qQujC;gNqtwGuzkiP6OHMNHB8%bKRjkr^Tti`=l*?UtC}xqg!IT*C`K*@1d6C zdP5VKKuuRD%a-C5fS&7p8P`5P>=-od>axSki3A`-y!O5b#*Q7q-rUOKw;C+#eI9=P zhWcNzHj{XqlxMCbSAuhW2Zg-BYo1{gu*gxBc}Y>#V7-R|*`o2WOwGu`f(MW9V-m_y zj!Ysqs;{+dbq+mf+%McIeiRt1$QRXcKv_sU>lj~Mom~&7bEDm1j1JUXRgxZNbb|gz zmf)oxxkq*lmLw(Q^am>l-x2S9`FUFZqH{RXrZxQQ<^p9^N%P;YjCmmYC=r{SdTYvy zM>4UONqBoT=fo?*NVaEv@z^mcM{4oO@YF*VWDkM3bLZ0N{{zbo=mk6rq)+yIArP8$ z=k!K)>^chr!w*7fM#b$!{U}eey(fEE!?N=s$+>(l&@VClJvRTe7mHMX9a+)Q0}4c@ zt6fUwITf?QmCy0U&?%&(Ggk5Zjherrz{$TbTwRCL=0Kh0^5@PnW6?0_(IOGqTPK70&JqK-^EyFR6${`x3VT`9pQvSl~S zM310~E#sJOd;)p;cx6g6Tj$(jI3Yo(r~zETjJIr7=xLK;iKm>&cEW-$WYirh6*MxaCTIR!kI&QI(?3;%&U zCoQXVPAGEPtn{|P$A?Gx3i@5XM^LOc3w=#ZCZla_><%W3Hy!}pAW*5lnO_1gB*Gtt z&azLtP2RPP$TC^!j@b1UM-^;U3>xwBO-g*Z@y6UREy)RWD=;ipF^5y+f;h|>*iPJ1 z04ZOyy5vt)-Bz_uT#XjVZJVdb9JkVau=ho~xmXKbZW3b1Ajmm^*Rjd{_fxU9tG;H^ zV&y&>-@@81ePdqGU_y%)GZ4M`*i)gnGd;|_(dh>)<9!`7KGtb9<{@6}*vh}Dr?|^t z?#N9*rh9725N3TF(PwDGReUt~1Y>@ecwhGrgeKq~Q{Zm+cf7K`8F1(F2cpd!;$PNW z$10~NB-zO`&%cr8YE=dG8F@GRsAGK zbGlBaJenoF%;Js9N1!eBa(mB1*dt;+<+t+7B$O4FY@XN>W2OAr&W3fq=y~FGqTyuG zP;?#vJ9tgal5o!ffAeP~m&|D%@}l1Y{toXti`{QUF1jqcHEwDz|nRZ1st+J>lj93zoWD zc2r^G zee!mrR452mhr|6Me(Q)xu_6^>FGW?)KZWgc+~S{hh^J)@qfrS*6Kvl>fGYJNf517H1Ou9l6Y+8?BMPY@G9^S&XxRXZOF6ox|6M_!jb}BK4K$nuiw} zRRv4oo_7r-jzkBSzB7s|=xRKb;~Kh^5B^(XK;k*)^0~Y$!KlJ36E-*b({V@NVL#?3 zwiV^tPhT5ZSEO9#>_KHrY6-?XCRW}!VSP%}I5Sc`$imlzZcm;6X6lOig;4P)P$hhx zmz)@MHCMu0b`XAg&r&h!(ey7Fi7I$l;|SP+@Tbp}f)OIalP@_LOUP&5()t!-#Lgq4V46FprJQzfv6dzv4>1; zz%6!Fej$V;BKxBt=H|ht(lJbY3I>CAolG=*GTzJYzsVX2CfJbEQE}7A#3ukM#oV3a zSo?83eex0U7)-8yP8fDfr+R5cQE01e{nkQUjI`tHvAh9y)U?~<6e#_NUTeCW9*33~ z-0x668R&&?E_uXZgmo(YW3!I~QBSs`_WLnntGHzLg+OVFZ_L>#*{|COpN6xfN4HL9 zQtIVZQp@6IiRIJpv&d=85`V=v)s{%v3Sn_`1D2E2SoPIpp4EBaa=rDx?z4?9YqMI+ z*7D5NsDDAYu`qG0yu!qBxpnwn3x3Hjr7=41`V1=nzc4zH2{E9Ruj#ns`dQu;uyoY8 z?@%7AB4#UL)E{@t=859oe1k*E`lQ*Q1F6W)STJvIGh-{1b_*9H~typmT!iQ7K$0nKFDbb4; z_U_P)S?{UyZo!3=NG%&%h$#D&>|N5O?ayV4I@<7V26R->t zd^hs)j0-c=AB1)MW+t+UA+YP1U5608$S-1JWMVRx^O49Wa1+sH1vXT6p0tHhEf&%8 zzCkFr7~eM0X{c1vygAQ{f$hV@4Y7dMY=L2<`J`qZ4KKB@z-O0q@ zU!2mV(%K>2qOx}@-ugaZ5#Tcgv0Xh&8vABEWVbFV<-OgZbAQu}psGJ_{9J9m28 zmo{@)edptGk>&-j5rP9lyG&47vN#`6Z(3KY?xFL5hY(@qo0xmq3oFMG4V!_D6k9LWJ+- zhu<0LUs+zp@jevX%8~oh2ZG}BmPlH=O1$DtmWZy4%C)b>z&qEQH*ckl=i@Oyy&rF5 z&(Z5OSDz6{!H?(fGool7>{DY_OBgBB+l$Itb5WqF%`N zGWnVa+v|7}aU-C2eS@bMmsoT_Ka?et1p^ux%rfO2&mslQ5vW%rByU>FmGfvp1KpSg zd&HsNTHj=Q45zw|>3^spKE)$G1-!gL9{W0UZs^j-Xma2=a1fASVre&mcy+dunsfM| z2Q(%deB2d5SC4^|b8C0PxGpP5yrYwEU-=?6Qfv4hwKaLdr9zW%YKVuz*jPNQt!zLy~U|}SBCZB?p0*c@2jEpN*1Y4fe1vtm@4zIY0SNR{j zQhrJ9Fr;sbmYU_jzvv?dGZ~&rsfr%sFl$R;=vx(RB~YxNjK57AHequ!3l+jug-(aH z1CwSaa|Sl0zShQesJCtZ%FualDJ8P`AuX`}*J{-SL*?Yl1;RX6aLJ9Af+c2b3i2<- zC1S63t65E6pUaO2(SZ~ut(ZVj29MT~nfm)wNmwW&fT>YbDl4NmNB*1>%xi21%Y96} z+L`2*$)q8v?d#)rNOhu21icQ2;sW&a^{2F40dS5{24TP?`?xLnje4P_236{ZLS&3a zn4S{4w=%CN+ujkT{;~pu8}#crC*`O?Rr%|-ZUlBv`J}**t=!S4%Oq;*`#e!$)cUvF zzQ&M`ysCZ)PCq)kPs}>k{%{>yw{lH*3VscgQas0SsNB8*;yr#YPE@+C1FmR47=A{z zA>6H9r=m>+3gVLkMFu6mh~!-qcvvV590MYAcjEA;k|TR+oC&=`^0W$JrZ;wO{0b4B z!@a6Dym^n*hgAA%jzOCF@LN-g!T{v}(nT%qws84m<9&DvBt#2MOW{aJ%OuwJInCA- zfprkiZPjJlJ#qK=hjE!rkfO<$0rsbC1}!BDI{|-(AeVCbm*d0+rTrZb+qF1h% zBepVd)&#l0oI0>JZ(mT)!9sRxeP;#Ny_UTdg|f#J9v1XmX;GNb-b&F+^x5UZhZHFIWx=K{%fmb4YprHXz5JJ} z+%@<-;uNh%&?7+;fZp~N=g?%kyH1@9ob233h9QdHMYlFN=pw7%z+LXd;a&ts!3v@- zKS_N4wO$f7esKMultM(|kk1FvR0zngH9a`$*Td;av_0~naH4INr_e*)L;hyBxTzw< zwZnvR@v6h|8w7EtNVI0Z3{hJ9S5p1PKWw^oxDVVP$UR;f=p>dMf2@ScosEtxGBu_iLV zZ2XD%pHLmEbQZKvmmA%o%D2a9+}laV4P%XKhYQodlWHy1)wx>=u39)peg~GIAFKde z5e#KJNrAWEZ$W?vt+GF9> zNS`%I+Ro!is7l5-gK%BV1#Rl97vRTH*;P&<&)#)RQzSn>?j`bmX2d{4GqC*?SeD=aPjML~wAt)w*^*QcTTC5D*Dv;Rav;=pm3ugnhAN5Lk5^^g zfBgt>MC)~~&Q6Ms@wm5g4`9W32pX+27slojh~ z-XJ^z0j$mi_&}qXUa~xWFmS)=)s4~6UX@|;3YHq=4~BR*rP!m`Qymft+GN3T^9yPj zTh=`{A%siwzBKsw$Cnc~QdMj~wZvof1$g0vKaXo5XVK#H@^qaAn$4J!%!w+9SvcVs z;L(6t=b_{@!4!oDY;kF99-aGCNm>2 z&Qe(dKQ#13`4x?r1FkJ^6@+zzG0IlNf|Y&Jil3cUlW+I!qSrt3{Mml<@rL#bGa_2= z9(%MV$+Hyjo17?-Jmey_$F(7MMxUpEhco5rDp3>yET`G3cI4}~O1?-4ysGlBx7(^L zgun{NPx*`^ z;mJFR&uM`(dTcf`+%(V^Bh28#o(nql`D-GKniDY7`zUt4vPuEzJAx(lYfTIiiNM^Y z;4ML4-0XEaZ_&j(gnF&bUd4r`zRPayV55>uE2aVWl6 z*Tu@?ny2+t0cQ}&obh+*xaj+$%bn=Uw>DO;!R6MdJ@QQNsoO)-*Z-POiEz#)h0{gp zh*b2|nZua>r6mI@SGv2&2T-$`CBpfC?49Q~Tw&X{6TO!t4AF%Uy^H8X1ks{*f*^YD zy+@7SMz7I(H(J!_!l*-Zqj$#e?%enNH@u&o{lT)9%?In6eeLT!&);zfG8~Ak!ZNF) z`rp2@l&NRszd7+;+9AtJCLEF8`GEWI0iL5y?y`^0Ke!0vyQmng;3o_cPGcN6i26P= zLiR8tNTe=5%ZCT$*H#umqc*Q%M_s$!zC4g+;-0*;!rekXtX}y(H8si4HubczYMt|*lwozi z_#WqWFx2Q0J|^pB`-kOZ#+=x}^?JPrYv=R%jb-FDS_jdtcKwRguvbmbrSdnCeL3co z%eKPxvFw&Bh342gc`{!@kRRAlMN|t za~k{I>&gD5HlIHUQjSk-@o1G%q`QY&;idmrZcgt9h2ju>`Ya>h_Q=)Bm<|(FvQ!jz zB2;z~kRTSb%)r3HmlO3LSIF?8(8v3@tj9Qi%q)%@h{a}u-f=yt=#YL*1ptCV3Qi_u z5U?1uq@|yXnAVw&y~K`lN=XN}sI1F7qn?i*A2<;6?Pr6jkRPlM=aJD5NJW6z3hH#2 z`p)2#jEaZAs{AxLQk;jtwL&9da{pG#Q12rhv)^*%jr>mD=bANh;{K=j5paDW*r)xuYp zyit-hMQTn*ta=u2%mqnPBJ5OG4L-cY*%AyACfmT$3pRbXJL|laAD(^geXxRUzFn&D zxQoVh8D5H-OQAI;W<~nV6{W^mVzh>(AqA%<)8Q4m>DaoJfh*OA!atp00zN#aXh)|j zS*m=FHd&i!S`iIg1P1(1s|-`?RzJm7({uY2J>`z!)n%HX z)1z1Oz08`aKQ$8@ZXD-e3k&oW^Nwun0n_8X#fobTyh?aRt?3_u|sn3l_U*n?Zr{f_v&9(Ze3jWBh!Q&>BS5G|>r zfJ(cD# z&#GF6K3b=vkl`|wkV$T!=2Uz28A_|5KP5#dXb}c8&8Oh~$oaCin{bBXZ9oy79L%Ag z8cj(HAW4FjefF`0tRw{Yl2?0mMb}l~t7@5>Xv(8ZC75xIZ(Y8+PD%hmCZ1`k@g?bn zp3J4W3=9Zr_3ZLh#J$2EuJA;Nsw>6BmMhfM>Hl)%#>?uBlP#)%$)beUdDi(+bK*m(W8@9}NWYt}+Gye8^WMiqP{ zc0yhv=UnR(h=c0U!*r8@#TMt|IoLtB&6BPT+=@7q=x%Kjd`j4`OYU^Xc$SHHcvac% zR)5$hSz1w=a8N01HZyGw1pcO1il4#lHW#*0GX{^4e$9dPopCvKvB#%Zf<0AyYj|OY zPUC|HdmusoEDpoKIneF9qs>2tej6R{1>=F`fAM5&m)ez9Eryelq~z?l`&9Z#$&pn_ zNa(lQ{K#%{vo^;|`a%*)P4#*;f&Lw7cuQJA=avsV+WcDRMGtxyl3jV%BzK)S8fX z;}u$H{PuG7ZLvnGcZUOY+qno?J46Vuj+Q+3sg;a0E??O!N)E2<++d=<(KFG*Er)Mt zdp=xYT2TgO!<8sGAQb!`KmVPqo3tX2n3EoDHXGrDHxG+Ugv~wt1~5p$UoWYbr4js-Jd55 zea%DV{$dKN9k-v;PxjU7eD;+SYVA5TyTHC0s%>h2)zr3TGihq7ELt+XxGWEZ_NiGV z@{lX;&IhNv9t~!`#o;u5%D@D^fwKRtsInj$>g2BjA(w@gD=VPK_6Xe=IJH#Avy4cg z({{((j$E72&;qkTZpWC%d|vuHE~wa<6D9X|bo?ha0J!bC{aWa=s^)&{+p6%+w}qN-CjBeTWej7Z{g3Ew5AuYO+ujew6MUSMj|ZFS)kfZ$*)LnW2Jl$>~kSX1BY>sJ zkHpg|*q}Q~a#WzL3o7a6P57aV8Y4U!xyG!s)+I2tlq{-V| z%isIbp8uAJw_g{(CuH}@r{ABVzYFO0-0ynktxFnfV$6Ill@(YhdVCZ7WS%?2o)T6a z<+{;(eDb0?Tl_Z8KUXZLsAcC3-~3}A9--C^F}ZlUvL z&Z!Ugslin2W`k}tO;4#up4wC>XxhF}C(LjyIgD(&iR`7# zefsr#B34{Pqa?lP@vW3pkM?^H9P+hA1|YNxWcCzzHa6z>Mc0W3$)K&{!p9E>G9Mx( zNU6&*>s^B@(g1Jx<6r52;Pz40`7FBQRo7&|53quwy_-!+O!7wd{)+G!c|p3fFXu}_ zp0|dfp}*XiP^wka*59c8XSnm>X2(mJ#+OfSt{O&t@|Ws(ql%J6!BQYbR%_J{q2M&e zuRL7Y+(iovXKE2rU|_kjXpnK1RF!!17v1}~s5xC>&OP;g6u((xf-LRZ_g=-(h>xl{ zid^t0pm|SEXB8g~(bQ0VeD2z_apbr|67R-U6$lw~pVl!^WPI_QdSdZK4>e`;&@NeI z>E#|x%Orm433JI3F70u>WRR%;F>ab*5Z@5j=}Qn&fhnf`e$1$A$3Ej>iWLZ3g{Sz_ ze7C|K{8xcy%rnn0IMDb;e(~qIJKUwD0%86Wa4c9DL``b^e(f96bKh5xg^5YI@6aDV z$HG0*_z;-M2M7I*FFO{j_>^_KM*$2@R2F}FzsP6%`7*;#`>dd1Oxqe_k5xet8+xsl!NbdIts8Jas7)|1;HcM?+o9d* zRi9FiUv6!@*da7(zYs{uhK07?gWVF*CnKm^@kNJup8JEbynKf*XyfM^hK6k*xrhM8 zO2$+374e5tanHS?+N*{l+od`-qBW(^&>~$sO_&7W8!CWj;2?H|&Sl27%ASSAUT`^$5azMz@ba zHoJhq?D`z?K&t$*767QBs#mEky5NzubR)FweB^O6ON`oR)or-B8nz7H8U5yE5m=qW z3{1LHddTssW6BEPU|5t;er1KVjJz-e*~|bIh~Xvw)=$8Iw;}5v%V(Ski+NF^{1tfy zl#`S<#=G;;$WVE_ATAGlvMdn0%_MsnotiJlvg+RAWv0WJ{z%2brSCOT3mBZPEBU;Q zw;6E>r9oh3cG`i@-xCQp|IKH!4Af*_UDBEl2^h}9L3FoAVU7R1>pneWpimIMS)wV# zfAP8$t!=Nu6+UBW;%S6{C?<6ORiv#Uk8;W`Ktg303rMIF}!FHW4ON zS=BMJZ9;bs{$vN>hl9iUJjMf`0Bp3^*7H?=L>vtOz_#D{_L$}X)lY{e6RAZ>K^75g zT>snN9jvbY_qkC?kHY=CT^gSa>6gx%;DRS$a6&dzKEJQvGDw0G$K~M?Jjqg;7yoE- zqNsnk`lBTQC zQ3@!@6b(Uv0BDp|U?fiMe5ZPjFC(w|dO7EYDw{G4Q$Cj<=(r%8$6{Q*CrvsH<^P&P&m0keuZiA6eO*==xb|w6W~GfdD3ffY8`jnBZTYm(6cc04J;=FIx3Nz1 zvts9AOlSjGLxs&Vn_JS6+THfga+9+?9cM#l{yEOfId+`Kb!H$-2UR%&B48QVAUf~4 z$k3HTNl|p`v_?VTAo76 z3XAY8Qe$=;>^Sl-#W{*+2^U@J8Tk-~l4VRZS_bCINArWkR2(`cgG`4LTvN&lau1i9 z3##<;`Ms;E7n>kW6E!~SSCbbvOZGdK=si}RVG_7^@`4Ohe55>rBHySAA$@Buv>MYf7IS6m%oD z9hD-gs9M%DQSfD>P-A`-kREu(X-3<65he_g;oU{eF(#9SEX)JCZA~^8U9(-2xJkYh z$8O^B@XM7Wi~}0BzRh@^FX7~Vb{AEMLg)%d6EiR-89 zd^;nF>_7@ZEEW^;8;UKFX&C@tYC`!VthiDl%hh2HH8V~ki#MRDQ-i6n=udgGiLlt1aE`mVMSbpQ z$gNFy&r>*8npG|~rpdX5Bd`e-ff*-KebPWCbzSpz_EvN!mJ?0mU30(Mn27yNAgpTc zvOz=>l9obdp*-$1Fuw47u2L4Rj=PAK!M*CH`}kJSnU!y1Xs)devVon3d-eQ^7$e$k zM|a8V*dA(E$X&>PeY1uE94g0!oGss7e0%A)_M{?5TzZ6WMsk%T5N``QBbuxYQ!kV= zeC}Ocpp>LN7Vu+{$fD=C6B3oe3ka)52r9Y$ekB&$_uC|E7v80K54^x4R|m5=)?M{O zHbV3OnhoHTs)r;_>kgkdb2QI4YS?irvw?y=-znplQ?N|8E;*jAxj05e#QxJfSq9G3 zJi)no74*Ta1d?LgHy66`hT99zK_0NoeMF}oK6s<@%@m^)BW{UDo@v) zj`jRW;Z*=D6UE zu(41JYEse*_wHD6TKk@rl4MrF1(>$1I$zwopE;EUkc}|DVEMbXo|QxA->5zLz>5(o zY0%C1bdf@(RQN_zP_aR~n>XZ8yKCEMD3j|$x_yRBb}%lSI-_}=Y^@!j?2iDzr^?W= zu8_IUx57;$KgpkeJA*zjWwRdMQNAYGY7{4Z8CWjwBdgWcRY^HI%4bUl-s6ynl6T>* zbftat#e4H~ui%-oeEkKdTRknkE@u{UlYQuMO5S=mJ=KbQZIjA4*vb^cRjNDxvy(t| zqi?{QJacO~yGoE}h87K0OKVVxem?(m0`#iGrE7|xTi4{{rg66BS~izKv&jsZ?X+F4+^O;{_wK95W7=H*x8N(8bRl}H^tH<)4BjzCrR_Ir)xto zRAv;^n^;J$go@Y*0|u6IQGEkIG>_Dxj?sk|`HXv#Jm)QrFx#*V6@S z06=f}*OQ8Q#N?7+S(rArmYGx%$EPKEjsKhI_0TnJ}DZKd#82HfAY@z1eR>c3bQo536L zQqpoSp?&RiuvO<`4^G2sp%YKrntiArn&%Tqrr&~-Zd#UYyFap7`r4Z=(bBVg15Sg# zEodM}rb(yCUQ*GH(XB=Bd6Zgf$2!ON$_y75L52oooNXrMT{`#Om(!>He=p4KV=A_( zmdXCA0X%Mi5Q0&9;(=;9-N4KYki3*zJlc@FPt}l8z%d7=HNoF&w+$-Wu99AjLtf_S z%Dg!5?_eEOIIL~;b?v?W7W@^Vr^S|3l-T7srvVW^$btCT( ziDrpfuKF$iaUY(7w9D;mW-&MR{8Ari+e)ZR_HMQjI+(>}s8mx1Tbv#}$1uBe4x{g$ zIWJAzEQBAWY8rSgEf|!+8V?6U-&~dZVQvsH!qI2DpZi-wq>IQwPOVFVI+JA_%)etr z&OWyO{myz&g^sg+w9-D>IB#g}Zb+cMMdr@RHFqi6M){{FCH0o$>CrTG` zZauoD7mdrRaw*Avr{7!&u|QmkT>i%NFUalMOxS36wkN_u6Ku|5(6*}f=FFeNxC5s(VY@ebX3W49X0J#yl;HoUX2GZdw=qsx7X`DSSOE4)E@ z&+V_2H<09$GtrFgOorAKW%(frri89~q3`Nl|cTk+sGUnpD=+wh0M1yO6Wtp21zE7LIep9yop?OhyIG^8vCY!Ruh03Q1|kyBE7|NvCJO8E=1<*z zt28RF|6)X-lQoJEL}4Xz&o7&rvjnLz+q)(6kHga9#g;Cq=u2Q%r5EB4HcLhvK4C{D zti^uz=lxxWJe$m-+dE5cbcd#Xc6U;~xquwvK)(!se5^QwKtua_`L90k(33uo`s%jM z^>gr}T&!YBo%~8p#@ng=l8XFKpEoi`?ZdOJpXM!|S)+a9Ipej-qW)bkzvE~#F?F_N zvh%H#;_G?kY;drrsT2|Rv5C+y_psAI?u?7j4%FUJy0WWA$&x+LF)98l*E+}1)nVWu z^c?eCvSydi|0WoZLg4pyWP9H~ta_I3)k33PJMLvvtFzW($UL+AKMt@E(nT>8t7 zC~Fv8=dkfX^x@9;x9RRA^>l%(w{>8!6Dd5vU)A4gRmAWgHNG=I3isv2Cv|!Gpb|#8 zL!c#8Ytl!s&_-fPEM#ClE@V+C`V?;|&b;OMLzS%?mVB-P2FL1hw{{%0=+!XoSQ^bh z>fTIA96K(&^Xu44JwNGb(ksp&87(8<#+7@QnYna}>^DFO*8bo;tXC)++33a|RIVh7 zb(y>oOJN4zp9}3^GI~pTdn?0Z(krs0S%Inh-N5meoUZ@8v*rxzc2t{iN^%w@lSen! zem(S0so=v&ALm_XH)@`#fo6p^onDjiE-s!kkqM%{Ebh;$ghDcTxxnyWMPW8*pho-2 zke`4t_gSqIdp!|aot@IVWO!HqY@T_(zStdCWYqVXi4Hoh#A<}i^RW2I+l`H>PGR|| z5bZIoMzPIwqx5m6*Yx_S^F|p3QH2ve9VQ}RcHQutcDwS+v30dmB+%z&3)=O0r$mm|5EVxSw$TQI6gPI z(MjiIg~@FPUlLn=pdF8jkqQ$}W`Qv?h0THG+s&uvlIr>$c+*h{Y5ngjb2WyRYmenk z2r?c+7oV2RjyR|q#$LSiOwc4fy&2}h-G4vGq-#zNQ#gbwVt~E6vOLC&7`r@wGV)c- zyK~;F&5dPNC69~Jz*FlzgI(Soe3bi^{FUUxva6EKf-WmeiDr!Suzk^VAg+DiEmAIG z9*2pCqmpko)MKFSPu&S(q9Uj>$~my!bc_dl{1H&v{ygTd|IW;EXW1Bxn-2Jbm$zgR zkpR^ZP{bNC4H~+JUx<1&Y}&zINBL8hJ5N&;!HKZu2$S8GZx7vWxZMJYU>%_@6n+)` zMNJ7QRNf!k{iF|-zKtr0GPuJ(;2N&GbEwy~1I|^Zhg*R?=L?-_Ro4DmxiLloA&qOT zkeHaH#k}&1Nu=L&Rn!mpeDgcq@osW=Makz@c}>y;)VI4t%F28(%%a2T!^8WWIR zHx8zLIC5vQ%yZALlJ(bt_+@^|?)d26Tpl1x7coM}VhPY9nwa#~j>=ubs^fpy_SmZq;10q8si272~&a3H$r66NVb^UBQRT#1OO+d;c6Us)O1sOSzESFR(9CT09j@ELB~D zFW}+0A?Fh`?7%ejP%H&Y!wpqN!Yl0}*v|!{mYI-=6EFZRwXKXiVc@gjG=`af-QGV= zs7U9T4i;`q)!0NiPmRLed8q!3LnF?L^>-2FFA_Np{ab;8A*30BV{|fO7Dy2C-R7g zzKT&!yBE7C0FDc-gviW#roI{pZZ)Z7%Qy*z@qNFZ^x6kdF|)3U4LTL+qB3(x9^2+R zQn>n0csN8}wsCcHilVaFfXGYO`j%upDM~k%vC*y^E~xGmP4;T=VV%=ymT?~L z`1aF|kiwibx#7%9MNk+r#PjOy8JdA#M_0*CcXuB@@*7xURHzVmX}cSWh^m^}Q`WRE zopYvG3Ow`P`0H8s0{Il>2$TNQCVqK(rk;mRqFIO20(RVOYOHDd-aYW1s2z# z?$va}p(C^KjGWpEM2_`L(NO3Q2eX?3@12wM9L8S7(&B3&1I}kZi#4=$WOVP}MK?6SC#Hy*R7 zeGQ-X&CQud{M_+qX8BbFs2$(awBDa3ZeBygC4mD_+`j-D_1Ri=$8{KKLxC(|jpol? z-B4Ty393CH*>k9hc@n1BeO)6~YO(iqzbk64t~wJrP+0<;eo7w!E6ai=hH1KGUht6= zR1 z@^W$w7d|Vl7P?4cnB=<;tWEB#w>@)=lWiNmE;)OXdD4owY@god5Tp#wn^F-CK{NE2 z|M4{~E7ii_ellX0&(TdbYjGpds&A~s9#np=p<;?}78P3_ptBhWh zL>)pEYHf{ny*Hv%mv>Jjt3Gc&&!}1yR8BJWa!)Cbqo7~WD_QK48ZAQW{My|ye>C(1 zpoISUeJ#<`8rHop12RzOwR=6i2*@*xs|mv{=CtX6vgdqX$8lalTH4%)Dv#~wmEDcW z2^~_mu*1}$uu!k5pCsaFW=fMd_q(GH71oSA$b!e=)Hs+)&ic6C7u`1fhgsR_4qxv# ze?sp6IZOpAq&j}ambFRrZB+%p~pIP$x|H-if{1dFH zdJSL1wW@ZGBsJw<=+{c#pRR11J1s#gN9aD% z>m0g|4`5dVj)ovY^c1z)1*>4{zpGQb z)J1QN&SyocjhyvQ% z2nBT|2*M~KOLYd8zQIBF>*3XPBjgWVJK0+1BT2umngvUoi0M-e+=7%$D{4LrMBM(Z zQVh&a)E3h{&rRp|i}MXJjhF!97((g7?#W${9WM{*r|zqs1wh`N$z5GNxApiam_qNV z+UKuv?gll+W7OyUUTb6>cl);94a~iM7Y`KhJ@D%q0|5M!Bz)wtVAy^lyqqo-Ra_89 zZG7G>)^n^H=^CWuvHzi>l8~haD%jF~x$JDTsHSmGg=&v=MCpA{o3cv$LW(tK$sV#Y z5NA0V8osSlvV0K`-EzVr467HkuaS5k{(xjBeDmi!Z}!0pa*A?L*Oby~LAj=`3Nwwk zLjha*MI%1haa?Ac5psH-lqaNuLj#%*1GeX)L?FArn9R=}sO86hXs_LW#)jdj5FM-D~BW^Ufxt6TQsv*Q}W$ zMty!m3w0WX(;ZyyOCONl6Uf*UfIinQx#L=LII(Q2w;4k*;jF77@Rln#SxYo7w)KqD z>Gzf(^gce=M1avt2_WI?K_|Mp-fVY(Q6xC5Bi-ve}0?2`6s!X*CPF4x6MLMxr6Ww})7_IL2- zEF;=v67v)e#|6JO53QMb~HvXS{AMtPZBZUvX4hII%;-94f4AFwXU&p?CCf zV~7+SeSTj`u`k0-JoS$}DnBtNdhphlXNUI|HtW7GN@%2=nu5)~SzQ8SJp8T4gH^`! z8PP__(5#3mB!!}FKX_h_u)+Z4x4){jPVjNQ{sRK$Z=pQfd-ephibX?*&KSVQao6{4NnIFWxJ- z%pR*$OIzO}jvuSh1x{BMX-j!MPK@EbVYrTH{x!-VPQ&F~WJ`6P3?7PzV75yLx+BYF znz2!Slv?&2@#v*Jn%Fbbd>cy2hE<)+AMDK#(NzqOtHbe7QtnzW{FCPTxD-k7v@Kqa zgQBc?290qRtq+A{N=xa-^hU2lboVMzlQh>z-6lr0Bfn!|T&de)+=`tysO`i2rWft+ z#FKb$lnAPX8&+SI>;PWuT(p;h{>j_Nk<;~xKf%Gue!PkU6GnTUHtncz^6J}31B>W{ zp0;eLQo~#iq*bO!>9$}VJ@SsK1H#`dkxeH2=jxA+wAX@&pMKp3YJ}7GPCH{ByU#<` zj(wMa=c{&2eB7i^c-iIGts8T>Xt?c{k1BcVV--%MKSPrlrFIsVX@_#nn-p8>n}qK* z#hYRK-wNEeFZBwEe%qfdO$;*;uF{jkRZj;HmfO_q0PpnUr!UdA+^Z9S6C5UT9^ixM z4c6_sqS<|_o;^PXraykXjH?4kS0z#v=h4wov!2EBHg(RjW{F=#a>P6eY;**aIpwo( znA@<>g4&aTDr-XXgHyT&OVqiOZ5OlA#^*1#t8!Muy5yH=UM?)-qp6|dpW;0?#2mkV zj;lvWc0mI+zypHYy)jTBm++_?6^DyTx?9P&u_tb@Jc4^a#Nrm<6}GYX#1QHl8G-W$ z$1{PUq&XhSu!7mfl4h@J7yHTMko@*+N*EC6Lj8%X+lNI3ak8>bw1sbNzl;0p9>ln=o6DqAPTepr`2@S2=xL&|SUTJF%JqlJ4jrP-J_q$ml!%WUFw_iFuU~Cyl5>FdNcSzguF0 zBjboaM#!`F!Fb1%j`<}G92LI$8`m*O$;&XlfIJ`p$D;rHKM-)M^tBn2H6`?7MO_(4BJDm`@ zR0DTS$|pSIMCmtokj-{Lv^bqFwemR5RI6)9=N~hFk8b*A>u@OL^wCqh%j;fgp~bj3 zY12YaC-NRt>9~(uY1CA$^ZNMlkR@H*uOgXH8~^yMdj#!TjB}sN^sHT+tieFiZ^G4QOeH#we5o5#0_!lc6?6OGRuKi!)v6+4;^{$z}AO1eq z;df53P33He2R|M%Wi;70@U8x?w+FwI{Mp|xyU>xMe5(T}*hq&L>Q?PajeRrMvn~&U zP_f;Gi9b3P52TtiKmRdkEH=g`c{o-~_pw3LKD>kPlw={PUWfIOSho?Uc1@|gpU>*Y zWN*UQ#3<9~xp)@?MZLhnjj*xFnNYUO*>M(nA2d(L#Qgfzyz!?U+j>>#DDoXwZ{RDe8&^<`B?@E7@ z$i!LUEObLDNLTzxXLGX_09YP0yp$@>&C2pw%3geXUK$T1kNQX4ceYb!G_d3B2j8_- z8meB2-=DOf0Xg==sbg({X_IozH`66?ynYG>TsghHUot1!0TpOZMJ?Ms>F7WR1>1N` z)}oq#U%1z6jlme1xn<+Pe4uNd?x{ZbmB+6bBDLpnu>;!vT6F3PR#G?Tqu(2ZGh<^)Vw zGU?+bjV>=mJ_DBQ53Hl?m8?&)`vZ@#t#_3EjI6&OUhPoYf%ENE;~u5RG8IETa75!M zUrouqxwDH%0+B#FNR=tA;aRgk!AiCVylkw_8yTW=Qxr#a`Mp6^DpZ(MRvQ@oyB(Fd zp?0`OZhut|$*bw!reG7v*tyn3HiJ%EY|G70``p3M_UFjE%X+_~$jHb! z$ZbKKSe;lRK0EEo2uAJ}`kl|brtzOtwR9qQQ0P++#PzXRrIiX6-C-U|NTA1*FHRhr zth42bRw-TI;PJ`q>t$lec?L93A`!_g40u$aZz`LPeVo8A4*@R*>lF|di*c^#vIZ(z z7!U9HeX(Q(XT}H9g&Y0H1{5-PLewH1A53>$hrud~iK{sgP_U*c_-Qg{gOg6}7l2;W zd-bX`^t)NE=UuN?-m{<)sZ0T{A2d$jg+rLFvzD{UH|zh=>}FHwi(U?KebK%C)iN(6xNjnK}F{ zw9f9qWq7e+Zh*A+-Msfg8{THw4@c~T>9)?0!MDK)bCdIAO!!Y3Dxx5$;g|1)LeFsC z;KD{{k4?KS#U{lnGSjywtiaw1$5SOxwU;ZJEJK-72^PO*j~Pt+zA-`bLGxu)ZP4ys zsE|x+I)B{DYEJlS#jOi2ySEB;u631deD6blmrQo7X0C_gk5yzXM`ukPR z6=lmeGj(e2*uM&(0g0W`6#l9ZSGSqjMtGodA+2-xK?XPVPAXJHRA`0Tf?z>Lm*a)meiJ`OXB(Fr!N8VK9h?}@ z3~_eGJ94D);_#Xe?5Ec`oPT*U*cBI>Iwx1@Dw?6ARn6Y2cjz5RYZ>fWve)Ep|7Wbt zhCj~Q?&&M$a<CA^i84gJTN^nX0cp-4^-2+q z?}aL0apjl=B-U|rI@^N#P(<%N_cpaIma&RS66~WMS%R_s%v`QyFk3gTNiQ1im9en( z(p+0EN9Y~tel6B`?$NHOc*nrlhgl2ja%o}LCM*CK-5E`}pfy7pTc60ZacxNzvW|Ds zFSPu&UvcFr zmX^7gL~o@6QA$6~?hr^x=hBDa%}*B)T7cduX25kh{Niw;)gs-eP;ml9??F_%2LHJ0he zDkG#b-Wn>{xrft!tA$+q+uAE6@zVrSnTcM^pHt@xKfW=+TzY5vC}7t*&co-MySP!0 zSNH}_eL(pU(sAwyxB$*dyDEGxQU||yyN~LrrsV&2h%K`B)G>k%HdTwBpMKrKzX~Q~ zm3BeUO}e|C%CWXRT)z8y=es0Ur~&hRn(bUcw`5Si+I3!UD^dQT;l-r6S5svzqM?i~ zB5*Anx6<_Gvt36ziOp>-k2?)D0|pc1Ssa2 zht@Px#_F-EqNMOQtSHLhe4##T;L3Ym@}$)rS;#$#YLRo$DxHNifFJ`lHmUt9CIAN4 z;a`Gt%Y##QlT|=ilCueP8=4_gZGfLUWd`brP|&kKft_7fLkzp&}qesvO~$$lAlYwXu`VqIkdYK*|c8uuus+6qPPE+r4A%M|osya^Hw8`ilsvN8K|PH2-DOW54-OQn|pL zj$^vF{sZI|LyCecW}bTqj0$Wm5fUdoq$)PiNQwhG`h4Bi(D4|-Rm;jjaKEE?CN+x&FFY7^WkhibPdRZdsPE;2%6Wj&QI<4tvg;Fm^PY!3a<1WBz8c9L5-dM-9qzW>Y)YZ+%0w|xF+xdSaP1oGhKy7+-@ge5X zG~(G-<=AU2zcctp2XuS8`V+mmp9sKZ8k*z7CKZYLB51+u7_0+BIK|Fx5K%91M8tj= zF0l)2vCXwy)E5QS|L-wCUtFMyXv>iB7WbmE_2y<4rpBk7>cQQ|a znsz|O8(_QJ=#y;6K{(#7lJSufqW2_1xkw2x%w?>93>2cHc_$<$T2IgQhT?94-B)W* zbD?);$7>$Be`bpnGZ_Q~G@I;Uc7wQqeEJpV%S8j-LQyGS$Sx>$M92NJ>OUIAkV|bE z|FYA6{~nX>g;Z%bFbFcB4szR|r^op%sO);Kdw1zLs*Yy*^7A-GZ$qO!oi?1hy!fr* z`!1_tMbn-Tet-J>;CQ{8+_fHX`JJZaN#$gXig!qZ;*S`L$>a!TGCEF~?C%{U)j#H^ zs}sEsD0Kh(ch>1C5shWBpdZIV6w#F@(3q%*UE0v^`NDr-N|6!YlY0+*7FrwvD~!0* z01>;nvf!6ri4mo>o~-Bv#Y(t}qT-8G*PQ%~#>#3312XSlm`x_*#+_n_9Px?p;BX$; z)Bn40{QmnhslHbX^zY`1Pf9H@ydwTxe&%GrTK(^ZLyvbwMEu_iofyOWKQ{nhg0O-` z-~aED4C$f%-|hLdy~c$9yIqyYq$d1-_cKRuL4g(ZpXl6XdIlO>o~eE@Kl$0!tzLFFIPWM;v;&V@ z>fe}=|DFg`Qj5*@{wniVQc_Y3@3?g`Lc`Vx%a}P5rt?v}A$G!yJOYxv1z!&MgGB!+ zYU?v3bBrkT*ZKzrgM+!nsGQng2kG;2Sien4cK#}0?!7YRV28ZAtAh-p0cXxM4Ao$>J!JXhv2n2!? z+}+&L(eT1TE-d?D}?Oc^g-cJ!Pt@k+J%TFfjan8F+GWb!B$6+xPqD?Ac%U zR7UchV5Dbc5o7Va6tZ4uE#X45eh)B>6}$XTVNv3uZvB{Q%cETQ_8iAQy|P=%xtk~B zTUZV)dFn9=%(8ffNMBb3MW+KcFs)}@k3~wk!i4+Y>%mB7s~z6BgtO;vLD5Brk~mQk?%&${y2z(Ui4 zcQk@}G^R9k!t7%qT;6<8RRax&1A?X4Yn`(#brtiqLBIa%wfiDoA$pqP#-f>6qS(N% z6s*Tv7WuwwL4->oZX5oJ+*~!zaZo@Hn59rEW{OLlBLGLjBp$~ux;K^pms%PU3Sd0q3TRNyjAyQ+iOd9E6 zvAwXU+0ovxH+=q@(sE?}uB!p<&M5dfY+dRsc!jRYUe0=rPJhuuzFAbs>tmWdQgjDu zCN(>qLEvzFFkxU~SUQ1QA-dJI)BAQ!GOsYLYMJdwFC(uQe?H`7y=(2SVxU2tzSLmW z`-hI>anHNneG4Ohgs;svmpo;j!>6!Qqp95YKe?^*v6{bTD?9lO2rFabl1ChP$EPT#h^LkzXBIKNr6vWl(>|XtN@`Q8KA>UaBy&d9sgZPXXH1&xI0m=RP)m-AwqzZ zYlq*}wf_y)t>9C{HQ(;X&ODr>zMC~&?Bnzr0y9M401QSZrec6vaydjB_vr}W4J`Go zc&;(arGOL8&mW3m_0}b&q&~eGcb?GHL>sKr@hHV>b;7_mUc%}HJ)aM}gRTV%&!Dwi z@GmSy3ci7#5@<|p&uLM?P295cbygt@5sh^ya>Z$%W)aU86))Z1+a77oeSWtVAUGM@ zn@=X{)O@gMJDyBUi=-8g=48B9sr`k%HWKaGCi9#}M<1NstSamUTa1`HgnbvNu6jmuJUcRP`eB; z6KUJxpncn~$I0{AJ*y#`8bpm3R@3E49dT^yfvX2F3|l6HIFEm0Q>Xc{0DZ+L24@xiE2sUq58p##jh7mSq>^`*w@hQ(K-&-O+7T9 z4;E{iPXad~>@KG}CB{GO1^Q_F%%2AFwyP{fE;F0pdL8R91EC{^W`EMyZ&!64JLv!# zifreV#6Ni$Ki8YCaN>96j7*J*Rix$ao^EIrec>fBrJ~)0C0Q;}MI|Lml4F(H4Q2q| z7Nf0J%m{vkYpnYL%y;{`Ab{4x82dg}=}Wu7J5eXk$W6t3QMr|saQ>UyiuU&APZ*3^ z45z0o!Pwwa)U~HQmyMl^&G;}j5uZ`tyVKiQPfG`n+f4&}4OXp`b5W}ZN)TCnk+^W` zcM2GffhyJXAYLhmN8f+ki-fIBDI6$7gH5b|h6S5IL+TkT z+sV!rVeomo0#;}8tl#kQ@ewlmpflwNHJY8G>O7p;M6HtcLxYQzBZ}p_6NC;PkXYz> zJexVMLPk)1NVHrLA|6Of^5!>OggoxUK*P!gM=AE-i|);yK;EQpH>G);(mEsDe&{2~ z!ureiTO@Efp{Mc;9=Lfsuc-xJiVKChj{|D~VrEvjG(TVO#V00aFlLG>00FNbNWPAa zXO?KfbAnzphwI1UCy|DV6agNeag%I7Td9XBvjgb|!!SfPq zp3a^)6=)qRYu#5=_68050X59(=))NI#nvljeD(&N*%nW{nJ*cFk%XgW`QE3DI;~Ef z$n6zGj(KVRk=j(%^w2O1!i5%=0SbO8_D9q4sI$bwzSVrp8{(;$4nE4;GyYGm0fwU8 zem6wVcXO&bA1srL{7oQ znX0OK993Giv4BaDQpU|uV58TptPWowd1PzCEO>P)2kBTmtWi$KWK-h5JcZ;FDD zB3Wz-T0Y)SJc>|jcc}V_g$)sDq1>Q|;Zv+2s$#_*yutx3=1m;3*EaaAH3iT=L$B;I zUi#&AcwPCJbDJY(Sr&TCn^YEUKYr%*_Gypic;68K+A-VtvN^_Ym=IXS&t3SwibST1 zHM2S6yFZkE6lxF4oYSdV31l3#@4)o*aU@)^@y>`*MGp#xUyC(W#a+@z&<=uBp~C{d zWVOB!i9+)*k5>XjA2KeRm;@%{<3G^grJH~^fR-u|65j@>2)FK4Z{qCcQHk{~ho zlW1g3ZFfpU?X`GC_f};v$KG5UA*^(<^jJ)1oyX!NF{aUK7L}jOKa4|_b3hzMPHnyD zt#zb28a}o2AYQ5dTnY`Z_=LN&GoiV%_f_Y6z+OgmyPG)L`jBYa`e>2dgaK{@P3t*L zgG0*#VU6b;Z;SGn9`y~FVX>Q2=TH%+H>;gS>-Oh`&|0TRGU2G6qOC0-6n zMd`Oj^u;=7p4der@BM8cfj9)%lAABBsk9U+L&;<)snU+-6mG6#Pxao!5Qv-bV(QgIa| z+^_4UU}MF4eLs4CyDX{EeMOOh$x^eQ&qp#@2xQo(58gOh?QGAwy1Od@+7(}aC~0q% z{O@v=`)hb!nhEdhn5j7sqQwy+J)Ia$IsB;~iA~$N>f8-3)(xj>adEgfd`l!Y!c!=@$x$ODd-n!vc1XXGsS#wPQqwOA~KCk zZ=;ZLr^>9Xc=2cpX4YfhczWhV9x>6&l{ZiX`u4Rh&UBN)xTpqO=j)L@w5#OTA=l#* z>xB+=H!xrU>^;%UWk(^?1-qI!SR8EI;U{SIlOXixA!-VP>#LSoV@$y$|Cv|-xv*21 zu=kGcVy#Pb(d5?=ggxf$@WKwKfU~CO#)KM)F1c=;bqS12DSUE)w-6LRm;7hSJ(nI_ zc90i%1TAof@5%e(m;mBLQ^%&qyB6WPSKkN^+7+h_wH|7Dx%{@j5@~f&Nig+&Ov;)1 zB=duPL7{Y}UC%O?sOSWA3Nq^kmOP#|v$n`FEw6NK zn>#gVR~y5cUEkT`x+G5`74l#(4gGQWff>d`Hmcn%wes2{rTJN(r%lqr!iq%5f-+?>F-wK->(wg5msKMw8tI;7-K&nP zC5*a*(X{%ZejOj-&EpdOR6cJA2MlURqUb99K*<+(>^|wVd2upAu-2&>iSh?@vXw{@ zZpR+`ts+nr=1DZeP3s<$RsmuKqgHnEjMN3`rN!Eq6%;L=p4vTB*?k;lCtNtMaYyl9 z=*5eMTm+Z?2H|FKX*{~aPiR24p=ayWCr{k40#`lowWhkqT*9?dmXMgri=Rwh*34qPc!&MMCG+nD($ z{?_;_`PX!3>$sBW+)&EHJG7=&dqL$?gb9cOx5bi9;rH8`OMt5an0@i>a8-$P)L*}T+FwzuEizy( z0=eF)9!#M{)xFgdzAUx+@W-~DOtXc*3vY(2Vn}(04atD7gp|TID~12_u&4@@rYr;) zIA8*IKH`_~4GV%a_omE;+-(#m4Ij+noKG7lE-ZgHyzP&I`>nPmYT^sEw%*>QNRj)3 z!-HPmY_WN4m>4JRUEJ&Dq^2lSQ9CGBpU#|pM;c;ci=Wz5W#B=R8$TCNX_j*=V2|sT z6vLol;G~P5O5*TJ9PcxjWhkF3$8aZeClI;&^P4jzlKYG^i^cZ^FUo3V_6yMnEV2N% zaE$7fDv0w(4>f&B@86La63-(!777DD`@bOHch3kRqULesZ!RLHRkUu`Im0T2d~b)E zj>>EKnnjH~w%T->1JXeBM&XHxX|i8Y7G#w{ntjpkVHgK+4$azg`i>u-&lL|lu~U3aGHCV?zY8nEDfu5F$82#2GjXjTS7~MJG3jw&iGR-hNbMQG}4?ioeQd(+~GI ztOQ_FJV?4~vQSb|+AKDf`Dx%wssOLCz;eGPEL%cwB#Fm3=l13<%P{_ZvGxb3?ZZLB z(SEc5>-SuW&fwV>z8IJK%Tj}i*lj$xqqIc&yfj49#5P-~ppLLRYSq(?v<~};z+!_< znY;}Xr$UpbQ;%Qvi_aCq!_%z_*VXeLh@KPP1-8}aL0K!wqS1GN;~`-4@AZw)mi1s_ zU|FnCiXIO4@l1)Rm@EGacF2l9eCgTET|#Cvpq%DH;p8wIqE}r@Rfh?{$!O8YDI5~7@Wu3GMy9&AQ;V=q*FDZWzrKbyXiNmJAesN(UqHP0 z5*L>WpyF@lJG`&ZzGpN3o49$DduR;e*@A`k{eJ)0#>4%SWTXYRJsp_atr*yO!+T=D zo~!ur?7RNR?$jB86+r9uReDDc(v>YXGKoF;`|7nropJF{c>OqnHMz^zef5#Fdt{sd zZdS}!gSjZV+h1Yu_9Tx$t$Y0p?dJosS+V700p)7p3e(G-+m+qb zWl2_*%lG%(_%jN1QKgY(KD+6!3C}fuX+!@T_3*fw?w1T<=r&$dR8%5c1st+wZ*Q;3 z3VeM-gY{aM|D`scKo2UMVwAI}7AE>liNiDhlzp6`w8XEwtq+!id*o(w<=%nJ3@sJ6 zp>2CRTQb>?jj3ltNLrJ2`rUHEn)smhx}(gvZXVCVT8Mz8?yOy^x%dDB-TF39N5ESQ ziaDiWG+{Ff#ccYKFvh~6QU#+j0*`r{sq%c#*90Y`yI;!3!G)X1?FCex0yhryx@{7J zNX1h?ZU^EKXr8Y_-U%Up^}#x&*}?0ikJxoxN~q^Y+6i|uKJj|GcZPoBH6bQS>C%v5 zFZ(Y!KrjEi5ZXN5WsEb^vLs4qaKC~1@o^TMuihQVgi4NNX=zGKm|}mi>;Z+0@2{tD z7uYc{d4)#tB>f6r(Hs-U^?BF-}!;e01 zn=eRF9PrN>)t~L|O2NXy)-yYrV~U7$H=H^ayi>jf+RNeNfByU_`uX!Cjum7$l)0k) z?&#y|xgR-;%iyX#`$^#G}es*MTb+#chJO*zsj zy^;I0rw2m8PRI|vIr}+lJtW)s z1Uy7#<!ovMX|-b4A3A zsi@^|Pu@Av( zeo_H>-M?H|)NlV?tE|`0Wp5CfD0n$kF${uF#=%;SO zU(bIgDM6%_2Y>LgY1hpAbs~EOSm@Nte}N4Ps=WTp7pIoa@D-A5VRsNi%PNrM!ZE$0 zgCuR~;&Q`ldU?FQuLUUChkdF^nBWvkyTy#XY%YJl;We8t5oxtEoZM*hw_KLocDAx? zB4v*N77Cxw4+4Ga+Dg~f);3vDBN*?-7Nux!YgbnOB7#6jwi=M5nrlp+fS>y=D#x%J zTH$nbfCdXPUv#q;llXO;bQTd$Dd8GusJzjJ1WP6xDa-wjU64P371s?G7;r2Z8>F@CU4; ziVHmhgD$CVtbtKOk#XipCEP>B+>Mno>rAPq6A1W+Zo3;^9NwM*JJJ!wZ!BRWjIYJb zrJny~V^D=;S>rT$lLOpq5^Qu%wWNdj)e~^ge9nZ433_zK@wde6_#E4Fj}P>;|n%l%HR0Cn-$<6MVz6?Al?p%inr^2zkf7q;@;U zLB(lbon;zLrzsX&)3IJi8(x7qD_P4o_>gC-roovW35MU&NdTnZBQk2Y&s)c^ZK;)5e+NXOgZ${^0o_uJgfBp^lKh7IcYq zS%{O0MMG~XtW+Vpht^OCIM>>N9PD&?Oj`9r-{Wvuxlxw)k4tt!ElILC_CCwx<|g!-YWHew z?eBkLVPOehBlw?q54lQZ%!Zmm6D&n*k;T*c`)J-n=Qm6cu))2uf-dp(({)Koy}xbx zd@gOFLPSaC9Vn7ydxPAJwJDeKyHUY({ltzQ4<^;dL>_V>mCPc8RIcZKAF z+Omtu^4|6~Xr$Q>-oGyK8{tbs*6rCB77aePCK)B3N@dsw1>RlF|I4K07eZy2pB9Yn z7+*k9*x98mG><9YB9Ltjaa7xLEEGL`BEQbDoRCwj}~*mf812jsNYBscv&6JmW82-TJ-6HZOwID}Wb-N=R6!Bc^=N<=VJ+EP4iGMq0RAPMxRgLl~ zywuHi>aB;bQ%F%HpohHk(5Y(PHQ9TN;ru3jU0GfyOd-c#;?U80@HR-*M9M3rTN8gI zpswDS;olR2JCgCoac|L-iwVk(R+Y- zpFua>H{B%jRh7nrEHeT3T5V6oO8Yv5d9+Jz40;qA1bLO`-|y_m)KL!cQykJo*?#4V z|5@m_8`%whlH@a`&*hHhnR@C@8Ri2~7U*#0rI2St8R=#QDaVH9yUid3&eEUAh))) zyL6|s2KboV8)((83y<`tz!jX|p=veHxEUFLp^M-VcB7O0NQ~K7_tXBe1h5QMlLXho z3|9>HjJX{s6uw)lg*czxK3|UtBr(vjD;x$hpFd5UQ;`HLz_h-nXNuf?X^Kz8Q~f=; zQ1?lD0~^86n8?Bp^wf#FvvY8ut)-M8?PO(`;8XEm9s)iF8vNxS;UcI)%=PdLp0YCT zh(WG@(@rwYt~V(0XKHcQlp6Qjy=F+N^>#gSNO2f3g^7Wt9K8 zQMfYeKMmQEVNO=>L?4s@_+sO6<)7;rwg#`kBp!#uceu~jKVvqnNcO6F6It&M%a%Gi z?>T{@EIowu2h~{~o6FsWSul(^EL!rI2fpVy_A{Vx%kS*OgBG->5ga1(8c!`@1weZ) zTMI<3f?T)9MP9QC3Jb?uvA_C6Yyld;Mb2BlzEWZwf4G{Ez*4cZJJ$9W4ttCkr8x*D za-=8Hw8Uw9$*fBYc)Hgt8z(C7c(c*0GJ@##detn`WyCkKyhb#ol>7AF`!gxq(LxKU z)Ksk;(wFtvwjX9=vJ8R9rvw+%@N25fi`>P!l${YB{z52rXc#kh%=(oKx23O$=YfVL{wIlaezXQajwAO6*_IV*l;3R z1>)2y@b+q*WX3XNR&31o`k3B(iRg}JUHK&he&=tOuWj2we#OvezdAk#&^SLxO6@LA zgO`^5OqHMAcsP@F>izJ(?oHo^huT<)Tz4sFXMI;^_l--`RPxSsOVFtoTsAMzUjfrH zCw=$7pdTQQ=7ETtzq#mszFj)F9AQm;2-QdX2j>cm@C}oh4*m-y4RX-p1k{kPToBfU zgZX5XBuyGiO-~6})o*Pmb4CKLmOQ4q%ZtchzxWAwQq75B!){xHQ22E)7ZvfggnqIbI?df1@E2-5mZctk!Q=O=Zz%bay+Pky)jn{o@*gTn=s| z<8%3#6WPDAqTiHI6SbF%fvtq>x9i*mYm7G%St_pD!2~cNO)-N{l!>(a@!^-h zI3Auh)2(zrPG<`CCtDG?ZI?dc3w^@EbQEEYw#`T)8r(1=zFMvCup#-96kU-7{vpw( z;!z3-eBp}RRWQ^;kyN7Kwp~f9?tD~rxpPX|@g2m>&8t~^NXJYq*k^!Si`+AfY}X*q zboSPUsPzVO*e%!RtJLtG4xQ!2!|<(rSCg}u>|$tb;~RlqhE&a7_%$v8f_)3w6@sX= z{}aaXi!D@(Iopz;tq{MbT6qoU9-BcXw$EX7x>@}0iGJdJzZb}(LPC;~dyqH5Z`?4Q zoZN5+h&&v{0?2?$jWFQx!}Iz1U$@O31C@KDy=ot3ewQw9xz#K=Hs~M=v$L}^J|(4L z87(z+zUr*l4gwrjrI*npOl`ZomzP)goBZ-$_$m2h{e(kWga|u-p@?8CFhd+D=DJT7 z@jzNw^t_T?_D!|8GT}ApLbQ~j&071gAtxUt}MY+nMqT6ji3W-Vn|8`i?j<3+8N644_I24TDwdsT-h%|lHkx`RR^o3 z(h?BsaY!Vat*mZnE(qlMYt4<_y|pw{>ndyWbFa<88G~zach#2yq#c5VQWpKYo}JA{ zqz$sTn8IJo36}1YpS?v13qPf%oyfxEdK-t z2VW9Hoq&=~R#8#IJnnynYO>6E!zs?N;6NLmYgLlFna4|Vyg!D9Es-C_a-=X7SzN|& z?Ic#+pJKQU^HZTZppV1d-zrYl>$qMHu)H}GA;}q1YeH^xGR6Oqz?nL7YOD94B&o`lLH|0#2Pa0+H+K zWy$?9g(;R6{X)5NGZV8-w9N!@(yYW27>OO+tzzSGrmEY7xuAe=G9PBB+F zI{>;vR5nhaQk2DveoM7ua^%vrl8$f?^d&-&x;j7(rD$$mwG$r!8SUNs zp@t;yR=;iM#bMSW2obUa+fmJNLWa&v$hsbU{?7SEzZ(#=#CJqLfBqapWJKi`Lm`y$ zUOr6}D926y!ikZ*MAqV0FRQzSZ>1dM77yRSIy^_vTq+a<($jjb2drV?^;wBIS^E7v}Ca@h*1 zX*&Tj5Ng0Eq^JuK0bvq=B!h;B%Nuta5D)s;P5ML^sO7q@x&yzD6?ebaTrNQJ-lp7^ z<5B2P1^Z&3el5HxNLy;K4{XI3f|2DJ5oA66v_s2Dn{*bWkK6AYxudQloAH#Blfs~( zzXSU$^XbzU(9Z>nGDk<5ZPYx#}c3?N3aYoX@;@Gh$@$v?OXtr=jn#;v;_2A~6UtD264wVuVp6 zH(G9$%3?c$H*~rKI{hM*r)!NRw5wrYe-wzah4%FeQa$xgR?K9EZ1yxRbc*5?azMpwQyZ_2XnV>Ucw>+^F3hxl0*tLv*>e ziaRConqn&E0sZ7X6k&~B3%Ak!c}z8xhZ@Vz-@mvr&+2ExXh+Z)2tqlgj`WGhuT0PX z&dZe4JN77IC-a}vwIHnJ9n?X8H$^&tB{f@}K(wpXw{7~zokHGhw#q3z1G})c$4Uq; z3S4JZEf`BTg{)~YceGz;q1TW(kX3;=r2fmdZ}KhELOX4?Od_NGTDa=?MOcJdJc3@Wb)&+Dv{5;87--<_)PH`% z&k+(}@0-fviYE5~xRd+~9q!LcrekU>>dl4+xrjvPD=scmQw7`VL(HRNn zZ;bQRTcPm8@s%WSp|#^)by?o z$Qu<0^dR9*u zv}I@KIWrypc$eocsm1hT)mi~AQCV5!1B>}$NysGLP@{!x)IAt{!G=IxD!J@oT+i(* zKx5|}>YMFIl+yth7UUvz=5ND^B}O6(zZuwO47>V$5a9xhzI4x^H3-l1lQcVg>b(is zpWoOjFk?G{nd0BgFwUx3L6HmX8B_2DaneR-G~3I^*oT>Wuisrm&@XmW4R%;_Fp3FM zij0XEejZr;an|Xy^8e~SgBuoX9{K!uM-(1BsYLoL2!TWsGUL)IYxe_DtPBomdT!ZO zJ3ndLw9dKF(XtU7Zqv783~W01C^An^&EHs@h6x%o?I_LPzE}GT;4?a`D=~n9^*o0&>6H(mNDr#bFXRK1*qd-AE17e4sUI{9usyGtYSa4bWx|^M zpe`BQwi?NI?Em$yTJvT7Z_~E-c2)029P*9tgh%nzeO1hQrs$c4gE>?ahgJr21)n-3 z6I`s8JnFZ;T7OC+^s|Th-NJw@mi^(a`p|M!KkrpTa^dN)(eeq~aKkcPp5FL-9BD;~ zG%7(p?oC;a%MfBp37?A`dhfV{xkPz4fAj2#S;&$U)6<{t`MKO&Qezn!w&o!A$+S?@ z<{Q=#D7oKb|HM^#H(3$qN42Dr`tFlhFT9~doY3uCZrfRT2stkmuDvCZ1r3T zbJ&9j?ta>uRXWTtjC`V5A@9v^yh6%;yt6&I6oBO+ z!M2HrF@G2}XRbvOzU?(!7wtLpAMnoe^@U?%^<@Y@hOO8=cfX5_pR{4C8F0Ed8vftD zv9a7hm@zi?9bh$_GISJ@v_?TO!-6K$Zx{pz;4ijNdn0&_6g~C)!r*%T`xWAKjm_77 z@oqW%EcvU|!i}2p``*O;)V;Is?t!n0eA?*|1xR9Al#)gpJ->6Z^e*&F2g&7(wBd87 zz9fYvrN>B7otJF&gun(Q@k(TVrlKO5pR9v#-~Z?a&K{2J`Dw|E>}iOF>hM;KI)=o_ z;GU4N7}&ektEp#`m3KKfk}BHab!f!fIF+coqnY%mc%UJEw|(W`#dn~C+e&;5zJ~<7z4&Epa*gEFXsaEFE@?LDD8z-=AH2Kv6Z_8iuD zbTQu0%YWf=1)q$%*3zySICTBQh-BxMKw`-22axE*_;P`Nj}%B?6C?BK+sCpoyKz%MVhPx}g3a%rHEkVL0f+Cabnz;kc$ z@JT+ox5jFK4Eb2`6yHq-uQ;6t@V|UQOEGEneJh;F*tbfK1mZ5+962q>_)BCuWt&^HyNg< zSy5uRBVv$yt>EzI*`B`d;1{1LS`i*;e70XKnv(RtF_Gs?HosLZDToN3mzkCsiTnLA z*H!M}#Rpl-gU{U8ee(tD5xZ&8?4cC)LFOZ5REW60P&_afKBjG zIdY8xj8y)G6{nxNPUZVfv7juLAiSIj)wdruf1_^C)|9Y>uCeKQP^;TkJ6u_f8I9uO zB5G}#wBQ%cz61{9QiD0mzNsQ=Py_gk?{fye3La<=XWg&+K&!Kb@k!*@hWeYvl8yezktyaPyd_Vi9KqJblsEa98p|Q$ zwdnPFvDT_tNjvp4?s-{J6nIDcT6$!1@cx@Xqwb@3)aAktTOy%{{la=z6Z6H&|0%Ac zR2xcf)5%lZdNUBB@R3P*&xx1nGKUF3{J@lRyR>o&e%xi<30oMSD<$&By9n1^S7rjB zh@ijXX`MqI@wI302}0IX9Fuy;KW6+CXe=yRH*oP;QfVOAi97JcZ?VZf6NIAEWG9o& z?_NHEi-)YCg{hiQSc!^?)1&=qz+-Dy*HcTT=T7DaOWpdugmHLd8Y-fekf;+k7LM>F z%YDW$a?B?YkN5qA*8u?mCOc*UVu`!p!GH)F^pU-tExVu7e0JhQ)d{x#V?JSGmfbsddw6p+8Ypyp6hl(+E&vMMKBG0|rP#+}J5Z2jflJ_O@9Dhj~=Tq=%s5OEGoONWMzs|5J zYZ_1DQBbfD3IWc-m`a%+s`}bj8TJT4p0(h5fVx?L99c{Y*|@L#uYt@`?snuZS%#T+ z&gZiI1%w0hMwK^lpf%(218G!AX(^~}Cq?&X0fMehzxEZWV+VC0E50$hhNjc4fKd-j zonVj0#5p{mHCCO1#ElH{G4kgILJ9*bnCBz;3!n=6u@04PCA=U{&`>npIQpqK6lug0 zwB7Z5s|&3druer!UzGmwc&q{`hY7)iAk?M=V@L5U|83Qc=dQI1=GMbKsN07012(?3e42X||M0P3K8zsx(YBM8)H?9g<>1CdU z>20u;rH;nGxxo*jAHY9F5&~@^7k>*8#4ct17$2|=kB>rRLy?7*hN=c38kR|R`S;=| zRo96*pD0y{pq_92dNv;UGy*>QJ;T*k2n}5Ym7gd7MOhVqxY2;2wk(D?0a*spfA1^+ zB?Ac5|N8;><_GfsPQL$JVw%qj|N8;|T&EhX{9otm-*?F}i~s*0@c(@eY#{Y}0G>L0 zuuQE4b&N4eRhKWkDS)GUombmQqYNVtOJt=ack>KEWCo_(rc!L4plVI>Y^U(Oxe)VJEe{pZGL2Uuw&lcY;aOG(kuXbTGqzweHZ?S@B3 zlg9z5w_=vwN@;9oq2Ji}Nd9uAJhU?Fm6tb9$&>M33X-0#0EPZ#{VWdS?C!pE&o|_i zl%6i*?d^@woudz&uS2Vo+QJgXNlEDj7O=plz&B&vg*g7Srwv^i3g>)3Tc#Gdyu7^4 zc?lL0(idj3L_*#gGHf6>GdDlTIwITL+!wvoD-m-LzC-YaUbZ$F-&R>QSY&hT-2i{M z>|Zrnh|pVfe)q$VN`e<*^5=Q?P$2AE0a|<#i2|JS*y!k<*V5em3z}=l`nr)1P(hU$bou~I zbtQoF29}hT9(k$|+`iyV)6Dz%W3TDd%2@=Tq}B{p(T~EvHyruWXmcoMhPj+2u%R%W z)N^|8YA(KC`_y$*sq=*(YB|{k>6`;hHnNlq$tpS*P4)i zsKkpQozvYk%y#bt>$&`<=T0E_+ReUVq0xZd6*b|McL^xXOmpw zV}?_$eBn?Zmj(T|H+Sf~3)PCT+Xsi@1`cD}o7+@-o0FEK3nGMoK#SlU8f z7TWU>Pdjxya&}Z_Al=_*!JT{$4%8;#wlC`caGh{}ywaY!`zPI1ag8^6oyk43`*U6Z z=^epSXU_u>X!nENJXI7jM04v`j5VOqV3Mqw6jV@R*zIw>5qUR-#=6|oXRpTK$-=f3 zLW20l5#$|#PE2zVX7sX`8%X%G)z3)V;_lbYelL#D0ktqt82+VAwBd4Q~hI2$i~iZh@f8xQ@kQx+$DBT`pyZwkg-9 zA~m;P4&V*v{1ah;v;|OkpgRpi!5f;!i|RsaJ_eHI@+6YN<6hCAW?8nCU+=K++791h z+oBDP$H!x*Pl){v4JO)~fi$_~Db6}O6?jMts#%%l%-1g`)Xb_zsTPX0hp)4H9BW-L z1fRL?jTHmo7P6F)FT3tQ@+?apOQNUq*O~3X4Y~mbYf6BR$7wlM^z9ouG?tF0_H&gC zDkVtp;m%RIN|$Hd2fkl#3v5=7szwLDKju~_O)4c;&QvK{3a-x7K$w9JKJAb zN#tt5$a~J)hNT)p+{o7yAle3bm)2vjFT}38{g-1&>RK|<46WAP@j`?2p)O!)ROWfM zYNK8aL`;PlWOFQh!KEhqLO{tR^R(@VrEM!#U>``WtLL{R7~a6!D=I3+_~QAzxJF%i zyJXh55=rv02IWlxR!w|)T)k6O1JOit@{(BaC1V1qWBE#sU1`Fd=3M?R({C5K?mS&e z^IvSn2-4u~mRk~<999_D`uqBTlTZvHswM*-U$C{+n`3v@YyuPLTS);Ft|Y-Q0>me( z139W4J7b-@JcE3pyf|`plQ8g6x%G*L3jU6Go)pNJZL&v^G;P9up+5YBpK>;PpD>KDWQ zs|q(B>wE?N2oE{xt?eU_bdH7J&`Ci>SH-X+>(K*z;+y~BQ2_nDxp|MgtlT_dV<;Wj zTHz|U{o-f8JF84};Ne!Lr zwvvNl!WM*glC39Yc($8}VxF<}Tbt{{66MD98w)0QyQW6c^6MdB#L0+HIYk(a%D{q*IX7l}VPU^LDkA1+SoiZj zCw7Xt;KhoU!2&!JL-mlr$a#z(Xbso!I&?m`0wBjCIFEfVmP<`(7t#)W94{>?edNBQ%C!xR!{LArq}l2K@0VP-6Etqb5&z#+(X$99E5JxqD~G zWUPBlZ^4U9^c-aVs%F5}Rc+X9kg~kty|J+|Q)ep8FmecUgKs~1$+lRmBK)c1<}5{8 z`zTxcwdyf6su4Dbm*%X^~f%gAPr=P_8Fu$x0m z{C#`Gg^EtshUS${f3Cs-mHZK}>{CnZuuCbbSxvfwHmW7g=|OF4Olo4vA)GzouvS9D>pxKcf0lKx2Yy>MhunE(IF`=DuQ_9h8)$ZMe?P+d-raL zKhSP^v0R-geZA)!zGy2f6OH`+1^TinTG-a3U9c%^fQ7mR;QD_B zY>DaR;A>yVL-l2xrS(Rc&E#a6HiM4VJqMr2-`9VMhLX=wbs%D_bM}A~<-;T-BnH?y zr-Ht&0Gh*~gC+xtNGG-gVLml%PHl{{M?)S)f#a%8v7OzGi%Ty_?NrP7&VzteUS zy*HdPG`qw=YuLca$>6akHEfw5sf7QeiZ9=xR42NPM<^51eqB8Az6yX6DFlR0BztXB zo#=dsi#=KDU2`2;&A(>8#vuawItzpp&WcLkwpxmC@HKhYiUvgiE(0weKk*W;WpT+w zFvf`WS_T~?Jwx8T<%KX3xfmSQ0)0@kDomNFRdWPJ$qXGxA!f(bqm7=UG_p}K7WudV zu~D;p+jLC^x#xYBGnM0_>1Z{Ndzud?1HRBC@_)EWqrO;lXX|V-XiuqRD-AJCW3SPw zv6LUC)D`Zb-iguCoPqq3Wo5+69@;&7D6J(WnY@-K6Z!0|ts9=WJA+O3lZ}4R4-U;` zQ=VGzu-JR>@SSBUg-)k6hgX~(Er#rucsBsjAbGZGY1Ej!XnULMWi-F`SvR!9-E;6X zg5n7y1!uu8FzP2e?|hukor5Hh3G4(67`!RIzsJuI38Xz)_WrV^(pErzN2Q2^BSxlS z8}$u+SGYJfOT_D(cQms(=dq8L%jUCZ2G;?jjINXvp+eR@gZ*M-T#b-^Gq&b-a$H== zq(QywJK-U`z(B{tzZC>OtZ-B&q;od~ zy!nxscG#txS<%J6aqEF%_7CttzU~uUx;7k4%(^$)Z-_t$J-ygVqA>9~a;V`J+oxy61BH44t;SzN^2g%;fni9q3La;aN>D8y=kBVl01;Hk4)d?y<&nn1_&f zoJzpn6qxCH{lxVDsNus%zV*S^Xj)cQ7+1nOi-#^;lWgjY14zW*+?=f$3Y#APN`aO} z7Vi#fzqu*)zCtc*u;|G1YJE6_6&$o;S(~d~88toW1|j0>Qh%X8ytzF=k~CJ+>rmLS zbgE??=ge?=jsqVn<-za=2qtJ$&Wp8s41DqxGgKx#FF+Em_@^ff#XLGdqV%g(C3H}> z(3@))(qE{+_`7la)L=~1Dd5`bC-sm+F&EdyUCx9Wt!V-NF1ds?l(K3L3q=c$XwF2F zM7;lhSUStFsNT10(?fT6hYVdqmxy#J4AKH3Auu4_-5}j90*bV>bVy4}GjtB!_3q#Q zeZF!Wd|;US-q*g?TIbSU{6mO+<+3ATRiA5x&=aMTQf+;|!NtTAGNrAYXOFiIKSsQ0 z)hO&ux@lME1sOFbtmcaQxBE)_E7Pt|35}n8vcw`4j9+>d!NV)y)o~k~>(LmJ&OA11 zzUKcFBhI@Q;O-l+KhLV*pIze)^5gd@aqx4*BZYY7RJA)+l6L!fo}b8JUtcnd+0dWg ztoT-*N7-iPTv^udK&9A9YzE!0MUza?zT#P)_Ue?<*#`VJm)0~0PNIGoa03V3-5pd73v-%xN92~Scy8d$>r%iinVWpB9p~e)RjF*qX9d>Q3?T)$ zLHPv)GB<(lhyN_j#XUE|mSy>wx_|RO4hU6tJb4R*h4*Ng_cNS!qU{3%3Q4|Vh83w; zp%2hmvZW#9pD*F|!Ve&4prdH$6nqWBIs|$u^>?A9wnnv9DuuO}drDFao@+tf7pWjA zkJ$>tBvEbS96sTbR}1;*88j>RVPVaTysN6_SlqX{YcL{}~)9G#P#>@yh?Lz)^Jl<#$}mdz2U?52t{Z ztDpIt^(t!QLuY&G6o^7R@X{|j!90wmGfwnZqJ*VYA9NHc! z1qq8Nj;A8Sg$GYht7l1h$y}SbzAOM&5_CFMR%(9kBYQM-K4eqn^6{z5uj3|tCHr{6 z7q9FOOleR-Ji3LbfnaX7W$h2kKF1N$C2D$O<1sj7DxI<1-B~hz;y{buDp=S??WQlN zO;Pr?1o$IrW2MQEvGwx*dQZj!fkhR~9EP&e4{2LF#B}pOZFT!SsR&{= z|IXMoVmdV`MY{Tnz4S+LT)@6`u}z|C0JRT&{)d1Y6rMXcs*YSbTKB%&mWl8tLS^zp zn#OI4Zx3_%WoIF=GFDbXt{TM@LzR@ApOfd8`#kPjs#1n*>U8j9XFOZorx*H#p;sIH zg%^$!9%#+g=Fv}ME~!*^!^umfRyJ0XU2)tW#|HQ&YW$S$gKANf`fg4)`brr()zXz2 z{8I2~A=+>kFke{!0x@<0;O4a-@0bK5I0?#1Ra9(^J7mAOJIv#&|79q6ZrV&T{XB0? z5!DTv%XFi?r22%bnkQoKKCzSk+;g6;IO(p6a%~YWWIM?UJSoz1Lf(!GjZ3k(#;z~; z*yeBpgFkgsWxx;4mL^5L?>one@k`_XTxDLi2?y#WlKf+ub8LUBF~A+8>+5 z#V3aiaEr?nC*qLcA9J#ZL3&kB)7{&QnEf^s_SPWT632U+Q%d{(z7F6DeX*RLqt zLhq%Hq6a$2@=_|!$Kr1#y6Osok2T()(d9PfNjtT9>tLxLs@4SiZ-0AV?*43CNlH&$ z`=8zs@SJ;KKa;Qw+)ETM$PW6rqnLK*W6rwII<3%lU@5{ueE3TtK1BEpw!m44uwCm6 z60FXc71EA1Cj=Z0`4RNqWxbcrm?UtEbz(w7M$NAg1Pr_=Xb-sZ%^}V(S#~?h>62~T zR+BZVS;nKzSbM|CXq)WQ&dUb&4-g=hq|Yi_xK@h>Yj{|ea?US@e8>I*vN?dvbt;D! z$FPVu!=9S+WEfv&Ps3Hpww%%;igAQrGz?P=7Z^+bwjHr_s#&{X^tu5?84fIzcOUHV ztyyhq);@eCiZ@)urPefPzrza?b!0=av0eKqNpBK?j?7^rW+&!=5|?`+8un6zQ8I65 z+iJ6BEmOs?94gp72YaJP)5x0t#alQ1XHd~VUr!9L|Aj{U`GM~zJr&7XX0iLPp<$4n zt(>HjWxwBL=esr2P4*qS8!<(pDE7P3<9by;90`@Q^d)xWlZfbxLQ>hDh=?8z#mXIH z@R^iWnz7IlO zoGQ~~3wnvK+Yjrc9qH zmzXGDWUivBy7RLu);plXi-8ix9;; zg+BOBE3v#@XC^=vz6-|L)Qt88jZ;3yxcTQ$SAO$xpi&N9zOqk}0uLLXMuB>g4}|wW ztO?IOe&Jy~?U3g9Mw5%r)(f(kW(t$52b+g`=$`9HLw2^t5`~vu{sDr80+BzmWy)T? zdy4ZzEB!a{Nm7gCZS1{x#3gN;bYlDPT82@5*Ufmzq+BJTeaW3h_316zv}FoSzwtDx znVA{Y#_$j4aw+#EecHas{?stbbNTNG)PpuZGk79O&n`-zuHkd4N_kE%D>NOvNP_Skt;Qpqc4XU@5# zMr?NZfl0w^iQ{qK453IS3qRSEV7H3!%})6Hh>Dk*LfsmJ>;Bo#^2lg09GerLP$g*6 z)Jj4Lw28y4->0^%j~Sp%>u_2V&;|E>q!TZ29`J1gpT7AoSd9OFo~=sVf=Id2gc7al z&Zzi!UgL#Od1Ogv^IRF3^l$+c7vf|cb(B#o8Sr~^!8;(3H7sB=Mte(7O3JEA#c5fM z%PlUR`QN%LB$*AfR3({x8mM=>1yp(>u*167Kif6nS4sil`c;my!k%7-zhB-+vuaMw zGAp~Day+d#i!tuENbZJLAZJzJR@)bd>Ch9MC#y_53A5ScF_1(xx_K)mz%XPW=8*Jq2**lN& zrx8~*CtfZ77KRvL?^iMxf^L)onrM=kp5><;kUTx{3cin25sk1bzR%`8*s8H{aGQvv z)<%va(;8^b$UCF;f`@8fR>s;wC|R@jt@g`*xJ7aDPtz@IA&SP%5ztFrEI|A;5#qK!_F^7o@aiX!H$G}WUe7BXZBPuc|};Z@q2I=K21NB;;^o z$>}yy5Ut*J>dSo^nxVHf6}?<`r*1O6^gC7B!aqUG*a6SCHe`PHN3>B4X=&D0R(c^A zOuFQR3Ozk>o(f);JSof-SR{9oGX?}H<2Tam?D6f-{`3sblOvhj%>i*+PeTz3ijSN8 zJz=TiIpX$v|7u=}+XOp*@%V$fNwOfazTamp<+Sn+VYU<5H<83<#2P>?ikoOJ_uD(D z-et?(@XDI%8{6&%e6`Q`3DsT$s@n}K`}x$s%Id%5gxB%RX17u*+bCPvLz}vbXaBKF z?D%g|zD$yJ_~1Z%Qh+DR89qMycp!ge{ww;g9Q{+_hEu)iqnU2AAM38-9Y24b({wCe z0D;OL@ajkNXpvUba(mPyF$sWI$e}gKu^n!c-MjH$Bo8SGY8!w}C{Ya7+NIF- z8$%a^zwe`-P~wV1v!bk1tEWT4;z~=o$IaTl<`fqvz+|i6_}59z{uIK8HdLw>-CO zk*)`H$q76v!I#}itvA30!2O8|eM{-UyK5Q@)~?WSXNW4E@gY!Ka5kUUjcknI{E&u_ zcn#V}>XRsi5az`svn-3%OJ3Sf$dSIFw7kxL{dLd2acX+YXs_|D(cUVLk1fu1)PUaP z;CPP)DHl%TQE5hK3Nu`>B#=#0yUE;^yq#~+E%J&^tDcHLN_Ki6`R=T4qKbLNRqU8- zE6I9gU4o!;iXiTg$(G_-+*rW*$RkGVZ#|jO$J;!`l)O}Bw`S9qXm^xqJrGPO500Iw z7h@a;X9`U^T~E`khJNj0>9fiViS(6ci)}g zn-MoL3^zH;rok-Uqgr>5j2&?aRk25Z}Z)gKTwkZ?|#3+wx*ESG|g>|18(gpgISIBTM%4qh4A% zE8RvEJBW2y1p?@=C~J`iuAjmY4k`PfgB zFabY0Py(!PBuLMSufLf64i`R2Q{Fon)2QDZAcz6WCi+?xjvM!*4@XMiyQjDuQa+wN znAF?Mpp4o0%j_^z1h=6m?tM3CFN_qWBQ#N!pjIh+`P#My1iwTuZa(LlNPxODa)*3cI-cy|K58_*sOx zFNnaXiSH@?`#DYHG3#50gw6sH_`1^1UU0@~P6zRLemP*67 z8D+m+5zq7E=;P{RN@miVzaMT{OVQ`L&$t)`qb1^7vfO#{mG5VHw_UdwY_#2>qH(8f ziV^rO_?=0Hs>a4N zZE}?FgU`;EJ&u#y!+jFyHP&j*XdKN>VDF(9MKT*ti;grZ2B!07XXvwmALpQ&cJmG3 zF5~Si`^rMV_hN|X!r=vnEY^XBRk>FJm?^5&htSER9>lYE==O3L*tNSMFv%V_+j;RK zDX#Lb1%Z@{CF0AI9YVRA*qD8P0*tOFcFt!fqozOCd=NN`9{?jrkib5bh?z8UM_IWa zSG}Z$?r2Jw>(~3KAvo!<-=}lBIz#{7X=0RUMBEi#j5PU&4L8x>+Ky2Hchqv^mBwx! z)AA-&G7K~m(?U5jKGww-P=j=UG0C-l>GwxKz)r1@aAG2$sNdi%E>VA#$bYRAhz(xIR3 z__}%Wr1Y5*5`VpxOpAVtQ9nA&*O4=ju~@RwB}kx#1#fB=ywn6BJ6w`BV~`(_LLBma z1e#mu7Vy!o##Mjr*%#Ff0z4`wRI}12UZ{mA$)zVJ&NZ0b=5L6b=oeF-fUmL3Pus9i zX8|IUcHick;GSFlF1Git;3b zvN1$^5cs9&_Hk-)RlN{O|mrbAg{#Np(yam=m zkH>0P#E4eC<$(O2finRq4<;(8yXTAmdfNFO=@P4(>0y42KUKIdh+n^IMFr>+`{E$F z5y8gd`2^ffyS-6XPYxVAfFq9M6uc{nc41LX8e9Y#5BYnxk?56Iw~_pOw`h3_RM~ge)S^uoH zf)Z9M^8&PBw+{cPDo)Prml%?a)29R!s$K|V(JM|(EE};R;%PmgbifZq^Tw{d(a>Dm zH0KOA%O$VT5i(D@vd+&9=+D~vb}sVSB6mCwN~`GT&c+v}S&wkn&(7y|w?CN+^4|BL za{|EANRTO^#nP|0S3AV@S_f?J!q@*sMhK8oJT6NW-28fb{TE@ejG;uZFFhIRZ*jiu z=fbexK0yJALLuwuYQY>=Fglzn#S06)8T{QK_XOV}&x>OLioiv!J8y>qQ>?6Zw2kdB za@SZy4lZywz9ePx?40`8M$>aTnU@O|`RO%} z70;qlzRrQE&qH>}r$?_XTDinD(oR(Mnc>-P+7Pwf$y9Q6|}rQ7+^Sbf#yBGAhfIpBFi@WPQRTg7g6PcW!< z08Apx0w(KmIt9|GKIHxS(<1eTBN1$?o|RGVMhnJ9z|j`>$+=IgK^utz#n)l6%kgXV zpK%)LgUx|c`u+%oAF1oEq+W%E{T0+Pj16hliR}xvXbxLN&2 zA+{PMn_jrEC$DchEyTH=kq%5->ukw69cy3F1w}GrE93Zs!|DCvJy!62P3zm=`gD*E zab8-HLqWIDgH5VWX+5tb7iv7&8^ypz-1l{ae8W8>$zcE9ud2+9m1&`or}|eusQ}v0p^!jqaN_85-6E z-4VLDe1=GQ$T&D70biH6vAX%mh-S&zz6qi!+iC?Tp-4JfP?5QR9fHGl5=0icqV8Lh zQXty?aylugt_wxG#wcp^hw(F=3NYO%doFHQ1|CW@%?Ha4;EEmFmWo{9gTMsL=TyZ};{p2Zz64 zEf5`_U3fNxs#Cm;t#^}1j`5^dH%qtk5=#AnTRl*P+V^7ZNT5=_$vG0bbV)(|ham+k z&D4Smy#Zda8$>r*JwiY-f6fz7D0PY#!+5;rtaG&%#z4ve#jvpV0?DnwM=N5L1i6nw~DwyF8mc|in(AP)Shf~lh1fH((#qhLdiQB`?*Dn`2~q98#3WIR zi;n3D!(QPma(Lkb)npvw(B=bzaDV3GI96}QToIu?6a5K&U2d4O(uB4ABrnM>2&%@r z^#_w!?dUhRdwW2=Dy}ev&o$1*aQlHON7PKe_2$6=e7&VEY39FqB9I~=a?`)~{5~tD zp5>wc^owtD-3$ng8`FwS9FrHPj6mC^^H$LPs^xra;$p$R@vqPEvIc{%vljgHbl`eo zO3RE}CYZ5|NpkJazqCM`V0-h4kle=Tg9zsZ3EW`>VK?VGMu$Rp_3QPjf!Gk96B^pr z>9bYjeesKKylm{>QZv|%Z7_`9-E5zkko5lWg$V|VEj^OD_ZjvUS%lk41 zB@S!QHKu<{HmfGB#qIwkUaIDSE0*`c~HSrvw?GH4C9X;W;~}Xt+u6#_Ia;|}Ma%#(96fSXm-#ZsoNk!6CMZHozIsb^}qqy@iI{1&!g{O!`W z){W{P`Q&lSXh*qeb2AI?G|A1eGxoQ@sP)ridoEX%(P8ak8_^8dsCu-lF-Og{M-EPP-F6!1;X|GDyX_E)v!3tysX}snK zK^&1wZI8Iz8M^!AyjoF`?9nOf`Q|j!MmKlP=T{<|viL~Dwbi&^zuu%#gr=mXIdJ+6 z(GBk-TPJ-n$24Z+=7G}g-@ATnB`hopzQ>^taYe$Hn6 zsp%M=it}YFiYpx5=G0vBsyBFCgoHErD^EHRXJa+1L)V;>-cn)D=B~q6*?O4JzQ>f` z=PBimQ-Rz{KObyBqTm1okhhy~iq~G^O1abqqz4Z&G#2_|uTdY)C{8KQF3;^mWJZv-k^Zr?&yz*p)o5(0E~`^JJ6YXy=5@r zZnQxf2W*CZ<-99DEax(p3n@}gl~(HN3}}cyCz?jTxh=CCKi#nlves>ezSBMNfyogC zTN7`EqxREW>9MtO@nVhHlDizSRsK1ng;M0k+{H4E%UADUZ=$2|Lezh95*}7Ie;|~; zoSvDkFbuWg-WTZ(?Tiq6NE4JWWltWJ0`bsG+&HvJ1g}+*0FV~3{?7vz=bXBb`wWR!9`xh=X8L>nIuzl!aBOL!^}4e4i`x`Be}&L7iTKHyOh@=6-2|wsrLiKe!mf z!-o8xP*)YaJT3JU+>7!ferejSWsT{$?aiM)mK)T))(OnqfeVYXGu_6r#2mnRX2e1ed9Ex-07sXSrI8 zciIaX`p&NtlJy3MT@yP_-lE$}Ca>2b!3J>< z4*;taMahKTw}ZS^WL<}@Jn56&GsyiCK(467PTN|Gs;eNZktFS4LyQfYHp<& z0zrKI5zWVYzvjM80_ua6z4f8DHT}PJy4E*v4coV2g-6d#ruxIo-Dx3?_ov)^Gh6e+iN4%ysJ zkK5jH6HMAP;+FC`GAIRfTnf|8|1zndC_I6=dlZ`*EOYsRQa)^<-^SKZ$ebYM;cSI9 zQ+40st|yele6%bB-{42=mLP}yC<$>7wfG@H4Y!%7u^iZxtQ-g@6&Y=Ib2OE)Hdp=$ zgr!~F$kA0)cq*6~+mh^X{rPS)OF6Ue6*gJU7DopYbDF@>2RP2un%|^za&N}g75nk6 zs`sOCPskahfK9ZR5Bl%NYr(9JOC&l-B4>L5c0w$8=noJkeeveO0L|f-<5)Jlbzn(H z3wRGJr+zc9_lp0EVj>AKvF>33eizE(PfLYpd;URHRn-?)-X!4j7x+s!aC~K3`1I#5 z46jjF`qz)}*$-01RnPL@BnPb~j7)DTm{B>Su9N5{$fjzy=Q#btn4e$PtL!#%g*f!P zBqfzrrlMM7DF5&z4@lYZ28Ab@L0NNo27UlkIWvuq%8{s~?EmCJj~ENr-C;Dg6Ga+Q z0`6-vT`Ht()i(kJm@+{@E1`5{-v#l?Pyvm6X2`x1=o<^*_cK78mi76i?AIcLgPcWt?df*&*X65pj3hf zZwn(2uu)9RiXtG-Wwd*lqa2yan4MuSztIl)Mhfk~+VQ)+$4jBBPzVW0%`L>~WUGAZVCJ!EepeJ(8FlO}DHu)~sXyymMKQn_m>poZ2ECvX(7=qIjaBgk7RJ|Wr5lMO$ zQ>%-LcoQU%gP~7}#1^*mw6ut)`WT!n8W^Sx80z*6;16Dk7ew!>-8!I05J{Ju8``wy zOwi=|*8a}{<$GcOMH6%%P0x}98kn;z-+soZ zsgY$~Yv&UjdM+<7A2vkqE9<-_mv|sR)Z?^`KtuZ9$N^}{)!k|Y>ikXm9(m08)t|G3 zZHY|+9xp#nX^6eOX}Vgl-vg>16Wf07T6J_&o>wT)7F2L6lUCDm3y8jJ;7@x?Z_L^i zmo;2z(3eU;TR!_~A}H*8u$a}n7paG{Q);TsVSHRIDiQq-Vn^%poyRx5b@kf^q}xy# z3^&4PvS}Jll!$eEaYTKnI-T=jg*z+0WAoqcWadfu|Z@0<-=#wdOw6ZMAy}&B8GyQ zL(Q^sYfh8szqXnNHbR>>4Fuz%F6f}4kBXl0r9bL8%^kW+dmh`c!EV@NhCyc%!1h%p z45`690rx~C&3zup7VGkoH2`rO^Wz?ZgC1|z)b!mJ;s77gw^5cM_6s zs-AUqCA&W@w>%k$^l#3`zlx=lXYjva@&k;UMb5pf^%7(*(Idh3#Y{UIMhXt&a4TGMUw7TMxy ztt%w#k%x3;G)sj0=GS58wdT8SEQwnCn28VKhL@5TjH@SiwLW9H;zA6kk`2 z>gGVG|DG;vW*i<$8m*$9_|Sh%A9oOq9{&@{yflW_V@+kCvWN6aBDU1h3kqXY-yB|> z)giX=9WcT7t^C?#!1htAn$iAmK@gGbSm|${>sjp5{r{#s5|IyIBT9x5%dOUgooJPB zf9L~lObRcOu&CTE9TOC4c2kXP(UOT`&`4kT177DVK%QH#)&1zDc*rp%p_+*OVM=a3SX(1pLS$2_LG0QwyvrQ|!( z_&pU!QWQnsWhV2aiD$g+T41Y(aoa6ee!;YHZZqCAFU0h6!dqEzx^)VN?jjlR5$wAj z`E9NvxYU$RBvYiC7?gJe=5(zVdC#N{ed}ztP;_oORdD(!qz*<-LhnNTo#yc8yLLMf9(ed>?onT6G^HSI0AvGTtw@!Ow9-m z4oSJekXj$Sbi4g<7_AC=VFMb-*$ zgWBSvq7rM+^Al}$dnN)>KuZxp@ZaiY#0NMCJ z?^?|XZgVy+x)5Ny=4H57{@HVYU8h=sX{o@{34z+8>-fQYLMEKRQhcWwB#AO>KOfYh z)szY-vDo=b0W^`J+M%}RY z8YY4p0}Q4?Ng)ZxqhHdw!|_$7uQ2O~C@M~%$YvgD#LDD-9~p65|cqvN%T`Xq~#U??$pFviTy@#j0&)H~BWf8=^1^?}AoROti@{35?oo&fOm0i59pg8ds_9HWIQo=$O^q zf8o?O_nYGS&g*9XCZV?JSt!k;urfEP#CEhB)mHo9)!E@Xf48`b!08}uJJAQDc;%+5 z^iX8B#L&TKMQ`V$jIq&jn<-4M_^8GoF;Gl0hWqcRm)Lxi0xPOsu#Rer9*oRIW`h44 z2snNI-tw8YHJ#vr+1&s^nsM3pe6;A0pMW#)yX#N!b@!RK27}r1>ZAgO-L-*1sG!N> z1eq6OyGW#*9nVj#qss~>en6)3{b=a8=WuvqObCb|?LP4eaE=Xry#O#WsaUk6@>|Go zW?v{d+#k2zGx-_-q)q`YS*>(q7pyocp;z(1EUXIz|E%ToRYBcD?K3fke+1CL%Dh!E z4E|xf0ed+6ZZjs+)`xi0ZcA~~pB_f19siVUZhC*a6{T%M7}aIC^JEWn6KWw`qOz{2 zvkUx!MFF$BOqP%o6B<29di?FUTt2H!^#TUy%A9IyVl{*JP!$(Mg6PPf#SN1G$w zep_1B6M6a>pz!Ku%Ug2s*9r{G>EZi2PS{ZH45WV^BymbI(EgZ;RopjGXQNfKFW1^@ zMRJ6tAiE-=uWS~4g;g>(lqz{6bS7k$X04v}S;XyVK|O^NicLnnuSH=p9lm%pDs1(d zqj`W3lL+JV*dNR)|C$H$h87v8$nQS>KWTqUXnnCzktF{3?SJl7AaTs7rDXsx?Spa- zRy+KNn7+2i$!sWfe}DVqD_&6rjhG|Y5_3|Uel7dMPHlkgbBndrac?*7>dID1b*yj3~b@e;!TH zF0S+|ht2uB2=C42OQ9;foN%cOVg|KGzmmD3U4EGOOf9x2Z8z`18Yb+>W$WSMy0fZS zE`OgVd^LDocG(#to-cmfw7Ssxfrzc^+CZFano}&c4#*58o`vd~1MQUS&XoDX*hBPb z7(gm7wN_a+uqCL2PSU;NMAx>yBfU%FV9^jV^TR;RJCP!zLyoYjvN=U0QsQiDF4f3(d;-m7Qz~eE z{Z8g+uO{YR`~I$QbJ#0_IYN+gdLX7ntWsIaWHKE=x1f<`Ghdt2m5B$6*JU?_oKOTH zN)Z5OZ~^A28IMnFno-4Jvhtg3S)xG5y4cj&G!5;{btTP@lr9}iF3L+gmK)L<$(AHL zK(w{CwXObWB>FXq#Yd-rVjNj@kAUb>P{vdorY>EaLCLN5p)G7+_yS!QTEA1Cjw_A0!2i(i??j+fM=%aR?SZQ&3y3q z3^oQ*2BqcXR3R$Cd#QH{dqLxh;kaB<;ee%l?x)$ff=w4OIc_ZB=1FU_5y(G6p|7o< z7+0a%d2g;NUy5LLXFQ)T9tc?W5_V?k0u51{k~t`rzXewGJBdM(s=!7t=2aXu+i7<} zcoSpnU^2&4ES1ov(BylpNuY~^5CUjdT*>?yiPwl_qtHoUI*t*<2uQ4yeSXmE7S;Gi z0fsWFk$^Ys@baQ=ECc&qlB=?Qof;TW&cjjzop-vz9+TMAu#ewHgW_K&$M+=#{N}`V zntj7=d(ud<(8MKza3>Nu5c!gTJl&=hfbFyrBs!kmg<|wRQEWk1rdj;6!zzBX!(wt< zNVD%*WH8N3odbExGNohqwBv5c^ii!*nTiK*zJwIl*814spQ4Uc|LChq)81|lwH(ne zS8MD2*9jb2&vO3cIwP}q^*&nxujcg&e2K0rrmZCEt$Akb=fivzLs;8o3Ja(0I$zt( zIK^W69YtsrRBkvGjZv7%cj?4I;;R+jDS$*PAiqEPB7X$^yh|tw{1YxXAzSMsc?1GF{RJ9N4eAgMz+!M z7yJg-A%O_!OPgKi^gp7+c$1L026Ob{aY;$Zt&>nusiT723uOEj{w(@nMi(nzlQI#E zKfxmcn)4;(f4&n*IrbcuwHbS-lAdYv+Ub0oXeqrusYp{}l#os1-8=Syx{vKBOCYt- zyURG?E-`{o%}^&p{73)}Q$y^&+!DlWyq+f?BH_X?nH7$Az&tC%^KLpFBA19hjOIXq z7lYAz{!gH1@-G@Wj`ou* zA`b&{(l=*Z_#(j)25V&2-VY^Zw^k5#U)MvS}a| zyXgC3?tW&IhMdW^XL1y;?wgLI{=B=*%e&-wdS(s-0=vKMI21^UF|8nx{e_r?1j_MG zaRg6`7Ml=YG6(fJtTIYk$*Yjgf5RE5A&`yO-OZ!>FM&v#2j66v^*m~(Yxedg0wZQ0 zFG)%``i76Im!cdNfByNsNHLyt!#JjWg(l|kiswFo)qsx=+kO`CF#QC@7pFbG(p~J@ zC_B$MC|fKMb6%Yb-cUd@P!w2fm^7=B)yypCe6HOQms<|MJV-S#<=SBCA7%)6nD{p1 zpu%|6wCY6MpPbYbHO{=Xzn;RG?ja0x;qe0H8=sDJ$s~l}UwjEXGF5dtq zll4=d5Mw99j?>qi$~B}vlw$sLU%jm|WxNKxx>pErp11a6qaCr=EtciIRUKjvp(X`C~M=(hu`Hepnk?h!B?tG>-=u| z7tuH>aKF{$8vQYY-aj;1C5K{d5rHGN9aq#;_6(C&*@?9fKh~!N0Ym~s+;yer)HQ*f zXgY2KGbTHcE^Fe%U4t`ns`~z;)RAoBOSpfY3txkp*{{akTFDXac#hKK{0Qz}Wi+g_ zl3ldpF!cZ#EKe$YdcALEUGZ`0A!7?)a)j zlk75>G;`~nerY6pMr+L(nD29+7k=4q(Xz+>ho|krg~KzqUtTOab)?!y$MCUdHGKdY ze~Cd-44{>UE@L`uwnNcNfDAbSfR6a)SWKE4Dni)SL$FTk#$~v%nK^$1-hvMlxoA#S zgG%%R+D|D*pZ672*Rh0#lpp4uC!q9@f#>x5ilDO%pwLKotK7);zUbDsAG2q$FG_)0 zgj6Z1HKFF7{DKt*D&Cu;I4;(34G_xtMGkUY^WOObhcvr*`ah5sJg1IdB_~_EaZY#dEcrYKVf1{^(F+PaJts|D`(INF8JE}`Ej+`DMvnA^k_$T8s3iNFV3>C<_s#3LjQ>By!)_VueY0S zmUi;HWQV)EyA7MqjnIbnkAXv%>I$V40ieE1!at&c!8giz)?D( z>}oL4k$tV|3fpR7np*%2&yttct@sl(0eq=8k`e_xL^GGyZCzP$$wdZZ38QuBh?K0x)u8UsBg|26XY>Qkn@Y$88^KLgfqh0>8e4PteEM&|Yt29y8^ z&&6;b#+3EZbKotC+dhYg0}6o~Mjm|A$le7~m`dyKh_{Ub+UUZRNYG4%o0w?Z;EE5a zwIO1=&b|9|w3^>xh08WhvBu9uN*kSAxPLF z3fiOT*Bgx;CDCRnoeM!S1GC}yHBsz^Z!tW4%1onko_6>&YCw&NGn^J2GJ(*FvL0-4 z3$Ln)4i>wCE(D2P`Q0~ciBdBLGvuYFr0|jpp6t|k9205&$OWedci<};!c} z_o$dr)tG@H246BWNBy=nZ;gzl7JtR83*QdykZ>gRfiUx&aC#|JGl7(g%j%4LMPh7; z2?v+N9VLM3d;!d(+&Jah-|IJ@`c)8yt?m@pg|KbVKPc>%7P1*&-{`@X2w!#a#6ja(=&@0iz_AZXEl@Pma zhE0!p7AOMXL?|s#rMaSuI1J4-bXBEL`UK)T8nXd)eo*~r(SyxxroDf4?`5n@O8y*w zT$I-t?KPBo9Ub>ABzx|U4M|F7c6x#_3GIwAIA{b?hG#gN9~GA0c?iX( zT{*1^0>j2%)LSOk&N|hzEX@@vTEeX9Z;#ofwr3Se=)U$*2{cjcm<#8G1c zF9tJ%SW9eNyc4233Vsg|f8`5vsqp(sw5mr-o7Sl`z?Papu|quqhX*6KS0klO$efP| zxwJR={x6`1C!#rTiGq&K&fdUoMmXk>8jvXi(O*YVnO$+vz$9k z8~MP8kiMH|{43TR%g`tFp|rL zGsFiAyS%H$AwYX0RTsGxNVfhjZ3@JC#8ywY>C~=7Gw{p$*Z*VhESutpx^|7bySqCi z_@KdElHeXJxVyV+kl+r%-Q8V-ySuxd&hxzI{D@O;6(6Qxribe3-m~}G_qr~uE&&q{ zQH^ZoO>H+U0U9MHVKPXx5i-kfICN%YbfO}LaG?^q6yy6C)J6x)l+3o_mi~>e^KanW zA+8Xs7V0Nhda%s1fmOC%(qt&j&87njhmBD;Me5hSwhn{tHsbz~tlx8X7W zSLk#h8x4hTu3t8>dc!BCLI2qwV5NMc)LBye>0`2)wt`>SEmHy7U;P?Q9E0 z5(W$kN^=r6&I4__IZ|^mA*zpV3kM>J4Vw};N2K*|^fK@G3 z<3eskr4!ux3m(Oo!yCv;;F-P7ipL~)Pw8np@{#P#Wz`N=$Ni|l9%XdlA1%U7!u;wF zO4S$a0e1Cabq2O~G-UyYSAZqJR6XzY7xy<)Puxs-Gre@L;J9 zsg3e%Yrx@tm*xqvxJDS@Q$H1!R>e^Iywx09`sAp*}wgu=YKF+V{MZ()&E2g33h4+3FO-3f}-w}SGyuMY*C|UXsqFS+i*h4+1SlA zo=^Q!x#x*`v4{e2VAMt-C&hq~JTt;W262?bs=uJ~GvJV%wIV6lOI887)>_k<6HW4n z!7<9ZU2z6?4J_C{-5@n=>WifM1OqX7dVUGlpewrC_~KJCIS^#c8ou5B^Vg)h2r|tS zHR?fhgH#ztqNE-x^PzUncH@^nW*H&A(EW(f*9cnuevhd#n4{jl9|>F*_-fh~f}|nk zAg;i^_{!@1Hgk@hh)y)?@kfex6TZu#nm+J@QTq&Ow;cpDnr8=f8$)>9GI8?QE*Y$( z-`FLHaI?i9#V;mrti>EI4w_r0w4l)s$*A3X0uQA264-(hZl0VcPPq6t06*N9)yGw4YE`_05GE zLC;^k23x!*>ik763r$Z*XpoA4kEx0ukjJ!V%oq)}_4{Fm;L})5B5_ZzKtoSm#|B47 z(or$Wl)_a!mLRRNj{z@iKOdc}?muCzY4Y`??JrR_TMd$5Tb98E>65lgd9UR$zsKCL zHh!;nM4s|@KKtihG4p6Z`uXbvyFZ^Ta6I9cM&swvDJ7)ZkjY#8$)=MAp+mksPpSxZ z)j)^yS_b=G_o!I#d==6{fUljHcT{LBubmx;^{G-A7(fLZA+dz>yH_r8wv864DUS)a zZ|JGs*ADbx=>e$;PX`dMOGFb%U&e$K#Hki3lvfEc{fl~B7uQGvnE?Rh+I0m?W+Hm6 zAFcdJYqPVn37B_vL$g8pe<3^Fr>Cduj7M>n9?1aMWf^qh2zU@`l8wO*OdaH-1QIGL zt_7`l37uKv;u*GVqf+Ij7NV6}vJHSq`%20=Z>lJt&66b%f<95_UG_lM8X@0NB}KKTP01tkm|2h4?PyToK67l~%!2h0u z|Gf_W_g?t#sPO;gy8krbT~F0sA^9dxyEm9o{9pZI%qvb6$Odk>~2bl45G)` zd9|K`ZX7Wu`Y=8-e8qG{)9hEG(Ec@PX%0DE>1k<_OH0ky%}nYWHWAms8pQg#+uPfs z(b3WINdSJDS4L(b$hSR!lDDM^+mm0MH}%MoL#T7}r5W1oVt2pWC&EW0AuCND0Bb^T z{2T94Ii;2y!ueOU-L!N=V4>k2Ia6OjnYOlAXT%wgX$YPN@vEnD$8P<^pEr{vchBTz zrh9MS`bz0I&+-0Nt0899fI;Ig<1T+Eu9u>xEpkkfF{swwl?Sz4HohGzdRUjwtxLzX&yjii4J?up_jm!j?-JtW14}9DhG_JL?2KHK#c^`n|bf&oONJ8XcEommRU| zTg3bX&k7oUznvetB*4C<^qci>DG}tf>&-Z|%TjxV&6cghQ_%|Db5GanuCs5K!4g4# zYrw!sap-}vey*h<@T@s%j;_O;=q-h{~vA)BA<-2(j|6e~`79o&Lq z)`IUd7Ys7S-H)cG7sB!$D20LxtCADvnH&lXxHtf63}72e>*`wXj(btPCz(bLq(r|! zLbOE?t*?_1;q=^);U@onvq#=JE>_Mf1{?xJdaQs|kQng&M4a9gB+@x)w|hQbQsBO8 z!H!fi(8)x46ldiZS4mwHZ7IOaS5*{dlr@4di8MkFFdz<;FEaul#pyn*bJP+F9%U|9+#ms_>E!lT6P_>~1w z={MIId#JVRhnlrfmTK+#mq=1zGUVNqk>-S!HHkYl2s3XZr}lbb(ei#qKX*2BVtieS zAXHEz9H!o>tI+=S8&?yxl;1Y zdfv}ca&o__6hbsU4Udlg<6+N)qEqgAXloP6X9?<*d9mD5Yr~}Urrz+d3^Fjt^1r)q z2qoWFYDx1amuKsYCgYGC!H0!@7fGMUV%GsCtDvu~HMCi-y`w;0uIBHGCP0l9O+Q_+u|IJn4pFlQ?xfqX*TMVpd)qTorPaaMc| zr2Mf&IvI4Vgh`=?jbFi(86Xj@nW#CB1bqoyj^yqEnm{t74+9^KZm2#zE|%Bjx1B>X z77Ye1ovxW}066!&@!?$IGS~8Y1t{ z*Gj(&{6$KNGHAa0*V`*3*B0;Xxl8c8wN-j_Kx>`9^Wv zo+`%>AALC0Vf^leSZVdvp@5mGus2I1Yg38}{%KF(iz?lpf|$SFvb+{}kes@UQXjca zC;rd~QHO^{jvHR2Q_PKRBHXK&d|vNQLO3#6HQ5@c;CcVg9P|&428GTrH+XDnwd_2=x-{<%_VPR} z=nYX!)SC7?DF+U^yWtBCIRrq;x7&bSf@L7}0ezO(Txc*yYgy_2tOP7C(2B3uQI;K< zsH;KQc8yT2Fnb4B?E@eg9ktN_mL%qsWe-^g7!aWrV^97p$i9|?t}?}T>!R=Zi59&O zx?#qMWd4XRS-qn^_U`OefSa+FEn^Y6JAJDA0nNFIVGQzJd%lSR3DnC+>JS_AisYuH zVm-#DI0!FEB9vFicU)1D?VYKN%8NQq|0#)B^TXOL(P}2<8QP+a$y_MOIGE$}vHp|p zjV!_M5lkt=$n!gnU|k& zv%Q{XomZL@S?i$_v}Eedx98ODWAGxqXaAX#1Pv}8rWaIt50qA^rqvr?w+#v{okts>H^GT{tSR})?Jbq|?kk?8|GBekQmuG8A`8(i~#-V+e} zN>zDVY;}EA1>Rke)S74tUMFHDy?bK04{xJ>5v}D`#;;#rQd7la(utxy;u=)=Xqy%5 zu}M01KxO$M*GASL5BUFD1yniNGDa}4o#@ANm3$c66(CXp*BmS zLrA!=Jz_jg$9N&;Pv2(QZWcID?Z~pCu37cc#J%9*;IaYtlguJd606>Xfr*TFBKD{^ zT9%!9G-J{Y`7Lsa4R(XZIb!~J5`5i@fAb~&o3lvgw_xv9I%p$5anJF~@LOJN;p?Cm ztdnZ3XR9#_r|4pUX-x60<&KZfiU;8izJxIgoW>WK|E_R-VYL^MQd!dzH2!|cDR)tB4#0sx9`Svgp%RlIr!RGme% zv{RYi5Dzy8esS2QY*BcJn(F_c#BT$qX#s!ry3|oj2*C8z`!U;@)iGwbOs+Md>2Hf+ zm`DFUB^Z6*yinry2pyjgktX*XA5Bb=wb?Rfu={Hl6KeoMgyl@5J=U2ilRh9{O`oow z)EOU|(~RL^fOK8)WYFj*vOe3fi7N%g z@S@Ymuc@1E&`_!gnE}NBj&JvHmtw)R5a24`{o^Y46D?pVxWEgd^MK&47s=u7+R+D> zLUzuT&{%JUXi?4!))?P+8d}tz!NMfbk@k+JS~GKW)i%P(=-qn>p3l1(=_76!ibXKb z)q<6U+v%)@?(LbE-~U;^SlnakO0MOe86iiAy}ri@CXMDOA4C)%{Bp@np`)btdv`O% zFfOD5>O#DkmqxMn+h$>BxMw(%*U06PfOf19P>lU(9%06MV9!2v zVJ+o+DgpNbJgeC~-i%qZahYh6u-uioi)&hRQM2+)#Yyv~vix*-s<}Kiy}#}FZ6)}H zy$|f3NV5eP_h+lyS9pX*mQjvv`8S(*`6!L(n75!F7LHWB@n(PD;z`|UIwxkpPbW<) zpjxUcc&M>%yQdg>jOwEd4*$AY;1(VM5AU37%8R$g`VOSNu8#IFwPQ}3B4Z1wNMN0r zQsoElEe}>ASiZpTb!-sxH|J2A*m|vyC#$p??CidT9kzdt85E0zop%cb@z;1T8s}3b z{_Z@0h>SwpeS2#!7LS1Yxe&b)79(l!%mD<7Pv#-(tk~E%;8o1De+S&v#n}XU6*iH&ucBz0mr9W9)1tdN+R?Gf#{o z8uaodtUu+BCvZNb_CnmRPTPN#U|(6ddDzYhtA=+2=G=lYL0ILdyLy0ZXH9ye z`J^}?Shsf7XSBurLoGdB{i^^PqHLFAwiRjZV*W(PCP+g;;WzL)daxOzgb@20yCxrrodut~VdueZ=|H|MHM#p;MLrnpEA$(^^* zfP*-2Crjc+(B~IlOmh+~d{op1Z_5*Jk=a!Y>FhGT&`eX2kbX{4yBKjNIWG{V`L{ie zxtGWcjS$F+)bwWU8T#9ibO<64Q_+$q)W4Z+|9xmQv580X?S;F}a-=9egc{z6julS# z%{pZWqL(N0v`Na9Lzi-bX#MK##TCq3J1`RPl2T798&9SRKiWIrsnEgIOBku<11n)!^0u5bP^wHrN z5KX!XZvzkP`!6i51yuOB5Cn0W_dguWlClbkl@|FZ0y!}>YCYk%=W8Mdk`O-MTVP-g z^Y+>z$y7S8)<8w$qWD|4Kn5)7bb@EPmO9MTi4IhWzJGVeIoqg;Op<>O1SY{hvDZ^a zG9gdYW))*}B{L-SDBlA45x)jK>4TDjj>XI)EKEEnw{nQnQ!h*j>GCCoo?)Cib`eSi z8*cf0Rfp1q-no^pH?SbeMD#YXu}1DTQGR}MWok6=$@INB`+i48Ngu>DmM(+X>60~s zxnqeyu2fM^=uWS5-M6BBKy-N75Ow>veMcM(-*GTuHEZGECDMXg@5#nl0;N3TopSu; z6#GkwzR4@TZF9A4NY(S&=eQoNI-{-8=+m+h(!aY@4_8W5R=;l;$3LE#-BfkcMIoHJ zm&ODBWx-pt?R5fD6?~jcxXv`6S;t?tIdL+b!`M`aic48C*B#figjCgQ=U*ce z{u73o?etLsu#e_l%rW-5pJS^>BU}5@POOm*mTXwpEozB{rkEh*v1Vq`aG!|SDJJBJA3{u-BA9TO!D1`LZ(0d(+SQt z67^zYk#!EBInKCYpC*8NQW}XQSqm75Jd)1Id8gRzh3LmZGnj_hV6?NNuhZ{_d44*0 zV!EXb2eUy8gb(Tl!l?x~%*PS-U<(9W8QWK*Rb+k~`!5#<|5_M#xn=#ZL}?RID?7bQ z=_0Cb=92X$@<&-OB#{XMM=rp^;nMY*Y%v91PV0gE3{@6qq72V`a?VrQFsdQGq0BdpZGk#pnVB(vGI@BSl%`#(UKLX&C+Y691E) zUdY1)#dBI4o3(e)$IPX{&wus#>hdqziS3Fz-QK!|2dU`sAu?~i7wV&OO zp$tVCi+}9RYm^FED;VU2`9KG=Luwu#< zvdxkA2m5=n?UfyA%Q0=~AWyGI?@~bK1N-Le>`r5)b&2}+@HZ{d)zv5L4d+?ZQX{%L zd$j3fcy_W;>LdIzK4#EC?@cK4!*sey`r_uoHbuLp2G-c`N#bkKI%t1g{%*zj;D9GR zlayoT!%hfDj4Eh8p0(B)zh}j77!NxHR{kr*vtY@P^iObsQg|-AE_O!}z#|~T(J>uc zT*EB9qTI46S@2dY112&nyR*T9(AH5N+RwoWyf;4W68kr%f$U^PjYX&uU^%}~4BVX5 z|CFrI734VL^0PI3^K5ghhuSEwWJmSLeEmhVR2!0NB5A6Y*{7JlSuBwlmp%@8Dm^kP z@%gm)6XM}c=94eQN`t8gRThV;elRNaDm_l1R@J7xfM}1D8I&eiU~4Fs&f!RMXCW74 zh=VupFcg*0MdGfJ2}2SBexutn>{QnZ%}jAvrJeR__-+w3-jFfj3++ap^-kqBhfCaW zWZ>}Cgxn{pzp)a?=Bo(_dh**~U@h6wVj^EJhVDC_n!hEc;^?c`Hbgrh_#d=f9G*%k zSbx8Zh1SNLLm%A;C(#c1k?UZhE#Bqykjoy^DgI{DoCWfFiyCbj<50D48mke@fg?)K zC^?Ib)7IlgiSIoNgBZ2@-H!klT2|r9BTw3|89t;jRE36{N__LO?F+^35OyRL7tN zOB=!0MH+EpWROl_;}B6&i6$eO){w)f_m(v4g9vc3BjO`UL+Z3@NjC8-bam%${-hnv zmrH@y~^AX#4>jLebEbPX776wF{cG^&}=m+qU~vUJ=;X?@w_Rsyif zpO(K>+l>QxQPx6uR&NR~U1G8Do!^6R85-}rtEk&`0;7b2-Qb4c2dBpqP9x4Ku3Bfu zNAh{39tkk(&)NB;F_Niyq=~nz}&G|(TtfoC*5 zh}?dlL123VxqEPej%zoZgz-!cn!zkhyx$1O4Lj2LCZ6>cJ87i`gSM%3wc2&Gwb zzj27X0)I;|{s?)e$+uG_c$nj#I|=816cp6xe~SRR2IL3VS5{ttwt;V=8GI;MK6FX< zl{)kQ)~;M*B0w~pDW*q1IU`QN!c?wsZ4`If57}b(_w1nNa-(%gO3sRVT0}Q2&Hh=4 z?g*wVw_7Pw`4n`eg?fM<%4vCa-hdGyVI=J8`qyi>m&c z1V&^+8m$%3vDfCuPRoGyQ{dJAM?BhcaTr|nqxs~YegIGTV1A`YI^6*6W7k0xTOd%O zyMEokTW9Vt?4vH7t!1!$JCWIgwp4h;1$@7T7w`ac?QG3U3)=30oGoTz(iS_ zCMYFwV2~)ATF853?@q8WBkORg0eN*oJq4+fcEJ_qQAH^Lr_N%jGcn42zcF29XmMAUCDYY7^-u1Th%NFs9*-B4p z?jg`{+AT_zD=%5HamY%s;<)`zo&5d%8nQ9S#eA<)&%v8@Ymiq-Sd20{h?tjeDWb=X zKO}V$yS)uUSlm|$*Qd&iH0@WC0{9)l$W3*k!<~(gJ)theo~}CVyq%W_>F66yl8Q#> zbnsuaB&&a57_9ZSg0^qsx)(b2iAis-^vqiw`n^I4N1I9YD$&5o>TFlgCW_s8Ruh+%F5T zB;-?F-mIBz#q?>^>mGfw&KYrdcuq{j z%72aUM|P=XxGrfBT-(AS(g3{lpXq9$S`FWeT+hG5W_U7dYWrfNsNUy8j$LcPA;X!n z^Ae85fv&5Wc+O_TJZ03&f?bqs*{?Q2{YsxcnNB}Gp}4mge%1NyNXmf!v8IZc=$G^6 z@D}WJVRNVIw!w+B{_YpsUX!_+3n_0FL|LXlLus{@93y%m{^XNqZ4%VZ{5@5{f_k5K zDp`v77}6a(2v)>xGs;F*x}&Lw5ubF$!TVF@GOfqacJ8O(z?&FnZ5+Egljq4gm%EzT zJA{RtOuMW)AAy($9HY}_7zmyudij5CZzuEGNvqkqq#)OH?+-6{`HYs>LU}DIpi?lj z!E#B_&Q7}KGyHebkGDp@`}3&1j#e}693Ed|C8sru#1FagT+Jd4&&ZN}$VEk}#vp@)1mk={y>W}Wc_P)t7BRm%vlP!lWz<3JaV^NOo#gfnsDd{HAP9{U{b(*4XUx&eHFZukC_hI+($mOGu@6OC;|2{{qW+OxrkOXwf zz6E)KZwFrExlcNPDx%|)eegPCE>^~O9tP_|53~qP6t=2uC?948XY@o%&It*Iywnrb zkM?`}{k{=I61S-fFluFBe!$~?y2Gi%l)`QC7tQH zSkGgU`-})fL>E%Ug71yHxna^)Pok@{;{2}hNUSwFxz z`kSJA#dN!E28}1ZMvT$=4=xLZ{5pINe}*qr=dgi%JtHkwqAJ{WVC{iUOII|YRaGI0 zxl1T!S3@M_GQ^J>Ze65Xy8~oi-B=oLterw;!0`ewuvLJJn6ZkQGTTu^baWxEo~QEs zjTYjxDRQ&}OR`!{ujo690qi`F086XL@c;?VSf`gh1fo|%pay^SNcH_2ctl_eGyjy6 z&Bs`Rh@yEBfU_+$pIq(6?DMbr1+UG5c;x;0y8A=9b!$g#Q&Oo`&@+to7b#8&Bu#Ov zSr)e~F_OhhdM^*Xe?vqOtW_pIpgQ_MdVmL!iwP6yC63Zp3|WIp&Rnnl{)3lV)%}$Ry}0J#+{~Pw#E8$7+~btU4^%X6%um$gov+&xB@;CL#;u1?i0rs z{kyA&dH^DKiS#w*=0u>h#n{(%GxKC4Q`BHw^SV|c%zozYsH!gAnEMB>j35rBUA%Kd z!}u9kM|4#pT8D>WgFSy00iiMeg738BQEiq~%hlS@sOLmXk=ag}UL=Txj{k_iDX)0p z59UL=mss7x|B;=+$k!Dtt6_e-d3ij>f!4^_*ARnQfX__g7Yl|cO z;j&^y`A=mdOB&sQhY;Dg{`||(a1*x*g>=B4PCMcviXdo~*hO2e*0eH**Wo*^21npY zEbgR$OU7dxI}t%vJ}@RL{NoF6DRmDx0n`|UJq)B0{iJS>hh(C&|Gj2umy^%}lExk2{O`v-0`y-Mt?FN~_?<2^>~ z$={URl4++0qdJ;6EfgDVf|k~FD4rFt$AjQtM;O$PK(|u`j#vF_;^h9HH-VRE=~1CF zb$=S`9BN(A>y%IYSP4=HVef43jmX23j(@Z+6YaM;L&?3^~mUyYU$I!A&%VP$8+`>Ps&GbgVsU zWet&!(7kV8&fWcc$z^1vvMZa2iPu`?pM$NF*F|xKUjIaH9m*ytKK&l74Jlx?wR?!) zTZud4yRH|T)pHB{{&7`K7ZX*rgIFE>w0?w#adHw zL$OUon=2gA%*i}wIIDp;BR(bMrkrn+*OUy=N;=oB1Srx2Qc&4g;KKSAj*`N64$`>Vk`gPum*}Z#qdS(T!|+}_e63me{ELRP*48sv$na$W2CkPd%`Y&)FWLfah$r8M7voABVa08?33UtaJI19BRQjrRe@XM#dj>ER z?}$XkhXT{vYVCQ23KH>3MjlIg-_0pp**Hz)Kze{|-Sztiz;Z+<`YVf%y-R<6vpWn@ zz6|@IoM5EImHa7S8`%eG{FUg?+o-{sO#MLw9 zU^z~&Y3l-Zfrd1uAVbzNQD_j>4Lk=*p~ckaGW|AgKAa?xc$qaRsM*6q+9zcag5$Zz z$fHrVBlQgID8y;mN3TO+5VZ^TH@avkxM!t}F#Zbol>UEQ$#Wzn<>ybTF}ECHC12OWKCp(^VsT%`t0@0pe3 z^K*ekX8Su)tIOcf1svk!V)G4J7H8QfeRBv4O3L$S$8E71rhZX$AX^igl~t5q&!d!7 z1qZxK8dIyNHapFxhV47jhNzMyAiIPGgm|hdY4v@%f9Bw_ySlHL)g{SU0Fr3dS;p@1p~ytwP_^1Sq85WLurA+@n8L;Rw6@n|g;Bn=Vk2Z5?WA%;#YzXEJ}?7+e0b%g+AzBIH2f z-Z;k1F*~#8&QqI2{o#h~!&?M|308_9Lmf$cxkrw|=nZpGEWorxk-DI;0+cjpKXc+! ziKNTnBK=|O9MnQV3dC!xRY?#-^P@oO+(JgCK?jpn(-shTN{Bn zvi~7Y&%|TURX?yk90t1w4PxI4gH^;}<1XIFDLUmaC=s@QxX*jGY8}2vk?T8OWoq3@ zr~hH35t6{9AtmUYNUH>0R0=l-n_^_MSmnoMy{ZlnEH4(T186hd3KX1w63}tGbb|ft z+A~X90I>Bx7C7bF%DSnTwuVHO8xBcJZ0MXn+?BEBs!|iZN^IYEjkEHB#d!kp6Iv!S zTWXRHZ&{C+7mD?ioZPm2L0@!%gW1Ei&#+&Rc1a%>0P+41ebQLvq@n9gA0t}k`%v`c z&yyo}93)7YD4W1! zfwG(2!)^rS6Yli7u?sfi9&}(D082nuiLLDVgeGgdHgVWG-MQ5UpNQ53^I_=+&{TH` z5OW5gzKgpCtfQUz2E;mdLEt6{7~tn}D*5V%K_q46xM%-{5fdAp`7}2@Bb+fZ&MVDW z=p}-0Ypo;4FPxdpyck}~jCQf8qh$sdQ9}<4V-sh@fvu}S2YFF${zS-&oseJl>6CHXcC*w7{92ro5TvV<7i zYWaaOpN%v)7GN=NFYZUw58DVm_yRVvI%}ZYJn1U;-zK)DqDKfif3#iJX{0BeuVDf?}n`_+2g7CM#F}Ah<5jNISUIn5~k5 zO`})rxwNt)zCb$$bS)Kxn&R4dH?M_lzpTC%>XfX9_5>Azo3vnHIjJk>2A+?;vVUOg zkg@Y;`}AC*TY21=LyaH>WcYF)kfs)qTj>}+06qz4v7WTr*1or7n}isFk}{n9`{DH^ z6^Gc=-`RhypsL-Sn6U8e!{&ks?1Z6)+k1Q6^*2q_-!9uPNkb7A|0C+gyy#$5xx|J6!f7hsE1H zWXS*UC2gqi+LT6Pj*cf7&!zRzXwHSR<#uDOQ^v8ZGPr)_`E)=MeN5joM>~m4#EpAxsG& zkjE*Az5a2zK4jpo+HYa#{>aFuC?=%!KP_+JOC1x$bFm25cY%O9QtdIxJ{LtRGD z*+|{p=k2y+C}iI!8>~FW8$ueuuLnQb+#OqKqU;8g>~0Z%OKViJfPP@^(;FF*;<&5n zs{s}fKMAwf*r0&QH7E-ZpP9W)ua}g-f2;QX=@07dxt#b>&zpI5ml`xme91DmB*#fg z`WEfMP@zXu+ERobJe-uWx1T4C7a#5s@6EYlfDyoT73NL~wKC>K*xPQwNV%T&ErBal ze;XrEvP%5tNd}n$#u&$LmC8#d#TDVl#=0r zK4TJsNT^%FSA>;;TWXg9I^hnqnbj3isB(P&BeLOD-Nxt(Anoc@O>f*~ zl?%OYIl&v882f5;a|;T+y`;+ur*|S+Y^Pl?J4WPDSl`ey7`?^qhGckrTs5vRN0_e1 zQs)qgr}O20@6SP0>jNUwpu5~b)Fc`z84+1kb?eEdl5+R@gk-yWnS*JVG8!!zOf#=1 zXVqJ#z|!&BPDPs;A(DZA5k`1Oj@kSa+rwL1y*~$~tCN^mRRbp!#vH0$G#SKv5(S0m zEl6L;a{!IkrTTEie926{@5njLD=;TLz*l7>nYNZ znzk^fDdGCk{Rz8!H;$JiBMvQs=mcGaKsWl5Oq7dG;y${Z#%7kaoPrqQ81f=Z6Q$HR zT=9e{G$brws~zmYjCcY2{gTtn2tZ-2oe;o%V*!@j)liU_gd?#^S#mv{Ud zfa?=$`i*VD9*Uxz{UzbkzItd;9Iy#y1a7WiJ1D z%V{PC0XAUytb?3`Xu;VoJ+zd%YqO5yYyj@{qVnSURudHZkW?samfZQA?u@`C-n*W}4vw1E=b�r9wL9>1 zicnU#^`%AUeztrai{#N;m-hk@p=mF6tUo8A)ArA2F{68|Ugh25tnSAGym$Mw`8Tob zCTV2Yr8JiTM{AW- z8;E{c8?5-l%m{~rjIA4c;RhKMUBiz<;veIV)2#)!lHFm}ONm}*^Yv?!5nR$-=f+lnyA(n>ZD+a{*)UtuWqCg zv@VPCNZzZaMiI+ukur9;!$s5&@?eraj~@YvSpj&GuC%tQC^rgVsKl$>Ykmy_c9#sH zldefwj%$$idnX&fRkL_hS^1_2^R@+{%AL;YsuX;L_U%m+j=Zl`z{7VX*<~9&p-lsZqG;Zpu8$Q=^$~AOsdRr_p$}hOt>1gOCu1Bp|H{hZw zV2UAGXxi({k;RhwKWjckpgh@e0!y4l1%6#j+a^-#iAe^}V0Hg@?|eTBt=_+7TMl4f z)@)HuWSlQesF0d#&%G*Ez%pC8%C9s~|{y?x*Gb`^^z8ZFxP9=*6s(k4j|YLk;<`Z;)r zYy#4I$xgB0-T$0uvS6RSIQ*BN<=B^{h0603ERxBdd9lp`C|1{l43|vMWf!FVMf(F6 zkHH6HW!p^K=N1=eoBEUnR{9ZcR-ebnN*+B|zY1b)mkE)!Y1pxooE3DmcY;c@qBnv30Zn%GIsMORv3wpQBeek~dh`$h; zKI+rMwzv8Am>z=q3I`CYeCzRTml1U`tSzc6za!Co}&#^NHwhL>Q z&4Pk_>G2~S5>yF7uhpcS=dDrTs?eL;Nwe7>(Xx4?yJ)*fkUTnT^of61f6)jzX$~I^ zyzf5Ma<60$3=Wo2QabyD!o^J;6n)SrlC`nlURZH$z@GuueWgP$iWjM zYQ=xkXFu#z-JZ@>{e*Qh`alo@O0(8S198rbfnjP74BWbWi`@G9^C)^TH>jc9NaV&Z zIYH3tc_@Y5=*L{KGO|LRWK5Ds08A>b_heC(74v2p_Wu4?{7$@Z%w-x)q1OI6^e--K ztptA}KxjLKZEbZExIF{D>B} zC)e_3x7~kNy)_tja+lghydd|PA|Y#W?!cx#1N&I-#6EF8I-YH9UOP`5Z)?c~Y_wWF zPb|z8iM!_l#+xMvY2J>-4~XYR6AR$MM04Rs_9+31x+^zjt{#N6JF+!= zfTt3LGnb~jkJS{Z`U)zI^%tcCA~4Iri(NOge{?)JGcRw0)ua^mQErx||Bk7GOdp(e zqVH^H`z_JWNyc2|depxt9qhxPn#<+=PD@)kOw6*8xAECE&`o(T{f;c&Z60a9(Vfoy zAb~k>PRVAIO*}SF)u;V)Y>Fv{&8xJnOf2BbiWe6Z#mMjbHG}+w=Ku>#m@m5D3p)ZS z`>9NeW5IX8NwhNylia~W$RVctH?B%O%s~?i(C`g}e5bb$8L{@UJRtiOqaIFBxB~n>fjBvm>*o*mjT?|1dE* zY4$gp?(k~U|7rj5CGrHON5q-yeD9F)7XXkhoPXq3k4|BQhY!^<{blj-4$2T+!;Hx; zgu084u*mC;U;TxHNt`0m zXM=D^x=?>C%8jjz9RSbBu4foj$wLAGG}Js$8XK zd6gU6C^XB|Tc+D072o>s%U5{eOS-;okXT*Z5-LAwSIyZ&-#-YS!f=YH4-~; zw!iM#>?UR_X7`QW%!28{maFa7L5W!E5^ln_zK(dVkw#W)t|#@DPEY|QB)XyH z8U>(I#%D07PU6yJGtw@nl)3ttW|k;rhbHK@Ua$(Q)etr$#0O%zm=5dn%H!(5d-$kG zb}#O^xikHjY(3+%#r*E!dgQShXB~H0CA>f)ZYzB*ne+9|V?H`&`UC+&wIkDuF6*p- z2h2KiD+xU>zv;jm=m7YI!XgRGd!X-an!Zk1&{dO)y8pQD2$?u35QM(zw$^g?Oxz=7 zX_-IFt#IQK#GI-Boz61gVDtVgH9pHmv2@Kdn<*@eox#1?0uq$P=b|uAE=xxWA#1Z1 zZoYHkZN2r1g{=Wx#}c)?$TIr zcXxLS?!n!ITX1)R2X}W3?oMz?aCd0jq4C__=bZiC|8U3s!eG#g^}t$>%vrN)R`!L$ zEi#qetJMV|L?AmPwAJ|_1(&6t9&IlFq-NLN2OeGsXmBeYb}2t=$Dd#pPsmPnrazHP zjvb^T+;0jY3c>H|yw~ewn2ah)a+=_~cv7Cv!rFZu(tR^Rc)OsXB3EzJb8qw+ft*=< zQ{{RH#j#dEGQ~S@KQG=?0;2)3b}1@|JSGxmBAP5;Eko;#O*mGMpquWTBZ5=N&17eX zf5m)C&L`d=mlS44;e12EnC2$|E7ZCc%bY29Rxl(h66}$*>H0*tK+jJbWt(mDpyAAP zSVelh}zdz=__zK-yYbGV@|XKyZyWsI(t0&vULTq-rr>1)se5+ zg(mROyjB2L!5AU&4c|p5%eX$&z>V0#Q4idK=HsluJIGTZJY`T}KFDeW`(23|N`eR< z+;&*#@7vui*hky$ya{a4?aTC3usoCBr)OO-*fGqcmrO%kE$(SYXchX(1p_yEU5&QH zY}h%z%WQ@kLm=3izpelQSqP7MN^x`Z<0xY(RyF2R?Q-8V>I{xmIe}1n_2hoB;qEi1 zvx&6>X%ywz@>QZ?7n|>Tp~G&hk)#!=;zhibq+n?o4Oslzo~)x0s5!NcV{fYo#LfGxdm6>#c-Aqcw+{>CT;?JV8Zsy6l0Bp~z`7WNI=jB4!tSq>p z@?u{tq)=azr$aKaUE1j#093mmB1t7uTzv*?^QH+K9aaDq?l-Fx6TEslvR?&HKCD({ zv<@e3F)JLM@QiV?v=o;-5NjQZAD~#F#mpWTffVe|&^B<@mQedv|b)B z7FYH>fidi3II>F2dM0i=qXFkN36BX)Ya~Dk|B3B)@QyJb(|R z-F0CHRqXu?mOHxH3k31Q7CKN^+>_QX-o@R;cmE`Pfu6b8kqhK_w@CDE5GTdd1+CYg z5-$+3_P2j#KR5=}e-yK(9drCZMC7kMKfmK$bHfz2Ve#k^?=W18d>G3C2*PVABAe47yGn<*C<@K%W{ z0UJ0dsH-Q9h<5Ep5K__5jBL1_&cKrpO{O^{tv02fSs!>$u?ruv2J)-TGFy#>h@F}xMpJ7B#{ny;zS`Q&9RXMC#TruvtkArK+A5({*!^(k`6@jsfI!>V{PMf@ur zwIANes7rA4x@8rm%r=00A7&OPc$Jb9S_a5zk_dI~r zR`_Z`*fjMA(Xb#Shkj`AuRoczca#Gh2ECzj$F3UjNP=(6T@&FfVG2G72jTvEuFu#_ z`gT>Ius3L=W3AP!aC{Q82An^$2#@AgU2$<=>F4!MQmS~}kcpVn?-68jn6>hF`J$2t z>D0r5*RJ6c|u|oPT|qrxNKY79m@n%Ar`-W}e&M56(#3 z`h{m%9$nv8e_(Z)2Il6Pk(fxoP{{~Ba-3B?=5kAu1_@l^>t;vVUS3|zy{{z4sbN*U zmOKzL1dpgPjm>ZXt9ADEo3FNvX^htmo{P{sUv9ml$ioC+VF`;t_VeYn0uK7U zW8Fk5E}L+X`1g=7HB!&!&I!!b&6f?KGII8=ozcWIAf2hLq`pAo7CU5jsYLVp(7I7Y zHh=mRgMIQj%VC!z*uL5mPH(=%LUr?nVXE?z2VuWkOOB6dL5s1H@gt*lnNTV_U;Z@| z5G@M+v50Mv(RzCy>&s=|(obmKWsf#<<;0RY;RmlNU@HCjgJ+(Vg6o-Wf#%i)wJ3hdE#*Nr|Byx*BLJYP$QOQ56e^8pot<|0AOvoZ5x7``kXzD8g5z?^B z*{t!bwJG+=U#@8*O+AF0)JGa{a`!***vwI#Z1?pMD6mPG@_}2Wj2B7Ad^=V&gc68{ zr1W2JRmA5g$zJ6hJkX#AcTY*FGh9~)5D|K3a?0M>Av50?t?Iu6^{J?%7pyjEPpq}9 zYnC*K=K)+2o1VILk2c-18yA0y!N)O2k)4`n9(Fsqdp8{n6Bi&tdB_uR|6ZftCYccP&UTLi1>ZW9Sdt&0SMVT#RfIZ{K>Bqe zzR)6-#>viDFt8>PFPz*y?J1~Gb$71I6F!bGY}aB8ceHmWWdC5pZPbRCS`sdsYcLdQUxaq3#PL;r_p&xQc&o##3{TXWh(Z@cv`)vJv8KE_yVslr!_VK z=B>ZOcn|Gc{I0~6m6alN>q#BkyAoaUr1qNJ%ZxayrF`V}zA)<3(^7u@)M2Z?+qCF91vu>)TNy1n95Nc9>*Bp=oaq{+zp@)d5R;< zgkHb`bzfKT$z6%chL;?dJThc}cdh0vM8!Pt=A7kwZ8+!qR!JA%$~_VG3dqan{^exX ze*OvZdn~&<$CW?LcKZU?Jd3$I=>m>Jxc{K}-u>$0AED6^L!au{_erIrJNB2MFH4U~ zBpcg)_c9eML!$~h-%^(FFQQf?R(MCcNeAG9moYEj!}(vUADFdwNMIY#2jks)bPMVo zRvV>Xr4`<&LNT8-U+=d!k-g((eZxN=d~LKyq+rwvydDJkLEBXlX|V< zVFipF8x%GD#wH$Hk{F^B1qP>=jN&ypb*zFa1ubSq>ox6+-+VZZ(QzlnmqjZ+le7-S zhy{4m9h~#9rUegRZr1gTZyS@&e3VtT)$7ocZn$S!IINo(>7a41{fP$!Y_Tv^VTk-As6gev2z?G&Jfus<$(0!*i%ngTAXWTuDeF&TcCTvnD1iUR=a0m6qm`Vo*d}x4U0>er6{T}VZK$UHB z$N}>B+EbNqZ*~+;*3jh3C(#oTOxEb7cbI3OHR*3Y^>*X1&m;0uemN&jM~6$_=DqX5 zW^+APrvxBeb3j&lJ63(Kc6`sZtH0@H_t)NTxA|~}*ad8ARoW>b*PFloRC0#C$L!XD z_ho+X3G3)K%STYadXP)`c!}U&@`1v_qtXT27+a9e9X{|r8Qv+iBfq!MOj*e}&o|Rg zx#tGMEg^2n7xFQw%InhD0TQ*l z6=%C<^4?z?Z?t}&kR1MWZ|=OttdjZq$3hrgKb<+L&8br?!7d-(w@a=~lsQycbTVaC zfJlnvzzH_De!=AKDV5L!cftHo#_1%MMd#}D7s*9N;vdZ!?`un&yp<$uRkhe_qbV;mv?b4t8Uw^CvRW|M{_Ni{A3R1J|AHL*zG%I@^arOx$6JzW1UYAA9z{Ih!MQNxZasUuOhZ!qSpeNNgtpf|2;}(md83 z>Z`PoqQ7VMlLWqTfRB`FpHiWx9YH;-DEcwNvwXi>SA4#oF-JWv`z*v&unjQYS1UhC zn;fXAq$o2+RK@o*Nyzo2js42wLn|{-+5Yj@u(kLFe^^%#w5v_c!Y!S@?e#ew8|Un9dg+iM z$tnK^iKXdWdU|2_Q}l**^d7s*_3Tg6?4ZI^2bf?2(MP4Xb$^cIW;bGmT3^}TN zX50XaVK*AKdy~}(MPu*R2xQCm?n;u)?d!Lbz_||BHHU<3pkbo>)u4^XslHeG-{-}^ zanD&%RiHnzKRFA_%>APNG`ICTTi7e^%K-1TA0@c^S~E*^^(>ZztcF>SOEX*hD(mYr z^7HXlwZ&bM^pWXq#@@0}tZiARV^Z&t11A~ZZy_eY4k|kHGWu~uNTZ{!fXY_iKBg|W zT^4&{t#Pwq+G4k2g}c8_1KN(0k(8?OiZz0BtrG*9-;{Z$_z$zhlXMs=D~%&3AV$Qu)KWZ_eiu!M3l$m0#yy= zJ8ksOrz0*sZ!aVBw-214i9A%o1wOZB>x&D!kn{;KOgZ`LnH3;vUZtz-+V|LffKO>` zyoX|1mLq2cXUV{IcQ^944E`bmofo?ET9ShcVh6x3;6X?sWz z3qy`P7n!^BLl$@O%29~Ih*%t>55x?J80*6FUEMX@9%r5Po7RlR05pX?hg_xxivbbg7fwKq3n!A#CuevmubBx&>u=QjoYqu z0$g*dGedhEN@QLjZHg{zA`e86Ko`Cz)%|HwTWpjU+WK*r9&Yv5lbh7bw!JIg2Wjn} zCJuJ5V3dmtS&A`|*Pg1EbJq zrh$c**Erx6b<1$w)^}a%bBLQyEr=HK#4+(bL9W-<*Vm}|^ z%a_?$Xy8|7mUGIh{HGt(v4pz7_1K!(D|a_w#v4P;tA}%<4?T_NS&>A7qupg>*HRt# z>`dQe)k$4ANf7B8;3vu8+^ul{*E9cR_|dyygPY7i)S=<*CA?dz=Bai2o6JIL3Q4gFSE@%BsgcO2#oGwS+(1q}$Fz}BTyjL`$ zmD&kzuM2RetxvCiC(;wPX?)xI*Z=gY5N8QN`a^2mh(7Ivx`Lg9Gku3+7#?@taI&mf zE}5Sc5y+?Is;|!*2I|rAWMV7cpWGMPMkJAW&OjQ(OBxo#J`7V17QKVBT!wmxRP1s2E}9emyFjooT6 z-w$mEmais9!a1oYm}Cm9n)$ow*?iF={H_O1@SjrA`y#@nf8mEVx1o6sx4$wd3Jx+t z8VNr;M#w=SNC%G`PvG&X`^08dNV>C?pgT#9kq${}%|3~1&AzBXb1u{0XNW|mZ^TP} z&cq%1It{;k_K35@HarsY;pF=^u-P^Ly8&~PMcOz366%TK2w@L%XIV_~{nYje?~)lD z$k~}FB<819I>4e+0MCrr*D3y@g?`S${kkwdYM`N&i}u=6fOhW76mhVt_vq%5u-SR; zx!UF~S$mP-Mr`r|dC8a?88>fikC8oEM;hR;)#p}iug9klDYe_Hu5R!(w(JxHgTR!<< zL0LL(jx~2NZjr{{muK_qZfHGfTS5|jT;rXI`(Ke~vo4F3Q#{I?j@=8+@d?^4ZVV8;#a$*+G2G;$O^s<+wp!$v>9xfmABiICh=AQD@Np$S&PU zoezJ=>bb)M?3=v*5TSm#7;iA816$ka_4rayP*{Ln)^3;do$&X4Yfoez`gnDWXg=nu zwX;T62As*jeL~xLrROsQ0g_h)xgO0XKxGye0HB2JYHRf4K8d1+v}9m z#Yuxea9ymZKnP-Rd?Hn!%bDZ!0C5G!du@-XyNqmM#k;5%P!3k)#zpJZ^|xLJ4Dv_P zF8nUsEY&Ed!Wnl)`$@vt?y%LZ@QKBs+$EDO7TrN3dj;BLm=fX&+ycvHM!S*ViS4Oe zCEsyIoOo6Yx^6xU&2ajzIU$YO?)CF(w?$oLI=36%C29LCTtvi4;v(u1aWzB(rBcsD zA-I(B_@~?XY5+Z{uHPRsui5L9*)h{)oEM4CnLL;$631|@bK~{6@czcLleb4oL}4gH z#M(2O&w^-^`)kzTHG>H1DhavoS#=x!T4cq281cyK`hK8m1f`7}AL2BQW7m7r<@#%F zpuH`;+cs*$QZ~;}qUuH)h5{O=w`JCT;rpq(6X)o_E8Arp*yg>xQaBR z_4Ba65befsKddEU{~G<9?AeFVl-i zOT3UUzcWnbpSitvct2~K+96EgE#Y*GI9WYk_nso7h3~%UVBSjMimk6b40lKxeLd?O zMpX%tHEZE!K|RCcK^2BO{q7d_W@~Zcl1GR8l6v8-*<+s;6lqSz>N)s>kqjI`s{|(X zy&bZ?_^&nF-2RqCx5*SfQD`sM1u1CYZq!e;fD6d!K2FXC8c!~No=Ls~G4oV*68-H= zHrt41o!pE)F8_Mqqirz?LwjqH%~pYA4o9{xZ;Wgv zO1!mOO1Y(oL7p3K*OE||nW(>fMwerhL{X!K)OoS_tQ3=j4~EL0Dp1keLc+2yRS# z=x2FX&__{U6)ylmIAincxF_iuD@pB*sSJWkXa{=#JWOyryFEr99S^ow-1+KH`OO1F z^5GGzhDa{v>r4KN=J^9~)pmJTzwhIv+6QDp;7IVim2l{-#SjXII+r1QvEG&r_71mI zea2zb`du9fn4?fVw_%xz!6kty-+l~>?S1kan1V%x&8u10vQNQf$tsxNTH-kf2F!H^ zVZh-&P8vUd5kR}7%wVo6G`au%#@d?R7iq}n*I#pEXEY$5p{FGrjLBw27I)zgDJP#=BlG znLEvxc!2AIWJ9@S&>!@A5~}X(fqCd5pQpT|%^jfxQcn{7NP?B6$gY!HOgTc^F_X+~mbgkr$0k zn7=HCHjIdbtWjN&I;@!nZ#Dg&I}RfUd#b3a2GJl1hY=ak-Gc`s#Pu4@NHxMH~xRF-jh~6|Gx_nZYby;|L?*u zFO2{13|E4quB)rY{9{N3%<#~p2uq~I|M%%NOT2}KP_yxUNe&cCG^gh`czSt~RSgQ% zx!75l!#2?t3a_2ZV6gnUz7VU({DzGZ_CeT8lTdt<^y77f;mN7{|QfZR8o#4i~Fl+qD!<4;?;2U1Or zQt8+?cMpI3vqI$Wj0tUT&qs^jiZU_VMj&o4g_=5S;Jq57W-Pr$`HA{^eg%Pd(J8AxTtfnY`T)kd)nn0uH!Mntmcx2i{yt0>BTzZbe z?j=;CQdP^+;c{{#WPQ&I=I)!uBPiH`qN1DSl|eh9=BuN{GyB3R*#Vfi7!1q#T!Z)d z?5@+_ho~5sbizrz+#lo}7(H`D^e#orV{_QsBYi%&EVn62sQ;b9vUWpwSt7luA*#Y3 zG^Xw2e@y&};(u8ls(;b5Qm+i~jBQpM>efk5&vdBS)UUK}*s8 zB3oGOlnR!HY@Q(ShZO#IT}sF@4lXaNAh|Xkysxt+FV~3}8@--A-(K8-l zOO?nX%>P}AHcC%VvfjGi?e&CYmS&!Hmol2&E;yP(hUCGYd%^v?D)oqlf%LqUQ$Hr$ zwa8#BKikI2s7U`B@XL)1AxBg7w)vid{V5~Gsk8`+o{|1>0#6(dPRk4(gV6gQCuIMS&8Jjs+CL!ySc7^ywjoiG@m7UxZ4c+#g3nW z=gKqL>Zr^Vp`hqNoI8)6$E(hfQ|0Q?yqUS+fnja;5$jsfRA?inYx6!XKE8MUKIebu zcnR$F%|rTih7KZHLQ28UD9NNilf-DYhmlbk2}*M+Dh|Cn40Av2l+;t~%izM@Im;&( zFj$9N3e3x6cQmNO3sO4S>0+z3#gVV@s5cj2G4C|@TP_REXs*>JTy7NS#Z}~L;$#n{@8u^%=ZQX z#&tZ6t=+O3K5HA79}EzOS&c=xr&Xe2f}bP9zX}#5;9#d`eKc+5fXr}G1*k{9>hI(@ zZe=rbF5CY(bbnx$(Bl*VYml*}5&!e%Y{++9uyRvqaD^X8zuRZm7zxjq5ImyxsiY@d z_8Q2?#0Wj!R@T%c1{DDKL514X{{8GP?`MGVZ!!@aXBr&^(@ehn%PDp?ZIz>pFFl$S1%EDD9T^Q8yjwb4mR zVC*t|)AIQmZClUZV(he(CEN3%ahU2+Ldw7Ag~FwVU&3Je^(VNYDY>{*JS{r$Ihj4a-bI zTgN5y6pH(7;09z+in44L4C07wI*B@@AQ1qpN#4z}m^PXPoaFJCThmv2I;1-NwD#D#gzS6R{hAFF2q-1o;v z<;}jdlOS8pva^zPlhG* z9D8j_o6wubwM}cGp%a`LCM@N@cBq#={xj2580F=Z>*w9gU=)ty=&~faB#zR59 zV{;rzvcf1@ifH^PExYMJ$Z9bfcACymt?JigU{EYo$IXE`6_3yt>9i~pj0h-Yf;ma$ z%WY}ctet*{TI6BOgL3b*w8}&nr(CSpZ)xx|BFZnN)Kvg2EaN8LVYVn%|7jlWcv2=) z@n8n*)U@>KuPGR$)PaJg8A#?AMMI?p?NX5@riMTJ`V>i)G}O)6OFs&q%~zj9h~1IC zd9jK61@AzKGcTv$47EF>g+jksShR6bPKT&+?1^3)_x#SspRvV92|qWrSI+UFm`lzSWgB_-AS(!r$0-wXBuUsSGW zttDOF936NunG%3ClM)uowc6&8{`T=8!vUJ-wFDP*C#r0=$MclVsLALQ*M?~)z|l3ikft_Ra9H#!=a;F(7eH~&!X0THJh2;l$-&k^p|vH6g2z5 zA{)!w_zbH$Iz9sLN^3>4!BO4z5txcykVhU2$*?vo6K0EF#IhFUF)*1%v)|Za zQ6AQQxQl&Ut?S%OqP^>=4+z;&XimUvKsj-FA=yj(Yw>mG<8P{ql^;1*J>-peQmYe3J zcN7QEJBMCym?27o5B-s?F5bc1BFe_;po5-M-^v9K_XdhDHs(K}bfQ*h0znx-YNivI zKll2S`~^Wr)U&MH^EJP;x#IVosR0fSCV+?nSL7z%0)>d-u=ILoJoWdOtgdL_e@e}k zvk{uMzbRCR;LX&%zU83s@6R^+FH-38g0vq5{?UJ|J9a6}&CQi`kFYhp>|9;3BIB~8 z0P}p6Ckzm@*_O-)C8_#T;Cl&lpTiltgTzEd4I|gg@vxRUFykPE49S>;{AO*m+!Pm_msGPdm720Yiw{0^#~9j(Nk;jCnJhWv>g@=D3@PL8m)P&vDWunyS`l9qtyHk9(Yk!&>Q!@wSUV!=Qs=> zjTdG@GHJ(L!;tLRp@&f+J_j8wgx_vWQEMKCEi1VymIwuGdAX4=ITe-n=8XtuK zXSfQxd?2pnjrlGe4atS&HF45Dki4e>~5TGdyR+p|y&+tD+t9o%ipA)0xuh)Yg^8Mqqs+ruXHkO=czO z5`VNes5MIrRQ>q`iwC`3(x0eGKOssszvZvY!<<;p*CGVPPTb|Z#NTsfHxZ6tEXdzsh+zvJ+wGLQRx4r?*{7tRlwjCJ8Q%dr+TQ6x$&N(yv>qK)oWE%uX?{AdEhfA&&=`O2n|J)OAkH zsGE9fi#~8X~MbZh(QU9cP!&WSeD`SXh{jSYa?^9jr_f z^d8Vei}C5m#8X3ZoOu|6V z9KqXB)f4Q3$WU6-PFP=fc=+u>vdV<}vLP*)*=`;>-?D7P2=>x$^f6g^=H&*Ukr9;q zlCjyqjeoj zsHw|Jo7JG}@Z10o8C}21FBi1*UZOe>YDn|>L%zt8bq2va8{Mq>k0f-zA48VEln?8S>PaB{n`e%x1vNxc`vQ5! zSDWVd+eK%vad70s#l?#?1G^g0D^_;&1b)K6d!1L9>^oAz zUw`N}xF;aV{*DJHA{JlEv4!JNMFz61q|(3O(2#MMGPt9tRZkDWiEXds8(UcJp-m5t zHpn>;>e-PWkR7{RrNHFQy)B8xCcVxO(1P;)*@XROTwHICKYaX};kymU2HPjMZC593 zr^DZ`i3c%Z=zXb$d}~&O4>c$469v1IcjPQQIM4jQ$$vxHogZw6P(Y5JuuxI%&4Uig z5dQ#_*VKm*J^Y)}AJ9SYxNLnWRxT8B)x{mJ1;I;elnkK@NxxBznLbI1@u`04>JKwB zCCuHNY?vahkPMLfKBdd<9R6m641WhG!Zq7XBZ`l~^2pJBNuw zOCTW$s+Q9@)F7N0#EdT=inINd<+vIBte(DMjvANM7zYn! zU2i2`WX;XmuNsv%zHIi2{^5!pF$$A*^}XXn4zXfOJb9!EiN( z;1*~d2T2MC{~+A@PCYFHh_5PnLaVnUKe6M6pc?BYwieo~#I_^{zJUNyM8;lrVYn*sx@ zdng1ZfU*6>C=NhkFcivN&j~>^4?^u=87F(e4uM46UjvhxoY&65+{4$|zjIfU-F|)dDtkoa9mm61*h{*aXg_NVL zO*Vb{N$aFD*iUTKzmB#brheedF$l#T=M;$-6<#rIHYz})@d8t}a8hyRco+=S%_uaZB$$d+ zj#!p;>)71D0--XqwzBv(tchGad`Cp1>6K+?1C(_ERf`wc-VAf|iI*U~8Q2Klfouj5 z3Ppz9Rtt}*3C5q@q0St7QH_lp!o8$s@c0wlBA>{aq5ls`!cWCHRM+}){6w<{BsIYw zWQLGJnksS!7bPb0|Grsw+|MZx=CJfWnR!31S}!7q7<}fpK=%4>KRK~D3xMO;E7tUL z!6Cy}V4JGeSl8Ly(jZGbz{UpWwOR#M=g2>3sd?02%G>762h1%`tCR?8+WA#T@6o+R z3uYiVO-=ROJ%?6*!vX36o(K<(4HhXw$S>lB^xGbz&k|jwnRhsj6A5lWBztIa1WxB& z&X4s*R&CKX?CYcdD9-MtIB(bN>c_EEgvkA0A4_*6^-%s@9L*FaZ)vI6lOKlb_r8|l zvf%PH*w&CEU*9T2KAis&ES<4DTdHjY(@WrJ&-*;DjAZH};lJVlbhYf)i$_F-^aWbu z)CNWu*+>_d$vF&sN}IXYm+Sp8>a8zd90W84V&BmmiVx3Cd#G3_zDzdIbIMGbqN3Xw zsXFPu1&x`*ig}=x@n0nZn4vk=G+QvFNWCPvBmF{LSvEWM6XIqsiUvy)_(=ucJCRk} z#EFwilq$G|(D$|`E_|yHHK!4Ue0&~Gss9mr-Cu6%CxP2U?(_%$!}E}PeY~K5O8gZ3 zFDPG&YTkUQ*Jdyw&!~A0hH9w%RfH5KCgn?N+GT%{38to}3U~-~n|zJ#>kBN_a6_>k z1eC&NdrahQ2;mqGhCo}+C0=5FjYlU(AV9!`f|g|B1Bc`fm<`tRx7bA1lf6W4HN<=S zXO!SP*VXSTzLN;NL2HH=9DO2p?gk@iNil_L5x)x{@aNd2BZXGoKQr*gsTyd}{4phW z9bXa0#3IpaL7Oi$7jSEy--R>74lx`eM25oZ&F629jCLl#8pnqE6*LxlMZErblK#C* z>!QoYy3Br~$@o`W8bpX`zKB1SkdT4A{1mtC8ukzl`3nlMz?VL(xs89%n3B|8;1z53 zruO%gq_!Kt&k@7x$rkMF#t7}1Dg30AFyiX%b&osS) zN5|L=)elc^$~O0kz6pN|KFHv4eQp{>0f#Y;B<+YVUcJcf_(0Lk~zicQ{A7>%=I;iEEjtRX&MQsA2t_MRQ<-`RZ)pz zZY1+lw?F21I1Iuo_U1UA{XQ6otCrOBgvoe$wf`;= z_@>F>^zQL+EPkeVzAT;Bc`fCAoU?y8Fg}_+i**3T+NN83IozhL##veThjr?aQU2() zS|-@9G#Q6e6Tt^++iz3qgAOaM=P<=~p1*A&t>$KzYox$GzfSxrezAKe6$0`psTi<{ zUkcjg+$-3X?=AEN=2Q>Os0NfA9P>=m6g)jYk9jIAO{7>7e)$^GKF`aoM#2yB^1koJ zntCRnO~?-n*>F4hDwmM3D=*5d4v48*@`k5y>zAlb*7SWb~0va{4Yo zXz6l|R>faf1B_291ofQ=mC;q)6~tQLZxr^Fj7nDm!Y)W&&9eNuymIa5@Vit$iJoMq z(U}ftnb;{9VB}6CkD^EfzybHoj|p&x$<93*JDY~i}o(KDw9zO!kI zWHlKeDXyEYc{MA54!rQ!=%odyhAYZ3Cvr4&sOnV=$*yJ%ioD5Nu<^1oQd6nXMc;mI zfaCDny%^ThDYpvF2=_l=Zx)z;kl}Oh|JDGT)h84bN|gKr$MM)CijwBIRGTh-?VkZz z1(%sp!LA8a+tsG611~I;@##@dQsyYHW4*P67VmF1@u=}=JG+=WK-wXJmUQ#3Qh9J- zjCc=9ZETd+ZS&1n)a?})mi#%*ZsqT#T7V?hv}CW@FoyL9eqDuN!|=oB&E#%c#shdT zUVHVI%-8i+`1X9@glL0Yu3TYsSL=dQw|;!(kVWX>ZTpSK$V&` zQ&Ttvj5v$S6iN7to#1iX>FBScR?h^~c(tR9Y2+`;5NH0i! zVCZ4gd}wQJZJZv|0>(f_edj5|Kb(DVx-oKr}!V!3L?C&_bvUVsh4)eg;TgFJl$Li}@XA zCKX7=k)>d#q)nW+=zpV2x8~P52GEV(oMEzm`RZaH^JSUo3&2%15T?qbPV^f-ljkFX zgpo%tz#_-ZL&Q-Dd=sulko%{vK}bTHuS>*3pJ7>(cc%9ZtR5CPUfXyI;Z4T0z?3$N zjdqtZVt>nr)5XSbC+Z{Te@szt+Ob7@96OZyAUk!xnJn{O9unst32KHn1((!0Bbp09_{A6UakC9((~ z1^!orzX2c!3zS-N!^lB{t0nD3JJF zi=QmjW@F>x+STOoyB&c=Zeh(Zv8(?f3V`oO0-=)UBFuR(_`~0QOv}r;U^p!NSe0@( zEwk6BN$BJ7TU?)H1Ms+@x%o>3Iw?zn@MFfv;cBZxcK73&-R-=JsHQh1#vJC8^ykl8 zz<-FQ|9N-As8QbYtjuc4r^}i}lA+WK?5V=t{zr~0EPvlj>kd9)yR6Kp!3-0jEP~x9 zlb-K@lTEWCjD27$i;qTo$3mKrwa4r;(fp2k6rVppUXi=$I!0e^3C zxy+|!45mykQoM=qUA?Gv=)wfZOgX!te}kbgFN8AoQ2+~}Ur0lJlh2>z{R&wBAYgto z&9SlU|Nc^v5Sxh$n}Hu>wW4{P(%nCQcw(7jMY;eGN<|?ss_oxQX6NloyjF zDketF&)?3#CB6;Kb_DG5Ztj=k+&j_St5dH*{=66*ypk_@<`yuNQTxZxEa@7A71B+0y~4(F?E z&!#fo*epq@sHn{9BS{z7FJylJZm*O`mR70F%}vYbCS^Q`V+yl24Vl-Of41X&7ME2@ zW!)XTl9(Uu)b2Q#t7P&nA+|an(wJqt(+GHKLH@S`<1qN7s)pY?y5OvqC#jInhEN_G zTZJaH=g@NBU$qm(N19T=(h;@c<%Gc*0F6Nb4KoQExm-Q`zisB<^E@!u>=8tRvZS`U zZ=_{q6YoyD^e4v&%rbNJc+qi;q586aO&6(WT@I4@>U7>;jMCtI?7Z0}TC6b+y@q!4 zygFJ;B-LYv`|ohpBABzQKsm>RoO2?t5gBij9IJeOf(Z~a>6T8Urg)_~GHUSN(G9r| z7Hz9WSuB|Jw;bBMf0|#e!C%>)B75PZU$bSYQN+ATljs37-2GtdY*29RtW%WCoMFlr za%y%kQ*tZ7!2z(w4^GFJ?Y>+>|NCu87&NPixTJt3K#7Y@lcLc`UtZnEKVw;M;leo` zxQKD6WMraWM6=Fu{CM4+mcwBZq2s=j{7n29RaMvh+Us7O`I;C>5aJ&5dIQZHx~?_% zr&uV4>M7~KiG|PK`hPCY0oo*lr$U&0%r6>l~9@L52`7c-zhP<><&d z8qv7u=g*%sB&1Hwen?VPiG{E*Ya!@FABPjD37?Qfz9{f)1ieGYq(Tq;?^n$n&HjdQ zWTc?Iyq=*)K7=;bR@=)lKN&T_`KV-B1hU$`RfFfo^8k7S1Qr0te7aSxWaM88r^ z^AdpHVdgwmH##`{S@<``rD0^>8wS7q&3c6)tygICR^~xu79PLWkz;UI*o_i+UYXtB zmJhdNHin{I_dX6G2EYX&y_eM^V11}rjzkLrCMnbK@URxy53j!kg@tq8!T$_dL@|gueIBu1%+GzcFQXY)2;@mj zO)jAfkxV;g3shG$Z$4lT=G9b+%cv+@ayg<|k(5u`FL>>!BZOna(NvynBIHXCASbv& za3~HY#Ton$Ffekw*s6Ztd|EtRWzyN>5?ilR^nfAT{+N@Bj!6`_z)XN`yVl{Lsn%vk z^ysMd69Rq>#fx~XA2fs+C3KDkYoGqWgwtz_zYhSl(cyeT0RQmhYEcgK1u7bH;`Pei z=UYPt`%%Zh9|sSpQ}(w?nbD9=qBsT` zPsvFTV-p%lS~Ipn4jy?(`?5Q<=Y|V|>!?QSt!n6t8f{RaVZDCue|H!paV`D>-xu9> z(khF~q@j39KmVbtx|%J^d6QKR@&Da>s4k&)-G2P2(#NXLc17P`MwJ6j4!7 zh@^slcD%UR%b9PhiVqR{Pxr-z%cfu-{&i7gCKy-Tmn6ooeGhYQ^6>N5#bZ{gl*NZ! zNqf)rERJ6jW+vK((v|U`5DahCxX3O>Tu zQ=*`Ioe9J<6v#T?3H2IC@FuohaUI{p!x4#f-4@q=E}YB_sj2E3I=QmisW_x5>)tJ8 zUe|m{NWeUp?VF6ov*U#v8S#2K{b!~;dJJk&{*OGRXF}V1_Z|C1Hs99aE|blL*IyT4 zi*>1leBdZF4D#ROvRA<@|`QehH z?9%0V#BJpBJ@D10P72+N0Gg$uZ3ta_ey5E>8~MEtosX;WCnAEXS;Y0Y{0&m|w% z8yQ;#hkz(+5SF%Gg-jV6ONOO3zjpS6!eC*%pt$ea;hU=K=)RCHz2OH#B`rl7_{Cr) zEuGvvaZQV_;xvuP%C;WjZ!ri<0>WjZ;zKxkBK&}(V^5!XP~X06m{Vuuy*J?wJ++oOUy>4;PNNvF;qD|`w&Cukq`e6mOtA=DhZ@G8 z(qH<6Z?LqjXxR7WI)jo6j|Vph#Z!&8z+l>!0YutET+q)V zO(1@8A)BokX7s)y(0Qf+D*XOzr!50GwC3PXmsf4a7 z>B|&3up*}5bUi9WTkIK&Y$L_Sqyk_TnASsGDOxP&M$((bS?loV`*@vyxH(_L96r^5 zki53mVEzzZ@`HwDB1w~~#7e>O=mCIW?#1{`)`ye6`vy4?yG8j-jxNzYFV>96M2wwV z;SUtP_t|X*lq-tw$4iHKVAFPZ#A5BhEJPjtFloS&qg=vsDV@5OZIl zaw9awTI)6{;`%()SOnhGT5PnJ*4uBKn&vFl+VsOHzv+`NU0vlt3{+^7dY9=Lj`CKc z!@C~~Fd#+?ZS>;-D1d&{=SHcKS)Xo^&s8eTqHbykNt4?(q|tTU(3oxCaKBf>_I*H{ zVg^OD^Yi+F1#}Pew}DBaC#GlinfGC&KmiJZz>iAZHm9K&W}ogvC0>b@2LQglbUSU_le#fkJO)u{5q~rR$8~N~75>mr1W(L!grF2yLhT4SprU zBvlgf;c>nBcghDqQ;8bQO4~;H4gDM0&1-S+??xz@ib|P+V5M*e;D?t}*w)jJG`nB! zB3yAR`TnoFCS?FWij9!VuKEU5?C{9*i5U<%G!aQFa`=>2+a4yb2nU=8bSHFms68_(u2p_hTVYaji(X%TMO7mg#B^%51JQq&U-_Z&Wxq#ytN_r zS8=lX_ewBuXna zHay0BXqeOA*4IW*kWr9RyY4R?r4RXpM=uYFCP$whPgzp!oya*TXyUSR&<~Dm8@u+& zPBjO)7j1PL7cw&a4@TNNDH z0yeZqvB%Y?1=5MXy$lFqo*wT`M`c+%RGh8-Y&Ix<1OI>;`j*F0&7exRy{bAfUT@vA zrt%Q8z=xk_in&$W<6)$EuZ-8aT<-j^cd%w9|DbyP6RbiqoYr@U2}*%^bGte z|MNC~Hf?>w%E=Wotp$E`;^Du-K{VpKV!}N6VJ{(bd2WntoAIHndfZbnQf*=e7 zde>3p#OLN=^hwH{0FE|3ZU{eI_V6Rx$w5t^WdOwo33vF5(Zi>>p8|X;P@?Zik$90* z+Re5s(;vRe(vZ2GhV>OX#mS4kYa^qa3lbg@C19HwlZU2J4JR+m?ks$qILyLc>eqZ- z?Qi~)kc8F-D?!{CI$dsVES-|Kt?8E!l0mdfzy7xWmnp>SlY|_7Pnndb`>(Qw^b8C# z9m<4>>%RilJK3hPB+`1N$FFOa8|hRC)l zOLUf2hF`#ikz1w!yb#u{h2!=2e3Jid2w|)Phvenp#nS_R!q$51V$$ee-s-=yMH^mf z_|!6C9;`Zmn|Ovl_C?u~VBM+CahzkZ)w=U9Ji^-)(!NRjQmfs2@KzhsIW(5d(0JYn zIL2rgPrh7Un;F%t_t#L=wqdhIweh$3rR#6Ve65%nLUYt*)Q@sCV?NTUj2D{U1%m`L zn#!6QO~hTgzJV4K>|xZg63fPzCU8GWvVeJLHj!Dw#*vK8f19P=RH_|5Il-`@0Lr7dv!|o+=4g-?6)BXaes&E{=-#;OqSc3_Q-| zZm;DA(-MqSj@{0NpU6l0X>Ix(+*Qgoh!Ikvp_OmB`1n7@h?8fEyks02=Rja;*AQD%WI*$i&}<2P*ut97{h3$`!2G?Yvl9bjz!fSG&k~05qLkizj26nAn7Y!hybf3mf6Sp0k`KJ6f0)SW4LT_mHQ-mCgl(-a)o|q-AnL z2Dcw<F`m=Wf|9hEuuyu_E#nr=`PYc3F^kZfH~p7NOf{Sl(WuSosjoX68wynsToljc~JwC`d; z$kxVEg>3}w4{sEE{NjfSIjbA2h@V|5Ysyyh#NazS!yjx@Arm&^R$tuzlckU>AeMu( zy`+DxP~*PDv0eTe;<|AiM&Y1 zH%I6ChdRLOGuGGqBxq~=@N+@@Gj1j;+72Z;Z12wcxH0UD==0dmnE(B5;!JaE^K`bB zh<}k^zt|-Ae%wdDSW+z-F2Y=60w4(4#wr~K#M;_|ymlqYZxmh%Ztu(Aw_ghqcntKb z@srV#(-?RhBa(S37Gc?%gY3n-oz}z3Y2p6Yg&7z~=Hu+xsoX}WCV={KOJHL$s7X2g zzvcj`4w3z6;P5vd;>9Hv{rcCa^ytlpTXY}Vunpz3x~}hfbjcOnoKj|s99N@1+tqfx zA5-p6)A_cO%j6SNS+eMI1ikXhbA*;>rU_3%Mq<;vTKei0sv!mmDqr zkWZLd{WO&;c9G#Jj(UaHU9_jjg6xz6OeA7ML?+d8uIT;Bm5jw8+xsdsBJ8mlFALbP-~Z-ZDxy4U!#-XhUMrs2AClllKf*B?H7@UdfX3Wb7+&!1hdPEKt7 zt+vs>U>~YzfoAr;jx=XZI`nn!-@8WJq)nJHr)}$yzfzjgHCvnkJfSk=o43tFFO?fZ zCHiGD{!*n5mZO*5j0r7F-XkNu#g{FRW<&IQf%=`6oJTz3^73Pk`l+L zQj=i9LQO_eDhXjI;^>cC^kks2SkbsD$I79a$x4g(B)>pC|*1%-0jKt|N zvf!E&fR>_@u$0yIu4mpzKrHZX@`-DX`G$s2D^vc?$hT&72+S>-^m8Otfh8@f*BnZ9 z49qO?W6XytRt^Z8&}i$0Qq1rNP~R=L2BL2X^jIOPkXWSN88G!J{hETsbmRc;UwHw! zGZGVjI#@$^(aAUpvEO`4j^FbnY903`#FlWKmw<>nhqHWrx~Biu<*3-5Qm6ok{l~CH zIX&RwKRiM%-u)t_5oCIc?5l&UNqx@`X4`Td&f-v z+u5r@h4Ai}n3UqlI<<-WuTOWH`G(6)D87KvqX+PNDwePLaKRz6KG%YS*|=#}rFX^Q z2{R2vK5K0z`AiSL9C6;hKWQC9$foTveDUL{z>$iIQmJXN*`N{hilg|Zh|xMo1RLz< z9P3G!eUn1nH8YY+g{}!{*CbS~lkzz3Fj4HEe;E6{Z{LLmwv^^>*r9p-?WU(rYFXd= zF^x2-PF|s@1U+@xfW#)^d$UC0joh= zWfv=r1a$r~Be@KP)=P_Dd>ilU9)!UK4LHr@W%9dEac|-i;#O5-I@T=--rOjiU!7xj z3C9Dd_2^Noo&eS4E{qYBcxe19NwsjWI(lCx%briMAeIp$aIG;;CRTWRby!3R4;Qh$ zyDkIj++i_wwP$hbDpKb z%W`D4y)xI+)mswMw?km845Y_Q9iN*W!fC{vZ>_!$hx)Za6XC@OCgxLDrpG$(VlW*2HMAItK8*FUst27>8pi54@ z>4TI!!;_mnt^8(%1M2vZ#})Goi92OC_qEufFK8FVmTEf5^=F4LeJn@i6$o6Q|WT|x+n7mlE!Q)d?=SH9q1RJ-*`g(s=@)DJ1Q1|xKvr}^PeMh8c z*xgf-WsC+%pQ{xa1F7O9oYbLJW-IC%W$vq0`Z30Y2fjx9SKk46O?g%f@msWS+|U7m zGGJd-r3An(wsz9c*-qN~#?_sA>w%r2XbL@xnB!OOch{r-9c@%>uXT(P62|SlP5OTm z0QpeTgUq#7&E))#{tzwZqwZzN_*Ae44n84G;;H9{)>zV4*F$xm`~BG(JXhYiA0Ke4 z*j+kv$bUyTL%N;jJe9R;#)*V`nbEp;viyn~-W>~{n85~XNm?B3k9eL8K^;OdGc?1n4M6X83*T9@x`wnISgm3^l=nDRe@EOd; z*Z&O7DoIA@SXcUM~2bPJDK+ z&XfJfg;sFy{(duXYN~*i?5jWM^1dw`d-zR4&sY%`7iJ)Ruq1)S=c;61ZZ4V z^OadAn1;yTzDEMcl)l$_@G=KE>kQ_y;XdIA)EQoH%eoCV2?=wX>#5rJeQ;y^*y`y; zUUp+}Gsr1XE*c5M6r__)VJac~5&U3P*m|(0v;H}N4Zeyo37~^gahG6nq%TD#|HU6@ zv7dP$M@J+-v;SLlBZBI~LnOW)vK@o@0^`U28Qb4&=Z)Q|5P1`(6sxFQGo@Sk;$MqK^vNhMQOgHD8zj&`-{^C9-&T>bsabbgZ z-`(YU>tsECME|xfjZ31+M zh$VRbo`0xywFc+&-Y+u+;rAqHgAo>~4B=Ux#q*F6>%g~oJ3B)m?$ZVJ1%Dy02zUD4 z>q+b0fuuxk=~M3aM#>yc3j}b~uir}>b{@U932Uv&xhq2vXJ-E0e0g*OEIMWFBA-Rs zgzvgx;h{hCIUf{|at`TFRvAD1{xw|&f7s6vMjnWWskVWzre3b`0SB*$29nH{ge;0K zys|ryrEO!5Z+CqJD9NUkZ6|M;C;P&&P5Q!KyiLUIj-&6}PY1KkfmD&j153;qPI`Lj zOVLFN2fzD!uW?jL%n%ksbNmqf#dDW+Irxp?>Fak7Q!GesQP+Jpj;nY%>J?gFNnbl7 zGhDkvHJVL!{8=q?e;sTF?ArM6z=*qg4lnzC)2c^VBkYLW|20ze4iI(3Jj`N2o0cn* zJSn+qfgp6hT&ad|bT+6_5|Lp4oMFt>HxtcRo`UY{mYTm;1}*6<(MXs<&W{X^S2bqL zVaW$n#ZO3L6Yi=h>ih4tctRqicfOkPF|s-9Iy;6&2=16HHF9;Nd3kwb3~bm#FCo_i zoy|v(d>kpwSX$1wG&zWUQ*8TyiC5W|4MrW8afUi_s%l6S5N(}sObC`Gi=_5ajSbX@ zM#j2d`*E%jlpZhF=!wixO;Iu8#GQe_64H!hQa5`w>(Z32-nHvJeC3E1-U8lL@YGI3 z>(b}|3oH+@5#|!C4H`6>JD8FbUHo*MoGp`+o>pu-lKe-;val!L2fe#hfZKYe4zO1p zf+_gx4M&@9x=T!y%B9Z2DP^K+%?H>(6>@((d1|#sF4-@`%FJAou zF1oqNf}8(q(_x~ZQD4o;tLm|!_wllhS2ZZ*XR7W6Cr`S|*_Y#>ze7u;El&gq+#BP7c1>?OZ*fS?v6kfj!3rgETn+7DX>#DBD zQayo}9$FZ{M)psYIuqWOCIBY43xwC=T3iqh2pisXR2z(_Fo={v!21*xDrp6?8C;eo zu(-I$5}M%7XATP&Iq>rAuMRfSo3Q0+$almW!(u`#K{1tpIH%W*j#mE63jciVB}5A$ zf=D_?v`C3twh3H!Ne!RtKasC@C&Ko)>QawwZ_A0sGMjeS_2E)LE)kls-o_wUi(u|8 zsgiXcnW0~uh^!+0jVd{0pT-wd-nY>FH=ls!pVtlHRwU5HT_NDvhIBLZRWVQ9^GwQB`t56^fb7_>bps=zD;Ir6mJ1*^h(s zt4ll~U*88b>?z+p=n*B3I5YjR;28Hw7^FdQM=*AT_P0~_HS&j|(B zliX5Y*=OopPvidBc4=T7Ex+pY@rkMU3A-i8!=v$BfBjl%YkD@3Dk}(3qvA3to)>j4 z1Wc?wUUL)hIuz{YWMwfw7mn5|t*R2_KkTMDA)Dk=uO+lM@5KT4Kg)HM9!$PGDzn`- zku^I!oRc;FJsBxh#mn#qR5d?TZb~ct>EyTwD5K`5S;aTsiD{@nMdIK2N{yAVBof1W z$`Gz_%7n8MeF>p(Hi&xqGR0<*2r*X56z}T~k7X5ynskyuLIjhnR4^8m_VQ}~cM0@6 zofw&@xTt6V%{K5;Y9phdlwt))^;Sz{Q;3BOkv*Ov3P?uVC~r5}t_h|KcsBM849M(F zWbJ@MyyD{bzh?x*How0=8<>ozAeBKKqoDydBS0fM*Iw6uU1^zd8x>crxR zjnplQ+?OwY!2g<<$L@M^g&JcLD?&{Epy%PcW}RttcG*U6=qM2Ghy;>i z|258Ny<%p$PD)LdxyyzvSnE>;U9yUdY}EHCsP+oBOay*6qotzJb!j{)ah<57^jf8T zixpvEp7TCzYA{;vGTxv2@l}>czzcB`8>?L`?(;X;9rJ-tZCf#X@*N)J6Xcch%Fw6T zKwtx4ZnAoXZd9uu1=3i!xP@!~`W5>!%(;4-N1}Ai7Ig;pt@`biDNcm9ExzWDg<+qq zgKQ`jXlP-6znWO#k?8Y-o=ym@p6`Qm3-Gn>gb;m-dv>_E94+*?38420lAYD_>iAdM ze=1bSsoX=p`2!qftN-x1f*@4|s$qbng+YEz?&kosh$}v$tM0NQdN_hGY~ZI$XmQ-L z)M|ID0VZ5-4|5GisN5~PF!~CT%s(JoYS^~xp>PkGR1=%#ecaU4RDP|%KW84rs%%}P zK-l*{sKE*3(M)<)=H!cngXM*iN<(F~r8u$>9Zmm4OhS94TOBneD(c`HL}g&c#LI>Y z-HkM!<@b@$EHSQtkNMz8#K&b8XOYxUIuY}Rj+v#rly+(Qjd=~TX>;v-GA@q0j@tGI zJvAB=d3WuS?s^;hz{4o06*YI%(jE2r8yCb8Fy5s%9*8QzG5|>qRqUQ@1w|AyQQrf- zD9FjVs1)|@Gefa9_}b%upQj8fh>GbC9G?Q!UFH_&MRgJJplcso z&)IQ;^4@bBThjZAmWqy5)r*3km3|9>Z~AN2SEj$p;*J0J__PO`W_gf=@ENU&_t6lF zjdL83M>r93yc^JS$Y&Ro0A~T?ljki4Z(PG5{uCEK+ZckTHn$YmFTMQUU>{x5EVd*b zi5X>UjE1{?lnR`pICKE3C7t?WP!@!PpI>55aTq+@-b{bI8>Gz|{r~52X@dd^P;M zDLnpWyd<-RJ(*A(L}jye)ioK{(DgvML){M|nHitM91dY>8*LMS6hPx{?kM!9#u}oJ zya&e_=S0da*Xd1@?aKQOiQfOgnCB?>#Jz5Lg-X29%b3wT?p9qD6tc+h&MHc zY-fL;QoBxK5V$Tfb8H;&y?+qDKK;p3rR)3|WLh|B>xCgcv-*BbZS+9rZUGX0szf9% zmwBr*gp~BeWq_G!bbMSDx>s3LRP=Rj7P;W|-rnJ3+KLs379%u662-b$1p%OAY3yQ+ zwriE6KO30Ei-0F|U;oH;y&HO*sCSW#h@g-(6iE$Oz4TZB&viO|nB0q8U!9z6hUluP zYhG2o?^}-`lqkxG|AVz+XMKIWr5+buZ@O7FJxE}kJAEFz&Fgu}#VXfzF1P?;@Cu_o z`T@XUoerGGMuNT9e4o7CZ+237(fR&4<@&75)oKt|sft!^F$p++iPAIL-%_vO>!@bG z7&r?Q*+mJ1f&}>7{dTq*s3hzzqZX{F)A_sYOU2J3B6D%M9c?v{1pF2w>(ftkuUK2( z0X%*52eUUW3!!%6^z=-L$==vgZs%Y;S@m8Thp z2ZW~dO*@|wy#CyKZ{k7rH9Xw>x&s%-TB9@Rf)TGrlTt|&dlDb%=sP@eExs2hX=$)@ zbzYS--QRliZk_BEY^`q;l3qVj65%n&J>&AhSGK7n?EeS5FoO)_e-#WChhq_w(sY_~ zE>FB8wetvQzFHovN_YI7;vurQna*g}DN31dG7~&MpD5#Z`# z*7tpAkTpbDHt3Qhbr*jUMG{FTZ`W}g=X#roez!P}cM-gQ3B(zz``m}SKW$@p-w5~% zrhMoa8{2>&X9Av=P-|OMZh(W$l6Q?gh1fi6B+QsgBe*(>6QA|CG;AZe7v^U>pbjYv za?#M!Mac&5fJM2Ivwj0VE%y+t#;Da)PJ zeSQ`Hr8@Ail$PuqwYyw0CR!IN_Osu*0Wum2+p?sT+xq}w$JJzqgYj4y9m`tw-SuE4 z3JG@;B_(r{moE=kLAn6=j1Q)Z?+}T#)T|HDI{B)E(#9S|{2Njl=+6hs6?+7+uwKv) zK7lNuwfxkVcuBIF_Ftt7b?+ZO{$2Eq=RENd3C_N#m9g?h7UzkNdW8LWRuEY+X8DpZ z`-Ak#?Z{1RCk3p6fGmty@B0u+oPl@qE=Zh%|6esx%Lej+PO2 z!u!K4-yhrhPSX!lbT`A8_|altNVpG16BSY7vwC=IvH0$aT?KZ1f8M zq@0|h-5ccKV#^(}noEj=pAt+V^Ps@54I?Hxdx&tZACi4#hKz#Be#4yVp3_y7O?xTm zD+^K~7@j>)ZaRF%9}$KEs@AwohF#F0IT5Vv!d~v-WYn_!*s|5ERjhTfPHmovu{0O0 z*5Eou1nGikcqY0mM7}xHp4?Rxvu$6ci_4Sr2D50+3w4x!G5QQlWncA_aqR3!0^ja< z!q1Mb68$g0rXZ~&^$$wTo6+#&))<9mN)gt>xw86nRhBIL0CsXp-p3W6R z8*nt_2D z|7y0gq|5D}q5AA!UkoocDC@P}Qw=Y*p2zXZg+C50r(0$J_SVAC`83 zW3HX$JSp^btxsAtn>|O+DSGVbLie1@Chn)B-~G;FEHxE_5=yP(_A${!@^Kli)gm;E zR&4v(DaZM#t7cx5#Vz%kFN?l_bH=vWDd%!Jd?CsA>k{vfk;Oj`+Mwh<=yccgl=Ime zooE;>y?`|zOwtS7j#cO?g2DbBa9ox$9O|melAbYFtP#vlvGmVGqtfikOD#$Lv*VWX z9vcfypepB2}#VyJlN9Cj<*sQZwBMar&A$pW zwt<2(a|&5JY2L!X3AeUK8vP#9$c^6l%8&;vp`rLEeIO-XOa20DI5bfuxSj^-ems48 zROs-C8%)*rDOo#j>D+wrZ+BJy=D?Po3oDGvpTcl9!S~CKcP_pdBM@{$QD%Q)3};O_ zA`R>Y%}Z)n7cga9G$1Ikyp1!yK$_hW3F-lv8H(tN_gkZ}Z~FoR2N>ZZJ`euEM6oZ2 z7o>?@7i+9hc0>}GWqZSLG?QhOQW6F%i*DRDx{qzMq^2ZG1|Wd}3-Xfb(e%N8Oc$i4 zG*yZ%YxLAH2a-@KX*gvfI1W&{tsSixEggK7ETEgTl%07&Dh6HHnUOk?tiS4}H8NUm zsQbd+{)hK(>FRARAv2E{0SVDpuj+NEgZrN?QNz&E(~%BVcwUj-E$6{0DM!FS2tiV(^JSq5k_LWAg+)i#NJlB_e^zx zsPhPOVrS30inCVmt71dggWt{=+#^@xoPgpo+0ns;*$$Q!dHT zRVCIO_e(bkifhP-|CY#ak?iZ5e;c=Lslr8I@_{?|z4)^?9QEgk*@T+>|UT6A`B177oceKMlo8NhTNl&kJ!nZ4j3zk%AsK7vtj7pAreD#|R z66aE`$pF%WN?C59l@Z-AD*n!bF#&~+Gqf=YB^1odeDpYS(*@-iZd|^}!l6+F_a4t3 zuvDdLSX3AWZEp%%^|zk5bKpEJs^l=>J3_=HP$!i;!`EtY_=-@~DB$!w9ultzOTuo?ppWHIjNA8xkN2Kji~C-WORC}_*yyhOYxplbSY z9ijN&Li`AX$NVJ|Z1#HB6%LlP11$qj1;Wkclz0? znYA1D_db8I{f6CteFS|LR8lrc$=Y|&!=%R*sttYDjPDWBZuco6Az`K$F{gvTjh|u{ zOIRpW9TSfIe|I_m=Y1KVo~~ApploZ08Xn21EcLis8+3mQ;gVC-(qY;$v77F3C5z5c zj{3E`@>)T~C?Y~wBer`8#GkxbTX7$G*K#iNN@S5)FE0%rjd%@dD$0;*bbK z6t^(@N)W2Z2wHhqPglMaH?rba?Xi{IUYTeGN{*3!xP*_Omk*%U^X|Ux8u8v&HjVrw-25hI)W~fIm<8S8?bgI`gOr z4e{BcW41?PAGnshC&nL6KFHZ3g;RBLnJIBcVH4B9G5vapX3oOC{I@yHXociVNrZ`z z2#(nFOm|XvnIb#73HHNi%wdoxT*#Ej(Z)l-Kd*li-K}Fmj4)I{84n2sJ@HF5($$1L zBEKkGR)-B|FJlp$9&ZdhTcKQ}6QfZ>7pNp%=8D}py4Q;bup!xrEM1C{_#ijmC` z=R#bR_;bFqmW=`ouNYeI8z~4sWVEo-+tNz5ZAF`rj>zb(Ap%QVW?K589a{dYe;pqw zUz{TZBy;3eE$N3=&CrZybzb`@0O}z|hXZ1i3EW4uNm65OvHjukpYBs(Z+snHY*P~` zk%S?h;*^`^7C&_TEB%^K8A$P$-@SU9k%X0$tUQ%24rIM>7XFECL2`6 z-GzmP()GzO61a_CUi^B_h0e-cy@jR*Po-S1#)ZJ^o^q20_@RHL^0-&wYLwSJXiUa@ z-y(Yl*Scm2CMvtnULn1Z6xAdOV)t^b)TP+@TIjR;)Ex%_I^$+zC&YjKCUNF#tWW4l zW@^JAmM8HhGz|OR@njM#Qs(i#_i@`*qTd&T)Emk)Cp!(Ia0S2^c4aDu4MRUW>QUK& z(jx}^205Q>@~8#(B^W6y7_nN#)37k42OaDrl58OoK9`wO^CqykHJmLFiog|WXF88_ z71We;cxfVMoSm6@%#q%^! zq6|T`7N81)cXp4hr=m-xQC55EhPvYMC*O!JHm`L_Cthfi4JIf!$yZ1qM{9Gi_W`UL zYKr`_`okZ~naW{?Aa0l??{!s*HUx}i^V=cw?%FwxV~|EM!lNbL!& zDF<&7XpDF(xO=Ak5jde}Ti^D^#>Ps&%%3G$&SN}0D*7Y1=T*epdM|ww%s!5WhejZk zge2=At42fbh!Vte`_VN6Fb~Z!*}sF29|`sC3&v<;uy-!9(uTP{m}tmJkJytRFmrxv zQf4g<^QsC&tf}ua5-NxEMY6)HLswSs8w+ zq#b}5Tb2hJ`MG93h^zHEDyO9g9O1*D74>Trd+(?XJsp-ZQ?H^rH`Q>&UDiS+eclIc zdS-fc=ps3 zXYFXvj6VZqy79y84aYhszYkkGYEI1>aB4M9 zK1P*i*Ek9uh?3EYqXoHuX%|i}c=EvC%7pq4$~d*X1dt>kkAncQvdw;AhjH%ejbJzM zB=h|YvQ_&eH8~w4to0kDVlk^qf_}67a6BpjtZGJZO`4oz=138kE^_Py_|1-%mo2%@ zy<7y*-V?>})lZ;i`(ZpI{Wc2K84TC^@CP0fQj_U{7-;Ga*8L7BXdfR0e+ha7M2JF* zP?nkAQ%$cu(4uHmpxd3)y4oaepFF%@`~m-2!)T8b$PCQ9ro1D#ke8Ihh3E>MkA}Z^ z+?v@R$72e;1Um(vQ?GF!Phkb{uFtAJ3-kCHVt#-TMGFcCRE09(>6fI$5CpJasT#LG zT@>q4-(6P(e02=+!oR!U7Nn)&ui!2(9D{@G4jrFOxXaAMWEgCw@4t~^JFxp{KltnH zRk=P;h{iH`>y{deSwpNvoWAL6qlvVDPW~e(aTvsNkIvd}E_0rkXtUk&Qs7~bF77~g z=$vz~H!9iT{nq(_3W(elrB*-)f2}Zsmxl${0QNI=-o+4?e2Zr;G|*T`qE{}RUBjVc zogkV>Cwbipiu|QQ$ie#8AP(|eLG!v>1l4!C`T6-fU`qI;Cp!OESA?ih zT!RL!tVuF;V=&S;j(yeJi~0?<>RWVPHIJxt^B>`#J)-$OaBx?oR>{fASI|{gn2Qv zQl~87`WA!K;^K_+FrZ+1D*ne^v-FRv1!Ru1B-HJ+%SVReR;R=f2ibkl{C9ZW{AGR% zS8lqzmHDbmAY)E?FL#Sc;R}ks^dV>W%of@&5PiJPRj4m9;0x z)SOh;OmAFcA0*EIhH-JS+R+vT>`j-MJob6J;S|Q8FG~g@vMPGka))P=q!DIJjc7e(VjnQPNWRIPx*(P`x8lOHC!O@!K2ZIp}-XhlWlOnwwk1VyKT> z#!*Y%5Pe-d$jfIxrIpRsn;>t@M!dev&%q(p*ULq=BdH3jG(Z(XsvY#%s9{f7t;(Tt zJ*Dx_6`9(vCz%`1cty728GN)@zf>Q%xBzcHm{o>1>U-Lz0c8*iE~(Xz&e--;n$3J$$otmmxwBFg4}c2V zkOeC@voLX(o4=n&afAg6SrCNDoSo6sQ6k&+Tz5mE`V!_b;lH3#%FXt^=x>@n5t({bz!x7%jN$+MikQ;qK|D5RWUAC`MFI;{Z_pJb_%X05hXDO^q- z=q3vBAZc4#MJa+kftejqx#U!*?LCMzRht{j&J(-*i7YVjh1g~j>+2-4S(=wZ!cY(u zGxw6~;AiUE^|Gh5d!M<-Dfhqj4Ez+f#E8?a^QLEFUOkXfaFB_HJ{UBA2g|L6dNdH$ z&nRdDqCO9|SJR7CP7JyhIy$h4rDwSWrBz=uP z0k{64-k>MIaygF}DKqMJll2;q9Mu2E;^q&#%6*pLuNbf;Rvdu;8;Ip)lISV(2)DRc z{Ko4ILERI@+$oDYPhhV35sVWP0NibpaKz6|=l0 z`dgm0!3IcZvYVpUURPM%4fxd>A<&~GJE*GZcR`(7(Y(Tf&SfO&Dcj)M;&(#XVPtJE zRk}DoS5_ds6zK|Fx~@PDkbl+Hy$+eKgjOBkTY@=Hg7hM~HxudYM93sfZWBSUN;hBB zssyfW2Y342zecLRsXVrRbB0-O)H(x?cKhtmt!&;zSs{h#0S>b&BY)aE&j@BkVR8E% z>Hrh2*wEv6D7eVhc4tMrZOk}*Isb3T3zyH3$LH1X9kE9#x=>xczBB*0%@PBcDVGJr zRdyFWI=qW-u1V-7Jucmett~L_c~L`TB{bLP0`SM78Ui4wzT_m7M*xVCwuyh>|2vT^ zwD?UGgb%{Q2`7vvkK>8dn6Oc+ zoA|4zN^_*#TU}wZ_Srz7F_c2@gI+181BD2bXypVj%SeY~caF)cvY zBq{yqxh@>!?-$VZH>RE6MwaMqVZwcp67K-+K$}~gr>Ue&CC4E@>TLJsQ2+ng|5eWC z1stTDj|!|_vG6IZ|8vc2_2)Mq(-LBqf0b0qwf?&1(~?XM0e7xV-%kS1!dvkoc}?7L zsYhHDB?;G7g3iF%>nLzT;+}0pT>Qetrp%6qr>66_6*|gT&T9P5x2MV=TH@SKV5zR{ zZ})Ra+~j*#*rADx1#}x&Pax;}_RIOTPHyov`=(5_m}bsku{$Svr(@iX7cQE*dM3wS zJ^UQ^i7ny6;muEhv3Ygn9>Y5?TE*ioNG{&v$+GaqW2u<(j^E51`8O|B+$WGG7rnpk zZ=ik?v`fB6xXbn|N1JA%*T;9t>K;3l!@QjjeAw{U+UzvHzMT$7iqH9$U5y9jWn^@K zhpGLNZQ^hHbYb)H-y4tG`8f8U2aZ%|UVp2Y4sEkP$^|+(OIYksfYb2apMf)}@?Yh2EZ$rEXLvr>33?0Gk=dE0jI;L6y~Sj8 zU0rsaw!uoJXh;qu=1H#fU8nPiq1t_|b)Y;CxSLAs@7C0EQxU?(j41_+uZ!_g!jEh^!|A5O})!xvX [!WARNING] Work in Progress -> This benchmarking suite is a current work in progress and is prone to large changes. +```{important} +This benchmarking suite is a work in progress. +Expect breaking API changes. +``` -TensorRT-LLM provides a packaged benchmarking utility that is accessible via the `trtllm-bench` CLI tool. +TensorRT-LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility. #### Supported Networks for Benchmarking -- [`tiiuae/falcon-180B`](https://huggingface.co/tiiuae/falcon-180B) -- [`meta-llama/Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) -- [`meta-llama/Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) -- [`meta-llama/Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) -- [`meta-llama/Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) -- [`EleutherAI/gpt-j-6b`](https://huggingface.co/EleutherAI/gpt-j-6b) -- [`mistralai/Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) -- [`mistralai/Mixtral-8x7B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) +- [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) +- [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf) +- [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B) +- [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b) +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) +- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) +- [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B) +- [meta-llama/Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B) +- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) +- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) +- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) +- [meta-llama/Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) +- [meta-llama/Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct) +- [mistralai/Mixtral-8x7B-v0.1-Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1-Instruct) #### Support Quantization Modes -TensorRT-LLM supports a number of quanization modes. For more information about quantization, see the -[documentation](https://nvidia.github.io/TensorRT-LLM/precision.html). +TensorRT-LLM supports a number of quantization modes: - None (no quantization applied) - W8A16 @@ -31,7 +41,8 @@ TensorRT-LLM supports a number of quanization modes. For more information about - FP8 - INT8 -> [!NOTE] Please see the supported quantization methods for each network [here](https://nvidia.github.io/TensorRT-LLM/precision.html#support-matrix) +For more information about quantization, refer to [](../reference/precision.md) and +the [support matrix](../reference/precision.md#support-matrix) of the supported quantization methods for each network. ## Inflight Benchmarking with a Dataset @@ -41,9 +52,10 @@ This section covers how to benchmark TensorRT-LLM using inflight batching. ### Quickstart -For this quick start guide, we will focus on running a short max throughput benchmark on -`meta-llama/Llama-2-7b-hf` on a syntehtic dataset with a uniform distribution of prompts with ISL:OSL -of 128:128. In order to run the benchmark from start to finish simply run the following commands: +This quick start focuses on running a short max throughput benchmark on +`meta-llama/Llama-2-7b-hf` on a synthetic dataset with a uniform distribution of prompts with ISL:OSL +of 128:128. +To run the benchmark from start to finish, run the following commands: ```shell python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-2-7b-hf token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt @@ -51,7 +63,8 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --dataset /tmp/synthetic_128 trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1 ``` -And that's it! Once the benchmark completes, a summary will be printed with summary metrics. +And that's it! +After the benchmark completes, `trtllm-bench` prints a summary with summary metrics. ```shell =========================================================== @@ -101,35 +114,38 @@ The workflow for `trtllm-bench` is composed of the following steps: The inflight benchmark utilizes a fixed JSON schema so that it is simple and straightforward to specify requests. The schema is defined as follows: -| Key | Required | Type | Description | -| :- | :-: | :-: | :- | -| `task_id`| Y | String | Unique identifier for the request. | -| `prompt` | N* | String | Input text for a generation request. | -| `logits` | N* | List[Integer] | List of logits that make up the request prompt. | -| `output_tokens` | Y | Integer | Number of generated tokens for this request. | +| Key | Required | Type | Description | +| :-------------- | :------: | :-----------: | :---------------------------------------------- | +| `task_id` | Y | String | Unique identifier for the request. | +| `prompt` | N* | String | Input text for a generation request. | +| `logits` | N* | List[Integer] | List of logits that make up the request prompt. | +| `output_tokens` | Y | Integer | Number of generated tokens for this request. | -> [!NOTE] Prompt and logits are mutually exclusive* -> While having both `prompt` and `logits` is not required, at least one is required. -> If `logits` are specified, the `prompt` entry is ignored for request generation. +Prompt and logits are mutually exclusive, but one of `prompt` or `logits` is required. +If you specify `logits`, the `prompt` entry is ignored for request generation. -Examples of valid entries for the inflight benchmark are: +Refer to the following examples of valid entries for the inflight benchmark: - Entries with a human-readable prompt and no logits. -```json -{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000} -{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000} -``` + + ```json + {"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000} + {"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000} + ``` - Entries which contain logits. -```json -{"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128} -{"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128} -``` -> [!INFO] A whole entry is on a line! -> To make the passing of data simpler, a complete JSON entry is on each line so that the benchmarker -> can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete -> JSON entry is on every line. + ```json + {"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128} + {"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128} + ``` + +```{tip} +Specify each entry on one line. +To simplify passing the data, a complete JSON entry is on each line so that the benchmarker +can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete +JSON entry is on every line. +``` #### Using `prepare_dataset` to Create Synthetic Datasets @@ -162,12 +178,12 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --max_seq_len 256 --quantiza > [!NOTE] `trtllm-bench build` reproduces benchmark engines for performance study. These engine configurations are not guaranteed to be optimal for all cases and should be viewed as reproducers -for the benchmark data we provide on our [Performance Overview](../docs/source/performance/perf-overview.md). +for the benchmark data we provide on our [Performance Overview](./perf-overview.md). Looking a little closer, the `build` sub-command will perform a lookup and build an engine using those reference settings. The look up table directly corresponds to the performance table found in our -[Performance Overview](../docs/source/performance/perf-overview.md#throughput-measurements). The +[Performance Overview](./perf-overview.md#throughput-measurements). The output of the `build` sub-command looks similar to the snippet below (for `meta-llama/Llama-2-7b-hf`): ```shell @@ -236,16 +252,17 @@ upper bound throughput number. #### How the Benchmarker Works -The benchmarker will read in a data file or standard input (stdin) as a stream where a single line contains -a complete JSON request entry. The process that the benchmarker is as follows: +The benchmarker reads a data file where a single line contains +a complete JSON request entry as specified in [](#preparing-a-dataset). +The process that the benchmarker is as follows: 1. Iterate over all input requests. If `logits` is specified, construct the request using the specified list of logits. Otherwise, tokenize the `prompt` with as specified by `--model $HF_MODEL_NAME`. -3. Submit the dataset to the TensorRT-LLM `Executor` API at as fast of a rate as possible (offline mode). -4. Wait for all requests to return, compute statistics, then report out results. +1. Submit the dataset to the TensorRT-LLM `Executor` API as fast as possible (offline mode). +1. Wait for all requests to return, compute statistics, and then report results. -To run the benchmarker, run the following with the [engine](#building-a-benchmark-engine) and -[dataset](#preparing-a-dataset) generated above: +To run the benchmarker, run the following commands with the [engine](#building-a-benchmark-engine) and +[dataset](#preparing-a-dataset) generated from previous steps: ```shell trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1 @@ -316,16 +333,160 @@ Total Latency (seconds): 20.331645167 [TensorRT-LLM][INFO] Refreshed the MPI local session ``` +## Low Latency Benchmark + +The low latency benchmark follows a similar workflow to the [throughput benchmark](#running-a-max-throughput-benchmark) +but requires building the engine separately from `trtllm-bench`. Low latency benchmarks has the following modes: + +- A single-request low-latency engine +- A Medusa-enabled speculative-decoding engine + +### Low Latency TensorRT-LLM Engine for Llama-3 70B + +To build a low-latency engine for the latency benchmark, run the following quantize and build commands. +The `$checkpoint_dir` is the path to the [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) Hugging Face checkpoint in your cache or downloaded to a specific location with the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli). +To prepare a dataset, follow the same process as specified in [](#preparing-a-dataset). + +#### Benchmarking a non-Medusa Low Latency Engine + +To quantize the checkpoint: + +```shell +cd tensorrt_llm/examples/llama +python ../quantization/quantize.py \ + --model_dir $checkpoint_dir \ + --dtype bfloat16 \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --output_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \ + --calib_size 512 \ + --tp_size $tp_size +``` + +then build, + +```shell +trtllm-build \ + --checkpoint_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \ + --use_fused_mlp enable \ + --gpt_attention_plugin bfloat16 \ + --output_dir /tmp/meta-llama/Meta-Llama-3-70B/engine \ + --max_batch_size 1 \ + --max_seq_len $(($isl+$osl)) \ + --reduce_fusion enable \ + --gemm_plugin fp8 \ + --workers $tp_size \ + --use_fp8_context_fmha enable \ + --max_num_tokens $isl \ + --use_paged_context_fmha disable \ + --multiple_profiles enable +``` + +After the engine is built, run the low-latency benchmark: + +```shell +env TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG=1 \ + TRTLLM_MMHA_KERNEL_BLOCK_SIZE=256 \ + TRTLLM_MMHA_BLOCKS_PER_SEQUENCE=32 \ + FORCE_MULTI_BLOCK_MODE=ON \ + TRTLLM_ENABLE_FDL=1 \ + trtllm-bench --model meta-llama/Meta-Llama-3-70B \ + latency \ + --dataset $DATASET_PATH \ + --engine_dir /tmp/meta-llama/Meta-Llama-3-70B/engine +``` + +#### Building a Medusa Low-Latency Engine + +To build a Medusa-enabled engine requires checkpoints that contain Medusa heads. +NVIDIA provides TensorRT-LLM checkpoints on the [NVIDIA](https://huggingface.co/nvidia) page on Hugging Face. +The checkpoints are pre-quantized and can be directly built after downloading them with the +[huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli). +After you download the checkpoints, run the following command. Make sure to +specify the `$tp_size` supported by your Medusa checkpoint and the path to its stored location `$checkpoint_dir`. + +Using Llama-3.1 70B as an example, for a tensor parallel 8 and bfloat16 dtype: + +```shell +tp_size=8 +trtllm-build --checkpoint_dir $checkpoint_dir \ + --speculative_decoding_mode medusa \ + --max_batch_size 1 \ + --gpt_attention_plugin bfloat16 \ + --output_dir /tmp/meta-llama/Meta-Llama-3.1-70B/medusa/engine \ + --use_fused_mlp enable \ + --paged_kv_cache enable \ + --use_paged_context_fmha disable \ + --multiple_profiles enable \ + --reduce_fusion enable \ + --use_fp8_context_fmha enable \ + --workers $tp_size \ + --low_latency_gemm_plugin fp8 +``` + +After the engine is built, you need to define the Medusa choices. +The choices are specified with a YAML file like the following example (`medusa.yaml`): + +```yaml +- [0] +- [0, 0] +- [1] +- [0, 1] +- [2] +- [0, 0, 0] +- [1, 0] +- [0, 2] +- [3] +- [0, 3] +- [4] +- [0, 4] +- [2, 0] +- [0, 5] +- [0, 0, 1] +``` + +To run the Medusa-enabled engine, run the following command: + +```shell +env TRTLLM_ENABLE_PDL=1 \ + UB_ONESHOT=1 \ + UB_TP_SIZE=$tp_size \ + TRTLLM_ENABLE_PDL=1 \ + TRTLLM_PDL_OVERLAP_RATIO=0.15 \ + TRTLLM_PREFETCH_RATIO=-1 \ + trtllm-bench --model meta-llama/Meta-Llama-3-70B \ + latency \ + --dataset $DATASET_PATH \ + --engine_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \ + --medusa_choices medusa.yml +``` + ## Summary -In summary, the general process for reproducing a benchmark point is as follows: +The following table summarizes the commands needed for running benchmarks: -- Prepare a dataset: `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH` -- Build engine: `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH` -- Benchmark engine: trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` +| Scenario | Phase | Command | +| - | - | - | +| Dataset | Preparation | `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH` | +| Throughput | Build | `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH` | +| Throughput | Benchmark | `trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` | +| Latency | Build | See [section about building low latency engines](#low-latency-tensorrt-llm-engine-for-llama-3-70b) | +| Non-Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` | +| Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR --medusa_choices $MEDUSA_CHOICES` | where, -- `$HF_MODEL` is the Huggingface name of a model. -- `$NUM_REQUESTS` is the number of requests to generate. -- `$DATASET_PATH` is the path where the dataset was written when preparing the dataset. -- `$ENGINE_DIR` the engine directory as printed by `trtllm-bench build`. + +`$HF_MODEL` +: The Hugging Face name of a model. + +`$NUM_REQUESTS` +: The number of requests to generate. + +`$DATASET_PATH` +: The path where the dataset was written when preparing the dataset. + +`$ENGINE_DIR` +: The engine directory as printed by `trtllm-bench build`. + +`$MEDUSA_CHOICES` +: A YAML config representing the Medusa tree for the benchmark. diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md index b4c0646a8..1dcf759fc 100644 --- a/docs/source/performance/perf-overview.md +++ b/docs/source/performance/perf-overview.md @@ -34,191 +34,145 @@ and shows the throughput client-server scenario under maximum load. The performance numbers below were collected using the steps described in this document. -**All data in the table below was generated using version 0.13.0 and presents token throughput in tokens/second.** - -| | | | | | | | | | -| --------------- | ------------------------ | ------------- | ------------------- | --------------- | ------------------ | ------------------ | ------------------ | -------- | -| | | **GPU** | **H200 141GB HBM3** | **GH200 120GB** | **H100 80GB HBM3** | **H100 80GB HBM3** | **A100-SXM4-80GB** | **L40S** | -| | | **Precision** | **FP8** | **FP8** | **FP8** | **FP16** | **FP16** | **FP8** | -| **Model** | **Input/Output Lengths** | **TP** | | | | | | | -| GPTJ 6B | 128/128 | 1 | 24,533.54 | 22,368.50 | 24,318.61 | 12,936.63 | 5,964.19 | 7,688.44 | -| | 128/2048 | 1 | 8,375.67 | 6,588.73 | 7,829.91 | 3,931.61 | 2,215.88 | 1,842.82 | -| | 128/4096 | 1 | 5,048.59 | 3,662.81 | 3,955.28 | 2,041.06 | 1,118.12 | 980.23 | -| | 2048/128 | 1 | 2,770.27 | 2,520.37 | 2,698.08 | 1,479.48 | 650.09 | 746.54 | -| | 5000/500 | 1 | 1,791.39 | 1,449.23 | 1,623.17 | 818.80 | 436.85 | 413.33 | -| | 500/2000 | 1 | 6,770.60 | 5,565.62 | 6,149.65 | 3,030.03 | 1,673.05 | 1,538.45 | -| | 1000/1000 | 1 | 6,465.73 | 5,580.37 | 6,078.80 | 2,797.48 | 1,673.45 | 1,531.57 | -| | 2048/2048 | 1 | 3,637.42 | 2,998.01 | 3,060.80 | 1,285.08 | 845.83 | 753.55 | -| LLaMA v3.1 8B | 128/128 | 1 | 28,125.59 | 26,045.60 | 27,147.22 | 15,647.83 | 6,687.04 | 8,548.90 | -| | 128/2048 | 1 | 22,989.20 | 16,497.79 | 19,221.02 | 8,882.95 | 4,918.53 | 4,988.61 | -| | 128/4096 | 1 | 16,077.62 | 9,637.91 | 11,856.11 | 5,462.96 | 3,054.46 | 2,768.91 | -| | 2048/128 | 1 | 3,625.83 | 3,357.60 | 3,497.30 | 1,859.37 | 796.17 | 1,000.90 | -| | 5000/500 | 1 | 3,823.76 | 3,217.40 | 3,276.69 | 1,687.74 | 788.66 | 872.14 | -| | 500/2000 | 1 | 19,382.37 | 15,128.77 | 13,996.05 | 6,834.76 | 3,929.83 | 3,911.14 | -| | 1000/1000 | 1 | 16,435.21 | 12,355.41 | 13,411.43 | 7,160.92 | 3,592.16 | 3,648.21 | -| | 2048/2048 | 1 | 11,072.97 | 7,850.75 | 8,851.23 | 4,152.21 | 2,269.78 | 2,055.78 | -| | 20000/2000 | 1 | 1,634.98 | 1,200.89 | 1,278.04 | 595.89 | 316.43 | 263.75 | -| LLaMA v3 8B | 128/128 | 1 | 27,940.47 | 26,117.13 | 27,156.81 | 15,489.11 | 6,656.98 | 8,734.57 | -| | 128/2048 | 1 | 23,228.98 | 16,417.04 | 19,209.17 | 8,901.43 | 4,967.37 | 5,004.93 | -| | 128/4096 | 1 | 15,980.94 | 9,351.95 | 11,889.67 | 5,455.91 | 3,053.27 | 2,768.15 | -| | 2048/128 | 1 | 3,631.45 | 3,339.90 | 3,476.37 | 1,918.56 | 796.28 | 1,050.68 | -| | 5000/500 | 1 | 3,836.98 | 3,186.22 | 3,279.24 | 1,668.42 | 792.95 | 860.31 | -| | 500/2000 | 1 | 19,725.45 | 15,241.74 | 14,218.30 | 6,816.62 | 3,899.64 | 3,990.73 | -| | 1000/1000 | 1 | 16,201.60 | 12,049.81 | 13,371.60 | 7,041.47 | 3,617.10 | 3,679.10 | -| | 2048/2048 | 1 | 11,097.69 | 7,255.55 | 8,852.87 | 4,251.45 | 2,269.68 | 2,048.94 | -| LLaMA v2 7B | 128/128 | 1 | 19,549.13 | 17,823.45 | 19,298.99 | 11,436.31 | 5,238.68 | 6,396.62 | -| | 128/2048 | 1 | 7,675.14 | 5,438.53 | 6,607.33 | 2,985.61 | 1,807.39 | 1,566.03 | -| | 128/4096 | 1 | 4,397.83 | 3,310.09 | 3,628.46 | 1,575.35 | 957.24 | 821.83 | -| | 2048/128 | 1 | 2,392.31 | 2,064.18 | 2,304.02 | 1,157.55 | 560.35 | 619.83 | -| | 5000/500 | 1 | 1,570.37 | 1,250.11 | 1,419.09 | 624.75 | 366.39 | 347.03 | -| | 500/2000 | 1 | 6,044.15 | 4,717.51 | 5,188.69 | 2,382.75 | 1,408.58 | 1,231.78 | -| | 1000/1000 | 1 | 5,896.10 | 4,825.24 | 5,208.97 | 2,462.65 | 1,431.92 | 1,277.79 | -| | 2048/2048 | 1 | 3,193.42 | 2,693.21 | 2,792.53 | 1,263.11 | 734.38 | 641.47 | -| Mistral 7B | 128/128 | 1 | 30,152.19 | 27,738.08 | 29,672.75 | 16,711.12 | 6,863.59 | 9,676.88 | -| | 128/2048 | 1 | 24,742.09 | 17,528.14 | 20,318.60 | 9,774.11 | 5,321.44 | 5,437.25 | -| | 128/4096 | 1 | 16,905.49 | 10,671.38 | 12,715.46 | 5,740.41 | 3,257.23 | 2,941.08 | -| | 2048/128 | 1 | 3,676.37 | 3,369.77 | 3,502.83 | 1,893.42 | 796.00 | 996.65 | -| | 5000/500 | 1 | 3,890.07 | 3,401.45 | 3,358.65 | 1,740.69 | 807.07 | 904.45 | -| | 500/2000 | 1 | 20,788.70 | 15,035.59 | 15,962.94 | 7,494.80 | 4,168.89 | 4,088.52 | -| | 1000/1000 | 1 | 17,620.46 | 13,362.84 | 14,213.48 | 7,281.07 | 3,794.31 | 3,972.63 | -| | 2048/2048 | 1 | 11,747.88 | 8,599.03 | 9,200.19 | 4,349.39 | 2,320.50 | 2,170.16 | -| | 20000/2000 | 1 | 1,693.41 | 1,271.85 | 1,299.05 | 609.91 | 324.52 | 276.19 | -| LLaMA v3.1 405B | 128/128 | 8 | 3,734.50 | | | | | | -| | 128/2048 | 8 | 3,039.70 | | | | | | -| | 128/4096 | 8 | 3,144.97 | | | | | | -| | 2048/128 | 8 | 454.17 | | | | | | -| | 5000/500 | 8 | 459.91 | | | | | | -| | 500/2000 | 8 | 2,967.98 | | | | | | -| | 1000/1000 | 8 | 2,259.32 | | | | | | -| | 2048/2048 | 8 | 2,067.15 | | | | | | -| | 20000/2000 | 8 | 447.67 | | | | | | -| LLaMA v3.1 70B | 128/128 | 1 | 3,923.61 | 2,998.99 | 2,168.72 | | | | -| | | 2 | 5,358.16 | 1,839.02 | 5,215.12 | 3,156.10 | 1,340.20 | | -| | | 4 | 8,969.59 | 8,655.98 | 8,677.59 | 5,845.53 | 2,426.46 | 1,434.63 | -| | | 8 | 16,449.68 | | 15,711.60 | 10,643.75 | 4,491.42 | 1,365.36 | -| | 128/2048 | 1 | 3,503.59 | 1,343.53 | 344.22 | | | | -| | | 2 | 7,068.42 | 1,146.08 | 5,654.43 | 801.82 | 498.44 | | -| | | 4 | 12,890.95 | 10,358.10 | 9,377.87 | 4,791.11 | 2,460.91 | 1,748.87 | -| | | 8 | 19,947.02 | | 15,168.97 | 6,892.18 | 4,148.33 | 1,890.62 | -| | 128/4096 | 1 | 2,314.83 | | | | | | -| | | 2 | 6,227.19 | 896.56 | 3,302.41 | 413.22 | 268.86 | | -| | | 4 | 10,059.64 | 6,628.22 | 6,501.69 | 3,056.98 | 1,660.93 | 1,180.87 | -| | | 8 | 14,393.28 | | 9,699.99 | 4,238.15 | 2,705.77 | 1,417.60 | -| | 2048/128 | 1 | 459.73 | 372.44 | 211.51 | | | | -| | | 2 | 689.30 | 280.61 | 690.05 | 323.66 | 143.39 | | -| | | 4 | 1,047.96 | 1,015.14 | 1,016.24 | 672.37 | 278.87 | 167.87 | -| | | 8 | 2,061.19 | | 1,964.49 | 1,273.97 | 539.57 | 163.91 | -| | 5000/500 | 1 | 534.79 | 283.19 | 112.21 | | | | -| | | 2 | 943.78 | 337.04 | 897.36 | 224.31 | 115.63 | | -| | | 4 | 1,437.45 | 1,383.61 | 1,329.82 | 851.12 | 361.39 | 235.90 | -| | | 8 | 2,795.95 | | 2,472.69 | 1,438.10 | 679.27 | 224.33 | -| | 500/2000 | 1 | 2,758.24 | 1,083.48 | | | | | -| | | 2 | 6,063.53 | 851.46 | 4,347.69 | 652.34 | 423.06 | | -| | | 4 | 10,061.89 | 9,090.78 | 8,378.16 | 3,441.34 | 2,072.88 | 1,436.41 | -| | | 8 | 16,139.49 | | 10,790.85 | 5,792.17 | 3,115.20 | 1,512.78 | -| | 1000/1000 | 1 | 2,539.65 | 728.79 | | | | | -| | | 2 | 4,572.03 | 1,223.92 | 3,880.41 | 737.40 | 451.82 | | -| | | 4 | 7,612.56 | 6,705.02 | 6,553.00 | 3,655.64 | 1,731.86 | 1,113.18 | -| | | 8 | 12,660.86 | | 11,121.10 | 5,599.45 | 3,013.95 | 1,120.73 | -| | 2048/2048 | 1 | 1,753.58 | 611.08 | 161.60 | | | | -| | | 2 | 3,407.26 | 626.26 | 2,432.55 | | 108.91 | | -| | | 4 | 6,565.77 | 4,864.55 | 4,948.83 | 2,396.06 | 1,220.93 | 855.44 | -| | | 8 | 9,948.56 | | 8,527.52 | 3,819.60 | 2,103.68 | 924.89 | -| | 20000/2000 | 1 | 262.82 | 88.89 | | | | | -| | | 2 | 598.19 | 177.04 | 414.17 | | | | -| | | 4 | 1,047.27 | 958.88 | 856.31 | 375.85 | 187.42 | 140.73 | -| | | 8 | 1,793.52 | | 1,359.27 | 650.78 | 344.41 | 122.04 | -| LLaMA v3 70B | 128/128 | 1 | 3,924.02 | 3,161.73 | 2,177.84 | | | | -| | | 2 | 5,388.22 | 1,551.84 | 5,205.80 | 3,186.61 | 1,321.55 | | -| | | 4 | 8,958.95 | 8,618.55 | 8,678.68 | 5,857.16 | 2,424.68 | 1,432.46 | -| | | 8 | 16,375.41 | | 15,703.26 | 10,627.36 | 4,490.19 | 1,333.09 | -| | 128/2048 | 1 | 3,519.24 | 1,346.37 | 353.68 | | | | -| | | 2 | 7,071.54 | 862.54 | 5,878.06 | 802.98 | 512.11 | | -| | | 4 | 12,876.38 | 10,015.23 | 8,929.23 | 4,768.27 | 2,458.73 | 1,737.31 | -| | | 8 | 20,013.92 | | 15,171.91 | 6,875.97 | 3,906.35 | 1,892.41 | -| | 128/4096 | 1 | 2,310.85 | | | | | | -| | | 2 | 6,199.95 | 602.98 | 3,311.05 | 413.29 | 269.02 | | -| | | 4 | 9,633.49 | 7,370.19 | 6,489.95 | 3,053.89 | 1,677.51 | 1,199.71 | -| | | 8 | 14,552.09 | | 9,632.02 | 4,259.39 | 2,697.61 | 1,358.34 | -| | 2048/128 | 1 | 458.75 | 371.70 | 210.27 | | | | -| | | 2 | 694.00 | 277.85 | 692.74 | 321.71 | 144.61 | | -| | | 4 | 1,048.84 | 1,016.03 | 1,022.77 | 690.10 | 279.06 | 168.52 | -| | | 8 | 2,072.33 | | 1,976.76 | 1,273.41 | 542.93 | 158.63 | -| | 5000/500 | 1 | 533.37 | 303.33 | 112.68 | | | | -| | | 2 | 936.82 | 379.62 | 899.29 | 224.65 | 115.00 | | -| | | 4 | 1,442.76 | 1,384.62 | 1,326.95 | 853.73 | 361.06 | 235.19 | -| | | 8 | 2,797.36 | | 2,483.56 | 1,437.15 | 678.70 | 225.15 | -| | 500/2000 | 1 | 2,763.89 | 1,074.62 | 293.47 | | | | -| | | 2 | 6,054.46 | 1,109.13 | 4,356.55 | 683.11 | 423.82 | | -| | | 4 | 10,103.08 | 7,325.93 | 8,370.32 | 3,436.29 | 2,064.47 | 1,412.78 | -| | | 8 | 16,857.45 | | 10,760.65 | 5,665.02 | 3,159.89 | 1,517.76 | -| | 1000/1000 | 1 | 2,540.45 | 1,164.45 | | | | | -| | | 2 | 4,590.38 | 1,040.64 | 3,879.25 | 768.53 | 453.73 | | -| | | 4 | 7,606.92 | 6,655.61 | 6,547.23 | 3,655.19 | 1,732.86 | 1,117.53 | -| | | 8 | 12,660.32 | | 11,155.47 | 5,617.24 | 2,894.58 | 1,126.50 | -| | 2048/2048 | 1 | 1,746.77 | 610.87 | 162.10 | | | | -| | | 2 | 3,405.72 | 738.51 | 2,548.70 | | 108.66 | | -| | | 4 | 6,571.34 | 4,880.28 | 5,060.39 | 2,391.55 | 1,222.11 | 854.65 | -| | | 8 | 9,923.96 | | 8,480.48 | 3,826.38 | 2,181.07 | 927.54 | -| LLaMA v2 70B | 128/128 | 1 | 3,969.25 | 3,502.35 | 3,413.82 | | | | -| | | 2 | 6,394.64 | 3,252.69 | 6,432.82 | 3,170.28 | 1,336.48 | | -| | | 4 | 11,031.42 | 11,126.95 | 10,865.42 | 6,420.88 | 2,766.00 | 1,487.71 | -| | | 8 | 17,060.04 | | 16,384.83 | 11,146.15 | 4,742.74 | 1,404.99 | -| | 128/2048 | 1 | 3,742.99 | 1,660.81 | | | | | -| | | 2 | 6,453.25 | 1,335.80 | 5,775.34 | 757.21 | 476.46 | | -| | | 4 | 13,869.67 | 11,098.69 | 9,536.82 | 5,274.27 | 2,686.16 | 1,880.22 | -| | | 8 | 19,220.48 | | 17,715.01 | 8,904.94 | 5,520.41 | 2,186.68 | -| | 128/4096 | 1 | 2,459.63 | | 446.60 | | | | -| | | 2 | 4,831.03 | 684.68 | 3,354.60 | 385.98 | 235.22 | | -| | | 4 | 8,988.84 | 8,397.13 | 7,619.62 | 3,228.36 | 1,941.07 | 1,318.51 | -| | | 8 | 15,115.41 | | 12,506.95 | 5,996.81 | 3,539.36 | 1,782.93 | -| | 2048/128 | 1 | 458.88 | 400.31 | 328.90 | | | | -| | | 2 | 745.71 | 457.57 | 742.17 | 308.02 | 138.81 | | -| | | 4 | 1,297.10 | 1,330.90 | 1,270.78 | 755.30 | 321.72 | 171.67 | -| | | 8 | 2,060.53 | | 2,009.57 | 1,348.71 | 561.71 | 160.37 | -| | 5000/500 | 1 | 548.46 | 364.00 | 224.17 | | | | -| | | 2 | 1,020.86 | 335.07 | 885.67 | 212.20 | 112.43 | | -| | | 4 | 1,759.69 | 1,683.26 | 1,590.94 | 837.57 | 386.78 | 231.54 | -| | | 8 | 2,839.69 | | 2,546.12 | 1,570.91 | 709.66 | 238.59 | -| | 500/2000 | 1 | 3,019.28 | 1,364.66 | 716.54 | | | | -| | | 2 | 6,402.94 | 1,292.24 | 4,462.98 | 629.21 | 387.61 | | -| | | 4 | 12,429.18 | 8,951.07 | 8,753.09 | 4,012.41 | 2,158.17 | 1,517.53 | -| | | 8 | 16,789.12 | | 15,260.29 | 7,384.79 | 4,104.80 | 1,739.28 | -| | 1000/1000 | 1 | 2,706.04 | 1,449.83 | | | | | -| | | 2 | 4,693.24 | 960.39 | 3,958.45 | 736.68 | 425.70 | | -| | | 4 | 8,557.11 | 7,278.64 | 6,817.41 | 3,866.05 | 1,876.40 | 1,188.91 | -| | | 8 | 13,483.04 | | 11,511.74 | 6,543.96 | 3,285.82 | 1,241.42 | -| | 2048/2048 | 1 | 1,911.20 | 798.50 | 412.37 | | | | -| | | 2 | 3,408.82 | 767.24 | 2,551.21 | 388.82 | 226.60 | | -| | | 4 | 6,702.46 | 5,354.80 | 5,212.02 | 2,512.22 | 1,316.92 | 891.95 | -| | | 8 | 10,348.65 | | 8,016.14 | 4,414.75 | 2,492.09 | 1,083.26 | -| Mixtral 8x7B | 128/128 | 2 | 25,135.25 | 8,512.51 | 24,572.90 | 15,395.59 | 5,927.88 | | -| | | 4 | 42,394.61 | 40,148.01 | 40,309.25 | 27,747.43 | 11,205.51 | 6,784.44 | -| | | 8 | 54,648.80 | | 51,683.16 | 40,116.51 | 18,496.66 | 6,437.72 | -| | 128/2048 | 2 | 29,412.17 | 3,271.02 | 20,938.80 | 7,391.51 | 4,278.79 | | -| | | 4 | 52,603.13 | 43,071.34 | 40,580.94 | 21,332.15 | 10,946.58 | 7,475.05 | -| | | 8 | 70,427.00 | | 64,161.64 | 41,101.18 | 21,235.99 | 9,955.21 | -| | 128/4096 | 2 | 21,312.11 | 2,254.56 | | 3,896.02 | 2,388.14 | | -| | | 4 | 39,353.01 | 30,065.77 | | | 7,108.03 | 5,232.44 | -| | | 8 | 32,992.62 | | 47,860.65 | 27,261.67 | 15,943.70 | 8,081.21 | -| | 2048/128 | 2 | 2,946.01 | 921.87 | 2,894.09 | 1,790.49 | 684.71 | | -| | | 4 | 5,237.58 | 5,056.60 | 4,988.14 | 3,354.89 | 1,338.54 | 803.50 | -| | | 8 | 7,053.32 | | 6,559.63 | 5,072.46 | 2,244.39 | 753.39 | -| | 5000/500 | 2 | 3,848.10 | 997.06 | 3,630.24 | 1,656.04 | 739.84 | | -| | | 4 | 6,877.65 | 6,466.39 | 6,237.22 | 3,607.46 | 1,619.49 | 1,048.60 | -| | | 8 | 9,531.26 | | 8,709.34 | 6,237.96 | 2,927.13 | 1,109.25 | -| | 500/2000 | 2 | 23,539.24 | 2,773.86 | 16,886.30 | 5,773.33 | 3,325.73 | | -| | | 4 | 40,035.05 | 33,478.35 | 32,047.73 | 16,897.03 | 8,908.09 | 6,153.32 | -| | | 8 | 60,572.77 | | 41,597.80 | 31,392.32 | 16,954.54 | 7,980.34 | -| | 1000/1000 | 2 | 18,644.51 | 4,540.15 | 14,154.95 | 5,826.43 | 3,289.27 | | -| | | 4 | 32,709.62 | 29,046.16 | 25,291.30 | 14,307.91 | 7,461.63 | 4,697.19 | -| | | 8 | 44,072.88 | | 40,628.46 | 27,633.48 | 13,741.62 | 5,706.17 | -| | 2048/2048 | 2 | 14,017.70 | 2,870.77 | 10,448.79 | 3,535.21 | 1,954.32 | | -| | | 4 | 25,550.44 | 21,488.32 | 19,977.11 | 9,620.99 | 5,191.30 | 3,593.18 | -| | | 8 | 24,999.94 | | 31,678.85 | 19,372.52 | 10,572.07 | 4,860.61 | -| | 20000/2000 | 2 | 2,195.84 | 367.81 | 1,583.86 | 626.60 | 320.41 | | -| | | 4 | 4,086.41 | 3,301.28 | 2,982.42 | 1,586.09 | 807.67 | 579.49 | -| | | 8 | 5,797.57 | | 5,163.91 | 3,106.98 | 1,653.55 | 821.64 | +**All data in the table below was generated using version 0.14.0 and presents token throughput in tokens/second.** + +| | | | | | | | | | +| --------------- | ------------------------ | ------------- | ------------------- | ------------------ | ------------------ | ------------------ | ------------------ | --------- | +| | | **GPU** | **H200 141GB HBM3** | **H100 80GB HBM3** | **H100 80GB HBM3** | **A100-SXM4-80GB** | **A100-PCIE-80GB** | **L40S** | +| | | **Precision** | **FP8** | **FP8** | **FP16** | **FP16** | **FP16** | **FP8** | +| **Model** | **Input/Output Lengths** | **TP Size** | | | | | | | +| LLaMA v3 70B | 1000/1000 | 1 | 2594.2199 | 464.5243 | | | | | +| | | 2 | 4574.1197 | 4092.3267 | 776.9965 | 468.5805 | 259.1155 | | +| | | 4 | 7612.2487 | 6925.0844 | 3730.2064 | 1765.9123 | 987.1971 | 1159.357 | +| | | 8 | 13075.5194 | 10733.0804 | 5963.0914 | 3054.8915 | 960.3737 | 1173.3517 | +| | 128/128 | 1 | 3904.1639 | 2551.6384 | | | | | +| | | 2 | 5343.8677 | 5191.7428 | 3183.9714 | 1334.903 | 806.1477 | | +| | | 4 | 8829.1049 | 8540.5362 | 5837.9598 | 2421.4383 | 1275.5474 | 1427.9115 | +| | | 8 | 16359.1322 | 15498.2004 | 10597.6556 | 4474.1621 | 1223.1747 | 1377.473 | +| | 128/2048 | 1 | 3613.7474 | 418.3639 | | | | | +| | | 2 | 7112.2959 | 5852.0185 | 817.52 | 511.6257 | | | +| | | 4 | 12772.8148 | 8998.3742 | 5072.0345 | 2484.2018 | 1471.9105 | 1771.4437 | +| | | 8 | 19722.5974 | 15099.0633 | 7554.2141 | 4463.6602 | 1589.1759 | 1953.7918 | +| | 128/4096 | 1 | 2409.6881 | | | | | | +| | | 2 | 5687.3482 | 3513.0941 | 413.3767 | 273.5871 | | | +| | | 4 | 8937.3115 | 6718.5895 | 3093.7358 | 1688.0132 | 1231.8104 | 1279.2496 | +| | | 8 | 13976.1386 | 9279.1013 | 5001.2743 | 2948.5374 | 1350.794 | 1494.0776 | +| | 2048/128 | 1 | 457.5772 | 241.7561 | | | | | +| | | 2 | 699.5582 | 690.9961 | 328.0399 | 145.088 | 91.1746 | | +| | | 4 | 1035.6523 | 1008.8318 | 670.6725 | 278.5717 | 150.2619 | 168.7886 | +| | | 8 | 2055.7245 | 1996.2653 | 1288.7599 | 546.9599 | 140.0144 | 160.2741 | +| | 2048/2048 | 1 | 1802.1116 | 204.0931 | | | | | +| | | 2 | 3487.2497 | 2444.6903 | 165.6522 | 126.1101 | | | +| | | 4 | 6126.7196 | 4850.8285 | 2386.6556 | 1230.1833 | 822.2269 | 876.6085 | +| | | 8 | 9784.0193 | 7432.6659 | 3991.2123 | 2144.3042 | 883.4809 | 994.94 | +| | 500/2000 | 1 | 2822.7846 | 389.8823 | | | | | +| | | 2 | 6175.7623 | 4601.857 | 687.5386 | 430.6093 | | | +| | | 4 | 10783.8925 | 9018.9053 | 3698.3674 | 2113.3936 | 1248.8319 | 1468.7827 | +| | | 8 | 17631.9756 | 11375.9582 | 6321.3679 | 3673.5693 | 1321.8541 | 1636.4588 | +| | 5000/500 | 1 | 532.2603 | 123.8543 | | | | | +| | | 2 | 931.8255 | 897.4263 | 227.9005 | 117.5698 | 75.35 | | +| | | 4 | 1399.7865 | 1316.2865 | 831.2804 | 362.3465 | 209.8052 | 234.7343 | +| | | 8 | 2725.1283 | 2469.5585 | 1446.3508 | 662.5725 | 202.0719 | 231.9027 | +| LLaMA v3.1 405B | 1000/1000 | 8 | 3391.0372 | | | | | | +| | 128/128 | 8 | 3766.2785 | | | | | | +| | 128/2048 | 8 | 5952.1416 | | | | | | +| | 128/4096 | 8 | 3944.117 | | | | | | +| | 20000/2000 | 8 | 481.5732 | | | | | | +| | 2048/128 | 8 | 444.5735 | | | | | | +| | 2048/2048 | 8 | 2604.8557 | | | | | | +| | 500/2000 | 8 | 4805.86 | | | | | | +| | 5000/500 | 8 | 655.9754 | | | | | | +| LLaMA v3.1 70B | 1000/1000 | 1 | 2585.0953 | 410.286 | | | | | +| | | 2 | 4600.9616 | 4116.4444 | 785.4931 | 468.6383 | 257.972 | | +| | | 4 | 7607.5304 | 6932.8808 | 3774.676 | 1762.6831 | 989.4082 | 1161.4814 | +| | | 8 | 13081.434 | 10730.156 | 5978.4573 | 3190.0211 | 959.8463 | 1188.1193 | +| | 128/128 | 1 | 3897.2623 | 2459.6003 | | | | | +| | | 2 | 5357.0227 | 5194.8171 | 3207.2866 | 1346.9692 | 806.7215 | | +| | | 4 | 8826.9618 | 8542.3012 | 5846.8413 | 2420.8665 | 1272.6755 | 1438.0446 | +| | | 8 | 16382.9807 | 15533.1169 | 10649.4968 | 4572.3445 | 1212.0566 | 1381.7051 | +| | 128/2048 | 1 | 3612.2603 | 445.7773 | | | | | +| | | 2 | 7054.7235 | 5869.3998 | 822.1912 | 483.1299 | | | +| | | 4 | 12763.4114 | 9017.4377 | 4982.6225 | 2492.4036 | 1435.236 | 1763.522 | +| | | 8 | 19266.0398 | 15190.1652 | 7605.5295 | 4254.2871 | 1609.2473 | 1944.1251 | +| | 128/4096 | 1 | 2415.1981 | | | | | | +| | | 2 | 5671.9561 | 3518.782 | 419.0178 | 272.9137 | | | +| | | 4 | 8939.8227 | 6431.2702 | 3083.8794 | 1685.9677 | 1212.5416 | 1280.3778 | +| | | 8 | 13974.2854 | 9168.709 | 4981.9765 | 3067.5452 | 1310.091 | 1499.2441 | +| | 20000/2000 | 1 | 240.7202 | | | | | | +| | | 2 | 614.318 | 397.6801 | | | | | +| | | 4 | 1030.9528 | 851.8542 | 369.4269 | 179.5181 | 126.7676 | 140.5565 | +| | | 8 | 1898.9762 | 1354.5333 | | 362.9368 | 156.5767 | 141.1584 | +| | 2048/128 | 1 | 458.1948 | 244.1842 | | | | | +| | | 2 | 692.3911 | 697.3907 | 322.7016 | 144.7921 | 95.0306 | | +| | | 4 | 1034.5773 | 1001.0771 | 688.0344 | 278.4018 | 150.6795 | 169.0386 | +| | | 8 | 2070.8157 | 1966.6072 | 1316.3086 | 550.4751 | 142.6166 | 163.6749 | +| | 2048/2048 | 1 | 1797.6743 | 209.1707 | | | | | +| | | 2 | 3518.0774 | 2445.0093 | 166.792 | 126.1127 | | | +| | | 4 | 6112.9026 | 4838.5272 | 2393.1359 | 1231.0359 | 823.4777 | 876.2254 | +| | | 8 | 9716.1934 | 7434.8117 | 4023.6978 | 2171.5323 | 858.6602 | 1001.3649 | +| | 500/2000 | 1 | 2826.6665 | | | | | | +| | | 2 | 6106.5855 | 4605.9226 | 700.5415 | 430.6129 | | | +| | | 4 | 10816.8283 | 9205.3766 | 3781.082 | 2096.2441 | 1176.418 | 1470.0826 | +| | | 8 | 17693.705 | 13109.4437 | 6205.2658 | 3486.7891 | 1306.35 | 1639.2778 | +| | 5000/500 | 1 | 533.6128 | 125.4236 | | | | | +| | | 2 | 936.7014 | 886.6758 | 228.874 | 116.9529 | 76.1601 | | +| | | 4 | 1386.4827 | 1313.893 | 849.1091 | 362.9361 | 209.2045 | 236.117 | +| | | 8 | 2711.5057 | 2444.9643 | 1420.5163 | 670.3742 | 203.8008 | 230.3084 | +| LLaMA v3.1 8B | 1000/1000 | 1 | 16414.6988 | 14108.0361 | 7054.5156 | 3634.3886 | 3165.3542 | 3726.7552 | +| | 128/128 | 1 | 27778.8885 | 26933.1886 | 15571.6549 | 6701.7958 | 5338.0166 | 8639.7933 | +| | 128/2048 | 1 | 22948.5383 | 18995.2523 | 9150.7477 | 4963.4443 | 4250.6391 | 5101.6652 | +| | 128/4096 | 1 | 15583.3035 | 11815.449 | 5368.9227 | 3011.3335 | 2568.5398 | 2774.5363 | +| | 20000/2000 | 1 | 1649.5453 | 1301.4754 | 562.8735 | 316.533 | 291.4776 | 270.5404 | +| | 2048/128 | 1 | 3619.4309 | 3460.3545 | 1904.3259 | 795.389 | 611.8446 | 986.9134 | +| | 2048/2048 | 1 | 11032.9729 | 8777.6623 | 4159.6857 | 2264.9513 | 2011.1215 | 2018.303 | +| | 500/2000 | 1 | 19510.4015 | 14993.328 | 7498.3331 | 3945.1912 | 3374.7133 | 4065.3921 | +| | 5000/500 | 1 | 3787.6721 | 3258.2001 | 1708.0353 | 790.6631 | 703.56 | 855.9822 | +| Mistral 7B | 1000/1000 | 1 | 17739.1436 | 14986.7562 | 7697.1418 | 3804.5585 | 3333.4754 | 3981.4799 | +| | 128/128 | 1 | 30094.9137 | 29341.284 | 16238.937 | 6914.2184 | 5491.7418 | 9127.5052 | +| | 128/2048 | 1 | 24671.5477 | 20941.6631 | 9708.1161 | 5303.4318 | 4402.3044 | 5357.3405 | +| | 128/4096 | 1 | 16454.0833 | 12780.3724 | 5800.4957 | 3235.0678 | 2825.7896 | 2879.9833 | +| | 20000/2000 | 1 | 1676.0415 | 1317.9654 | 569.7589 | 324.5936 | 281.4751 | 286.353 | +| | 2048/128 | 1 | 3649.1462 | 3492.3042 | 1929.3126 | 800.9286 | 617.0932 | 1019.75 | +| | 2048/2048 | 1 | 11403.6968 | 8974.7383 | 4367.8733 | 2331.8112 | 1988.3496 | 2184.3861 | +| | 500/2000 | 1 | 20819.4592 | 15992.3357 | 7947.4257 | 4189.395 | 3603.4489 | 4286.3867 | +| | 5000/500 | 1 | 3840.0108 | 3340.7385 | 1707.2611 | 807.4561 | 722.8385 | 881.7336 | +| Mixtral 8x22B | 1000/1000 | 8 | 18557.43 | 16918.03 | 9759.888 | 4753.6273 | | 2128.4403 | +| | 128/128 | 8 | 25179.4765 | 23729.5293 | 16421.3182 | 6948.5923 | | 2488.6297 | +| | 128/2048 | 8 | 27492.4926 | 24556.7807 | 12303.4168 | 7246.7172 | | 3540.0067 | +| | 128/4096 | 8 | 19718.8648 | 17755.0018 | 7474.3817 | 4696.6123 | | 2568.3114 | +| | 20000/2000 | 8 | 2897.182 | 2189.606 | 1118.8294 | 594.8509 | | 309.0799 | +| | 2048/128 | 8 | 3093.8418 | 2917.1362 | 1994.0127 | 825.3934 | | 294.7706 | +| | 2048/2048 | 8 | 13795.9827 | 12487.6502 | 5857.8831 | 3377.8371 | | 1694.6176 | +| | 500/2000 | 8 | 24637.473 | 19997.3914 | 10637.6598 | 6007.619 | | 2976.9633 | +| | 5000/500 | 8 | 3889.2745 | 3578.4843 | 2211.2377 | 1028.3843 | | 420.2156 | +| Mixtral 8x7B | 1000/1000 | 2 | 18712.2046 | 15931.8663 | 6052.876 | 3276.6186 | 1907.8817 | | +| | | 4 | 32834.0923 | 28015.1981 | 15509.1538 | 7357.1613 | 4737.0179 | 5060.8399 | +| | | 8 | 44410.7533 | 40573.0499 | 27684.9381 | 13948.1533 | 4970.9287 | 5725.9638 | +| | 128/128 | 2 | 24970.5594 | 24321.9927 | 15334.2103 | 5915.3897 | 3810.1846 | | +| | | 4 | 42500.5855 | 40182.7271 | 27718.9857 | 11328.7486 | 6026.9206 | 6769.9441 | +| | | 8 | 54304.0436 | 51030.9048 | 40119.3268 | 17918.1146 | 5573.7682 | 6422.4308 | +| | 128/2048 | 2 | 29314.1475 | 20945.7816 | 7409.9253 | 4284.3035 | 2248.1815 | | +| | | 4 | 52680.8353 | 40668.5928 | 21293.1761 | 10929.0182 | 7353.7405 | 7506.7612 | +| | | 8 | 70409.1968 | 64529.9982 | 40839.3077 | 21058.2144 | 8866.251 | 9907.6896 | +| | 128/4096 | 2 | 21520.4385 | 12070.6724 | 3928.6678 | 2302.964 | 1171.966 | | +| | | 4 | 32550.5267 | 29120.2002 | 11678.0071 | 6538.1511 | 5176.9632 | 4958.7004 | +| | | 8 | 40373.4857 | 36357.7861 | 21628.821 | 13565.7778 | 7209.2336 | 8271.7938 | +| | 20000/2000 | 2 | 2204.1378 | 1659.5907 | 622.2717 | 321.9839 | 185.6671 | | +| | | 4 | 4047.7473 | 3290.9457 | 1602.0208 | 778.7285 | 572.4282 | 587.1759 | +| | | 8 | 6561.6849 | 5328.5261 | 3113.2047 | 1645.8114 | 750.5372 | 828.8471 | +| | 2048/128 | 2 | 2958.0873 | 2883.5166 | 1796.5451 | 687.7251 | 465.1585 | | +| | | 4 | 5229.8744 | 4972.6818 | 3354.994 | 1351.7191 | 728.4943 | 812.0143 | +| | | 8 | 7030.9766 | 6532.721 | 5025.3047 | 2248.6418 | 677.9886 | 771.3656 | +| | 2048/2048 | 2 | 13842.834 | 9334.0732 | 3503.0218 | 1997.1923 | 1060.8946 | | +| | | 4 | 22389.4914 | 20185.8212 | 9143.2741 | 4963.8758 | 3520.3659 | 3453.8076 | +| | | 8 | 28975.322 | 26176.9163 | 19291.8278 | 10552.9732 | 4590.187 | 4929.7228 | +| | 500/2000 | 2 | 23459.0411 | 18185.6392 | 6023.3308 | 3438.6964 | 1817.11 | | +| | | 4 | 39971.0236 | 31693.8787 | 17087.037 | 8930.3495 | 6117.5624 | 6434.9178 | +| | | 8 | 60721.462 | 48842.8084 | 31358.2791 | 17034.706 | 7118.0767 | 8130.8026 | +| | 5000/500 | 2 | 3742.5293 | 3563.8228 | 1648.9041 | 733.1921 | 448.6716 | | +| | | 4 | 6602.3877 | 6020.6267 | 3543.6819 | 1603.8223 | 948.0567 | 1047.3212 | +| | | 8 | 8862.8164 | 8214.9445 | 5968.7734 | 2813.1531 | 969.817 | 1098.3081 | + *TP stands for Tensor Parallelism* ## Reproducing Benchmarked Results @@ -226,7 +180,7 @@ The performance numbers below were collected using the steps described in this d > [!NOTE] The only models supported in this workflow are those listed in the table above. The following tables are references for commands that are used as part of the benchmarking process. For a more detailed -description of this benchmarking workflow, see the [Benchmarking Suite README](../../../benchmarks/Suite.md). +description of this benchmarking workflow, see the [benchmarking suite documentation](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html). ### Commands @@ -277,10 +231,7 @@ remain in the system longer and therefore require less requests to achieve stead | 128 | 4096 | 4224 | 1500 | | 2048 | 128 | 2176 | 3000 | | 2048 | 2048 | 4096 | 1500 | -| 5000 | 500 | 5500 | 1500 | -| 1000 | 1000 | 2000 | 3000 | -| 500 | 2000 | 2500 | 3000 | -| 20000 | 2000 | 22000 | 1000 | + ## Engine Building diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md index ea62b081e..1db1b7ca2 100644 --- a/docs/source/quick-start-guide.md +++ b/docs/source/quick-start-guide.md @@ -16,6 +16,18 @@ This is the starting point to try out TensorRT-LLM. Specifically, this Quick Sta git clone https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct ``` +## LLM API +The LLM API is a Python API designed to facilitate setup and inference with TensorRT-LLM directly within Python. It enables model optimization by simply specifying a HuggingFace repository name or a model checkpoint. The LLM API streamlines the process by managing checkpoint conversion, engine building, engine loading, and model inference, all through a single Python object. + +Here is a simple example to show how to use the LLM API with TinyLlama. + +```{literalinclude} ../../examples/llm-api/quickstart_example.py + :language: python + :linenos: +``` + +To learn more about the LLM API, check out the [](llm-api/index) and [](llm-api-examples/index). + (quick-start-guide-compile)= ## Compile the Model into a TensorRT Engine @@ -68,19 +80,6 @@ cd tensorrtllm_backend 2. Refer to [End to end workflow to run llama 7b](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/llama.md) in the TensorRT-LLM backend repository to deploy the model with Triton Inference Server. -## LLM API -The LLM API is a Python API to setup & infer with TensorRT-LLM directly in python.It allows for optimizing models by specifying a HuggingFace repo name or a model checkpoint. The LLM API handles checkpoint conversion, engine building, engine loading, and model inference, all from one python object. - -Note that these APIs are in incubation, they may change and supports the [following models](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/index.html#supported-model), which will increase in coming release. We appreciate your patience and understanding as we improve this API. - -Here is a simple example to show how to use the LLM API with TinyLlama. - -```{literalinclude} ../../examples/llm-api/quickstart_example.py - :language: python - :linenos: -``` - -To learn more about the LLM API, check out the [](llm-api-examples/index) and [](llm-api/index). ## Next Steps diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md index bb7071aab..e77270eaa 100644 --- a/docs/source/reference/support-matrix.md +++ b/docs/source/reference/support-matrix.md @@ -43,7 +43,7 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA - [Qwen/Qwen1.5/Qwen2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/qwen) - [Qwen-VL](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/qwenvl) - [RecurrentGemma](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/recurrentgemma) -- [Replit Code](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/mpt) +- [Replit Code](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/mpt)[^ReplitCode] - [RoBERTa](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/bert) - [SantaCoder](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt) - [Skywork](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/skywork) @@ -117,6 +117,7 @@ The following table shows the supported software for TensorRT-LLM. - Turing (SM75) - FP32, FP16, INT8[^smooth], INT4 - Volta (SM70) - FP32, FP16, INT8[^smooth], INT4[^smlt75] ``` +[^ReplitCode]:Replit Code is not supported with the transformers 4.45+. [^smooth]: INT8 SmoothQuant is not supported on SM70 and SM75. diff --git a/docs/source/reference/troubleshooting.md b/docs/source/reference/troubleshooting.md index 9fcffae71..43e053cb8 100644 --- a/docs/source/reference/troubleshooting.md +++ b/docs/source/reference/troubleshooting.md @@ -12,20 +12,6 @@ Then, we print the values at runtime. Many build errors can be resolved by simply deleting the build tree. Try running the build script with `--clean` or running `rm -r cpp/build`. -## cuDNN Linking Errors - -If you encounter errors such as "Entry Point Not Found" (see for example [#1062](https://github.com/NVIDIA/TensorRT-LLM/issues/1062)) the issue might be a mismatch in the `cuDNN` libraries shipped from `torch` and `tensorrt`. To rectify this, please try the following steps - -``` -python -m pip uninstall -y tensorrt_llm -python -m pip install --upgrade pip -python -m pip install nvidia-cudnn-cu11==8.9.4.25 --no-cache-dir -python -m pip install --pre --extra-index-url https://pypi.nvidia.com/ tensorrt==9.2.0.post12.dev5 --no-cache-dir -python -m pip uninstall -y nvidia-cudnn-cu11 -python -m pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com/ --extra-index-url https://download.pytorch.org/whl/cu121 -``` - - ## Debug on Unit Tests 1. Register the intermediate tensors as the network outputs with `register_network_output` API. diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md index 9dd351dd2..a319d0209 100644 --- a/docs/source/release-notes.md +++ b/docs/source/release-notes.md @@ -5,6 +5,52 @@ All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/). +## TensorRT-LLM Release 0.14.0 + +### Key Features and Enhancements + - Enhanced the `LLM` class in the [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/index.html). + - Added support for calibration with offline dataset. + - Added support for Mamba2. + - Added support for `finish_reason` and `stop_reason`. + - Added FP8 support for CodeLlama. + - Added `__repr__` methods for class `Module`, thanks to the contribution from @1ytic in #2191. + - Added BFloat16 support for fused gated MLP. + - Updated ReDrafter beam search logic to match Apple ReDrafter v1.1. + - Improved `customAllReduce` performance. + - Draft model now can copy logits directly over MPI to the target model's process in `orchestrator` mode. This fast logits copy reduces the delay between draft token generation and the beginning of target model inference. + - NVIDIA Volta GPU support is deprecated and will be removed in a future release. + +### API Changes + - [BREAKING CHANGE] The default `max_batch_size` of the `trtllm-build` command is set to `2048`. + - [BREAKING CHANGE] Remove `builder_opt` from the `BuildConfig` class and the `trtllm-build` command. + - Add logits post-processor support to the `ModelRunnerCpp` class. + - Added `isParticipant` method to the C++ `Executor` API to check if the current process is a participant in the executor instance. + +### Model Updates + - Added support for NemotronNas, see `examples/nemotron_nas/README.md`. + - Added support for Deepseek-v1, see `examples/deepseek_v1/README.md`. + - Added support for Phi-3.5 models, see `examples/phi/README.md`. + +### Fixed Issues + - Fixed a typo in `tensorrt_llm/models/model_weights_loader.py`, thanks to the contribution from @wangkuiyi in #2152. + - Fixed duplicated import module in `tensorrt_llm/runtime/generation.py`, thanks to the contribution from @lkm2835 in #2182. + - Enabled `share_embedding` for the models that have no `lm_head` in legacy checkpoint conversion path, thanks to the contribution from @lkm2835 in #2232. + - Fixed `kv_cache_type` issue in the Python benchmark, thanks to the contribution from @qingquansong in #2219. + - Fixed an issue with SmoothQuant calibration with custom datasets. Thanks to the contribution by @Bhuvanesh09 in #2243. + - Fixed an issue surrounding `trtllm-build --fast-build` with fake or random weights. Thanks to @ZJLi2013 for flagging it in #2135. + - Fixed missing `use_fused_mlp` when constructing `BuildConfig` from dict, thanks for the fix from @ethnzhng in #2081. + - Fixed lookahead batch layout for `numNewTokensCumSum`. (#2263) + +### Infrastructure Changes + - The dependent ModelOpt version is updated to v0.17. + +### Documentation + - @Sherlock113 added a [tech blog](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml) to the latest news in #2169, thanks for the contribution. + +### Known Issues + - Replit Code is not supported with the transformers 4.45+ + + ## TensorRT-LLM Release 0.13.0 ### Key Features and Enhancements @@ -63,10 +109,6 @@ All published functionality in the Release Notes has been fully tested and verif - The dependent PyTorch version is updated to 2.4.0. - The dependent ModelOpt version is updated to v0.15. -### Known Issues - -- On Windows, installation of TensorRT-LLM may succeed, but you might hit `OSError: exception: access violation reading 0x0000000000000000` when importing the library in Python. See [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) for workarounds. - ## TensorRT-LLM Release 0.12.0 @@ -440,11 +482,11 @@ All published functionality in the Release Notes has been fully tested and verif ### Key Features and Enhancements -- Chunked context support (see docs/source/gpt_attention.md#chunked-context) +- Chunked context support (see docs/source/advanced/gpt-attention.md#chunked-context) - LoRA support for C++ runtime (see docs/source/lora.md) - Medusa decoding support (see examples/medusa/README.md) - The support is limited to Python runtime for Ampere or newer GPUs with fp16 and bf16 accuracy, and the `temperature` parameter of sampling configuration should be 0 -- StreamingLLM support for LLaMA (see docs/source/gpt_attention.md#streamingllm) +- StreamingLLM support for LLaMA (see docs/source/advanced/gpt-attention.md#streamingllm) - Support for batch manager to return logits from context and/or generation phases - Include support in the Triton backend - Support AWQ and GPTQ for QWEN diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py index c9e5d70fb..96b5f0d2f 100755 --- a/examples/apps/fastapi_server.py +++ b/examples/apps/fastapi_server.py @@ -2,6 +2,9 @@ import asyncio import json import logging +import signal +from contextlib import asynccontextmanager +from http import HTTPStatus from typing import AsyncGenerator, Optional import click @@ -9,6 +12,7 @@ from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse +from tensorrt_llm.executor import CppExecutorError, RequestError from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig, SamplingParams TIMEOUT_KEEP_ALIVE = 5 # seconds. @@ -16,11 +20,16 @@ class LlmServer: - def __init__(self, llm: LLM, kv_cache_config: KvCacheConfig): + def __init__(self, llm: LLM): self.llm = llm - self.kv_cache_config = kv_cache_config - self.app = FastAPI() + @asynccontextmanager + async def lifespan(app: FastAPI): + # terminate rank0 worker + yield + self.llm._shutdown() + + self.app = FastAPI(lifespan=lifespan) self.register_routes() def register_routes(self): @@ -50,20 +59,27 @@ async def generate(self, request: Request) -> Response: sampling_params = SamplingParams(**request_dict) - promise = self.llm.generate_async(prompt, - streaming=streaming, - sampling_params=sampling_params) + try: + promise = self.llm.generate_async(prompt, + streaming=streaming, + sampling_params=sampling_params) - async def stream_results() -> AsyncGenerator[bytes, None]: - async for output in promise: - yield output.outputs[0].text_diff.encode("utf-8") + async def stream_results() -> AsyncGenerator[bytes, None]: + async for output in promise: + yield output.outputs[0].text_diff.encode("utf-8") - if streaming: - return StreamingResponse(stream_results()) + if streaming: + return StreamingResponse(stream_results()) - # Non-streaming case - await promise.aresult() - return JSONResponse({"text": promise.outputs[0].text}) + # Non-streaming case + await promise.aresult() + return JSONResponse({"text": promise.outputs[0].text}) + except RequestError as e: + return JSONResponse(content=str(e), + status_code=HTTPStatus.BAD_REQUEST) + except CppExecutorError: + # If internal executor error is raised, shutdown the server + signal.raise_signal(signal.SIGINT) async def __call__(self, host, port): config = uvicorn.Config(self.app, @@ -82,28 +98,32 @@ async def __call__(self, host, port): @click.option("--max_beam_width", type=int, default=1) @click.option("--tp_size", type=int, default=1) @click.option("--pp_size", type=int, default=1) +@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8) def entrypoint(model_dir: str, tokenizer: Optional[str] = None, host: Optional[str] = None, port: int = 8000, max_beam_width: int = 1, tp_size: int = 1, - pp_size: int = 1): + pp_size: int = 1, + kv_cache_free_gpu_memory_fraction: float = 0.8): host = host or "0.0.0.0" port = port or 8000 logging.info(f"Starting server at {host}:{port}") build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width) + kv_cache_config = KvCacheConfig( + free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction) + llm = LLM(model_dir, tokenizer, tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, - build_config=build_config) - - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) + build_config=build_config, + kv_cache_config=kv_cache_config) - server = LlmServer(llm=llm, kv_cache_config=kv_cache_config) + server = LlmServer(llm=llm) asyncio.run(server(host, port)) diff --git a/examples/apps/openai_server.py b/examples/apps/openai_server.py index 5c502aa13..5323be8d4 100644 --- a/examples/apps/openai_server.py +++ b/examples/apps/openai_server.py @@ -1,6 +1,8 @@ #!/usr/bin/env python import asyncio import logging +import signal +from contextlib import asynccontextmanager from http import HTTPStatus from pathlib import Path from typing import (AsyncGenerator, AsyncIterator, List, Optional, Tuple, @@ -15,6 +17,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizer # yapf: disable +from tensorrt_llm.executor import CppExecutorError from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig from tensorrt_llm.hlapi.llm import RequestOutput from tensorrt_llm.hlapi.openai_protocol import ( @@ -66,10 +69,8 @@ class OpenaiServer: def __init__(self, llm: LLM, model: str, - kv_cache_config: KvCacheConfig, hf_tokenizer: PreTrainedTokenizer = None): self.llm = llm - self.kv_cache_config = kv_cache_config self.tokenizer = hf_tokenizer model_dir = Path(model) @@ -78,7 +79,13 @@ def __init__(self, else: self.model = model - self.app = FastAPI() + @asynccontextmanager + async def lifespan(app: FastAPI): + # terminate rank0 worker + yield + self.llm._shutdown() + + self.app = FastAPI(lifespan=lifespan) @self.app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): @@ -326,7 +333,9 @@ async def create_chat_response(promise: RequestOutput) -> JSONResponse: else: response = await create_chat_response(promise) return JSONResponse(content=response.model_dump()) - + except CppExecutorError: + # If internal executor error is raised, shutdown the server + signal.raise_signal(signal.SIGINT) except Exception as e: return self.create_error_response(str(e)) @@ -432,6 +441,9 @@ async def create_completion_response(generator: AsyncIterator[Tuple[int, Request else: response = await create_completion_response(generator, num_choices) return JSONResponse(content=response.model_dump()) + except CppExecutorError: + # If internal executor error is raised, shutdown the server + signal.raise_signal(signal.SIGINT) except Exception as e: return self.create_error_response(str(e)) @@ -452,31 +464,34 @@ async def __call__(self, host, port): @click.option("--max_beam_width", type=int, default=1) @click.option("--tp_size", type=int, default=1) @click.option("--pp_size", type=int, default=1) +@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8) def entrypoint(model_dir: str, tokenizer: Optional[str] = None, host: Optional[str] = None, port: int = 8000, max_beam_width: int = 1, tp_size: int = 1, - pp_size: int = 1): + pp_size: int = 1, + kv_cache_free_gpu_memory_fraction: float = 0.8): host = host or "0.0.0.0" port = port or 8000 logging.info(f"Starting server at {host}:{port}") build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction) + llm = LLM(model_dir, tokenizer, tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, - build_config=build_config) + build_config=build_config, + kv_cache_config=kv_cache_config) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer or model_dir) server = OpenaiServer(llm=llm, model=model_dir, - kv_cache_config=kv_cache_config, hf_tokenizer=hf_tokenizer) asyncio.run(server(host, port)) diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt index c6f02eff0..ea47f59f5 100644 --- a/examples/baichuan/requirements.txt +++ b/examples/baichuan/requirements.txt @@ -1,8 +1,8 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.15.0 evaluate~=0.4.1 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 cpm-kernels~=1.0.11 transformers_stream_generator~=0.0.4 diff --git a/examples/bindings/executor/README.md b/examples/bindings/executor/README.md index e3fdd5d3d..51a605c83 100644 --- a/examples/bindings/executor/README.md +++ b/examples/bindings/executor/README.md @@ -21,7 +21,7 @@ python3 example_basic.py --model_path=../llama/tmp/7B/trt_engines/fp16/1-gpu/ ### Debug example -This example shows how you can define which engine IO tensors should be dumped to numpy files. +This example shows how you can define which engine IO tensors should be kept or dumped to numpy files. Run `example_debug.py`, passing in the directory where the TensorRT engine was generated. For example: ``` diff --git a/examples/bindings/executor/example_advanced.py b/examples/bindings/executor/example_advanced.py index c6b7c31ea..6cd1303ed 100644 --- a/examples/bindings/executor/example_advanced.py +++ b/examples/bindings/executor/example_advanced.py @@ -124,7 +124,7 @@ def write_output_tokens(output_tokens_csv_file: str, request_ids: list[int], default=False, action="store_true", help= - "Exclude input token when writing output toekns. Only has effect for streaming=False since in streaming mode, input tokens are never included in output." + "Exclude input token when writing output tokens. Only has effect for streaming=False since in streaming mode, input tokens are never included in output." ) parser.add_argument("--max_tokens", type=int, diff --git a/examples/bindings/executor/example_debug.py b/examples/bindings/executor/example_debug.py index 9fe817e27..f7c0669b1 100644 --- a/examples/bindings/executor/example_debug.py +++ b/examples/bindings/executor/example_debug.py @@ -16,14 +16,17 @@ type=str, required=True, help="Directory containing model engine") + parser.add_argument("--dump_tensors", + action="store_true", + help="Dump debug tensors to files") args = parser.parse_args() - # debug_config = trtllm.DebugConfig(dump_input_tensors=True, - # dump_output_tensors=True, - # debug_tensor_names=["test"]) + max_tokens = 2 - # Select which tensors should be dumped - debug_config = trtllm.DebugConfig(debug_tensor_names=["host_request_types"]) + # Select which tensors should be kept or dumped + debug_config = trtllm.DebugConfig( + debug_tensor_names=["sequence_length"], + debug_tensors_max_iterations=0 if args.dump_tensors else max_tokens) # Create the executor. executor = trtllm.Executor( @@ -32,7 +35,8 @@ if executor.can_enqueue_requests(): # Create the request. - request = trtllm.Request(input_token_ids=[1, 2, 3, 4], max_tokens=2) + request = trtllm.Request(input_token_ids=[1, 2, 3, 4], + max_tokens=max_tokens) # Enqueue the request. request_id = executor.enqueue_request(request) @@ -44,9 +48,20 @@ # Print tokens. print(output_tokens) - print("debug tensors:") - debug_dir = pl.Path("/tmp/tllm_debug/PP_1/TP_1") - for iter_dir in [x for x in debug_dir.iterdir() if x.is_dir()]: - print(iter_dir.name) - for file in [x for x in iter_dir.iterdir() if x.is_file()]: - print(file.name, np.load(file)) + if args.dump_tensors: + print("debug tensors from files:") + debug_dir = pl.Path("/tmp/tllm_debug/PP_1/TP_1") + if debug_dir.is_dir(): + for iter_dir in [x for x in debug_dir.iterdir() if x.is_dir()]: + print(iter_dir.name) + for file in [x for x in iter_dir.iterdir() if x.is_file()]: + print(file.name, np.load(file)) + else: + print("debug dir not found") + else: + print("debug tensors from queue:") + debug_tensors = executor.get_latest_debug_tensors() + for debug_iter in debug_tensors: + print(f"iteration {debug_iter.iter}") + for [name, tensor] in debug_iter.debug_tensors.items(): + print(name, tensor) diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt index b795b5ca4..d4adbc087 100644 --- a/examples/bloom/requirements.txt +++ b/examples/bloom/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt index 231831897..aebac704a 100644 --- a/examples/chatglm/requirements.txt +++ b/examples/chatglm/requirements.txt @@ -1,8 +1,10 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 protobuf rouge_score~=0.1.2 sentencepiece tiktoken +# https://github.com/THUDM/ChatGLM3/issues/1324 +transformers<=4.43.0 diff --git a/examples/dbrx/convert_checkpoint.py b/examples/dbrx/convert_checkpoint.py index 41e19b309..2009bbac7 100644 --- a/examples/dbrx/convert_checkpoint.py +++ b/examples/dbrx/convert_checkpoint.py @@ -545,6 +545,7 @@ def execute(workers, func, hf_model): kv_cache_quant_algo = QuantAlgo.INT8 hf_config = None + if args.model_dir is not None: hf_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) @@ -563,8 +564,10 @@ def execute(workers, func, hf_model): args.clip_qkv = hf_config.attn_config.clip_qkv args.hidden_act = 'swiglu' args.rotary_base = hf_config.attn_config.rope_theta - args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k, - args.moe_renorm_mode).validate() + args.moe_config = MoeConfig( + num_experts=args.moe_num_experts, + top_k=args.moe_top_k, + normalization_mode=args.moe_renorm_mode).validate() config = { 'architecture': hf_config.architectures[0], 'dtype': args.dtype, diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt index f4b32b625..c0790e6de 100644 --- a/examples/dbrx/requirements.txt +++ b/examples/dbrx/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/deepseek_v1/README.md b/examples/deepseek_v1/README.md new file mode 100644 index 000000000..6bca80df1 --- /dev/null +++ b/examples/deepseek_v1/README.md @@ -0,0 +1,77 @@ +# Deepseek-v1 + +This document shows how to build and run [deepseek-v1](https://arxiv.org/pdf/2401.06066) model in TensorRT-LLM. + +- [Deepseek-v1](#deepseek-v1) + - [Prerequisite](#prerequistie) + - [Hardware](#hardware) + - [Overview](#overview) + - [Support Matrix](#support-matrix) + - [Usage](#usage) + - [Build TensorRT engine(s)](#build-tensorrt-engines) + +## Prerequisite + +First, please download Deepseek-v1 weights from HF https://huggingface.co/deepseek-ai/deepseek-moe-16b-base. + +```bash +git lfs install +git clone https://huggingface.co/deepseek-ai/deepseek-moe-16b-base +``` + +## Hardware + +The Deepseek-v1 model requires 1x80G GPU memory. + +## Overview + +The TensorRT-LLM Deepseek-v1 implementation can be found in [tensorrt_llm/models/deepseek_v1/model.py](../../tensorrt_llm/models/deepseek_v1/model.py). The TensorRT-LLM Deepseek-v1 example code is located in [`example/deepseek_v1`](./). There is one main file: + +* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the Deepseek-v1 model into tensorrt-llm checkpoint format. + +In addition, there are three shared files in the parent folder [`examples`](../) can be used for inference and evaluation: + +* [`../run.py`](../run.py) to run the model inference output by given an input text. +* [`../summarize.py`](../summarize.py) to summarize the article from [cnn_dailmail](https://huggingface.co/datasets/cnn_dailymail) dataset, it can running the summarize from HF model and TensorRT-LLM model. +* [`../mmlu.py`](../mmlu.py) to running score script from https://github.com/declare-lab/instruct-eval to compare HF model and TensorRT-LLM model on the MMLU dataset. + +## Support Matrix + +- [x] FP16 +- [x] TENSOR PARALLEL +- [ ] FP8 + +## Usage + +The TensorRT-LLM Deepseek-v1 example code locates at [examples/deepseek_v1](./). It takes PyTorch weights as input, and builds corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference. + +### Build TensorRT engine(s) + +Below is the step-by-step to run Deepseek-v1 with TensorRT-LLM. + +First the checkpoint will be converted to the TensorRT-LLM checkpoint format by apply [`convert_checkpoint.py`](./convert_checkpoint.py). After that, the TensorRT engine(s) can be build with TensorRT-LLM checkpoint. + +```bash +# Build the bfloat16 engine from Deepseek-v1 HF weights. +python convert_checkpoint.py --model_dir ./deepseek_moe_16b/ \ + --output_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \ + --dtype bfloat16 \ + --tp_size 1 +trtllm-build --checkpoint_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \ + --output_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \ + --gpt_attention_plugin bfloat16 \ + --gemm_plugin bfloat16 \ + --moe_plugin bfloat16 \ +``` + +Then, test the engine with [run.py](../run.py) script: + +```bash +python ../run.py --engine_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \ + --tokenizer_dir ./deepseek_moe_16b/ \ + --max_output_len 32 \ + --top_p 0 \ + --input_text "The president of the United States is person who" +``` +## Credits +This Deepseek-v1 model example exists thanks to @akhoroshev(https://github.com/akhoroshev) community contribution! diff --git a/tensorrt_llm/models/deci/__init__.py b/examples/deepseek_v1/__init__.py similarity index 100% rename from tensorrt_llm/models/deci/__init__.py rename to examples/deepseek_v1/__init__.py diff --git a/examples/deepseek_v1/convert_checkpoint.py b/examples/deepseek_v1/convert_checkpoint.py new file mode 100644 index 000000000..b76ea1b18 --- /dev/null +++ b/examples/deepseek_v1/convert_checkpoint.py @@ -0,0 +1,215 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed + +import tensorrt_llm +from tensorrt_llm._utils import release_gc +from tensorrt_llm.layers import MoeConfig +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import DeepseekForCausalLM +from tensorrt_llm.models.deepseek_v1.convert import load_hf_deepseek + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', type=str, default=None, required=True) + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument( + '--moe_tp_size', + type=int, + default=-1, + help= + 'N-way tensor parallelism size for MoE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MoE, default is 1, which will do tp-only for MoE' + ) + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0)' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim=0') + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers' + 'Note: the flag might not take effect when the criteria are not met') + parser.add_argument('--output_dir', + type=str, + default='trtllm_checkpoint', + required=True, + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') + parser.add_argument( + '--moe_num_experts', + type=int, + default=0, + help='Specify the number of experts to use for MOE layers') + parser.add_argument( + '--moe_top_k', + type=int, + default=0, + help= + 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' + ) + parser.add_argument( + '--moe_renorm_mode', + type=int, + default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, + help= + 'Controls renormalization after gate logits. Check layers/moe.py for accepted values' + ) + parser.add_argument( + '--save_config_only', + action="store_true", + default=False, + help= + 'Only save the model config w/o read and converting weights, be careful, this is for debug only' + ) + parser.add_argument( + '--disable_weight_only_quant_plugin', + default=False, + action="store_true", + help= + 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' + 'You must also use --use_weight_only for that argument to have an impact' + ) + # Add quantization related feature later + args = parser.parse_args() + + return args + + +def args_to_build_options(args): + return { + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'disable_weight_only_quant_plugin': + args.disable_weight_only_quant_plugin + } + + +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def convert_and_save_hf(args): + model_dir = args.model_dir + world_size = args.tp_size * args.pp_size + # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. + # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, + # before the refactor is done. + override_fields = {} + override_fields.update(args_to_build_options(args)) + + hf_model = load_hf_deepseek(model_dir) + + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + deepseekv1 = DeepseekForCausalLM.from_hugging_face( + hf_model, args.model_dir, args.dtype, mapping, **override_fields) + deepseekv1.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del deepseekv1 + + execute(args.workers, [convert_and_save_rank] * world_size, args) + release_gc() + + +def main(): + print(tensorrt_llm.__version__) + args = parse_arguments() + + args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" + + tik = time.time() + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + assert args.model_dir is not None + convert_and_save_hf(args) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') + + +if __name__ == '__main__': + main() diff --git a/examples/deepseek_v1/requirements.txt b/examples/deepseek_v1/requirements.txt new file mode 100644 index 000000000..0ad9cdb1e --- /dev/null +++ b/examples/deepseek_v1/requirements.txt @@ -0,0 +1,5 @@ +--extra-index-url https://pypi.nvidia.com +tensorrt_llm==0.11.0 +datasets~=2.14.6 +evaluate~=0.4.1 +rouge_score~=0.1.2 diff --git a/examples/draft_target_model/README.md b/examples/draft_target_model/README.md new file mode 100644 index 000000000..debf1cf3b --- /dev/null +++ b/examples/draft_target_model/README.md @@ -0,0 +1,86 @@ +# Draft-Target-Model Speculative Decoding + +This document shows how to build and run a model using Draft-Target-Model speculative decoding (also known as `Speculative-Sampling`, [`Paper`](https://arxiv.org/abs/2302.01318)) in TensorRT-LLM on single GPU, or single node multiple GPU. + +## Overview + +The Draft-Target-Model involves the use of two distinct models trained independently but sharing the same vocabulary: a smaller Draft model and a larger Target model. For example, GPT 125M / 6.7B models can serve as the Draft / Target model. + +There are two styles of using Draft-Target-Model in TensorRT-LLM now. The first one is using TensorRT-LLM-BLS in Triton, which more information and detailed steps can be found in [speculative decoding documentation](../../docs/source/speculative_decoding.md). The second one is using it directly in TensorRT-LLM, which steps can be found in this document and the code can be found in [examples/run.py](../run.py). + +Draft-Target-Model has 4 additional hyperparameters that you need to specify to control the process of generation: +- `draft_len`: the number of tokens the draft model generated in one iteration, which the range is from 4 to 10 in common usage. Empirically, the larger the value is, the higher acceptance ratio but higher overhead is expected at the same time, so the right balance based on the models and application scenarios needs to be found. +- `draft_model_device_list`: the index list of device(s) to run the draft model. The length of it must be the same as the TP size of the draft model engine. For instances, `draft_model_device_list=[1]` means using tp_size=1 and GPU 1 for draft model, `draft_model_device_list=[4,5,6,7]` means using tp=4 and GPU from 4 to 7 for draft model. +- `target_model_device_list`: the index list of device(s) to run the target model. The length of it must be the same as the TP size of the target model engine. For instances, `draft_model_device_list=[0]` means using tp_size=1 and GPU 0 for target model, `draft_model_device_list=[2,3]` means using tp=2 and GPU from 2 to 3 for target model. +- `use_logits`: there are two methods to accept tokens proposed by draft model. When `use_logits=True`, the draft tokens are accepted based on the ratio of the logits from draft and target model (modified rejection sampling method in the original paper); When `use_logits=False`, the draft tokens are accepted based on per-token comparison with target predictions regardless of the logits. + +## Support Matrix + * GPU Compute Capability >= 8.0 (Ampere or newer) + * FP16 / BF16 / FP8 (both draft and target model) + * Paged KV Cache + * Tensor Parallel + +## Usage + +### Build draft and target engines + ++ We use a open-source `llama-v2-7B/13B` models as both draft and target model in this example. ++ `--use_paged_context_fmha=enable` must be specified since we need KVcache reuse for draft / target model. ++ `--gather_generation_logits` is optional. In original paper, we accept the tokens by comparing logits of draft and target models, so this parameter is needed. But for simplification, we can accept the tokens by comparing the output token directly, in this occasion, we can skip this parameter. ++ `--speculative_decoding_mode=draft_tokens_external` and `--max_draft_len` must be specified for target model. + +```bash +cd examples/llama + +python3 convert_checkpoint.py \ + --model_dir= \ + --output_dir=./ckpt-draft \ + --dtype=float16 + +python3 convert_checkpoint.py \ + --model_dir= \ + --output_dir=./ckpt-target \ + --dtype=float16 + +trtllm-build \ + --checkpoint_dir ./ckpt-draft \ + --output_dir=./draft-engine \ + --gemm_plugin=float16 \ + --use_paged_context_fmha=enable \ + --gather_generation_logits \ + --max_batch_size=4 \ + --max_input_len=3200 \ + --max_seq_len=4800 + +trtllm-build \ + --checkpoint_dir=./ckpt-target \ + --output_dir=./target-engine \ + --gemm_plugin=float16 \ + --use_paged_context_fmha=enable \ + --gather_generation_logits \ + --speculative_decoding_mode=draft_tokens_external \ + --max_draft_len=10 \ + --max_batch_size=4 \ + --max_input_len=3200 \ + --max_seq_len=4800 +``` + +### Run decoding + ++ `--draft_engine_dir` and `--engine_dir` must be specified for the draft and target engines. ++ `--draft_target_model_config` is corresponding configuration of Draft-Target-Model, we can see its definition in [util.py](../util.py). + + As an example, `[4,[0],[1],False]` means `draft_len=4`, device of draft model is `GPU0`, device of target model is `GPU1`, and use tokens rather than logits to accept. ++ Only CPP session (using executor as low-level API) is supported, while Python session (`--use_py_session`) is not supported. + +```bash +cd examples/llama + +python3 ../run.py \ + --tokenizer_dir gpt2-medium \ + --draft_engine_dir ./draft-engine \ + --engine_dir ./target-engine \ + --draft_target_model_config="[4,[0],[1],True]" \ + --kv_cache_free_gpu_memory_fraction=0.4 \ + --max_output_len=256 \ + --input_text="How does Draft-Sampling work?" +``` diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt new file mode 100644 index 000000000..179f92d66 --- /dev/null +++ b/examples/draft_target_model/requirements.txt @@ -0,0 +1,6 @@ +--extra-index-url https://pypi.nvidia.com +tensorrt_llm==0.14.0 +datasets~=2.14.5 +rouge_score~=0.1.2 +sentencepiece>=0.1.99 +evaluate~=0.4.1 diff --git a/examples/enc_dec/README.md b/examples/enc_dec/README.md index dfe68ed90..509de1255 100644 --- a/examples/enc_dec/README.md +++ b/examples/enc_dec/README.md @@ -219,6 +219,8 @@ For good usability, Python binding of the C++ runtime is provided. You can use t python3 ../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful." ``` +You can specify `--kv_cache_free_gpu_memory_fraction` to control the percentage of free GPU memory to be used by KV cache (by default 0.9), and `--cross_kv_cache_fraction` to control the percentage of KV cache to be used by cross attention (by default 0.5, and rest of the KV cache will be used by self attention). + For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend). #### Run with Triton Backend diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt index 83ba1d6a5..cf320cc9d 100644 --- a/examples/falcon/requirements.txt +++ b/examples/falcon/requirements.txt @@ -1,8 +1,8 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 transformers>=4.31.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 tqdm diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt index a296aa5b6..d1e107a9b 100644 --- a/examples/gemma/requirements.txt +++ b/examples/gemma/requirements.txt @@ -3,12 +3,12 @@ # WAR the new posting of "nvidia-cudnn-cu12~=9.0". # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9". nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64" -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 flax~=0.8.0 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows" jax~=0.4.19; platform_system == "Windows" safetensors~=0.4.1 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 h5py~=3.10.0 rouge_score nltk diff --git a/examples/gpt/README.md b/examples/gpt/README.md index 2c9699096..7b0b65507 100644 --- a/examples/gpt/README.md +++ b/examples/gpt/README.md @@ -425,8 +425,7 @@ Then, use `trtllm-build` to build engine(s). ```bash trtllm-build --checkpoint_dir starcoder2/trt_ckpt/int8-sq/ \ - --output_dir starcoder2/trt_engine/int8-sq/ \ - --builder_opt 4 + --output_dir starcoder2/trt_engine/int8-sq/ ``` diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt index 30e41f940..2714c5b5c 100644 --- a/examples/gpt/requirements.txt +++ b/examples/gpt/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 -SentencePiece~=0.1.99 +SentencePiece>=0.1.99 diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt index d7bf43bed..80c4473aa 100644 --- a/examples/gptj/requirements.txt +++ b/examples/gptj/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt index b97a55fdb..4700fd4eb 100644 --- a/examples/gptneox/requirements.txt +++ b/examples/gptneox/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 rouge_score~=0.1.2 evaluate~=0.4.1 diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt index 3c4c9a922..66889c9a3 100644 --- a/examples/grok/requirements.txt +++ b/examples/grok/requirements.txt @@ -1,6 +1,6 @@ -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt index 1a932da58..9b782d0f9 100644 --- a/examples/internlm/requirements.txt +++ b/examples/internlm/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets==2.14.5 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 evaluate~=0.4.1 diff --git a/examples/jais/requirements.txt b/examples/jais/requirements.txt index 30e41f940..2714c5b5c 100644 --- a/examples/jais/requirements.txt +++ b/examples/jais/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 -SentencePiece~=0.1.99 +SentencePiece>=0.1.99 diff --git a/examples/llama/README.md b/examples/llama/README.md index 7c3ca77b7..b21c7f370 100644 --- a/examples/llama/README.md +++ b/examples/llama/README.md @@ -67,7 +67,7 @@ The TensorRT-LLM LLaMA example code locates at [examples/llama](./). It takes HF Please install required packages first to make sure the example uses matched `tensorrt_llm` version: ```bash -pip install -r requirements.txt +pip install --upgrade -r requirements.txt ``` Need to prepare the HF LLaMA checkpoint by following the guides here https://huggingface.co/docs/transformers/main/en/model_doc/llama. @@ -717,16 +717,19 @@ To run the GPTQ LLaMa example, the following steps are required: 1. Weight quantization: - Quantized weights for GPTQ are generated using [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa.git) as follow: + Quantized weights for GPTQ are generated using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) as follow: ```bash - git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git - cd GPTQ-for-LLaMa - pip install -r requirements.txt + git clone https://github.com/AutoGPTQ/AutoGPTQ + cd AutoGPTQ + pip install . + + # Download the quant_autogptq script + wget https://gist.githubusercontent.com/TheBloke/b47c50a70dd4fe653f64a12928286682/raw/ebcee019d90a178ee2e6a8107fdd7602c8f1192a/quant_autogptq.py # Quantize weights into INT4 and save as safetensors # Quantized weight with parameter "--act-order" is not supported in TRT-LLM - python llama.py ./tmp/llama/7B/ c4 --wbits 4 --true-sequential --groupsize 128 --save_safetensors ./llama-7b-4bit-gs128.safetensors + python quant_autogptq.py ./tmp/llama/7B ./llama-7b-4bit-gs128.safetensors wikitext --bits 4 --group_size 128 --desc_act 0 --damp 0.1 --dtype float16 --seqlen 4096 --num_samples 3 --use_fast ``` Let us build the TRT-LLM engine with the saved `./llama-7b-4bit-gs128.safetensors`. @@ -907,6 +910,24 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_codellama \ --output_dir ./tmp/codellama/trt_engines/fp16/1-gpu/ \ --gemm_plugin auto ``` +The example below uses the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process. +First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation)) + +```bash +# Quantize HF CodeLlama 7B into FP8 and export trtllm checkpoint +python ../quantization/quantize.py --model_dir /tmp/CodeLlama-7b-Instruct-hf \ + --dtype float16 \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --output_dir ./tllm_checkpoint_1gpu_fp8 \ + --calib_size 512 + +# Build trtllm engines from the trtllm checkpoint +trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp8 \ + --output_dir ./engine_outputs \ + --gemm_plugin auto +``` + Use the following command to build `CodeLlama-34b-Instruct` for 4 GPUs (TP=4): ```bash python convert_checkpoint.py --model_dir /tmp/CodeLlama-34b-Instruct-hf \ diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py index 17034568f..6443f8513 100644 --- a/examples/llama/convert_checkpoint.py +++ b/examples/llama/convert_checkpoint.py @@ -91,6 +91,19 @@ def parse_arguments(): help= "The huggingface dataset name or the local directory of the dataset for calibration." ) + parser.add_argument( + "--calib_size", + type=int, + default=512, + help= + "Number of samples for calibration. Set to -1 to use the whole dataset.", + ) + parser.add_argument( + "--calib_max_seq_length", + type=int, + default=512, + help="Max Sequence length for calibration", + ) parser.add_argument( "--smoothquant", "-sq", @@ -408,6 +421,8 @@ def convert_and_save_hf(args): quant_config=quant_config, device='cpu' if args.load_model_on_cpu else 'cuda', calib_dataset=args.calib_dataset, + calib_batches=args.calib_size, + calib_max_seq_length=args.calib_max_seq_length, **override_fields) else: # When not loading by shard, preload one complete model and then slice per rank weights from this diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt index 30dd11d63..8cb0b9096 100644 --- a/examples/llama/requirements.txt +++ b/examples/llama/requirements.txt @@ -1,6 +1,7 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 +transformers>=4.43.0 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 diff --git a/examples/llm-api/README.md b/examples/llm-api/README.md index 1dbba4f28..7e180d3af 100644 --- a/examples/llm-api/README.md +++ b/examples/llm-api/README.md @@ -1,329 +1,3 @@ -# High-level API -We are working on a Python high-level API(HLAPI) for LLM workflow, which is still in incubation and may change later. -Here we show you a preview of how it works and how to use it. +# LLM API Examples -Note that the APIs are not stable and we appreciate your patience and understanding as we improve this API. - -## HLAPI Supported Model -* LLaMA (including variants Mistral, Mixtral, InternLM) -* GPT (including variants Starcoder-1/2, Santacoder) -* Gemma-1/2 -* Phi-1/2/3 -* ChatGLM (including variants glm-10b, chatglm, chatglm2, chatglm3, glm4) -* QWen-1/1.5/2 -* Falcon -* Baichuan-1/2 -* GPT-J - -## Quick start - -Please install the required packages first: - -```bash -pip install -r requirements.txt -``` - -Here is a simple example to show how to use the HLAPI: - -Firstly, import the `LLM` and `SamplingParams` from the `tensorrt_llm` package, and create an LLM object with a HuggingFace (HF) model directly. Here we use the TinyLlama model as an example, `LLM` will download the model from the HuggingFace model hub automatically. You can also specify local models, either in HF format, TensorRT-LLM engine format or TensorRT-LLM checkpoint format. - -```python -from tensorrt_llm import LLM, SamplingParams - -llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") -``` - -Secondly, generate text with the `generate` method of the `LLM` object directly with a batch of prompts, the `sampling_params` is optional, and you can customize the sampling strategy with it. - -```python -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -outputs = llm.generate(prompts, sampling_params) - -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` - -Please refer to the [LLM quickstart](./quickstart_example.py) for the complete example. - -## Examples - -You can refer to the scripts in the current directory for all of the examples. - -## Model preparation -The `LLM` class supports four kinds of model inputs: - -1. **HuggingFace model name**: triggers a download from the HuggingFace model hub, e.g. `TinyLlama/TinyLlama-1.1B-Chat-v1.0` in the quickstart. -1. **Local HuggingFace models**: uses a locally stored HuggingFace model. -2. **Local TensorRT-LLM engine**: built by `trtllm-build` tool or saved by the HLAPI -3. **Local TensorRT-LLM checkpoints**: converted by `convert_checkpoint.py` script in the examples - -All kinds of the model inputs can be seamlessly integrated with the HLAPI, and the `LLM(model=)` construcotr can accommodate models in any of the above formats. - -Let's delve into the preparation of the three kinds of local model formats. - -### Option 1: From HuggingFace models - -Given its popularity, the TRT-LLM HLAPI chooses to support HuggingFace format as one of the start points, to use the HLAPI on LLaMA models, you need to run the following conversion script provided in [transformers/llama](https://huggingface.co/docs/transformers/main/model_doc/llama) or [transformers/llama2](https://huggingface.co/docs/transformers/main/model_doc/llama2) to convert the Meta checkpoint to HuggingFace format. - -For instance, when targeting the LLaMA2 7B model, the official way to retrieve the model is to visit the [LLaMA2 model page](https://huggingface.co/docs/transformers/main/en/model_doc/llama2), normally you need to submit a request for the model file. - -To convert the checkpoint files, a script from transformers is required, thus please also clone the transformers repo with the following code: - -```sh -git clone https://github.com/huggingface/transformers.git -``` - -Finally, the command to convert the checkpoint files to HuggingFace format is as follows: - -``` sh -python /src/transformers/models/llama/convert_llama_weights_to_hf.py \ - --input_dir Llama-2-7b --model_size 7B --output_dir llama-hf-7b -``` - -That should produce a HuggingFace format model in `./llama-hf-7b`, which could be used by the HLAPI. - -### Option 2: From TensorRT-LLM engine -There are two ways to build the TensorRT-LLM engine: - -1. You can build the TensorRT-LLM engine from the HuggingFace model directly with the `trtllm-build` tool, and save the engine to disk for later use. Please consult the LLaMA's [README](../llama/README.md). -2. Use the HLAPI to save one: - -```python -llm = LLM() - -# Save engine to local disk -llm.save() -``` - -### Option 3: From TensorRT-LLM checkpoint -In each model example, there is a `convert_checkpoint.py` to convert third-party models to TensorRT-LLM checkpoint for further usage. -The HLAPI could seamlessly accept the checkpoint, and build the engine in the backend. -For step-by-step guidance on checkpoint conversion, please refer to the LLaMA's [README](../llama/README.md). - - -## Basic usage -To use the API, import the `LLM` from the `tensorrt_llm` package and create an LLM object with a HuggingFace model directly. -For example: - -``` python -from tensorrt_llm import LLM - -llm = LLM(model=) -``` - -It will trigger TRT-LLM engine building in the backend, and create a HuggingFace tokenizer by default to support an end-to-end generation. - -To generate text, use the `generate` method of the `LLM` object directly with a batch of prompts, for example: - -``` python -prompts = ["To tell a story"] -for output in llm.generate(prompts): - print(output) -``` - -The output might be something like: - -``` python -RequestOutput(request_id=2, prompt='To tell a story', prompt_token_ids=[1, 1763, 2649, 263, 5828], outputs=[CompletionOutput(index=0, text=', you need to have a beginning, a middle, and an end.\nThe beginning is the introduction of the characters and the setting.\nThe middle is', token_ids=[29892, 366, 817, 304, 505, 263, 6763, 29892, 263, 7256, 29892, 322, 385, 1095, 29889, 13, 1576, 6763, 338, 278, 18707, 310, 278, 4890, 322, 278, 4444, 29889, 13, 1576, 7256, 338], cumulative_logprob=None, logprobs=[])], finished=True) -``` - -You can also dump the runtime engine to disk, and load from the engine file directly in the next run to save the engine building time from the HuggingFace model. - -``` python -# dump the llm -llm.save() - -# next time -llm = LLM(model=) -``` - -In other words, the `model_dir` could accept either a HugggingFace model, a built TensorRT-LLM engine, or a TensorRT-LLM checkpoint, and the `LLM()` will do the rest work silently for end-to-end execution. - -## Quantization - -By simply setting several flags in the `LLM`, TensorRT-LLM can quantize the HuggingFace model automatically. For example, to perform an Int4 AWQ quantization, the following code will trigger the model quantization. - - -``` python -from tensorrt_llm.hlapi import QuantConfig, QuantAlgo - -quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ) - -llm = LLM(, quant_config=quant_config) -``` - -## Parallelism - -### Tensor Parallelism -It is easy to enable Tensor Parallelism in the HLAPI. For example, setting `parallel_config.tp_size=2` to perform a 2-way parallelism: - -```python -from tensorrt_llm.hlapi import LLM - -llm = LLM(, - tensor_parallel_size=2) -``` - -### Pipeline Parallelism -Similar to Tensor Parallelism, you can enable Pipeline Parallelism in the HLAPI with following code: - -```python -llm = LLM(, - pipeline_parallel_size=4) -``` - -### Automatic Parallelism (in preview) - -By simply enabling `auto_parallel` in the `LLM` class, TensorRT-LLM can parallelize the model automatically. For example, setting `world_size` to perform a 2-way parallelism: - -``` python -from tensorrt_llm import LLM - -llm = LLM(, auto_parallel=True, world_size=2) -``` - -## Generation -### `asyncio`-based generation -With the high-level API, you can also perform asynchronous generation with the `generate_async` method. For example: - -```python -llm = LLM(model=) - -async for output in llm.generate_async(, streaming=True): - print(output) -``` - -When the `streaming` flag is set to `True`, the `generate_async` method will return a generator that yields the token results as soon as they are available. Otherwise, it will return a generator that yields the final results only. - -### Future-style generation -The result of the `generate_async` method is a Future-like object, it doesn't block the thread unless the `.result()` is called. - -```python -# This will not block the main thread -generation = llm.generate_async() -# Do something else here -# call .result() to explicitly block the main thread and wait for the result when needed -output = generation.result() -``` - -The `.result()` method works like the [result](https://docs.python.org/zh-cn/3/library/asyncio-future.html#asyncio.Future.result) method in the Python Future, you can specify a timeout to wait for the result. - -```python -output = generation.result(timeout=10) -``` - -There is an async version, where the `.aresult()` is used. - -```python -generation = llm.generate_async() -output = await generation.aresult() -``` - -### Customizing sampling with `SamplingParams` -With SamplingParams, you can customize the sampling strategy, such as beam search, temperature, and so on. - -To enable beam search with a beam size of 4, set the `sampling_params` as follows: - -```python -from tensorrt_llm.hlapi import LLM, SamplingParams, BuildConfig - -build_config = BuildConfig() -build_config.max_beam_width = 4 - -llm = LLM(, build_config=build_config) -# Let the LLM object generate text with the default sampling strategy, or -# you can create a SamplingParams object as well with several fields set manually -sampling_params = SamplingParams(beam_width=4) # current limitation: beam_width should be equal to max_beam_width - -for output in llm.generate(, sampling_params=sampling_params): - print(output) -``` - -`SamplingParams` manages and dispatches fields to C++ classes including: -* [SamplingConfig](https://nvidia.github.io/TensorRT-LLM/_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime14SamplingConfigE) -* [OutputConfig](https://nvidia.github.io/TensorRT-LLM/_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor12OutputConfigE) - -Please refer to these classes for more details. - -## LLM pipeline configuration - -### Build configuration -Apart from the arguments mentioned above, you can also customize the build configuration with the `build_config` class and other arguments borrowed from the lower-level APIs. For example: - -```python -llm = LLM(, - build_config=BuildConfig( - max_num_tokens=4096, - max_batch_size=128, - max_beam_width=4)) -``` - -### Runtime customization -Similar to `build_config`, you can also customize the runtime configuration with the `runtime_config`, `peft_cache_config` or other arguments borrowed from the lower-level APIs. For example: - - -```python -from tensorrt_llm.hlapi import LLM, KvCacheConfig - -llm = LLM(, - kv_cache_config=KvCacheConfig( - free_gpu_memory_fraction=0.8)) -``` - -### Tokenizer customization - -By default, the high-level API uses transformers’ `AutoTokenizer`. You can override it with your own tokenizer by passing it when creating the LLM object. For example: - -```python -llm = LLM(, tokenizer=) -``` - -The LLM() workflow should use your tokenizer instead. - -It is also possible to input token IDs directly without Tokenizers with the following code, note that the result will be also IDs without text since the tokenizer is not used. - -``` python -llm = LLM() - -for output in llm.generate([32, 12]): - ... -``` - -### Disabling tokenizer -For performance considerations, you can disable the tokenizer by passing `skip_tokenizer_init=True` when creating `LLM`. In this case, `LLM.generate` and `LLM.generate_async` will expect prompt token ids as input. For example: - -```python -llm = LLM() -for output in llm.generate([[32, 12]]): - print(output) -``` - -You will get something like: -```python -RequestOutput(request_id=1, prompt=None, prompt_token_ids=[1, 15043, 29892, 590, 1024, 338], outputs=[CompletionOutput(index=0, text='', token_ids=[518, 10858, 4408, 29962, 322, 306, 626, 263, 518, 10858, 20627, 29962, 472, 518, 10858, 6938, 1822, 306, 626, 5007, 304, 4653, 590, 4066, 297, 278, 518, 11947, 18527, 29962, 2602, 472], cumulative_logprob=None, logprobs=[])], finished=True) -``` - -Note that the `text` field in `CompletionOutput` is empty since the tokenizer is deactivated. - -### Build caching -Although the HLAPI runs the engine building in the background, you can also cache the built engine to disk and load it in the next run to save the engine building time. - -To enable the build cache, there are two ways to do it: - -1. Use the environment variable: `export TLLM_HLAPI_BUILD_CACHE=1` to enable the build cache globally, and optionally export `TLLM_HLAPI_BUILD_CACHE_ROOT` to specify the cache root directory. -2. Pass the `enable_build_cache` to the `LLM` constructor - -The build cache will reuse the built engine if all the building settings are the same, or it will rebuild the engine. - -NOTE: The build cache monitors the model path and build settings, if you change the weights while keeping the same model path, the build cache will not detect the change and reuse the old engine. +Please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/) and [examples](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/) for detailed information and usage guidelines regarding the LLM API. diff --git a/examples/llm-api/llm_generate.py b/examples/llm-api/llm_inference.py similarity index 100% rename from examples/llm-api/llm_generate.py rename to examples/llm-api/llm_inference.py diff --git a/examples/llm-api/llm_generate_async.py b/examples/llm-api/llm_inference_async.py similarity index 100% rename from examples/llm-api/llm_generate_async.py rename to examples/llm-api/llm_inference_async.py diff --git a/examples/llm-api/llm_generate_async_streaming.py b/examples/llm-api/llm_inference_async_streaming.py similarity index 100% rename from examples/llm-api/llm_generate_async_streaming.py rename to examples/llm-api/llm_inference_async_streaming.py diff --git a/examples/llm-api/llm_inference_customize.py b/examples/llm-api/llm_inference_customize.py new file mode 100644 index 000000000..405a3aa40 --- /dev/null +++ b/examples/llm-api/llm_inference_customize.py @@ -0,0 +1,47 @@ +### Generate text +import tempfile + +from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig, SamplingParams + +# The end user can customize the build configuration with the build_config class and other arguments borrowed from the lower-level APIs +build_config = BuildConfig() +build_config.max_batch_size = 128 +build_config.max_num_tokens = 2048 + +build_config.max_beam_width = 4 + +# Model could accept HF model name or a path to local HF model. + +llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + build_config=build_config, + kv_cache_config=KvCacheConfig( + free_gpu_memory_fraction=0.8 + ), # Similar to `build_config`, you can also customize the runtime configuration with the `kv_cache_config`, `runtime_config`, `peft_cache_config` or \ + # other arguments borrowed from the lower-level APIs. +) + +# You can save the engine to disk and load it back later, the LLM class can accept either a HF model or a TRT-LLM engine. +llm.save(tempfile.mkdtemp()) + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# With SamplingParams, you can customize the sampling strategy, such as beam search, temperature, and so on. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, beam_width=4) + +for output in llm.generate(prompts, sampling_params): + print( + f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}" + ) + +# Got output like +# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming' +# Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the' +# Prompt: 'The capital of France is', Generated text: 'Paris.' +# Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are' diff --git a/examples/llm-api/llm_generate_distributed.py b/examples/llm-api/llm_inference_distributed.py similarity index 91% rename from examples/llm-api/llm_generate_distributed.py rename to examples/llm-api/llm_inference_distributed.py index 2b0dd5c5b..6590f3916 100644 --- a/examples/llm-api/llm_generate_distributed.py +++ b/examples/llm-api/llm_inference_distributed.py @@ -6,8 +6,10 @@ def main(): # model could accept HF model name or a path to local HF model. llm = LLM( model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", - # Distributed settings - tensor_parallel_size=2, + # Enable 2-way tensor parallelism + tensor_parallel_size=2 + # Enable 2-way pipeline parallelism if needed + # pipeline_parallel_size=2 ) # Sample prompts. diff --git a/examples/llm-api/llm_logits_processor.py b/examples/llm-api/llm_logits_processor.py new file mode 100644 index 000000000..33e46657a --- /dev/null +++ b/examples/llm-api/llm_logits_processor.py @@ -0,0 +1,51 @@ +### Control generated text using logits post processor +import typing as tp + +import torch + +from tensorrt_llm import LLM, SamplingParams + + +# Define the logits post-processor callback. This simple callback will output +# a specific token at each step irrespective of prompt. +# Refer to ../bindings/executor/example_logits_processor.py for a more +# sophisticated callback that generates JSON structured output. +def logits_post_processor(req_id: int, logits: torch.Tensor, + ids: tp.List[tp.List[int]], stream_ptr: int, + client_id: tp.Optional[int]): + target_token_id = 42 + with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)): + logits[:] = float("-inf") + logits[..., target_token_id] = 0 + + +# Several callbacks can be specified when initializing LLM +llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + logits_post_processor_map={"my_logits_pp": logits_post_processor}) + +# Sample prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", +] + +# Generate text +for prompt_id, prompt in enumerate(prompts): + # We will use logits post processor callback only for odd-numbered prompts + if prompt_id % 2 == 0: + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + else: + # Each prompt can use one callback from the choices that were provided to LLM + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + logits_post_processor_name='my_logits_pp') + + for output in llm.generate([prompt], sampling_params): + print( + f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}" + ) + +# Got output like +# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming' +# Prompt: 'The president of the United States is', Generated text: "''''''''''''''''''''''''''''''''" diff --git a/examples/llm-api/requirements.txt b/examples/llm-api/requirements.txt deleted file mode 100644 index 90de003d1..000000000 --- a/examples/llm-api/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ ---extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 diff --git a/examples/mamba/README.md b/examples/mamba/README.md index b784e123c..c454606c1 100644 --- a/examples/mamba/README.md +++ b/examples/mamba/README.md @@ -81,37 +81,31 @@ mamba-codestral-7B-v0.1 as an example. ```bash # mamba-2.8b python convert_checkpoint.py --model_dir ./mamba_model/mamba-2.8b/ \ - --ckpt_type hf \ --dtype bfloat16 \ --output_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ # mamba-130m python convert_checkpoint.py --model_dir ./mamba_model/mamba-130m/ \ - --ckpt_type hf \ --dtype float16 \ --output_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/ # mamba2-2.7b python convert_checkpoint.py --model_dir ./mamba_model/mamba2-2.7b/ \ - --ckpt_type state_spaces \ --dtype float16 \ --output_dir ./mamba_model/mamba2-2.7b/trt_ckpt/fp16/1-gpu/ # mamba2-130m python convert_checkpoint.py --model_dir ./mamba_model/mamba2-130m/ \ - --ckpt_type state_spaces \ --dtype float16 \ --output_dir ./mamba_model/mamba2-130m/trt_ckpt/fp16/1-gpu/ # mamba-codestral-7B-v0.1 python convert_checkpoint.py --model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \ - --ckpt_type mistral_inference \ --dtype float16 \ --output_dir ./mamba_model/mamba-codestral-7B-v0.1/trt_ckpt/fp16/1-gpu/ # mamba-codestral-7B-v0.1 with 2-way tensor parallelism. python convert_checkpoint.py --model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \ - --ckpt_type mistral_inference \ --dtype float16 \ --world_size 2 \ --output_dir ./mamba_model/mamba-codestral-7B-v0.1/trt_ckpt/fp16/2-gpu/ diff --git a/examples/mamba/convert_checkpoint.py b/examples/mamba/convert_checkpoint.py index d0de466e2..0e48c97fb 100644 --- a/examples/mamba/convert_checkpoint.py +++ b/examples/mamba/convert_checkpoint.py @@ -1,336 +1,95 @@ import argparse -import copy import json import os -import re import time -from dataclasses import dataclass, field -from enum import Enum +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path -from typing import List, Union - -import safetensors.torch -import torch -from transformers import AutoConfig, AutoModelForCausalLM import tensorrt_llm from tensorrt_llm import logger -from tensorrt_llm.models.convert_utils import (iterate_shard_files, - load_state_dict) - - -class CheckpointType(str, Enum): - mistral_inference = "mistral_inference" - state_spaces = "state_spaces" - hf = "hf" +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import MambaForCausalLM +from tensorrt_llm.models.modeling_utils import QuantConfig +from tensorrt_llm.quantization import QuantAlgo def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument("--ckpt_type", - type=CheckpointType, - choices=list(CheckpointType), - default=CheckpointType.hf, - help='Checkpoint type') + parser.add_argument('--model_dir', type=Path, default=None) parser.add_argument("--world_size", type=int, default=1, help="world size, only support tensor parallelism now") + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') parser.add_argument('--dtype', type=str, default='float16', choices=['float32', 'bfloat16', 'float16']) + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) parser.add_argument( '--output_dir', type=Path, default='mamba_tllm_checkpoint', help='The path to save the mamba TensorRT-LLM checkpoint') parser.add_argument('--log_level', type=str, default='info') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') args = parser.parse_args() return args -def get_weight(config, prefix, dtype): - return config[prefix + '.weight'].to(dtype).detach() - - -def get_bias(config, prefix, dtype): - if (prefix + '.bias') in config: - return config[prefix + '.bias'].to(dtype).detach() - return None - - -def get_weight_and_bias(config, prefix, dtype_w, dtype_b): - return get_weight(config, prefix, - dtype_w), get_bias(config, prefix, dtype_b) - - -def get_tllm_linear_weight(weight, prefix, bias=None): - results = {} - results[prefix + 'weight'] = weight.contiguous() - if bias is not None: - results[prefix + 'bias'] = bias - return results - - -def split(v, tp_size, idx, dim=0): - assert v.shape[dim] % tp_size == 0 - split_size = v.shape[dim] // tp_size - if tp_size == 1: - return v - return torch.split(v, split_size, dim=dim)[idx] - - -def convert_hf_mamba(hf_mamba, - rank=0, - dtype='float32', - mamba_version: str = 'Mamba1'): - weights = {} - tik = time.time() - - model_params = dict(hf_mamba.named_parameters()) - dtype = getattr(torch, dtype) - - # Parameter names in mamba block - for l in range(hf_mamba.config.num_hidden_layers): - # ssm layer - prefix = f'backbone.layers.{l}.mixer.' - tllm_prex = f'backbone.layers.{l}.ssm.' - for layer in ['conv1d', 'x_proj', 'dt_proj', 'out_proj']: - dtype_b = torch.float32 if layer == 'dt_proj' else dtype - weight, bias = get_weight_and_bias(model_params, prefix + layer, - dtype, dtype_b) - if layer == 'conv1d': - weight = weight.unsqueeze(3) - tllm_weight_name = tllm_prex + layer + '.weight' - tllm_bias_name = tllm_prex + ('dt_bias' if layer == 'dt_proj' else - layer + '.bias') - weights[tllm_weight_name] = weight - if bias is not None: - weights[tllm_bias_name] = bias - # in_proj - weight, bias = get_weight_and_bias(model_params, prefix + 'in_proj', - dtype, dtype) - in_proj_weights = torch.split(weight, weight.size(0) // 2, dim=0) - tllm_weight_name = tllm_prex + 'in_proj.weight' - weights[tllm_weight_name.replace('proj', 'proj_x')] = in_proj_weights[0] - weights[tllm_weight_name.replace('proj', 'proj_z')] = in_proj_weights[1] - if bias is not None: - in_proj_biases = torch.split(bias, bias.size(0) // 2, dim=0) - tllm_bias_name = tllm_prex + 'in_proj.bias' - weights[tllm_bias_name.replace('proj', - 'proj_x')] = in_proj_biases[0] - weights[tllm_bias_name.replace('proj', - 'proj_x')] = in_proj_biases[1] - - # A and D - Aparam = model_params[prefix + 'A_log'].float().detach() - Aparam = Aparam.permute(1, 0).contiguous() - weights[tllm_prex + 'A'] = -torch.exp(Aparam) - weights[tllm_prex + 'D'] = model_params[prefix + 'D'].float().detach() - # norm - prefix = f'backbone.layers.{l}.norm' - tllm_prex = f'backbone.layers.{l}.input_layernorm.' - weight, bias = get_weight_and_bias(model_params, prefix, dtype, dtype) - weights[tllm_prex + 'weight'] = weight - if bias is not None: - weights[tllm_prex + 'bias'] = bias - - # others - for layer in ['backbone.embeddings', 'backbone.norm_f']: - weight, bias = get_weight_and_bias(model_params, layer, dtype, dtype) - layer = layer.replace('embeddings', 'vocab_embedding') - layer = layer.replace('norm_f', 'ln_f') - weights[layer + '.weight'] = weight - if bias is not None: - weights[layer + '.bias'] = bias - weights['lm_head.weight'], _ = get_weight_and_bias(model_params, - 'backbone.embeddings', - dtype, dtype) - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - print(f'Weights loaded. Total time: {t}') - return weights - - -def rename_hf_to_tllm(name: str): - """ Rename a HF parameter name by the corresponding TRT-LLM style name. """ - # remove model - if 'model.' in name: - name = name.replace('model.', '') - - # change layer name - if 'embeddings.' in name: - name = name.replace('embeddings', 'vocab_embedding') - elif 'embedding.' in name: - name = name.replace('embedding', 'vocab_embedding') - norm_pattern = r'\d\.norm\.' - if 'mixer.' in name: - name = name.replace('mixer.', 'ssm.') - elif re.search(norm_pattern, name): - name = name.replace('norm.', 'input_layernorm.') - elif 'norm_f.' in name: - name = name.replace('norm_f.', 'ln_f.') - - # Parameter names in ssm layers - if 'A_log' in name: - name = name.replace('A_log', 'A') - elif 'dt_proj.bias' in name: - name = name.replace('dt_proj.bias', 'dt_bias') - return name - - -def convert_from_hf_checkpoint(mamba_config: dict, - model_dir: Union[str, Path], - rank=0, - dtype: Union[str, torch.dtype] = torch.float32, - mamba_version: str = 'Mamba1'): - logger.info('Loading weights from HF Mamba...') - tik = time.time() - - tp_rank = rank - tp_size = mamba_config['mapping']['tp_size'] - d_inner = mamba_config['rnn_hidden_size'] - d_state = mamba_config['state_size'] - weights = {} - if isinstance(dtype, str): - dtype = tensorrt_llm.str_dtype_to_torch(dtype) - - for model_file in iterate_shard_files(model_dir, 0): - logger.debug(f'Loading file {str(model_file)}...') - model_params = load_state_dict(model_file, dtype=dtype) - for name, param in model_params.items(): - logger.debug(f'Converting weight {name}...') - tllm_name = rename_hf_to_tllm(name) - param = param.detach().cpu() - if 'A_log' in name: - param = -torch.exp(param.float()) - if mamba_version == 'Mamba1': - param = param.permute(1, 0).contiguous() - elif 'D' in name: - param = param.float() - elif 'dt_proj.bias' in name: - param = param.float() - elif 'dt_bias' in name: - param = param.float() - elif 'conv1d.weight' in name: - param = param.unsqueeze(3) - - # split in_proj in Mamba1 - if 'in_proj' in name and mamba_version == 'Mamba1': - in_proj_params = torch.split(param, param.size(0) // 2, dim=0) - weights[tllm_name.replace('proj', 'proj_x')] = in_proj_params[0] - weights[tllm_name.replace('proj', 'proj_z')] = in_proj_params[1] - elif 'in_proj' in name and mamba_version == 'Mamba2': - nheads = d_inner // mamba_config['rnn_head_size'] - ngroups = mamba_config['ngroups'] - in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split( - param, [ - d_inner, d_inner, ngroups * d_state, ngroups * d_state, - nheads - ], - dim=0) - in_proj_z = split(in_proj_z, tp_size, tp_rank, dim=0) - in_proj_x = split(in_proj_x, tp_size, tp_rank, dim=0) - in_proj_b = split(in_proj_b, tp_size, tp_rank, dim=0) - in_proj_c = split(in_proj_c, tp_size, tp_rank, dim=0) - in_proj_dt = split(in_proj_dt, tp_size, tp_rank, dim=0) - in_proj = torch.concat( - [in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt]) - weights[tllm_name] = in_proj.contiguous() - elif 'conv1d' in name and mamba_version == 'Mamba2': - ngroups = mamba_config['ngroups'] - conv_x, conv_b, conv_c = torch.split( - param, [d_inner, ngroups * d_state, ngroups * d_state], - dim=0) - conv_x = split(conv_x, tp_size, tp_rank, dim=0) - conv_b = split(conv_b, tp_size, tp_rank, dim=0) - conv_c = split(conv_c, tp_size, tp_rank, dim=0) - conv = torch.concat([conv_x, conv_b, conv_c]) - weights[tllm_name] = conv.contiguous() - elif any(keyword in name for keyword in ( - 'mixer.norm.weight', - 'A_log', - 'D', - 'dt_proj.bias', - 'dt_bias', - )) and mamba_version == 'Mamba2': - weights[tllm_name] = split(param, tp_size, tp_rank, dim=0) - elif 'out_proj' in name and mamba_version == 'Mamba2': - weights[tllm_name] = split(param, tp_size, tp_rank, - dim=1).contiguous() - else: - weights[tllm_name] = param - del model_params - - # lm_head - emb = weights['backbone.vocab_embedding.weight'] - if 'lm_head.weight' not in weights or weights['lm_head.weight'].data_ptr( - ) == emb.data_ptr(): - weights['lm_head.weight'] = copy.deepcopy(emb) - if mamba_version == 'Mamba2': - weights['lm_head.weight'] = split(weights['lm_head.weight'], - tp_size, - tp_rank, - dim=0) - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') - return weights - - -def do_convert_from_ckpt(args): - return args.model_dir.exists() - - -def convert(worker_rank, args, convert_args): - convert_from_ckpt = do_convert_from_ckpt(args) - for rank in range(worker_rank, args.world_size): - if convert_from_ckpt: - weights = convert_from_hf_checkpoint(rank=rank, **convert_args) - else: - weights = convert_hf_mamba(rank=rank, **convert_args) - safetensors.torch.save_file(weights, - args.output_dir / f'rank{rank}.safetensors') - - -@dataclass -class MambaConfig: - - architectures: List[str] = field( - default_factory=lambda: ['MambaForCausalLM']) - d_intermediate: int = 0 - vocab_size: int = 50277 - attn_layer_idx: list = field(default_factory=list) - attn_cfg: dict = field(default_factory=dict) - rms_norm: bool = True - residual_in_fp32: bool = True - pad_vocab_size_multiple: int = 8 - hidden_size: int = 2560 - num_hidden_layers: int = 64 - intermediate_size: int = 0 - state_size: int = 128 - conv_kernel: int = 4 - use_bias: bool = False - head_dim: int = 64 - n_groups: int = 1 - chunk_size: int = 256 - ssm_rmsnorm: bool = True - - def update(self, data_dict): - self.__dict__.update(data_dict) - - -def load_config_hf(model_name, ckpt_type): +def load_config_hf(model_name, ckpt_type, dtype, mapping, quant_config, + output_dir): if ckpt_type == CheckpointType.hf: # transformer compatible models - hf_config = AutoConfig.from_pretrained(model_name, - trust_remote_code=True) - mamba_version = 'Mamba2' if hf_config.model_type == 'mamba2' else 'Mamba1' + override_fields = {} + mamba = MambaForCausalLM.from_hugging_face( + model_name, + dtype, + mapping=mapping, + quant_config=quant_config, + **override_fields, + ) + mamba.save_checkpoint(output_dir, save_config=True) + elif ckpt_type == CheckpointType.state_spaces: # state-spaces/mamba models config = json.load(open(os.path.join(model_name, 'config.json'))) + override_fields = {} + mamba = MambaForCausalLM.from_hugging_face( + model_name, + dtype, + mapping=mapping, + quant_config=quant_config, + **override_fields, + ) + mamba.save_checkpoint(output_dir, save_config=True) + ssm_cfg = config.pop('ssm_cfg') cfg_to_mamba_cfg = { 'd_model': 'hidden_size', @@ -355,6 +114,7 @@ def load_config_hf(model_name, ckpt_type): for k in ssm_cfg_to_mamba_cfg: if k in ssm_cfg and ssm_cfg_to_mamba_cfg[k] is not None: config[ssm_cfg_to_mamba_cfg[k]] = ssm_cfg[k] + hf_config = MambaConfig(**config) if 'expand' in ssm_cfg: expand = ssm_cfg['expand'] @@ -362,6 +122,7 @@ def load_config_hf(model_name, ckpt_type): else: hf_config.intermediate_size = 2 * hf_config.hidden_size mamba_version = ssm_cfg.pop("layer", "Mamba1") + elif ckpt_type == CheckpointType.mistral_inference: # mistral inference format config = json.load(open(os.path.join(model_name, 'params.json'))) cfg_to_mamba_cfg = { @@ -384,90 +145,71 @@ def load_config_hf(model_name, ckpt_type): else: hf_config.intermediate_size = 2 * hf_config.hidden_size mamba_version = 'Mamba2' + return hf_config, mamba_version +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: + '''return config dict with quantization info based on the command line args + ''' + quant_config = QuantConfig() + if args.use_weight_only: + if args.weight_only_precision == 'int8': + quant_config.quant_algo = QuantAlgo.W8A16 + elif args.weight_only_precision == 'int4': + quant_config.quant_algo = QuantAlgo.W4A16 + + return quant_config + + def main(): print(tensorrt_llm.__version__) args = parse_arguments() logger.set_level(args.log_level) tik = time.time() + assert args.pp_size == 1, "Pipeline parallelism is not supported." + world_size = args.tp_size * args.pp_size args.output_dir.mkdir(exist_ok=True, parents=True) - hf_config, mamba_version = load_config_hf(args.model_dir, args.ckpt_type) - - vocab_size = hf_config.vocab_size - pad_vocab_size_multiple = getattr(hf_config, "pad_vocab_size_multiple", 1) - if vocab_size % pad_vocab_size_multiple != 0: - vocab_size += pad_vocab_size_multiple - (vocab_size % - pad_vocab_size_multiple) - - config = { - 'architecture': 'MambaForCausalLM', - 'dtype': args.dtype, - 'logits_dtype': 'float32', - 'hidden_size': hf_config.hidden_size, - 'num_hidden_layers': hf_config.num_hidden_layers, - 'layer_types': ['recurrent'], - 'vocab_size': vocab_size, - 'rms_norm': hf_config.rms_norm, - 'residual_in_fp32': hf_config.residual_in_fp32, - 'pad_vocab_size_multiple': pad_vocab_size_multiple, - 'hidden_act': 'silu', - 'num_attention_heads': args.world_size, - 'rnn_hidden_size': hf_config.intermediate_size, - 'rnn_conv_dim_size': hf_config.intermediate_size, - 'state_size': hf_config.state_size, - 'conv_kernel': hf_config.conv_kernel, - 'use_bias': hf_config.use_bias, - 'mamba_version': mamba_version, - 'mapping': { - 'world_size': args.world_size, - 'tp_size': args.world_size, - 'pp_size': 1 - }, - } - if mamba_version == 'Mamba2': - conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size - ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm) - mamba2_cfg = { - 'rnn_head_size': hf_config.head_dim, - 'rnn_conv_dim_size': conv_dim, - 'ngroups': hf_config.n_groups, - 'chunk_size': hf_config.chunk_size, - 'ssm_rmsnorm': ssm_rmsnorm, - } - config.update(mamba2_cfg) - - with (args.output_dir / 'config.json').open('w') as f: - json.dump(config, f, indent=4) - - convert_from_ckpt = do_convert_from_ckpt(args) - # TODO: Add convert_hf_mamba support for Mamba2 when transformers can support Mamba2 models - assert convert_from_ckpt or mamba_version == 'Mamba2', "Mamba2 can only support convert from checkpoints." - assert args.world_size == 1 or mamba_version == 'Mamba2', "Mamba1 can not support tensor parallelism." - if not convert_from_ckpt: - logger.info(f'Convert by using model') - hf_mamba = AutoModelForCausalLM.from_pretrained(args.model_dir, - device_map="auto", - torch_dtype="auto", - trust_remote_code=True) - else: - logger.info(f'Convert by using checkpoint') - hf_mamba = None + quant_config = args_to_quant_config(args) - convert_args = dict(dtype=args.dtype, ) + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size) - if convert_from_ckpt: - convert_args['model_dir'] = args.model_dir - else: - convert_args['hf_mamba'] = hf_mamba - convert_args['mamba_version'] = mamba_version - convert_args['mamba_config'] = config + mamba = MambaForCausalLM.from_hugging_face( + args.model_dir, + args.dtype, + mapping=mapping, + quant_config=quant_config, + ) + mamba.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del mamba - convert(0, args, convert_args) + execute(args.workers, [convert_and_save_rank] * world_size, args) tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt index 844ae8978..f265ca761 100644 --- a/examples/mamba/requirements.txt +++ b/examples/mamba/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 transformers>=4.39.0 datasets~=2.14.5 evaluate diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt index db7c7cd52..179f92d66 100644 --- a/examples/medusa/requirements.txt +++ b/examples/medusa/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 evaluate~=0.4.1 diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md index 20f8aa644..2aef5bd51 100644 --- a/examples/mixtral/README.md +++ b/examples/mixtral/README.md @@ -94,7 +94,7 @@ In TP+EP mode, both strategies are used simultaneously. This means each GPU hand You can enable Expert Parallel or hybrid parallel by setting `--moe_tp_size` and `--moe_ep_size` when calling `convert_coneckpoint.py`. If only `--moe_tp_size` is provided, TRT-LLM will use Tensor Parallel for the MoE model; if only `--moe_ep_size` is provided, TRT-LLM will use Expert Parallel; if both are provided, the hybrid parallel will be used. -Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE paralleism across all GPUs must match the total number of parallelism in other parts of the model. +Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE parallelism across all GPUs must match the total number of parallelism in other parts of the model. ```bash # Build Mixtral8x7B with Expert Parallelism diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt index e7d9d8c71..3e391c645 100644 --- a/examples/mixtral/requirements.txt +++ b/examples/mixtral/requirements.txt @@ -1,4 +1,4 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 transformers==4.38.2 accelerate==0.25.0 diff --git a/examples/model_api/README.md b/examples/model_api/README.md index 065a81847..258801153 100644 --- a/examples/model_api/README.md +++ b/examples/model_api/README.md @@ -42,7 +42,7 @@ python ./llama_quantize.py --hf_model_dir --cache_dir ./llama.aw ## AutoModelForCausalLM -The API `tensorrt_llm.AutoModelForCausalLM` can read from a Hugging Face model directory, find the correct TRT-LLM model class and dispatch the `from_hugging_face` mothod to the correct TRT-LLM class. +The API `tensorrt_llm.AutoModelForCausalLM` can read from a Hugging Face model directory, find the correct TRT-LLM model class and dispatch the `from_hugging_face` method to the correct TRT-LLM class. The following code snippets demonstrated the usage of the `AutoModelForCausalLM` class. diff --git a/examples/model_api/llama.py b/examples/model_api/llama.py index c699ee192..5ca11b98a 100644 --- a/examples/model_api/llama.py +++ b/examples/model_api/llama.py @@ -43,8 +43,6 @@ def main(): build_config = BuildConfig(max_input_len=256, max_seq_len=276, max_batch_size=1) - # just for fast build, not best for production - build_config.builder_opt = 0 build_config.plugin_config.gemm_plugin = 'auto' if args.clean_build or not args.engine_dir.exists(): diff --git a/examples/model_api/llama_multi_gpu.py b/examples/model_api/llama_multi_gpu.py index 388ad3a7f..fe35cbe9a 100644 --- a/examples/model_api/llama_multi_gpu.py +++ b/examples/model_api/llama_multi_gpu.py @@ -28,7 +28,6 @@ def build_and_run_llama(hf_model_dir, engine_dir, tp_size, rank): build_config = BuildConfig(max_input_len=256, max_seq_len=512, max_batch_size=8) - build_config.builder_opt = 0 # fast build for demo, pls avoid using this in production, since inference might be slower build_config.plugin_config.gemm_plugin = 'auto' # for fast build, tune inference perf based on your needs mapping = Mapping(world_size=tp_size, rank=rank, tp_size=tp_size) llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir, mapping=mapping) diff --git a/examples/mpt/README.md b/examples/mpt/README.md index 093fd123c..4b54e7f18 100644 --- a/examples/mpt/README.md +++ b/examples/mpt/README.md @@ -19,10 +19,6 @@ This document explains how to build the [MPT](https://huggingface.co/mosaicml/mp - [1. Convert weights from HF Transformers to TRTLLM format](#1-convert-weights-from-hf-transformers-to-trtllm-format) - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines) - [3. Run TRT engine to check if the build was correct](#3-run-trt-engine-to-check-if-the-build-was-correct) - - [Replit Code V-1.5 3B](#replit-code-v-15-3b) - - [1. Convert weights from HF Transformers to TRTLLM format](#1-convert-weights-from-hf-transformers-to-trtllm-format-1) - - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines-1) - - [3. Run TRT engine to check if the build was correct](#3-run-trt-engine-to-check-if-the-build-was-correct-1) ## Overview @@ -178,55 +174,3 @@ mpirun -n 4 --allow-run-as-root \ --engine_dir ./trt_engines/mpt-30b/fp16/4-gpu/ \ --tokenizer_dir mosaicml/mpt-30b ``` - -### Replit Code V-1.5 3B -Same commands can be changed to convert [Replit Code V-1.5 3B](https://huggingface.co/replit/replit-code-v1_5-3b) to TRT LLM format. Below is an example to build Replit Code V-1.5 3B fp16 2-way tensor parallelized TRT engine. - -#### 1. Convert weights from HF Transformers to TRTLLM format - -The [`convert_checkpoint.py`](./convert_checkpoint.py) script allows you to convert weights from HF Transformers format to TRTLLM format. - -```bash -python convert_checkpoint.py --model_dir ./replit-code-v1_5-3b --output_dir ./ckpts/replit-code-v1_5-3b/bf16_tp2/ --tp_size 2 --dtype bfloat16 -``` - -#### 2. Build TensorRT engine(s) - -Examples of build invocations: - -```bash -# Build 2-GPU Replit Code V-1.5 3B bfloat16 engines -trtllm-build --checkpoint_dir ./ckpts/replit-code-v1_5-3b/bf16_tp2 \ - --max_batch_size 32 \ - --max_input_len 1024 \ - --max_seq_len 1536 \ - --gpt_attention_plugin bfloat16 \ - --gemm_plugin bfloat16 \ - --workers 2 \ - --output_dir ./trt_engines/replit-code-v1_5-3b/bf16_tp2 -``` - -#### 3. Run TRT engine to check if the build was correct - -```bash -# Run 2-GPU Replit Code V-1.5 3B TRT engine on a sample input prompt -mpirun -n 2 --allow-run-as-root \ - python ../run.py --max_output_len 64 \ - --input_text "def fibonacci" \ - --engine_dir ./trt_engines/replit-code-v1_5-3b/bf16_tp2 \ - --tokenizer_dir ./replit-code-v1_5-3b/ -``` - -Here is the output of above command. -```bash -Input: "def fibonacci" -Output: "(n): - if n == 0: - return 0 - elif n == 1: - return 1 - else: - return fibonacci(n-1) + fibonacci(n-2) - -print(fibonacci(10))" -``` diff --git a/examples/mpt/convert_checkpoint.py b/examples/mpt/convert_checkpoint.py index e58f76bde..4708c4146 100644 --- a/examples/mpt/convert_checkpoint.py +++ b/examples/mpt/convert_checkpoint.py @@ -14,8 +14,7 @@ import torch import torch.nn as nn from tqdm import tqdm -from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, - MptForCausalLM) +from transformers import AutoTokenizer, MptConfig, MptForCausalLM from transformers.pytorch_utils import Conv1D import tensorrt_llm @@ -675,7 +674,7 @@ def convert_hf_mpt_legacy(hf_model, def convert_hf_mpt(hf_model: MptForCausalLM, - hf_config: AutoConfig, + hf_config: MptConfig, mapping: Mapping, dtype: str = 'float32', use_parallel_embedding: bool = False, @@ -691,8 +690,8 @@ def convert_hf_mpt(hf_model: MptForCausalLM, dtype = getattr(torch, dtype) num_hidden_layers = hf_config.n_layers num_head = hf_config.n_heads - num_kv_heads = hf_config.attn_config['kv_n_heads'] if 'kv_n_heads' in hf_config.attn_config \ - else hf_config.n_heads + num_kv_heads = getattr(hf_config.attn_config, 'kv_n_heads', + hf_config.n_heads) hidden_size = hf_config.d_model vocab_size = hf_config.vocab_size @@ -816,10 +815,10 @@ def convert_hf_mpt(hf_model: MptForCausalLM, else: kv_cache_quant_algo = None - hf_config = AutoConfig.from_pretrained(args.model_dir, - trust_remote_code=True) - num_kv_heads = hf_config.attn_config['kv_n_heads'] if 'kv_n_heads' in hf_config.attn_config \ - else hf_config.n_heads + hf_config = MptConfig.from_pretrained(args.model_dir, + trust_remote_code=True) + num_kv_heads = getattr(hf_config.attn_config, 'kv_n_heads', + hf_config.n_heads) config = { 'architecture': hf_config.architectures[0], 'dtype': args.dtype, @@ -845,18 +844,17 @@ def convert_hf_mpt(hf_model: MptForCausalLM, 'pp_size': args.pp_size, }, 'bias': (not hf_config.no_bias), - 'clip_qkv': hf_config.attn_config['clip_qkv'], - 'alibi_bias_max': hf_config.attn_config['alibi_bias_max'] + 'clip_qkv': hf_config.attn_config.clip_qkv, + 'alibi_bias_max': hf_config.attn_config.alibi_bias_max } with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: json.dump(config, f, indent=4) - hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir, - trust_remote_code=True, - device_map="auto", - torch_dtype=getattr( - torch, args.dtype)) + hf_model = MptForCausalLM.from_pretrained(args.model_dir, + device_map="auto", + torch_dtype=getattr( + torch, args.dtype)) act_range = {} mpt_qkv_para = {} diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt index d7bf43bed..80c4473aa 100644 --- a/examples/mpt/requirements.txt +++ b/examples/mpt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md index a2e85a42e..79b28d294 100644 --- a/examples/multimodal/README.md +++ b/examples/multimodal/README.md @@ -621,7 +621,7 @@ Currently, CogVLM only support bfloat16 precision. 1. Download Huggingface weights ```bash - export MODEL_NAME="Phi-3-vision-128k-instruct" + export MODEL_NAME="Phi-3-vision-128k-instruct" # or Phi-3.5-vision-instruct git clone https://huggingface.co/microsoft/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} ``` diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt index a054119f4..88ffbef37 100644 --- a/examples/nemotron/requirements.txt +++ b/examples/nemotron/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 nemo-toolkit[all]==2.0.0rc1 megatron-core==0.8.0 datasets~=2.14.5 diff --git a/examples/nemotron_nas/README.md b/examples/nemotron_nas/README.md new file mode 100644 index 000000000..d1e3a898c --- /dev/null +++ b/examples/nemotron_nas/README.md @@ -0,0 +1,102 @@ +# Nemotron-NAS + +This document shows how to convert and build a model generated by Nemotron-NAS, such as Llama-3_1-Nemotron-51B-Instruct, in TensorRT-LLM. + +- [NemotronNas](#nemotron-nas) + - [Overview](#overview) + - [Support Matrix](#support-matrix---verify-with-omer--nave) + - [Custom Layers](#custom-layers) + - [Usage](#usage) + - [Build TensorRT engine(s)](#build-tensorrt-engines) + - [Runtime](#runtime) + +## Overview + +The TensorRT-LLM Nemotron-NAS implementation can be found in [tensorrt_llm/models/nemotron_nas/model.py](../../tensorrt_llm/models/nemotron_nas/model.py). The TensorRT-LLM Nemotron-NAS example code is located in [`examples/nemotron_nas`](./). There is one main file: + +* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the model into tensorrt-llm checkpoint format. + +## Support Matrix + + * FP16 + * BF16 + * Tensor parallelism + * Pipeline parallelism + +## Custom Layers + +Nemotron-NAS offers the ability to replace both `attention` and `FFN` layers with either `Linear` or `NoOp` layers. +The `attention` layers can be replaced with `LinearAttention` (which eventually calls `tensorrt_llm/layers/Linear`). +Additionally, `attention` layers can also be replaced with `NoOpAttention` (which essentially returns 0, thus implementing a no-op operation). +The `LinearAttention` and `NoOpAttention` layers require no KV-cache. +Likewise, `FFN` layers can be replaced with either `LinearFFN` or `NoOpFFN`. + +Different attention layers of the model can have a different number of key-value attention heads and different MLP layers can have different hidden sizes. + +## About Pipeline Parallelism + +Due the non-uniform architecture of the model, the different pipeline parallelism ranks might run different types of layers, resulting in a possibly unbalanced load between GPUs during inference. + +## Usage + +The TensorRT-LLM example code is located at [examples/nemotron_nas](./). +The `convert_checkpoint.py` script accepts Hugging Face weights as input, and builds the corresponding TensorRT engines. +The number of TensorRT engines depends on the number of GPUs used to run inference. + +### Build TensorRT Engines + +To build a TensorRT engine, you first need to obtain a Nemotron-NAS checkpoint in Hugging Face format. For example, [Llama-3_1-Nemotron-51B-Instruct](https://huggingface.co/nvidia/Llama-3_1-Nemotron-51B-Instruct). + +The `trtllm-build` command builds TensorRT engines from a TensorRT-LLM checkpoint. +If no checkpoint directory is specified, TensorRT-LLM builds the engines with dummy weights. + +The `trtllm-build` command has a variety of options. +In particular, the plugin-related options have two categories: + +* Plugin options that require a data type, such as `gpt_attention_plugin`, you can: + * explicitly specify `float16`, `bfloat16`, or `float32`, so that the plugins are enabled with the specified precision + * implicitly specify `auto`, so that the plugins are enabled with the precision that is automatically inferred from the model dtype, the dtype specified in weight conversion + +```bash +# Optional: prepare environment variables +export MODEL_DIR=... +export TRT_CHECKPOINT_DIR=... +export TRT_ENGINE_DIR=... +export TP_SIZE=... +export PP_SIZE=... + +# create a local copy of the model checkpoint +git clone https://huggingface.co/nvidia/Llama-3_1-Nemotron-51B-Instruct $MODEL_DIR + +# Convert the model to TRT BF16 checkpoint +# Note, currently must provide --trust_remote_code flag +python convert_checkpoint.py --model_dir $MODEL_DIR \ + --dtype bfloat16 \ + --output_dir $TRT_CHECKPOINT_DIR \ + --tp_size=$TP_SIZE --pp_size=$PP_SIZE \ + --trust_remote_code + +# Build the model engine using a single GPU and FP16 +trtllm-build --checkpoint_dir $TRT_CHECKPOINT_DIR \ + --output_dir $TRT_ENGINE_DIR \ + --gemm_plugin auto \ + --kv_cache_type paged +``` + +The conversion script supports additional models with variable GQA, such as [DeciLM-7B](https://huggingface.co/Deci/DeciLM-7B). + +## Runtime + +After you build the engine, you can use the engine with any TensorRT-LLM entrypoint or API. +For example, you can run inference with [examples/run.py](../run.py): + +```bash +export MODEL_DIR=... +export TRT_ENGINE_DIR=... + +python run.py --engine_dir $TRT_ENGINE_DIR --tokenizer_dir $MODEL_DIR --max_output_len 1024 ... + +# for multi-GPU inference (engine must be built with either tp_size>1, pp_size>1, or both) +export NUM_GPUS=... +mpirun -n $NUM_GPUS --allow-run-as-root python run.py ... +``` diff --git a/examples/nemotron_nas/calibration_utils.py b/examples/nemotron_nas/calibration_utils.py new file mode 100644 index 000000000..42b4382fa --- /dev/null +++ b/examples/nemotron_nas/calibration_utils.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DATASET = "Magpie-Align/Magpie-Pro-MT-300K-v0.1" + + +def create_trtllm_magpie_calibration_dataset(output_dir: str, + calib_size: int = 512) -> None: + from datasets import load_dataset + + dataset = load_dataset(DATASET, split="train") + + def transform(conversation): + value = '\n'.join(turn['value'] + for turn in conversation['conversations']) + return {"text": value} + + dataset = dataset.select(range(calib_size)).map( + transform, remove_columns=dataset.column_names) + # https://github.com/huggingface/datasets/issues/6703#issuecomment-1974766332 + dataset.to_parquet(output_dir + "/data.parquet") + + +if __name__ == "__main__": + import sys + output_dir = sys.argv[1] + create_trtllm_magpie_calibration_dataset(output_dir) diff --git a/examples/nemotron_nas/convert_checkpoint.py b/examples/nemotron_nas/convert_checkpoint.py new file mode 100644 index 000000000..f2fcc04e8 --- /dev/null +++ b/examples/nemotron_nas/convert_checkpoint.py @@ -0,0 +1,162 @@ +import argparse +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +from tensorrt_llm._utils import release_gc +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import DeciLMForCausalLM + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', type=str, required=True) + + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument('--dtype', + type=str, + default='bfloat16', + choices=['float32', 'bfloat16', 'float16']) + + parser.add_argument('--load_by_shard', + action='store_true', + help='Load a pretrained model shard-by-shard.') + + parser.add_argument("--load_model_on_cpu", action="store_true") + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers.' + 'Note: the flag might not take effect when the criteria are not met.') + parser.add_argument('--output_dir', + type=str, + default='tllm_checkpoint', + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') + + parser.add_argument( + '--save_config_only', + action="store_true", + default=False, + help= + 'Only save the model config w/o read and converting weights, be careful, this is for debug only' + ) + + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Pass trust_remote_code=True to HF loading functions as needed") + + args = parser.parse_args() + return args + + +def args_to_build_options(args): + return { + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + } + + +def convert_and_save_hf(args): + model_dir = args.model_dir + load_by_shard = args.load_by_shard + world_size = args.tp_size * args.pp_size + # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. + # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, + # before the refactor is done. + override_fields = {} + override_fields.update(args_to_build_options(args)) + + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size) + model = DeciLMForCausalLM.from_hugging_face( + model_dir, + args.dtype, + mapping=mapping, + load_by_shard=load_by_shard, + load_model_on_cpu=args.load_model_on_cpu, + trust_remote_code=args.trust_remote_code, + **override_fields, + ) + model.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del model + + execute(args.workers, [convert_and_save_rank] * world_size, args) + release_gc() + + +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def main(): + args = parse_arguments() + tik = time.time() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # TODO(oargov): all deci checkpoints require trust_remote_code=True at the moment, remove this when this changes + # NOTE: we opt not to make this the default since users should be made aware of this in-case they don't want to trust remote code + assert args.trust_remote_code, "Nemotron NAS checkpoint require --trust_remote_code" + + convert_and_save_hf(args) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') + + +if __name__ == '__main__': + main() diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt index d7bf43bed..80c4473aa 100644 --- a/examples/opt/requirements.txt +++ b/examples/opt/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/phi/README.md b/examples/phi/README.md index c6c25ebb5..802ee282a 100644 --- a/examples/phi/README.md +++ b/examples/phi/README.md @@ -1,18 +1,16 @@ # Phi -This document explains how to build the [phi-2](https://huggingface.co/microsoft/phi-2), [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct), -[Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct), [Phi-3-small-8k-instruct](https://huggingface.co/microsoft/Phi-3-small-8k-instruct), [Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct), [Phi-3-medium-4k-instruct](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/) and [Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/) -models using TensorRT-LLM and run on a single GPU. - -- [Phi](#phi) - - [Overview](#overview) - - [Support Matrix](#support-matrix) - - [Usage](#usage) - - [1. Convert weights from HF Transformers to TensorRT-LLM format](#1-convert-weights-from-hf-transformers-to-tensorrt-llm-format) - - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines) - - [3. Summarization using the Phi model](#3-summarization-using-the-phi-model) - - [4. Quantization](#4-quantization) - - [5. Run Phi-3 with LoRA](#5-run-phi-3-with-lora) +This document explains how to build Phi-2, Phi-3 and Phi-3.5 family of models using TensorRT-LLM and run on a single or multiple GPUs. +For multimodal models (Phi-3-vision-128k-instruct and Phi-3.5-vision-instruct), see `../multimodal/README.md`. + +- [Overview](#overview) +- [Support Matrix](#support-matrix) +- [Usage](#usage) + - [1. Convert weights from HF Transformers to TensorRT-LLM format](#1-convert-weights-from-hf-transformers-to-tensorrt-llm-format) + - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines) + - [3. Summarization using the Phi model](#3-summarization-using-the-phi-model) + - [4. Quantization](#4-quantization) + - [5. Run Phi-3 with LoRA](#5-run-phi-3-with-lora) ## Overview @@ -29,13 +27,15 @@ In addition, there are two shared files in the parent folder [`examples`](../) f | Model Name | FP16 | BF16 | FP8 | INT8 | TP | | :--------------: | :---: | :---: | :---: | :---: | :---: | -| phi-2 | Y | Y | | | Y | +| Phi-2 | Y | Y | | | Y | | Phi-3-mini-4k-instruct | Y | Y | Y | Y | | Phi-3-mini-128k-instruct | Y | Y | Y | Y | | Phi-3-small-8k-instruct | Y | Y | Y | Y | Y | | Phi-3-small-128k-instruct | Y | Y | Y | Y | Y | | Phi-3-medium-8k-instruct | Y | Y | Y | Y | | Phi-3-medium-128k-instruct | Y | Y | Y | Y | +| Phi-3.5-mini-instruct | Y | Y | Y | Y | +| Phi-3.5-MoE-instruct | Y | Y | Y | Y | Y | * Model Name: the name of the model, the same as the name on HuggingFace * TP: Tensor Parallel @@ -57,6 +57,11 @@ python ./convert_checkpoint.py \ --dtype float16 ``` +If a model supports tensor-parallelism, number of tensor parallel ranks to split the model into can be specified as `--tp_size` argument to `convert_checkpoint.py`. + +For Phi-3.5-MoE-instruct model, expert parallelism can be enabled using `--moe_tp_size` and `--moe_ep_size` arguments. +The section on Parallelism Modes in `../mixtral/README.md` discusses tensor and expert parallelism for Mixture of Experts models in detail. + ### 2. Build TensorRT engine(s) TensorRT-LLM builds TensorRT engine(s) using a HF checkpoint. If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights. diff --git a/examples/phi/convert_checkpoint.py b/examples/phi/convert_checkpoint.py index cddb110b0..249dae2f2 100644 --- a/examples/phi/convert_checkpoint.py +++ b/examples/phi/convert_checkpoint.py @@ -59,6 +59,20 @@ def parse_arguments(): 'Define the precision for the weights when using weight-only quantization.' 'You must also use --use_weight_only for that argument to have an impact.' ) + parser.add_argument( + '--moe_tp_size', + type=int, + default=-1, + help= + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' + ) parser.add_argument('--output_dir', type=str, default='tllm_checkpoint', @@ -110,6 +124,18 @@ def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: args = parse_arguments() assert args.pp_size == 1, "Pipeline parallelism is not supported." + world_size = args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" + tik = time.time() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) @@ -119,39 +145,35 @@ def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: model_type = model_config.architectures[0] supported_models = [ 'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM', - 'Phi3SmallForCausalLM' + 'Phi3SmallForCausalLM', 'PhiMoEForCausalLM' ] if model_type not in supported_models: assert False, "Invalid model type" - phi_model = Phi3ForCausalLM if model_type.find( - 'Phi3') != -1 else PhiForCausalLM - - hf_model = None + is_phi3 = 'Phi3' in model_type or 'MoE' in model_type + phi_model = Phi3ForCausalLM if is_phi3 else PhiForCausalLM - override_fields = {} - # override_fields.update(args_to_build_options(args)) quant_config = args_to_quant_config(args) def convert_and_save_rank(args, rank): - mapping = Mapping(world_size=args.tp_size * args.pp_size, + mapping = Mapping(world_size=world_size, rank=rank, tp_size=args.tp_size, - pp_size=args.pp_size) + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) phi = phi_model.from_hugging_face( - args.model_dir if hf_model is None else hf_model, + args.model_dir, args.dtype, mapping=mapping, quant_config=quant_config, - **override_fields, ) phi.save_checkpoint(args.output_dir, save_config=(rank == 0)) del phi - execute(args.workers, [convert_and_save_rank] * args.tp_size * args.pp_size, - args) + execute(args.workers, [convert_and_save_rank] * world_size, args) tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt index c985fe088..c44bb977e 100644 --- a/examples/phi/requirements.txt +++ b/examples/phi/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/quantization/quantize.py b/examples/quantization/quantize.py index f2fe6f3c9..c1380efa3 100644 --- a/examples/quantization/quantize.py +++ b/examples/quantization/quantize.py @@ -55,8 +55,13 @@ help="Quantization format.", default="full_prec", choices=[ - "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", - "full_prec" + "fp8", + "int8_sq", + "int4_awq", + "w4a8_awq", + "int8_wo", + "int4_wo", + "full_prec", ], ) parser.add_argument( @@ -101,15 +106,43 @@ action='store_true', help="whether to quantize the weights of medusa heads") + # auto quantization + parser.add_argument( + '--autoq_format', + default=None, + type=str, + help= + "Specific quantization algorithms will be searched in auto quantization." + "The algorithm must in ['fp8', 'int4_awq', 'w4a8_awq', 'int8_sq']." + "You can use ',' to separate more than one quantization algorithms(e.g. --autoq_format fp8,int4_awq,w4a8_awq)." + "Notice: fp8 and int8_sq can't be used at the same time.") + parser.add_argument( + '--weight_compression', + type=float, + default=None, + help="Percent of compression size when using mix precision quantization." + "The range is [0.0, 1.0], if you only indicate the autoq_format, it will be default to the lowest possible value." + ) + args = parser.parse_args() + # weight_compression check + if args.autoq_format: + lower_bound = 0.25 if '4' in args.autoq_format else 0.5 + if args.weight_compression is None or args.weight_compression < lower_bound: + print( + f"invalid weight_compression value, will be set to {lower_bound}" + ) + args.weight_compression = lower_bound + if args.model_dir is not None: quantize_and_export( model_dir=args.model_dir, device=args.device, calib_dataset=args.calib_dataset, dtype=args.dtype, - qformat=args.qformat, + qformat=args.qformat + if args.weight_compression is None else args.autoq_format, kv_cache_dtype=args.kv_cache_dtype, calib_size=args.calib_size, batch_size=args.batch_size, @@ -125,7 +158,8 @@ max_draft_len=args.max_draft_len, medusa_hidden_act=args.medusa_hidden_act, medusa_model_dir=args.medusa_model_dir, - quant_medusa_head=args.quant_medusa_head) + quant_medusa_head=args.quant_medusa_head, + weight_compression=args.weight_compression) elif args.nemo_ckpt_path is not None: quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path, decoder_type=args.decoder_type, diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt index 7fd6da926..2f84ad8b5 100644 --- a/examples/quantization/requirements.txt +++ b/examples/quantization/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets>=2.14.4 nemo-toolkit[all]<=1.20.0,>=1.18.0 rouge_score~=0.1.2 diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt index 38672c8e0..32ceb5db1 100644 --- a/examples/qwen/requirements.txt +++ b/examples/qwen/requirements.txt @@ -1,11 +1,11 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 transformers>=4.40.1 transformers-stream-generator -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 tiktoken einops diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt index 3db61b20c..23f381886 100644 --- a/examples/qwenvl/requirements.txt +++ b/examples/qwenvl/requirements.txt @@ -1,10 +1,10 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.16.0 evaluate~=0.4.1 rouge_score~=0.1.2 transformers-stream-generator -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 tiktoken einops auto-gptq diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt index ee554d62b..80c16b1dc 100644 --- a/examples/recurrentgemma/requirements.txt +++ b/examples/recurrentgemma/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 git+https://github.com/google-deepmind/recurrentgemma.git flax>=0.8.2 jax~=0.4.23 diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt index db7c7cd52..179f92d66 100644 --- a/examples/redrafter/requirements.txt +++ b/examples/redrafter/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.14.5 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 evaluate~=0.4.1 diff --git a/examples/run.py b/examples/run.py index 0baa00b35..bcbe7ecce 100644 --- a/examples/run.py +++ b/examples/run.py @@ -41,6 +41,11 @@ def parse_arguments(args=None): parser = argparse.ArgumentParser() parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--max_output_len', type=int, required=True) + parser.add_argument( + '--draft_engine_dir', + type=str, + default=None, + help='Path to engine of draft model in Draft-Target-Model mode.') parser.add_argument( '--input_text', type=str, @@ -55,6 +60,19 @@ def parse_arguments(args=None): parser.add_argument('--multimodal_input_file', type=str, help='Path to multimodal input file.') + parser.add_argument( + '--input_token_extra_ids', + type=int, + nargs='+', + help= + 'Input token extra ids for using p-tuning and KV Cache reuse together (only available with cpp session).', + default=None) + parser.add_argument( + '--input_token_extra_ids_file', + type=str, + help= + 'CSV or Numpy file containing input token extra ids file. Alternative to text input (only available with cpp session).', + default=None) parser.add_argument('--output_csv', type=str, help='CSV file where the tokenized output is stored.', @@ -155,9 +173,41 @@ def parse_input(tokenizer, batch_input_ids = [ torch.tensor(x, dtype=torch.int32) for x in batch_input_ids ] + + logger.debug(f"Input token ids (batch_size = {len(batch_input_ids)}):") + for i, input_ids in enumerate(batch_input_ids): + logger.debug(f"Request {i}: {input_ids.tolist()}") + return batch_input_ids +def parse_input_token_extra_ids(prompt_table_path, kv_cache_enable_block_reuse, + input_token_extra_ids, + input_token_extra_ids_file, max_input_length): + batch_extra_ids = None + if prompt_table_path and kv_cache_enable_block_reuse: + assert input_token_extra_ids or input_token_extra_ids_file, \ + "Input token extra ids must be provided when p-tuning and KV Cache reuse are both enabled" + batch_extra_ids = [] + if input_token_extra_ids_file: + if input_token_extra_ids_file.endswith('.csv'): + with open(input_token_extra_ids_file, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for line in csv_reader: + extra_ids = [int(num) for num in line] + batch_extra_ids.append(extra_ids[-max_input_length:]) + elif input_token_extra_ids_file.endswith('.npy'): + inputs = np.load(input_token_extra_ids_file) + for extra_ids in inputs: + batch_extra_ids.append(extra_ids[-max_input_length:]) + else: + print('Input file format not supported.') + raise SystemExit + else: + batch_extra_ids.append(input_token_extra_ids) + return batch_extra_ids + + def print_output(tokenizer, output_ids: torch.Tensor, input_lengths: List[int], @@ -193,6 +243,7 @@ def print_output(tokenizer, if num_return_sequences > 1 else f'Text {batch_idx} Beam {beam}') print(f'Output [{index_str}]: \"{output_text}\"') + logger.debug(str(outputs)) output_ids = output_ids.reshape((-1, output_ids.size(2))) @@ -248,10 +299,230 @@ def print_output(tokenizer, np.save(log_probs_file, log_probs_outputs) +def run_draft_target_model(batch_input_ids, args, runtime_rank, end_id, pad_id, + stop_words_list, bad_words_list, vocab_size): + draft_len, draft_device_list, target_device_list, use_logits = ast.literal_eval( + args.draft_target_model_config) + logger.info(f"draft_len: {draft_len}") + logger.info(f"Device(s) for draft model: {draft_device_list}") + logger.info(f"Device(s) for target model: {target_device_list}") + logger.info(f"Use logits to accept tokens: {use_logits}") + # Variables keeping constant during decoding + input_batch_size = len(batch_input_ids) # Note as `BS` + beam_width = args.num_beams # Note as `BW` + is_compute_acceptance_ratio = logger.level == 'verbose' # Only enable in verbose mode + input_lengths = [len(p) for p in batch_input_ids] + max_seq_lengths = [i + args.max_output_len for i in input_lengths] + # Variables changing during decoding + n_iteration = 0 + prefix = batch_input_ids # Input for draft model + batch_slot = list(range(input_batch_size)) # Index of requests + if is_compute_acceptance_ratio: + n_draft_token = [0 for _ in range(input_batch_size)] + n_accept_token = [0 for _ in range(input_batch_size)] + + # Repack the output like the output of function `generate` + outputs = {} + outputs["output_ids"] = torch.full( + [input_batch_size, beam_width, + max(max_seq_lengths)], + end_id, + dtype=torch.int32) + for bs in range(input_batch_size): + outputs["output_ids"][bs, :, :input_lengths[bs]] = batch_input_ids[bs] + outputs["sequence_lengths"] = torch.full([input_batch_size, beam_width], + 0, + dtype=torch.int32) + outputs["context_logits"] = None + outputs["generation_logits"] = torch.full( + [input_batch_size, beam_width, + max(max_seq_lengths), vocab_size], + 0, + dtype=torch.float16) + outputs['cum_log_probs'] = None + outputs['log_probs'] = None + + # Model runners + common_kwargs = dict( + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + max_output_len=args.max_output_len, + is_enc_dec=False, + max_batch_size=input_batch_size, + max_input_len=max(input_lengths) + args.max_output_len, + max_beam_width=beam_width, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode, + cuda_graph_mode=args.cuda_graph_mode, + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc, + ) + draft_runner_kwargs = common_kwargs.copy() + draft_runner_kwargs.update( + engine_dir=args.draft_engine_dir, + device_ids=draft_device_list, + ) + draft_runner = ModelRunnerCpp.from_dir(**draft_runner_kwargs) + target_runner_kwargs = common_kwargs.copy() + target_runner_kwargs.update( + engine_dir=args.engine_dir, + device_ids=target_device_list, + ) + target_runner = ModelRunnerCpp.from_dir(**target_runner_kwargs) + + common_gen_kwargs = dict( + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=beam_width, + num_return_sequences=args.num_return_sequences, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + random_seed=args.random_seed, + streaming=False, + output_sequence_lengths=True, + return_dict=True, + ) + + while True: + n_iteration += 1 + batch_size = len(prefix) + prefix_len = [len(prefix[i]) for i in range(batch_size)] + # Run draft model + draft_generation_kwargs = common_gen_kwargs.copy() + draft_generation_kwargs.update( + batch_input_ids=prefix, + max_new_tokens=draft_len, + streaming=False, + output_sequence_lengths=True, + return_dict=True, + ) + draft = draft_runner.generate(**draft_generation_kwargs) + torch.cuda.synchronize() + + # draft["output_ids"].shape -> [BS, BW, maxSL] + # draft["sequence_lengths"].shape -> [BS, BW] + # draft["generation_logits"].shape -> [BS, BW, draft_len, vocab_size] + # `d_*` means variables from draft model + # Value of `d_seq_len` includes input part, but `draft_len` doesn't + d_seq_len = draft["sequence_lengths"][:, 0].tolist() + d_len = [d_seq_len[bs] - prefix_len[bs] for bs in range(batch_size)] + d_ids = [None] * batch_size + if use_logits: + assert "generation_logits" in draft.keys( + ), "`--gather_generation_logits` must be specified when building TRT engine." + d_logits = [None] * batch_size + else: + d_logits = None + + for bs in range(batch_size): + l = prefix_len[bs] + r = d_seq_len[bs] + d_ids[bs] = draft["output_ids"][bs, 0, l:r].tolist() + if use_logits: + d_logits[bs] = draft["generation_logits"][bs, 0, :, :] + + # Run target model + target_generation_kwargs = common_gen_kwargs.copy() + target_generation_kwargs.update( + batch_input_ids=prefix, + max_new_tokens=draft_len + 1, + draft_tokens_list=d_ids, + draft_logits_list=d_logits, + ) + target = target_runner.generate(**target_generation_kwargs) + torch.cuda.synchronize() + + # `t_*` means variables from target model + # Value of `t_seq_len` and `t_seq_ids` includes input part, but `t_len` or `t_ids` doesn't + t_seq_len = target["sequence_lengths"][:, 0].tolist() + # t_len = [t_seq_len[bs] - prefix_len[bs] for bs in range(batch_size)] + t_seq_ids = [None] * batch_size + t_ids = [None] * batch_size + + # Update output and tokens for next iteration + for bs in range(batch_size): + index = batch_slot[bs] # Get original index in the input batch + l = prefix_len[bs] + r = min(t_seq_len[bs], max_seq_lengths[index]) + t_ids[bs] = target["output_ids"][bs, 0, l:r].tolist() + t_seq_ids[bs] = target["output_ids"][bs, 0, :r] + outputs["output_ids"][index, 0, l:r] = torch.IntTensor(t_ids[bs]) + outputs["sequence_lengths"][index, 0] = r + if use_logits: + outputs["generation_logits"][index, 0, (l - input_lengths[bs]):(r - input_lengths[bs])] = \ + target["generation_logits"][bs][0,:(r-l)].detach().cpu() + if is_compute_acceptance_ratio: + n_draft_token[index] += len(d_ids[bs]) + n_accept_token[index] += sum(d_ids[bs][i] == t_ids[bs][i] \ + for i in range(min(d_len[bs], t_seq_len[bs] - prefix_len[bs], max_seq_lengths[index] - prefix_len[bs]))) + + # yield output if using streaming + if args.streaming and not n_iteration % args.streaming_interval: + yield outputs + + # Evaluate stop criteria and prepare inputs for next iteration + prefix_next = [] + batch_slot_next = [] + for bs in range(batch_size): + # Stop due to output length + if len(t_seq_ids[bs]) >= max_seq_lengths[batch_slot[bs]]: + continue # No need to update for the stopped requests + # Stop due to the same output. Normally target should return 1 more token. + # if (d_ids is not None and np.array_equal(d_ids[bs], t_ids[bs])): + # continue + # Stop due to no change (hit early stopping) + if np.array_equal(t_seq_ids[bs], prefix[bs]): + continue + # Stop due to end words + if end_id in t_seq_ids[bs]: + continue + # TODO: Check bad words and stop words criteria + prefix_next.append(t_seq_ids[bs]) + batch_slot_next.append(bs) + prefix = prefix_next + batch_slot = batch_slot_next + if len(prefix) == 0: # Leave while loop if no request remained + break + + if is_compute_acceptance_ratio: + logger.debug(f"Count of iteration(s): {n_iteration}") + logger.debug(f"Acceptance ratio:") + for i, (a, d) in enumerate(zip(n_accept_token, n_draft_token)): + logger.debug(f"Request {i}: {a / d * 100 :6.2f}%") + + # Return runner in No-Streaming mode + if args.streaming: + yield outputs + else: + yield outputs, target_runner + + def main(args): runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) + if args.draft_target_model_config is not None: + assert args.draft_engine_dir is not None, "Path to draft engine (--draft_engine_dir) must be specified." + assert args.engine_dir is not None, "Path to target engine (--engine_dir) must be specified." + # different handling if encoder-decoder models is_enc_dec = {'encoder', 'decoder'}.issubset({ name @@ -321,6 +592,11 @@ def main(args): batch_input_ids, model_name, args.engine_dir, args.multimodal_input_file) + input_token_extra_ids = parse_input_token_extra_ids( + args.prompt_table_path, args.kv_cache_enable_block_reuse, + args.input_token_extra_ids, args.input_token_extra_ids_file, + args.max_input_length) + input_lengths = [x.size(0) for x in decoder_input_ids ] if is_enc_dec else [x.size(0) for x in batch_input_ids] @@ -358,86 +634,109 @@ def main(args): "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." ) args.return_all_generated_tokens = True - runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict( - engine_dir=args.engine_dir, - lora_dir=args.lora_dir, - rank=runtime_rank, - debug_mode=args.debug_mode, - lora_ckpt_source=args.lora_ckpt_source, - gpu_weights_percent=args.gpu_weights_percent, - max_output_len=args.max_output_len, - ) - if not args.use_py_session: - runner_kwargs.update(is_enc_dec=is_enc_dec) - if args.medusa_choices is not None: - args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.temperature == 1.0, "Medusa should use temperature == 1.0" - assert args.num_beams == 1, "Medusa should use num_beams == 1" - runner_kwargs.update(medusa_choices=args.medusa_choices) - if args.lookahead_config is not None: - args.lookahead_config = ast.literal_eval(args.lookahead_config) - assert len( - args.lookahead_config - ) == 3, "Lookahead needs [max_window_size, max_ngram_size, max_verification_set_size]" - runner_kwargs.update(lookahead_config=args.lookahead_config) - if not args.use_py_session: + + logger.info(f"Using {'Python' if args.use_py_session else 'C++'} session") + + if args.draft_target_model_config is None: # Normal run except Draft-Target-Model + runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp + runner_kwargs = dict( + engine_dir=args.engine_dir, + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + max_output_len=args.max_output_len, + ) + if args.medusa_choices is not None: + args.medusa_choices = ast.literal_eval(args.medusa_choices) + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" + assert args.num_beams == 1, "Medusa should use num_beams == 1" + runner_kwargs.update(medusa_choices=args.medusa_choices) + if args.lookahead_config is not None: + args.lookahead_config = ast.literal_eval(args.lookahead_config) + assert len( + args.lookahead_config + ) == 3, "Lookahead needs [max_window_size, max_ngram_size, max_verification_set_size]" + runner_kwargs.update(lookahead_config=args.lookahead_config) + if not args.use_py_session: + runner_kwargs.update( + is_enc_dec=is_enc_dec, + max_batch_size=len(batch_input_ids), + max_input_len=max( + encoder_input_lengths if is_enc_dec else input_lengths), + max_beam_width=args.num_beams, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + cross_kv_cache_fraction=args.cross_kv_cache_fraction + if is_enc_dec else None, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode, + cuda_graph_mode=args.cuda_graph_mode) runner_kwargs.update( - max_batch_size=len(batch_input_ids), - max_input_len=max( - encoder_input_lengths if is_enc_dec else input_lengths), - max_beam_width=args.num_beams, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, - kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, - kv_cache_free_gpu_memory_fraction=args. - kv_cache_free_gpu_memory_fraction, - enable_chunked_context=args.enable_chunked_context, - multi_block_mode=args.multi_block_mode) - runner_kwargs.update( - enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) - runner = runner_cls.from_dir(**runner_kwargs) - - with torch.no_grad(): - outputs = runner.generate( - batch_input_ids=decoder_input_ids - if is_enc_dec else batch_input_ids, - encoder_input_ids=encoder_input_ids if is_enc_dec else None, - encoder_input_features=encoder_input_features - if is_enc_dec else None, - encoder_output_lengths=encoder_output_lengths - if is_enc_dec else None, - max_new_tokens=args.max_output_len, - max_attention_window_size=args.max_attention_window_size, - sink_token_length=args.sink_token_length, - end_id=end_id, - pad_id=pad_id, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_beams=args.num_beams, - num_return_sequences=args.num_return_sequences, - length_penalty=args.length_penalty, - early_stopping=args.early_stopping, - repetition_penalty=args.repetition_penalty, - presence_penalty=args.presence_penalty, - frequency_penalty=args.frequency_penalty, - stop_words_list=stop_words_list, - bad_words_list=bad_words_list, - output_cum_log_probs=(args.output_cum_log_probs_npy != None), - output_log_probs=(args.output_log_probs_npy != None), - random_seed=args.random_seed, - lora_uids=args.lora_task_uids, - prompt_table=args.prompt_table_path, - prompt_tasks=args.prompt_tasks, - streaming=args.streaming, - output_sequence_lengths=True, - no_repeat_ngram_size=args.no_repeat_ngram_size, - return_dict=True, - medusa_choices=args.medusa_choices, - return_all_generated_tokens=args.return_all_generated_tokens) - torch.cuda.synchronize() + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) + runner = runner_cls.from_dir(**runner_kwargs) + + with torch.no_grad(): + outputs = runner.generate( + batch_input_ids=decoder_input_ids + if is_enc_dec else batch_input_ids, + encoder_input_ids=encoder_input_ids if is_enc_dec else None, + encoder_input_features=encoder_input_features + if is_enc_dec else None, + encoder_output_lengths=encoder_output_lengths + if is_enc_dec else None, + max_new_tokens=args.max_output_len, + max_attention_window_size=args.max_attention_window_size, + sink_token_length=args.sink_token_length, + end_id=end_id, + pad_id=pad_id, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_beams=args.num_beams, + num_return_sequences=args.num_return_sequences, + length_penalty=args.length_penalty, + early_stopping=args.early_stopping, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + frequency_penalty=args.frequency_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, + lora_uids=args.lora_task_uids, + prompt_table=args.prompt_table_path, + prompt_tasks=args.prompt_tasks, + streaming=args.streaming, + output_sequence_lengths=True, + no_repeat_ngram_size=args.no_repeat_ngram_size, + return_dict=True, + medusa_choices=args.medusa_choices, + return_all_generated_tokens=args.return_all_generated_tokens, + input_token_extra_ids=input_token_extra_ids) + torch.cuda.synchronize() + + else: # For Draft-Target-Model + if not args.kv_cache_enable_block_reuse: + logger.warning( + "`--kv_cache_enable_block_reuse` must be specified in Draft-Target-Model." + ) + assert not args.use_py_session, "Only CPP session is supported in Draft-Target-Model." + assert not is_enc_dec, "Only decoder model is supported in Draft-Target-Model." + assert args.num_beams == 1, "Beam width > 1 is not supported in Draft-Target-Model." + + outputs = run_draft_target_model(batch_input_ids, args, runtime_rank, + end_id, pad_id, stop_words_list, + bad_words_list, tokenizer.vocab_size) + + if not args.streaming: # Unpack runner from the return value in No-Streaming mode + outputs, runner = list(outputs)[0] if args.streaming: for curr_outputs in throttle_generator(outputs, @@ -525,8 +824,9 @@ def main(args): streaming=args.streaming, output_sequence_lengths=True, return_dict=True, - return_all_generated_tokens=args.return_all_generated_tokens - ) + return_all_generated_tokens=args. + return_all_generated_tokens, + input_token_extra_ids=input_token_extra_ids) torch.cuda.synchronize() tensorrt_llm.profiler.start("tmp") @@ -559,8 +859,9 @@ def main(args): streaming=args.streaming, output_sequence_lengths=True, return_dict=True, - return_all_generated_tokens=args.return_all_generated_tokens - ) + return_all_generated_tokens=args. + return_all_generated_tokens, + input_token_extra_ids=input_token_extra_ids) torch.cuda.synchronize() tensorrt_llm.profiler.stop("tmp") diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt index 8ea5798fc..ceee140b0 100644 --- a/examples/skywork/requirements.txt +++ b/examples/skywork/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets~=2.16.1 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt index 30dd11d63..5f7d98fe2 100644 --- a/examples/smaug/requirements.txt +++ b/examples/smaug/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 datasets==2.14.6 evaluate~=0.4.1 rouge_score~=0.1.2 -sentencepiece~=0.1.99 +sentencepiece>=0.1.99 diff --git a/examples/summarize.py b/examples/summarize.py index e54eb0a52..faa3412d2 100644 --- a/examples/summarize.py +++ b/examples/summarize.py @@ -487,7 +487,8 @@ def eval_hf(datapoint, kv_cache_free_gpu_memory_fraction=args. kv_cache_free_gpu_memory_fraction, enable_chunked_context=args.enable_chunked_context, - multi_block_mode=args.multi_block_mode) + multi_block_mode=args.multi_block_mode, + cuda_graph_mode=args.cuda_graph_mode) runner_kwargs.update( enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) runner = runner_cls.from_dir(**runner_kwargs) @@ -739,6 +740,7 @@ def eval_hf(datapoint, if __name__ == '__main__': + # see `add_common_args` for extended list of arguments parser = argparse.ArgumentParser() parser.add_argument('--test_hf', action='store_true') parser.add_argument('--test_trt_llm', action='store_true') diff --git a/examples/utils.py b/examples/utils.py index 03eea2d3a..6d43fe41f 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -289,7 +289,14 @@ def add_common_args(parser): parser.add_argument('--enable_context_fmha_fp32_acc', action='store_true', help="Enable FMHA runner FP32 accumulation.") - parser.add_argument('--log_level', type=str, default='info') + parser.add_argument('--cuda_graph_mode', + action='store_true', + help="Enable cuda graphs in the inference.") + parser.add_argument( + '--log_level', + type=str, + choices=['verbose', 'info', 'warning', 'error', 'internal_error'], + default='info') parser.add_argument( '--no_prompt_template', dest='use_prompt_template', @@ -340,19 +347,26 @@ def add_common_args(parser): help="Number of (default) virtual tokens to prepend to each sentence." " For example, '--num_prepend_vtokens=10' will prepend the tokens" " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + parser.add_argument( + '--draft_target_model_config', + type=str, + default=None, + help= + "Configuration of Draft-Target-Model decoding, see `examples/draft_target_model/README.md` for more information." + " E.g.: [4, [0], [1], False] for [draft_len, draft_model_device_list, target_model_device_list, use_logits]." + ) parser.add_argument( '--medusa_choices', type=str, default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." + help="Configuration of Medusa decoding." " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." ) parser.add_argument( '--lookahead_config', type=str, default=None, - help= - "executor and request lookahead config to use, if not none, will use lookahead decoding." + help="Configuration of executor and request lookahead decoding." " E.g.: [5, 6, 7] for [max_window_size, max_ngram_size, max_verification_set_size]." ) # model arguments @@ -402,6 +416,13 @@ def add_common_args(parser): type=float, help='Specify the free gpu memory fraction.', ) + parser.add_argument( + '--cross_kv_cache_fraction', + default=0.5, + type=float, + help= + 'Specify the kv cache fraction reserved for cross attention. Only applicable for encoder-decoder models. By default 0.5 for self and 0.5 for cross.', + ) parser.add_argument( '--enable_chunked_context', action='store_true', diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt index dd123dc96..9f28078c7 100644 --- a/examples/whisper/requirements.txt +++ b/examples/whisper/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://pypi.nvidia.com -tensorrt_llm==0.13.0 +tensorrt_llm==0.14.0 tiktoken datasets kaldialign diff --git a/requirements-dev.txt b/requirements-dev.txt index c4529370a..3bf5a427d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ pytest-cov pytest-forked pytest-xdist pytest-timeout +pytest-split rouge_score cloudpickle typing-extensions==4.8.0 @@ -18,3 +19,4 @@ bandit==1.7.7 jsonlines==4.0.0 jieba==0.42.1 rouge==1.0.1 +pytest-rerunfailures diff --git a/requirements-windows.txt b/requirements-windows.txt index e3fc56308..dd2dbf8c8 100644 --- a/requirements-windows.txt +++ b/requirements-windows.txt @@ -1,10 +1,10 @@ --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu124 -accelerate==0.25.0 +accelerate>=0.25.0 build colored cuda-python==12.5.0 -diffusers==0.27.0 +diffusers>=0.27.0 numpy<2 onnx>=1.12.0 polygraphy==0.49.9 @@ -15,13 +15,14 @@ pandas h5py==3.10.0 pywin32 StrEnum -sentencepiece>=0.1.99 +sentencepiece>=0.2.0 tensorrt~=10.4.0 tokenizers>=0.14 # Default torch is CPU-only on Windows, so need to specify a torch version with GPU support -torch==2.4.0+cu124 -nvidia-modelopt~=0.15.0 -transformers>=4.38.2 +torch==2.4.1+cu124 +torchvision==0.19.1+cu124 +nvidia-modelopt[torch]~=0.17.0 +transformers>=4.38.2,<=4.45.1 wheel optimum evaluate diff --git a/requirements.txt b/requirements.txt index 698662167..cfaca19bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,11 +18,12 @@ h5py==3.10.0 StrEnum sentencepiece>=0.1.99 tensorrt~=10.4.0 -# https://github.com/pytorch/pytorch/blob/v2.4.0/version.txt uses 2.4.0a0. # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 uses 2.4.0a0. torch>=2.4.0a0,<=2.4.0 -nvidia-modelopt~=0.15.0 -transformers>=4.38.2,<=4.42.4 +torchvision +nvidia-modelopt[torch]~=0.17.0 +transformers>=4.38.2,<=4.45.1 +pydantic>=2.9.1 pillow==10.3.0 wheel optimum diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 0f7e4c750..441c81463 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -76,6 +76,7 @@ def main(*, trt_root: str = None, nccl_root: str = None, clean: bool = False, + clean_wheel: bool = False, configure_cmake: bool = False, use_ccache: bool = False, fast_build: bool = False, @@ -86,11 +87,23 @@ def main(*, benchmarks: bool = False, micro_benchmarks: bool = False, nvtx: bool = False): + + if clean: + clean_wheel = True + project_dir = get_project_dir() os.chdir(project_dir) build_run = partial(run, shell=True, check=True) - if not (project_dir / "3rdparty/cutlass/.git").exists(): + # Get all submodules and check their folder exists. If not, + # invoke git submodule update + with open(project_dir / ".gitmodules", "r") as submodules_f: + submodules = [ + l.split("=")[1].strip() for l in submodules_f.readlines() + if "path = " in l + ] + if any(not (project_dir / submodule / ".git").exists() + for submodule in submodules): build_run('git submodule update --init --recursive') on_windows = platform.system() == "Windows" requirements_filename = "requirements-dev-windows.txt" if on_windows else "requirements-dev.txt" @@ -303,14 +316,23 @@ def get_pybind_lib(): print(f"Failed to build pybind11 stubgen: {ex}", file=sys.stderr) - if dist_dir is None: - dist_dir = project_dir / "build" - else: - dist_dir = Path(dist_dir) - - if not dist_dir.exists(): - dist_dir.mkdir(parents=True) if not skip_building_wheel: + if dist_dir is None: + dist_dir = project_dir / "build" + else: + dist_dir = Path(dist_dir) + + if not dist_dir.exists(): + dist_dir.mkdir(parents=True) + + if clean_wheel: + # For incremental build, the python build module adds + # the new files but does not remove the deleted files. + # + # This breaks the Windows CI/CD pipeline when building + # and validating python changes in the whl. + clear_folder(dist_dir) + build_run( f'\"{sys.executable}\" -m build {project_dir} --skip-dependency-check --no-isolation --wheel --outdir "{dist_dir}"' ) @@ -327,6 +349,9 @@ def add_arguments(parser: ArgumentParser): parser.add_argument("--cuda_architectures", "-a") parser.add_argument("--install", "-i", action="store_true") parser.add_argument("--clean", "-c", action="store_true") + parser.add_argument("--clean_wheel", + action="store_true", + help="Clear dist_dir folder creating wheel") parser.add_argument("--configure_cmake", action="store_true", help="Always configure cmake before building") diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 3e5029249..3f64b1bc3 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -518,3 +518,31 @@ def supports_inflight_batching(engine_dir): json_config = GptJsonConfig.parse_file(config_path) model_config = json_config.model_config return model_config.supports_inflight_batching + + +class QuantModeWrapper: + + def __init__(self, objs): + self.objs = objs + + def __getattr__(self, name): + + def method_wrapper(*args, **kwargs): + result = False + for obj in self.objs: + attr = getattr(obj, name) + if callable(attr): + result = result | attr(*args, **kwargs) + return result + + return method_wrapper + + def __repr__(self): + return f"QuantModeWrapper: ({self.objs})" + + def __str__(self): + obj_strs = [str(obj) for obj in self.objs] + return f"[{', '.join(obj_strs)}]" + + def __getitem__(self, index): + return self.objs[index] diff --git a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py index 2d7df1e75..35282ce28 100644 --- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py +++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py @@ -26,6 +26,7 @@ class IdxEntry(Enum): KV_CACHE_BLOCK_OFFSETS = auto() HOST_KV_CACHE_BLOCK_OFFSETS = auto() HOST_KV_CACHE_POOL_POINTERS = auto() + HOST_KV_CACHE_POOL_MAPPING = auto() PAST_KEY_VALUE = auto() KV_CACHE_QUANTIZATION_SCALE = auto() KV_CACHE_DEQUANTIZATION_SCALE = auto() @@ -101,6 +102,8 @@ def is_entry_used(self, entry: IdxEntry) -> bool: return self.use_cache and self.paged_kv_cache elif entry == IdxEntry.HOST_KV_CACHE_POOL_POINTERS: return self.use_cache and self.paged_kv_cache + elif entry == IdxEntry.HOST_KV_CACHE_POOL_MAPPING: + return self.use_cache and self.paged_kv_cache elif entry == IdxEntry.PAST_KEY_VALUE: return self.use_cache and not self.paged_kv_cache elif entry == IdxEntry.KV_CACHE_QUANTIZATION_SCALE: diff --git a/tensorrt_llm/bench/run/__init__.py b/tensorrt_llm/bench/benchmark/__init__.py similarity index 100% rename from tensorrt_llm/bench/run/__init__.py rename to tensorrt_llm/bench/benchmark/__init__.py diff --git a/tensorrt_llm/bench/run/dataclasses.py b/tensorrt_llm/bench/benchmark/dataclasses.py similarity index 62% rename from tensorrt_llm/bench/run/dataclasses.py rename to tensorrt_llm/bench/benchmark/dataclasses.py index e507f0bdb..9a020a740 100644 --- a/tensorrt_llm/bench/run/dataclasses.py +++ b/tensorrt_llm/bench/benchmark/dataclasses.py @@ -2,13 +2,19 @@ from importlib.util import find_spec from pathlib import Path -from typing import Any, List, Optional +from typing import Any, List, Optional, Union from pydantic import (BaseModel, Field, PositiveFloat, computed_field, - model_validator) + field_validator, model_validator) import tensorrt_llm.bindings.executor as trtllm from tensorrt_llm.bench.enums import IFBSchedulingPolicy +from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode + +SPECULATIVE_MAP = { + SpeculativeDecodingMode.NONE: lambda *args: None, + SpeculativeDecodingMode.MEDUSA: trtllm.DecodingMode.Medusa, +} class RuntimeConfig(BaseModel): @@ -17,6 +23,8 @@ class RuntimeConfig(BaseModel): sw_version: str settings_config: ExecutorSettingsConfig world_config: ExecutorWorldConfig + decoding_config: DecodingConfig + performance_options: PerformanceOptions def get_config(self) -> trtllm.ExecutorConfig: return trtllm.ExecutorConfig( @@ -29,8 +37,60 @@ def get_config(self) -> trtllm.ExecutorConfig: max_batch_size=self.settings_config.max_batch_size, max_num_tokens=self.settings_config.max_num_tokens, enable_chunked_context=self.settings_config.chunking, + extended_runtime_perf_knob_config=self.performance_options. + get_perf_config(), + decoding_config=self.decoding_config.get_decoding_config(), ) + @model_validator(mode="after") + def validate_full_config(self) -> RuntimeConfig: + # TODO: Check engine to make sure it can support Medusa. + return self + + +class PerformanceOptions(BaseModel): + cuda_graphs: bool = False + multi_block_mode: bool = False + cuda_graph_cache_size: int = 1000 + + def get_perf_config(self) -> trtllm.ExtendedRuntimePerfKnobConfig: + config = trtllm.ExtendedRuntimePerfKnobConfig() + config.cuda_graph_mode = self.cuda_graphs + config.multi_block_mode = self.multi_block_mode + config.cuda_graph_cache_size = self.cuda_graph_cache_size + + return config + + +class DecodingConfig(BaseModel): + medusa_choices: Optional[List[List[int]]] = None + decoding_mode: SpeculativeDecodingMode = SpeculativeDecodingMode.NONE + + @field_validator("decoding_mode") + @classmethod + def decoding_mode_validator( + cls, value: Union[str, int, + SpeculativeDecodingMode]) -> SpeculativeDecodingMode: + return SpeculativeDecodingMode(value) + + @model_validator(mode="after") + def validate_speculative_decoding(self) -> DecodingConfig: + if self.medusa_choices and self.decoding_mode != SpeculativeDecodingMode.MEDUSA: + raise RuntimeError( + "Attempting to use set Medusa choices with a non-Medusa engine." + " Verify that you are using a Medusa engine.") + + return self + + def get_decoding_config(self) -> trtllm.DecodingConfig: + """Create a populated TRT-LLM DecodingConfig.""" + kwargs = {"decoding_mode": SPECULATIVE_MAP[self.decoding_mode]()} + + if self.medusa_choices is not None: + kwargs["medusa_choices"] = self.medusa_choices + + return trtllm.DecodingConfig(**kwargs) + class ExecutorWorldConfig(BaseModel): pp_size: int = 1 @@ -101,9 +161,10 @@ class RequestRecord(BaseModel): start_timestamp: int = -1 first_token_timestamp: int = -1 end_timestamp: int = -1 + decode_iteration: int = 0 def register_event(self, is_error: bool, is_final: bool, timestamp: int, - tokens: List[int]) -> None: + decoding_iter: int, tokens: List[int]) -> None: if is_final: self.end_timestamp = timestamp elif self.first_token_timestamp == -1: @@ -113,6 +174,7 @@ def register_event(self, is_error: bool, is_final: bool, timestamp: int, self.error_tokens += 1 self.tokens += tokens + self.decode_iteration = decoding_iter @computed_field def num_output_tokens(self) -> int: @@ -124,16 +186,18 @@ def num_generated_tokens(self) -> int: @computed_field def generation_time(self) -> int: - return self.end_timestamp - self.time_to_first_token + return self.end_to_end_latency - self.time_to_first_token @computed_field def time_to_first_token(self) -> int: - return self.first_token_timestamp - self.start_timestamp + return (self.first_token_timestamp - + self.start_timestamp if self.first_token_timestamp > 0 else 0.0) @computed_field def intertoken_latency(self) -> float: - return (self.end_timestamp - - self.first_token_timestamp) / self.num_generated_tokens + return ((self.end_timestamp - self.first_token_timestamp) / + self.num_generated_tokens + if self.num_generated_tokens > 0 else 0.0) @computed_field def end_to_end_latency(self) -> int: @@ -145,7 +209,7 @@ def total_token_throughput(self) -> float: @computed_field def output_token_throughput(self) -> float: - return self.num_output_tokens / self.generation_time + return (self.num_generated_tokens / self.generation_time) class PercentileStats(BaseModel): @@ -171,21 +235,41 @@ def from_iterable(cls, values: List[Any]) -> PercentileStats: class BenchmarkStatistics(BaseModel): + # Time-related Properties total_latency_ns: float + + # Token-related Properties total_output_tokens: int total_input_tokens: int + + # General Information num_requests: int issue_rate_ns: float - request_percentiles: Optional[PercentileStats] = None + # Speculative Information + acceptance_rate: float + + # Percentile-related Statistics + request_latency_percentiles: Optional[PercentileStats] = None token_percentiles: Optional[PercentileStats] = None itl_percentiles: Optional[PercentileStats] = None ttft_percentiles: Optional[PercentileStats] = None + generation_tp_percentiles: Optional[PercentileStats] = None + generation_latency_percentiles: Optional[PercentileStats] = None + acceptance_percentiles: Optional[PercentileStats] = None + + @computed_field + def generation_tokens(self) -> int: + return int(self.total_output_tokens - self.num_requests) @computed_field def token_throughput_ns(self) -> float: return float(self.total_output_tokens) / self.total_latency_ns + @computed_field + def generation_token_throughput_ns(self) -> float: + return self.generation_tp_percentiles.average + @computed_field def request_throughput_ns(self) -> float: return float(self.num_requests) / self.total_latency_ns diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py new file mode 100644 index 000000000..8b0ea5612 --- /dev/null +++ b/tensorrt_llm/bench/benchmark/low_latency.py @@ -0,0 +1,336 @@ +from __future__ import annotations + +import json +import os +from copy import deepcopy +from pathlib import Path +from time import monotonic_ns, sleep +from typing import List + +import click +import yaml +from click_option_group import optgroup + +from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode + +os.environ["TLLM_LOG_LEVEL"] = "WARNING" + +import tensorrt_llm.bindings.executor as trtllm +from tensorrt_llm.bench.benchmark.dataclasses import (BenchmarkStatistics, + RuntimeConfig) +from tensorrt_llm.bench.benchmark.utils import (StatsKeeper, + get_executor_requests, + get_settings_from_engine) +from tensorrt_llm.bench.dataclasses import BenchmarkEnvironment +from tensorrt_llm.bench.enums import IFBSchedulingPolicy +from tensorrt_llm.bench.utils.data import (create_dataset_from_stream, + initialize_tokenizer) +from tensorrt_llm.logger import logger + + +@click.command(name="latency") +@optgroup.group("Engine run configuration", + help="Runtime settings for executing a TensorRT-LLM engine.") +@optgroup.option( + "--engine_dir", + type=click.Path(exists=True, + readable=True, + path_type=Path, + resolve_path=True), + required=True, + help="Path to a serialized TRT-LLM engine.", +) +@optgroup.option( + "--kv_cache_free_gpu_mem_fraction", + type=float, + default=.90, + help="The percentage of memory to use for KV Cache after model load.", +) +@optgroup.group( + "Engine Input Configuration", + help="Input configuration for driving the engine.", +) +@optgroup.option( + "--dataset", + type=click.Path(exists=True, + readable=True, + path_type=Path, + resolve_path=True), + default=None, + help="Pass in a dataset file for parsing instead of stdin.", +) +@optgroup.option( + "--num_requests", + type=int, + default=0, + help="Number of requests to cap benchmark run at. Minimum between value and" + "length of dataset.", +) +@optgroup.group("Speculative Decode Options", + help="Runtime settings for executing a TensorRT-LLM engine.") +@optgroup.option( + "--medusa_choices", + type=click.Path(exists=True, + readable=True, + path_type=Path, + resolve_path=True), + default=None, + required=False, + help="Path to a YAML file that defines the Medusa tree.", +) +@click.pass_obj +def latency_command( + bench_env: BenchmarkEnvironment, + **params, +) -> None: + """Run a throughput test on a TRT-LLM engine.""" + + logger.set_level("info") + logger.info("Preparing to run latency benchmark...") + # Parameters from CLI + # Model, experiment, and engine params + dataset_path: Path = params.pop("dataset") + num_requests: int = params.pop("num_requests") + model: str = bench_env.model + engine_dir: Path = params.pop("engine_dir") + # Engine configuration parsing + exec_settings, build_cfg = get_settings_from_engine(engine_dir) + exec_settings["model"] = model + engine_bs = exec_settings["settings_config"]["max_batch_size"] + engine_tokens = exec_settings["settings_config"]["max_num_tokens"] + engine_max_seq_len = build_cfg["max_seq_len"] + + # Runtime Options + kv_cache_percent = params.pop("kv_cache_free_gpu_mem_fraction") + medusa_choices = params.pop("medusa_choices") + + # Update configuration with runtime options + exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent + exec_settings["settings_config"]["max_batch_size"] = engine_bs + exec_settings["settings_config"]["max_num_tokens"] = engine_tokens + exec_settings["settings_config"]["beam_width"] = 1 + exec_settings["settings_config"]["chunking"] = False + exec_settings["settings_config"][ + "scheduler_policy"] = IFBSchedulingPolicy.NO_EVICT + + # Performance options + exec_settings["performance_options"]["cuda_graphs"] = True + exec_settings["performance_options"]["multi_block_mode"] = True + + # Decoding Options + if medusa_choices is not None: + with open(medusa_choices, "r") as medusa_yml: + exec_settings["decoding_config"]["medusa_choices"] = \ + yaml.load(medusa_yml, Loader=yaml.SafeLoader) + + # Construct the runtime configuration dataclass. + runtime_config = RuntimeConfig(**exec_settings) + + # Initialize the HF tokenizer for the specified model. + ignore_eos = True if runtime_config.decoding_config.decoding_mode == SpeculativeDecodingMode.NONE else False + tokenizer = initialize_tokenizer(bench_env.model) + eos_id = tokenizer.eos_token_id if not ignore_eos else -1 + pad_id = tokenizer.pad_token_id if not ignore_eos else -1 + + # Dataset Loading and Preparation + with open(dataset_path, "r") as dataset: + metadata, requests = create_dataset_from_stream( + tokenizer, dataset, num_requests=num_requests) + + if metadata.max_sequence_length > engine_max_seq_len: + raise RuntimeError( + f"Engine supports a max sequence of {engine_max_seq_len}. Provided " + "dataset contains a maximum sequence of " + f"{metadata.max_sequence_length}. Please rebuild a new engine to" + "support this dataset.") + + # Dataset Loading and Preparation + executor_requests = get_executor_requests( + requests, + True, + eos_id=eos_id, + pad_id=pad_id, + ) + del requests + + # Instantiate the low latency benchmark. + benchmark = LatencyBenchmark( + executor_requests, + runtime_config, + ) + + try: + logger.info("Ready to start benchmark.") + benchmark.start_benchmark() + benchmark.report_statistics() + except KeyboardInterrupt: + logger.info("Benchmark interrupted! Shutting down...") + finally: + benchmark.stop_benchmark() + + +class LatencyBenchmark: + """Latency benchmark utility class.""" + + def __init__( + self, + dataset: List[trtllm.Request], + runtime_cfg: RuntimeConfig, + ) -> None: + """Initialize the throughput benchmark. + + Args: + dataset (List[trtllm.Request]): A dataset of TRT-LLM requests to + benchmark against. + runtime_cfg (RuntimeConfig): Runtime configuration. + """ + # Dataset and input properties. + self.requests = dataset + self.runtime_config = deepcopy(runtime_cfg) + self.streaming = True + + # Benchmark stats and time tracking. + self.start_time = None + self.end_time = None + self.submitted_requests = 0 + self.statistics = StatsKeeper() + + logger.info("Starting Executor backend...") + self.executor = None + logger.info("Executor started.") + + def _setup_environment(self) -> None: + # TODO: Once passing of variables is fixed, these should work + # when using MPI in C++ runtime. + os.environ["TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG"] = "1" + os.environ["TRTLLM_MMHA_KERNEL_BLOCK_SIZE"] = "256" + os.environ["TRTLLM_MMHA_KERNEL_BLOCK_SIZE"] = "32" + os.environ["FORCE_MULTI_BLOCK_MODE"] = "1" + os.environ["TRTLLM_ENABLE_PDL"] = "1" + + def start_benchmark(self) -> None: + """Start the benchmark.""" + logger.info("Initializing backend...") + self._setup_environment() + self.executor = trtllm.Executor( + self.runtime_config.engine_dir, + trtllm.ModelType.DECODER_ONLY, + executor_config=self.runtime_config.get_config()) + + logger.info("WAITING ON EXECUTOR...") + while not self.executor.can_enqueue_requests(): + logger.info("Waiting for executor to stand up...") + sleep(1) + + logger.info("Low latency benchmark started.") + self.start_time = monotonic_ns() + while len(self.requests) > 0: + final = False + request = self.requests.pop(0) + + req_id = self.executor.enqueue_request(request) + self.statistics.register_request(req_id, monotonic_ns(), + len(request.input_token_ids)) + + while not final: + responses = self.executor.await_responses(req_id) + now = monotonic_ns() + for resp in responses: + self.statistics.register_response( + req_id, now, resp.result.is_final, resp.has_error(), + resp.result.decoding_iter, + resp.result.output_token_ids[0]) + final = resp.result.is_final + + self.end_time = monotonic_ns() + logger.info("Low latency benchmark finished.") + + def stop_benchmark(self) -> None: + """Stop the benchmark and clean up backend and threads.""" + logger.info("Benchmark Shutdown called!") + if self.executor is not None: + self.executor.shutdown() + logger.info("Executor shutdown.") + + def report_statistics(self) -> BenchmarkStatistics: + """Report internal statistics about benchmark.""" + + config_path = self.runtime_config.engine_dir / "config.json" + with open(config_path, "r") as config: + engine_config = json.load(config) + + stats = self.statistics.generate_statistics_summary() + rt_cfg = self.runtime_config + build_cfg = engine_config["build_config"] + pretrain_cfg = engine_config["pretrained_config"] + + logging_info = ( + "\n\n===========================================================\n" + "= ENGINE DETAILS\n" + "===========================================================\n" + f"Model:\t\t\t{rt_cfg.model}\n" + f"Engine Directory:\t{rt_cfg.engine_dir}\n" + f"TensorRT-LLM Version:\t{rt_cfg.sw_version}\n" + f"Dtype:\t\t\t{pretrain_cfg['dtype']}\n" + f"KV Cache Dtype:\t\t{pretrain_cfg['quantization']['kv_cache_quant_algo']}\n" + f"Quantization:\t\t{pretrain_cfg['quantization']['quant_algo']}\n" + f"Max Input Length:\t{build_cfg['max_input_len']}\n" + f"Max Sequence Length:\t{build_cfg['max_seq_len']}\n" + f"\n" + "===========================================================\n" + "= WORLD + RUNTIME INFORMATION \n" + "===========================================================\n" + f"TP Size:\t\t{rt_cfg.world_config.tp_size}\n" + f"PP Size:\t\t{rt_cfg.world_config.pp_size}\n" + f"Max Runtime Batch Size:\t{rt_cfg.settings_config.max_batch_size}\n" + f"Max Runtime Tokens:\t{rt_cfg.settings_config.max_num_tokens}\n" + f"Scheduling Policy:\t{rt_cfg.settings_config.scheduler_policy.values[1]}\n" + f"KV Memory Percentage:\t{rt_cfg.settings_config.kv_cache_percent * 100.0:.2f}%\n" + f"\n" + "===========================================================\n" + "= GENERAL OVERVIEW \n" + "===========================================================\n" + f"Number of requests:\t\t{stats.num_requests}\n" + f"Average Input Length (tokens):\t{stats.average_input_length:.4f}\n" + f"Average Output Length (tokens):\t{stats.average_output_length:.4f}\n" + f"Average request latency (ms):\t{stats.request_latency_percentiles.average * 1.0e-6:.4f}\n" + f"\n" + "===========================================================\n" + "= THROUGHPUT OVERVIEW \n" + "===========================================================\n" + f"Request Throughput (req/sec):\t\t {stats.request_throughput_ns * 1.0e9:.4f}\n" + f"Total Token Throughput (tokens/sec):\t {stats.token_throughput_ns * 1.0e9:.4f}\n" + f"Generation Token Throughput (tokens/sec): {stats.generation_tp_percentiles.average * 1.0e9:.4f}\n" + f"\n" + "===========================================================\n" + "= LATENCY OVERVIEW \n" + "===========================================================\n" + f"Total Latency (ms):\t\t {stats.total_latency_ns * 1.0e-6:.4f}\n" + f"Average time-to-first-token (ms): {stats.ttft_percentiles.average * 1.0e-6:.4f}\n" + f"Average inter-token latency (ms): {stats.itl_percentiles.average * 1.0e-6:.4f}\n" + f"Acceptance Rate (Speculative):\t {stats.acceptance_rate:.2f}\n" + f"\n" + "===========================================================\n" + "= GENERATION LATENCY BREAKDOWN \n" + "===========================================================\n" + f"MIN (ms): {stats.generation_latency_percentiles.minimum * 1.0e-6:.4f}\n" + f"MAX (ms): {stats.generation_latency_percentiles.maximum * 1.0e-6:.4f}\n" + f"AVG (ms): {stats.generation_latency_percentiles.average * 1.0e-6:.4f}\n" + f"P90 (ms): {stats.generation_latency_percentiles.p50 * 1.0e-6:.4f}\n" + f"P95 (ms): {stats.generation_latency_percentiles.p95 * 1.0e-6:.4f}\n" + f"P99 (ms): {stats.generation_latency_percentiles.p99 * 1.0e-6:.4f}\n" + f"\n" + "===========================================================\n" + "= ACCEPTANCE BREAKDOWN \n" + "===========================================================\n" + f"MIN: {stats.acceptance_percentiles.minimum:.2f}\n" + f"MAX: {stats.acceptance_percentiles.maximum:.2f}\n" + f"AVG: {stats.acceptance_percentiles.average:.2f}\n" + f"P90: {stats.acceptance_percentiles.p50:.2f}\n" + f"P95: {stats.acceptance_percentiles.p95:.2f}\n" + f"P99: {stats.acceptance_percentiles.p99:.2f}\n" + f"\n" + "===========================================================\n") + + logger.info(logging_info) + return stats diff --git a/tensorrt_llm/bench/run/run.py b/tensorrt_llm/bench/benchmark/throughput.py similarity index 89% rename from tensorrt_llm/bench/run/run.py rename to tensorrt_llm/bench/benchmark/throughput.py index 4cf22d836..eeb5d5764 100644 --- a/tensorrt_llm/bench/run/run.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -13,14 +13,15 @@ from click_option_group import optgroup import tensorrt_llm.bindings.executor as trtllm +from tensorrt_llm.bench.benchmark.dataclasses import (BenchmarkStatistics, + RuntimeConfig) +from tensorrt_llm.bench.benchmark.utils import (ResponseTuple, StatsKeeper, + get_executor_requests, + get_settings_from_engine) from tensorrt_llm.bench.dataclasses import BenchmarkEnvironment from tensorrt_llm.bench.enums import IFBSchedulingPolicy -from tensorrt_llm.bench.run.dataclasses import (BenchmarkStatistics, - RuntimeConfig) -from tensorrt_llm.bench.run.utils import (ResponseTuple, StatsKeeper, - get_executor_request, - get_settings_from_engine) -from tensorrt_llm.bench.utils.data import generate_dataset_from_stream +from tensorrt_llm.bench.utils.data import (create_dataset_from_stream, + initialize_tokenizer) from tensorrt_llm.logger import logger @@ -92,7 +93,7 @@ help="Enable streaming mode for requests.", ) @click.pass_obj -def run_command( +def throughput_command( bench_env: BenchmarkEnvironment, **params, ) -> None: @@ -114,6 +115,13 @@ def run_command( engine_tokens = exec_settings["settings_config"]["max_num_tokens"] engine_max_seq_len = build_cfg["max_seq_len"] + # Check that we are not using a low latency engine + # Right now, this is based on max batch size. + if engine_bs == 1: + raise ValueError( + "An engine with a batch size greater than 1 should be used for " + "throughput benchmarking. Exiting.") + # Runtime Options runtime_max_bs = params.pop("max_batch_size") runtime_max_bs = runtime_max_bs if runtime_max_bs else engine_bs @@ -133,9 +141,13 @@ def run_command( # Construct the runtime configuration dataclass. runtime_config = RuntimeConfig(**exec_settings) + # Initialize the HF tokenizer for the specified model. + tokenizer = initialize_tokenizer(bench_env.model) + # Dataset Loading and Preparation - metadata, requests = generate_dataset_from_stream(dataset_path, model, - num_requests) + with open(dataset_path, "r") as dataset: + metadata, requests = create_dataset_from_stream( + tokenizer, dataset, num_requests=num_requests) # TODO: Verify that the engine can handle the max/min ISL/OSL. if metadata.max_sequence_length > engine_max_seq_len: raise RuntimeError( @@ -143,20 +155,19 @@ def run_command( "dataset contains a maximum sequence of " f"{metadata.max_sequence_length}. Please rebuild a new engine to" "support this dataset.") - executor_requests = [] - while requests: - request = requests.pop() - executor_requests.append( - get_executor_request(request, - pad_id=-1, - eos_id=-1, - streaming=streaming)) - del request + + # Dataset Loading and Preparation + executor_requests = get_executor_requests( + requests, + streaming, + eos_id=-1, + pad_id=-1, + ) + del requests logger.info("Setting up benchmarker and infrastructure.") new_request_queue = mp.Queue() response_queue = mp.Queue() - logger.set_level("error") benchmark = ThroughputBenchmark( dataset=executor_requests, request_rate=request_rate, @@ -165,7 +176,7 @@ def run_command( response_queue=response_queue, streaming=streaming, ) - logger.set_level("info") + try: logger.info("Ready to start benchmark.") benchmark.start_benchmark() @@ -173,10 +184,8 @@ def run_command( benchmark.stop_benchmark() benchmark.report_statistics() except KeyboardInterrupt: - logger.set_level("error") benchmark.stop_benchmark() finally: - logger.set_level("error") benchmark.shutdown() @@ -195,15 +204,22 @@ def __init__(self, runtime_cfg: RuntimeConfig, logger.info("Initializing Executor.") # Runtime related properties. self.runtime_config: RuntimeConfig = runtime_cfg + # Runtime tracking and multiprocessing. + self.responses = response_queue + self._shutdown = Event() + self.backend_ready = Event() + self._resp_daemon_finished = Event() self.executor = trtllm.Executor( self.runtime_config.engine_dir, trtllm.ModelType.DECODER_ONLY, executor_config=self.runtime_config.get_config()) - # Runtime tracking and multiprocessing. - self.responses = response_queue - self._shutdown = Event() - self._resp_daemon_finished = Event() + logger.info("WAITING ON EXECUTOR...") + while not self.executor.can_enqueue_requests(): + logger.info("Waiting for executor to stand up...") + sleep(1) + + self.backend_ready.set() self.response_thread = Thread(target=self.response_daemon) self.response_thread.start() @@ -245,8 +261,8 @@ def _process_response() -> None: if len(responses) > 0: self.responses.put([ ResponseTuple(now, r.request_id, r.result.is_final, - r.has_error(), r.result.output_token_ids[0]) - for r in responses + r.has_error(), r.result.output_token_ids[0], + r.result.decoding_iter) for r in responses ]) while not self._shutdown.is_set(): @@ -282,7 +298,8 @@ def __init__( response_queue (mp.Queue): Process-safe queue for passing request responses to main process. """ - logger.info(f"Initializing Throughput Benchmark. [rate=%d req/s]") + logger.info( + f"Initializing Throughput Benchmark. [rate={request_rate} req/s]") # Dataset and input properties. self.requests = dataset self.delay_func = lambda x: sleep( @@ -313,8 +330,9 @@ def __init__( def enqueue_process(self) -> None: """Method for starting enqueueing requests.""" + logger.info("WAITING ON BACKEND TO BE READY...") + self.executor.backend_ready.wait() logger.info("Request serving started.") - request_generator = self.executor.enqueue(*self.requests) # Iterate the generator until we run out of requests. # Note the walrus operator. @@ -378,13 +396,14 @@ def _process_requests() -> None: while not self.response_queue.empty(): responses: Tuple[ int, - List[trtllm.Response]] = self.response_queue.get_nowait() + List[ResponseTuple]] = self.response_queue.get_nowait() for response in responses: self.statistics.register_response( response.request_id, response.timestamp, response.final, response.error, + response.decoding_iteration, response.tokens, ) @@ -457,7 +476,7 @@ def report_statistics(self) -> BenchmarkStatistics: "===========================================================\n" "= STREAMING STATISTICS \n" "===========================================================\n" - f"Average request latency (ms):\t\t{stats.request_percentiles.average * 1.0e-6:.4f}\n" + f"Average request latency (ms):\t\t{stats.request_latency_percentiles.average * 1.0e-6:.4f}\n" f"Average time-to-first-token (ms):\t{stats.ttft_percentiles.average * 1.0e-6:.4f}\n" f"Average inter-token latency (ms):\t{stats.itl_percentiles.average * 1.0e-6:.4f}\n" ) diff --git a/tensorrt_llm/bench/run/utils.py b/tensorrt_llm/bench/benchmark/utils.py similarity index 60% rename from tensorrt_llm/bench/run/utils.py rename to tensorrt_llm/bench/benchmark/utils.py index ad69d160e..d2fad8ca8 100644 --- a/tensorrt_llm/bench/run/utils.py +++ b/tensorrt_llm/bench/benchmark/utils.py @@ -6,12 +6,33 @@ from typing import Dict, List, Tuple, Union import tensorrt_llm.bindings.executor as trtllm -from tensorrt_llm.bench.run.dataclasses import (BenchmarkStatistics, - PercentileStats, RequestRecord) +from tensorrt_llm.bench.benchmark.dataclasses import (BenchmarkStatistics, + PercentileStats, + RequestRecord) from tensorrt_llm.bindings import InferenceRequest -ResponseTuple = namedtuple( - "ResponseTuple", ["timestamp", "request_id", "final", "error", "tokens"]) +ResponseTuple = namedtuple("ResponseTuple", [ + "timestamp", "request_id", "final", "error", "tokens", "decoding_iteration" +]) + + +def get_executor_requests( + requests: List[InferenceRequest], + streaming: bool, + eos_id: int, + pad_id: int, +) -> List[trtllm.Request]: + executor_requests = [] + while requests: + request = requests.pop() + executor_requests.append( + get_executor_request(request, + pad_id=pad_id, + eos_id=eos_id, + streaming=streaming)) + del request + + return executor_requests def get_executor_request(request: InferenceRequest, @@ -62,6 +83,10 @@ def get_settings_from_engine( "world_config": world_config, }) + runtime_config["performance_options"] = {} + runtime_config["decoding_config"] = { + "decoding_mode": engine_build_cfg["speculative_decoding_mode"] + } return runtime_config, engine_build_cfg @@ -74,7 +99,7 @@ def __init__(self) -> None: def register_request( self, request_id: int, - timestamp: float, + timestamp: int, num_tokens: int, ) -> None: record = self.requests[request_id] @@ -82,9 +107,10 @@ def register_request( record.start_timestamp = timestamp def register_response(self, request_id: int, timestamp: int, final: bool, - error: bool, tokens: List[int]) -> None: + error: bool, decode_iter: int, + tokens: List[int]) -> None: record = self.requests[request_id] - record.register_event(error, final, timestamp, tokens) + record.register_event(error, final, timestamp, decode_iter, tokens) if final: self.num_complete = self.num_complete + 1 @@ -96,7 +122,11 @@ def generate_statistics_summary(self) -> None: end_time = -1 request_latencies = [] + generation_latencies = [] + generation_throughputs = [] intertoken_avg_latencies = [] + request_acceptance = [] + total_decoding_iterations = 0 ttft_times = [] last_queue_time = 0.0 queue_time_total = 0.0 @@ -104,26 +134,42 @@ def generate_statistics_summary(self) -> None: for entry in self.requests.values(): start_time = min(entry.start_timestamp, start_time) end_time = max(entry.end_timestamp, end_time) - queue_time_total += entry.start_timestamp - last_queue_time - last_queue_time = entry.start_timestamp + last_queue_time = max(entry.start_timestamp, last_queue_time) + request_ar = entry.num_generated_tokens / entry.decode_iteration request_latencies.append(entry.end_to_end_latency) + generation_latencies.append(entry.generation_time) + generation_throughputs.append(entry.output_token_throughput) ttft_times.append(entry.time_to_first_token) intertoken_avg_latencies.append(entry.intertoken_latency) + request_acceptance.append(request_ar) + total_decoding_iterations += entry.decode_iteration total_output_tokens += entry.num_output_tokens total_input_tokens += entry.num_input_tokens + global_acceptance_rate = total_output_tokens / total_decoding_iterations + queue_time_total = last_queue_time - start_time + percentile_request_accept = PercentileStats.from_iterable( + request_acceptance) if request_acceptance else None + stats = BenchmarkStatistics( num_requests=num_requests, total_latency_ns=end_time - start_time, total_output_tokens=total_output_tokens, total_input_tokens=total_input_tokens, - request_percentiles=PercentileStats.from_iterable( + acceptance_rate=global_acceptance_rate, + request_latency_percentiles=PercentileStats.from_iterable( request_latencies), itl_percentiles=PercentileStats.from_iterable( intertoken_avg_latencies), ttft_percentiles=PercentileStats.from_iterable(ttft_times), - issue_rate_ns=queue_time_total / num_requests) + generation_tp_percentiles=PercentileStats.from_iterable( + generation_throughputs), + generation_latency_percentiles=PercentileStats.from_iterable( + generation_latencies), + issue_rate_ns=queue_time_total / num_requests, + acceptance_percentiles=percentile_request_accept, + ) return stats diff --git a/tensorrt_llm/bench/build/benchmark_config.yml b/tensorrt_llm/bench/build/benchmark_config.yml index ab432c85d..1f28fb492 100644 --- a/tensorrt_llm/bench/build/benchmark_config.yml +++ b/tensorrt_llm/bench/build/benchmark_config.yml @@ -44,7 +44,7 @@ meta-llama/Meta-Llama-3-8B: general: max_batch_size: 2048 max_num_tokens: 8192 -meta-llama/Meta-Llama-3.1-8B: +meta-llama/Llama-3.1-8B: &llama_3_1_8b tp1_pp1: general: max_batch_size: 2048 @@ -69,7 +69,7 @@ meta-llama/Meta-Llama-3-70B: general: max_batch_size: 8192 max_num_tokens: 16384 -meta-llama/Meta-Llama-3.1-70B: +meta-llama/Llama-3.1-70B: &llama_3_1_70b tp1_pp1: general: max_batch_size: 2048 @@ -89,27 +89,36 @@ meta-llama/Meta-Llama-3.1-70B: general: max_batch_size: 8192 max_num_tokens: 16384 -meta-llama/Meta-Llama-3.1-405B: +meta-llama/Llama-3.1-405B: &llama_3_1_405b tp8_pp1: general: - max_batch_size: 320 - max_num_tokens: 5440 + max_batch_size: 1024 + max_num_tokens: 4096 256: max_batch_size: 2048 max_num_tokens: 4096 + 2000: + max_batch_size: 1280 + max_num_tokens: 2560 + 2176: + max_batch_size: 1024 + max_num_tokens: 4096 2500: - max_batch_size: 320 - max_num_tokens: 512 + max_batch_size: 1024 + max_num_tokens: 2048 4096: - max_batch_size: 192 - max_num_tokens: 512 + max_batch_size: 512 + max_num_tokens: 2048 + 4224: + max_batch_size: 512 + max_num_tokens: 2048 5500: - max_batch_size: 192 - max_num_tokens: 512 + max_batch_size: 512 + max_num_tokens: 5120 22000: - max_batch_size: 64 - max_num_tokens: 768 -mistralai/Mixtral-8x7B-v0.1: + max_batch_size: 128 + max_num_tokens: 2048 +mistralai/Mixtral-8x7B-v0.1: &mixtral_8x7b_0_1 tp2_pp1: general: max_batch_size: 2048 @@ -122,8 +131,31 @@ mistralai/Mixtral-8x7B-v0.1: general: max_batch_size: 8192 max_num_tokens: 8192 +mistralai/Mixtral-8x22B-v0.1: &mixtral_8x22b_0_1 + tp8_pp1: + 256: + max_batch_size: 8192 + max_num_tokens: 16384 + 2176: + max_batch_size: 2048 + max_num_tokens: 16384 + 4224: + max_batch_size: 1024 + max_num_tokens: 2048 + 5500: + max_batch_size: 1024 + max_num_tokens: 8192 + general: + max_batch_size: 2048 + max_num_tokens: 8192 mistralai/Mistral-7B-v0.1: tp1_pp1: general: max_batch_size: 4098 max_num_tokens: 8192 + +meta-llama/Llama-3.1-8B-Instruct: *llama_3_1_8b +meta-llama/Llama-3.1-70B-Instruct: *llama_3_1_70b +meta-llama/Llama-3.1-405B-Instruct: *llama_3_1_405b +mistralai/Mixtral-8x7B-Instruct-v0.1: *mixtral_8x7b_0_1 +mistralai/Mixtral-8x22B-Instruct-v0.1: *mixtral_8x22b_0_1 diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py index 4123e870d..5946b2d20 100644 --- a/tensorrt_llm/bench/build/build.py +++ b/tensorrt_llm/bench/build/build.py @@ -1,8 +1,6 @@ from __future__ import annotations from pathlib import Path -from select import select -from sys import stdin from typing import Dict, get_args import click from click_option_group import AllOptionGroup, optgroup, RequiredMutuallyExclusiveOptionGroup @@ -170,25 +168,18 @@ def build_command( # Dataset options dataset_path: Path = params.pop("dataset") max_seq_len: int = params.pop("max_seq_length") - data_on_stdin: bool = bool(len(select([ - stdin, - ], [], [], 0.0)[0])) - # Initialize the HF tokenizer for the specified model. tokenizer = initialize_tokenizer(bench_env.model) # If we are receiving data from a path or stdin, parse and gather metadata. - if dataset_path or data_on_stdin: + if dataset_path: logger.info("Found dataset.") - # Cannot set the data file path and pipe in from stdin. Choose one. - if dataset_path is not None and data_on_stdin: - raise ValueError( - "Cannot provide a dataset on both stdin and by --dataset " - "option. Please pick one.") - stream = stdin if data_on_stdin else open(dataset_path, "r") - # Parse the dataset from stdin and return it plus its metadata. - metadata, _ = \ - create_dataset_from_stream(tokenizer, stream=stream) + # Dataset Loading and Preparation + with open(dataset_path, "r") as dataset: + metadata, _ = create_dataset_from_stream( + tokenizer, + dataset, + ) # The max sequence length option for build is the sum of max osl + isl. max_seq_len = metadata.max_sequence_length logger.info(metadata.get_summary_for_print()) diff --git a/tensorrt_llm/bench/dataclasses.py b/tensorrt_llm/bench/dataclasses.py index a4238689b..f49b88a0d 100644 --- a/tensorrt_llm/bench/dataclasses.py +++ b/tensorrt_llm/bench/dataclasses.py @@ -88,12 +88,13 @@ class DatasetMetadata(BaseModel): num_requests: int def get_summary_for_print(self) -> str: - return ("===========================================================\n" - "= DATASET DETAILS\n" - "===========================================================\n" - f"Max Input Sequence Length:\t{self.max_isl}\n" - f"Max Output Sequence Length:\t{self.max_osl}\n" - f"Max Sequence Length:\t{self.max_sequence_length}\n" - f"Number of Sequences:\t{self.num_requests}\n" - "===========================================================\n" - f"\n") + return ( + "\n===========================================================\n" + "= DATASET DETAILS\n" + "===========================================================\n" + f"Max Input Sequence Length:\t{self.max_isl}\n" + f"Max Output Sequence Length:\t{self.max_osl}\n" + f"Max Sequence Length:\t{self.max_sequence_length}\n" + f"Number of Sequences:\t{self.num_requests}\n" + "===========================================================\n" + f"\n") diff --git a/tensorrt_llm/bench/utils/data.py b/tensorrt_llm/bench/utils/data.py index 4f6380325..b6d00a345 100644 --- a/tensorrt_llm/bench/utils/data.py +++ b/tensorrt_llm/bench/utils/data.py @@ -1,8 +1,5 @@ import json -import sys from functools import partial -from pathlib import Path -from select import select from typing import List, TextIO, Tuple from transformers import AutoTokenizer, PreTrainedTokenizer @@ -10,33 +7,6 @@ from tensorrt_llm.bench.dataclasses import DatasetMetadata, InferenceRequest -def generate_dataset_from_stream(dataset_path: Path, - model: str, - num_requests: int = 0): - # Check for data on stdin. - data_on_stdin: bool = bool(len(select([ - sys.stdin, - ], [], [], 0.0)[0])) - - # Cannot set the data file path and pipe in from stdin. Choose one. - if dataset_path is not None and data_on_stdin: - raise ValueError( - "Cannot provide a dataset on both stdin and by --dataset option. " - "Please pick one.") - # If we are receiving data from a path or stdin, parse and gather metadata. - stream = sys.stdin if data_on_stdin else open(dataset_path, "r") - tokenizer = initialize_tokenizer(model) - # Parse the dataset from stdin and return it plus its metadata. - metadata, requests = \ - create_dataset_from_stream( - tokenizer, - stream=stream, - num_requests=num_requests - ) - - return metadata, requests - - def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer: """Initialize a tokenizer. @@ -58,20 +28,23 @@ def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer: def create_dataset_from_stream( tokenizer: PreTrainedTokenizer, + stream: TextIO, max_input_length: int = 0, max_output_length: int = 0, - stream: TextIO = sys.stdin, num_requests: int = 0, ) -> Tuple[DatasetMetadata, List[InferenceRequest]]: """Generate metadata and a list of requests to drive benchmarking. Args: tokenizer (PreTrainedTokenizer): HuggingFace tokenizer. - max_input_length (int): Maximum input length to cap prompts to. + stream (TextIO): Stream of input requests. + max_input_length (int, optional): Maximum input length to cap prompts to. Defaults to 0. + max_output_length (int, optional): Maximum output length to cap prompts to.. Defaults to 0. + num_requests (int, optional): Number of requests to limit to. Defaults to 0. Returns: - DatasetMetadata: Dataclass of dataset statistics. - List[InferenceRequest]: A list of inference requests for benchmarking. + Tuple[DatasetMetadata, List[InferenceRequest]]: A tuple containing a dataclass of dataset + statistics and a list of inference requests for benchmarking. """ # Initialize dataset list, and metadata tracking variables. dataset = [] diff --git a/tensorrt_llm/bench/utils/tokenize.py b/tensorrt_llm/bench/utils/tokenize.py deleted file mode 100644 index 44f04df56..000000000 --- a/tensorrt_llm/bench/utils/tokenize.py +++ /dev/null @@ -1,105 +0,0 @@ -import json -import sys -from functools import partial -from typing import List, TextIO, Tuple - -from transformers import AutoTokenizer, PreTrainedTokenizer - -from tensorrt_llm.bench.dataclasses import DatasetMetadata, InferenceRequest - - -def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer: - """Initialize a tokenizer. - - Args: - model_name (str): The name of the HuggingFace model to pull a - tokenizer from. - - Returns: - PreTrainedTokenizer: An initialized HuggingFace tokenizer. - """ - # Initialize the tokenizer specific to the model that we are planning - # to benchmark. - tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") - if tokenizer.pad_token_id is None: - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - return tokenizer - - -def create_dataset_from_stream( - tokenizer: PreTrainedTokenizer, - max_input_length: int = 0, - max_output_length: int = 0, - stream: TextIO = sys.stdin, -) -> Tuple[DatasetMetadata, List[InferenceRequest]]: - """Generate metadata and a list of requests to drive benchmarking. - - Args: - tokenizer (PreTrainedTokenizer): HuggingFace tokenizer. - max_input_length (int): Maximum input length to cap prompts to. - - Returns: - DatasetMetadata: Dataclass of dataset statistics. - List[InferenceRequest]: A list of inference requests for benchmarking. - """ - # Initialize dataset list, and metadata tracking variables. - dataset = [] - max_isl = 0 - max_osl = 0 - max_sequence = 0 - - # If we're limiting the input length to a certain size, then set up - # a partial to truncate the data down to size. Otherwise, just use the - # unmodified tokenizer callable. - tokenize = (partial( - tokenizer, - padding="max_length", - max_length=max_input_length, - truncation=True, - ) if max_input_length > 0 else tokenizer) - - # If we need to limit the output length, fill in a partial callable - # for max, otherwise a lambda that just returns x with no bounds. - output_limiter = (partial(max, max_output_length) - if max_output_length > 0 else lambda x: x) - - # For each line in the standard input, parse out the JSON string we expect - # to see. - # Note the := walrus -- we're assigning and checking the condition. - while line := stream.readline(): - # We expect the data to come in as a JSON string. - # For example: - # {"prompt": "Generate an infinite response to the following: There once was a man who.", "output_tokens": 1000} - # Each line should be a complete JSON dictionary with no indentation - # or newline characters. - data = json.loads(line) - logits = data.get("logits", None) - prompt = data.get("prompt", None) - task_id = data["task_id"] - osl = data["output_tokens"] - # If the request comes in with logits, just use the provided. - # Otherwise we need to tokenize it. - logits = tokenize(prompt)["input_ids"] if logits is None else logits - - request = InferenceRequest( - task_id=task_id, - prompt=prompt, - output_tokens=output_limiter(osl), - logits=logits, - ) - max_isl = max(max_isl, len(logits)) - max_osl = max(max_osl, osl) - max_sequence = max(max_sequence, len(logits) + osl) - dataset.append(request) - - # Fill in basic dataset metrics here - # TODO: Maybe fill this out to be more complete? - metadata = DatasetMetadata( - max_isl=max_isl, - max_osl=max_osl, - max_sequence_length=max_sequence, - num_requests=len(dataset), - ) - - return metadata, dataset diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py index e8dfb9034..2f712be94 100644 --- a/tensorrt_llm/builder.py +++ b/tensorrt_llm/builder.py @@ -141,7 +141,6 @@ def create_builder_config(self, use_refit: bool = False, int8: bool = False, strongly_typed: bool = True, - opt_level: Optional[int] = None, force_num_profiles: Optional[int] = None, profiling_verbosity: str = "layer_names_only", use_strip_plan: bool = False, @@ -191,9 +190,6 @@ def create_builder_config(self, if use_strip_plan: config.set_flag(trt.BuilderFlag.STRIP_PLAN) - if opt_level is not None: - config.builder_optimization_level = opt_level - # Set TRT Engine profiling verbosity if profiling_verbosity == "detailed": config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED @@ -479,7 +475,6 @@ class BuildConfig: gather_context_logits: int = False gather_generation_logits: int = False strongly_typed: bool = True - builder_opt: Optional[int] = None force_num_profiles: Optional[int] = None profiling_verbosity: str = 'layer_names_only' enable_debug_output: bool = False @@ -567,7 +562,6 @@ def from_dict(cls, config, plugin_config=None): gather_context_logits = config.pop('gather_context_logits', False) gather_generation_logits = config.pop('gather_generation_logits', False) strongly_typed = config.pop('strongly_typed', True) - builder_opt = config.pop('builder_opt', None) force_num_profiles = config.pop('force_num_profiles', None) weight_sparsity = config.pop('weight_sparsity', False) profiling_verbosity = config.pop('profiling_verbosity', @@ -584,7 +578,7 @@ def from_dict(cls, config, plugin_config=None): config.get('auto_parallel_config', {})) max_encoder_input_len = config.pop('max_encoder_input_len', 1024) weight_streaming = config.pop('weight_streaming', False) - + use_fused_mlp = config.pop('use_fused_mlp', True) use_strip_plan = config.pop('use_strip_plan', False) if plugin_config is None: @@ -608,7 +602,6 @@ def from_dict(cls, config, plugin_config=None): gather_context_logits=gather_context_logits, gather_generation_logits=gather_generation_logits, strongly_typed=strongly_typed, - builder_opt=builder_opt, force_num_profiles=force_num_profiles, profiling_verbosity=profiling_verbosity, enable_debug_output=enable_debug_output, @@ -623,6 +616,7 @@ def from_dict(cls, config, plugin_config=None): max_encoder_input_len=max_encoder_input_len, weight_sparsity=weight_sparsity, weight_streaming=weight_streaming, + use_fused_mlp=use_fused_mlp, plugin_config=plugin_config, dry_run=dry_run, visualize_network=visualize_network) @@ -689,13 +683,11 @@ def __init__( self, config: EngineConfig, engine: Union[trt.IHostMemory, None], - managed_weights: dict[str, np.ndarray] = None, + managed_weights: dict[str, np.ndarray] = {}, ): self.config = config self.engine = engine self.managed_weights = managed_weights - - def regularize_managed_weights(self): if self.managed_weights is None: self.managed_weights = {} for name, value in self.managed_weights.items(): @@ -731,10 +723,24 @@ def save(self, engine_dir: str): if os.path.exists(root_lora_dir) and os.path.isdir(root_lora_dir): shutil.rmtree(root_lora_dir) if self.config.pretrained_config.mapping.rank == 0: + config_dict = self.config.to_dict() + if self.config.pretrained_config.quant_algo == QuantAlgo.MIXED_PRECISION: + quant_dict = { + 'version': self.config.version, + } + quant_dict.update( + config_dict['pretrained_config']['quantization']) + config_dict['pretrained_config']['quantization'].pop( + 'quantized_layers', None) + with open(os.path.join(engine_dir, 'quant_cfg.json'), + "w", + encoding="utf-8") as f: + json.dump(quant_dict, f, indent=4, cls=ConfigEncoder) + with open(os.path.join(engine_dir, 'config.json'), "w", encoding="utf-8") as f: - json.dump(self.config.to_dict(), f, indent=4, cls=ConfigEncoder) + json.dump(config_dict, f, indent=4, cls=ConfigEncoder) if self.engine is not None: serialize_engine( self.engine, @@ -807,7 +813,7 @@ def optimize_model_with_config(model: PretrainedModel, use_lora=build_config.plugin_config.lora_plugin is not None, max_lora_rank=build_config.lora_config.max_lora_rank, use_fp8_context_fmha=( - model.config.quantization.quant_algo == QuantAlgo.FP8 + QuantAlgo.FP8 == model.config.quantization.quant_algo and build_config.plugin_config.use_fp8_context_fmha), ) @@ -990,6 +996,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: if build_config.plugin_config.reduce_fusion and ( model.config.mapping.tp_size == 1 + or model.config.mapping.pp_size != 1 or model.config.architecture != "LlamaForCausalLM"): logger.warning('Overriding reduce_fusion to False') build_config.plugin_config.reduce_fusion = False @@ -1053,7 +1060,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: "Paged Context FMHA doesn't work with int8 kv cache currently.") if build_config.plugin_config.manage_weights: - if model.config.quant_mode & QuantMode.INT4_WEIGHTS or model.config.quant_mode & QuantMode.INT8_WEIGHTS: + if model.config.quant_mode.has_weight_quant(): raise RuntimeError( "Managed weights is not supported with int4 or int8 weights.") @@ -1068,7 +1075,6 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: and not model.config.quant_mode.has_per_group_scaling()) or model.config.quant_mode.has_int8_kv_cache(), strongly_typed=build_config.strongly_typed, - opt_level=build_config.builder_opt, force_num_profiles=build_config.force_num_profiles, profiling_verbosity=build_config.profiling_verbosity, quant_mode=model.config.quant_mode, @@ -1148,6 +1154,10 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine: "max_batch_size": build_config.max_batch_size, } + if build_config.speculative_decoding_mode == SpeculativeDecodingMode.LOOKAHEAD_DECODING: + prepare_input_args[ + "spec_decoding_is_generation_length_variable"] = True + inputs = model.prepare_inputs(**prepare_input_args) model(**inputs) diff --git a/tensorrt_llm/commands/bench.py b/tensorrt_llm/commands/bench.py index 9c48dac3f..4e10cdcb3 100644 --- a/tensorrt_llm/commands/bench.py +++ b/tensorrt_llm/commands/bench.py @@ -2,9 +2,10 @@ import click +from tensorrt_llm.bench.benchmark.low_latency import latency_command +from tensorrt_llm.bench.benchmark.throughput import throughput_command from tensorrt_llm.bench.build.build import build_command from tensorrt_llm.bench.dataclasses import BenchmarkEnvironment -from tensorrt_llm.bench.run.run import run_command @click.group(name="trtllm-bench", context_settings={'show_default': True}) @@ -36,7 +37,8 @@ def main( main.add_command(build_command) -main.add_command(run_command) +main.add_command(throughput_command) +main.add_command(latency_command) if __name__ == "__main__": main() diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py index 913297341..3a7772ecc 100644 --- a/tensorrt_llm/commands/build.py +++ b/tensorrt_llm/commands/build.py @@ -74,7 +74,7 @@ def parse_arguments(): parser.add_argument( '--max_batch_size', type=int, - default=256, + default=2048, help="Maximum number of requests that the engine can schedule.") parser.add_argument('--max_input_len', type=int, @@ -152,11 +152,6 @@ def parse_arguments(): type=str, default='model.cache', help="The file path to write the timing cache.") - parser.add_argument('--builder_opt', - type=int, - default=None, - choices=[0, 1, 2, 3, 4, 5], - help="TensorRT builder optimization level.") parser.add_argument( '--profiling_verbosity', type=str, @@ -330,7 +325,6 @@ def build_model( bool = False, # return the modified BuildConfig without actually building the engine **kwargs ) -> Union[Engine, BuildConfig]: - model_config = copy.deepcopy(model_config) logits_dtype = kwargs.get('logits_dtype') @@ -505,12 +499,12 @@ def main(): else: config_path = os.path.join(ckpt_dir_or_model_config, 'config.json') ckpt_dir = ckpt_dir_or_model_config - model_config = PretrainedConfig.from_json_file(config_path) # avoid ValueError if not supported quantization is chosen with use_fused_mlp quant_algo = model_config.quantization.quant_algo - if quant_algo and quant_algo != QuantAlgo.FP8: + if quant_algo and quant_algo not in (QuantAlgo.FP8, + QuantAlgo.MIXED_PRECISION): kwargs['use_fused_mlp'] = False if args.build_config is None: @@ -536,7 +530,6 @@ def main(): 'gather_context_logits': args.gather_context_logits, 'gather_generation_logits': args.gather_generation_logits, 'strongly_typed': True, - 'builder_opt': args.builder_opt, 'force_num_profiles': args.builder_force_num_profiles, 'weight_sparsity': args.weight_sparsity, 'profiling_verbosity': args.profiling_verbosity, diff --git a/tensorrt_llm/executor.py b/tensorrt_llm/executor.py index af0d7138a..5c2f94db3 100644 --- a/tensorrt_llm/executor.py +++ b/tensorrt_llm/executor.py @@ -13,8 +13,8 @@ from multiprocessing.shared_memory import SharedMemory from pathlib import Path from queue import Queue -from typing import (Any, Dict, Generator, List, NamedTuple, Optional, Tuple, - Union) +from typing import (Any, Dict, Generator, List, Literal, NamedTuple, Optional, + Tuple, Union) import numpy as np import torch @@ -25,7 +25,8 @@ from .hlapi.mpi_session import (MpiPoolSession, MpiSession, external_mpi_comm_available, find_free_port, need_spawn_mpi_workers) -from .hlapi.utils import ManagedThread, SamplingParams +from .hlapi.utils import (ManagedThread, SamplingParams, enable_llm_debug, + print_colored) from .lora_manager import LoraManager from .runtime import ModelConfig from .runtime.model_runner import _engine_config_to_model_config @@ -39,6 +40,14 @@ def has_event_loop() -> bool: return True +if enable_llm_debug(): + print_colored("LLM debug mode enabled.", "yellow") + + import faulthandler + import signal + faulthandler.register(signal.SIGINT, all_threads=True) + + @dataclass(slots=True) class LoRARequest: lora_name: str @@ -99,6 +108,8 @@ class CompletionOutput: token_ids (List[int]): The token ids of the generated output text. cumulative_logprob (float): The cumulative log probability of the generated output text. logprobs (List[float]): The log probabilities of the top probability words at each position if the logprobs are requested. + finish_reason (Literal['stop', 'length']): The reason why the sequence is finished. + stop_reason (Union[int, str]): The stop string or token id that caused the completion to stop, None if the completion finished for some other reason. generation_logits (torch.Tensor): The logits on the generated output token ids. length (int): The number of generated tokens. token_ids_diff (List[int]): Newly generated token ids. @@ -110,8 +121,9 @@ class CompletionOutput: token_ids: List[int] = field(default_factory=list) cumulative_logprob: Optional[float] = None logprobs: List[float] = field(default_factory=list) + finish_reason: Optional[Literal['stop', 'length']] = None + stop_reason: Optional[Union[int, str]] = None generation_logits: Optional[torch.Tensor] = None - _last_text: str = field(default="", init=False, repr=False) _last_logprobs_len: int = field(default=0, init=False, repr=False) _last_token_ids_len: int = field(default=0, init=False, repr=False) @@ -212,6 +224,10 @@ def __str__(self): return f"{self.message}\nStack trace:\n{self.stack_trace}" +class RequestError(RuntimeError): + ''' The error raised when the request is failed. ''' + + class GenerationResult: ''' The result of a generation request. It can be used to wait for the completion of the request. @@ -265,14 +281,12 @@ def beam_width(self): def handle_response(self, response: "GenerationExecutor.Response"): - if response.error: - if isinstance(response.error, Exception): - raise response.error - else: - raise CppExecutorError(response.error) - self._done = response.is_final + if response.error: + assert isinstance(response.error, str) + raise RequestError(response.error) + tensors = response.tensors for i, beam_ids in enumerate(tensors.output_token_ids): @@ -286,14 +300,23 @@ def handle_response(self, response: "GenerationExecutor.Response"): self.outputs[i].generation_logits = tensors.generation_logits[ i, :self.outputs[i].length] - if self.finished and not self._generation_request.sampling_params.include_stop_str_in_output: - for beam_output in self.outputs: - for stop_ids in self._generation_request.sampling_params._get_stop_words( - ): - if beam_output.token_ids[-len(stop_ids):] == stop_ids: - beam_output.token_ids = beam_output.token_ids[:-len( - stop_ids)] - break + if self.finished: + for i, beam_output in enumerate(self.outputs): + if response.finish_reasons[i] == tllm.FinishReason.END_ID: + beam_output.finish_reason = 'stop' + elif response.finish_reasons[i] == tllm.FinishReason.STOP_WORDS: + beam_output.finish_reason = 'stop' + sampling_params = self._generation_request.sampling_params + for stop_reason, stop_ids in sampling_params._get_stop_reasons_and_words( + ): + if beam_output.token_ids[-len(stop_ids):] == stop_ids: + beam_output.stop_reason = stop_reason + if not sampling_params.include_stop_str_in_output: + beam_output.token_ids = beam_output.token_ids[:-len( + stop_ids)] + break + elif response.finish_reasons[i] == tllm.FinishReason.LENGTH: + beam_output.finish_reason = 'length' if tensors.context_logits is not None: self.context_logits = tensors.context_logits @@ -401,6 +424,7 @@ class Response(NamedTuple): """ The response from the cpp-executor to the Python main thread. """ request_id: int tensors: Optional["GenerationExecutor.ResponseTensors"] + finish_reasons: Optional[List[tllm.FinishReason]] is_final: Optional[bool] # error is either str from cpp-executor or a Exception from Python threads/processes error: Optional[str | Exception] @@ -423,6 +447,9 @@ def __init__(self): self._pending_responses: Dict[ int, List[GenerationExecutor.PendingResponse]] = {} + # A flag to avoid calling shutdown() recursively. This happens when the background threads raise errors. + self.doing_shutdown = False + @abstractmethod def submit(self, request: GenerationRequest) -> GenerationResult: pass @@ -492,6 +519,7 @@ def _handle_background_error(self): # more than one error. if not self._error_queue.empty(): e = self._error_queue.get() + self._error_queue.task_done() self.shutdown() # We can catch some exceptions here. raise e @@ -627,14 +655,12 @@ def __init__( engine = engine[self.rank] if isinstance(engine, Engine): - engine.regularize_managed_weights() self.engine = tllm.Executor(engine.engine, json.dumps(engine.config.to_dict(), cls=ConfigEncoder), tllm.ModelType.DECODER_ONLY, executor_config=executor_config, - managed_weights=engine.managed_weights - or {}) + managed_weights=engine.managed_weights) else: self.engine = tllm.Executor(engine, tllm.ModelType.DECODER_ONLY, @@ -703,28 +729,44 @@ def start_stats_thread(self): ) and not self.dispatch_stats_thread.is_alive(): self.dispatch_stats_thread.start() + def _engine_response_callback(self, response: tllm.Response): + return response + def await_response_task(self) -> bool: # Get responses and place in queue. for response in self.engine.await_responses(timeout=datetime.timedelta( milliseconds=100)): + response = self._engine_response_callback(response) + req_id = response.request_id if response.has_error(): - rsp = self.Response(req_id, - tensors=None, - is_final=None, - error=response.error_msg) + # This error will be dispatched to the user's generate_async for the corresponding request. It won't + # stop the whole service. + rsp = self.Response( + req_id, + tensors=None, + # Note: error Response only has one finish reason. + # Since the error will be raised in the main thread, so the finish reason is not actually used. + finish_reasons=[tllm.FinishReason.NOT_FINISHED], + is_final=True, + error=response.error_msg) + else: tensors = self.ResponseTensors( - response.result.output_token_ids, - response.result.context_logits, - response.result.generation_logits, - response.result.log_probs, response.result.cum_log_probs) - - rsp = self.Response(req_id, - tensors, - is_final=response.result.is_final, - error=None) + output_token_ids=response.result.output_token_ids, + context_logits=response.result.context_logits, + generation_logits=response.result.generation_logits, + log_probs=response.result.log_probs, + cum_log_probs=response.result.cum_log_probs, + ) + + rsp = self.Response( + req_id, + tensors, + finish_reasons=response.result.finish_reasons, + is_final=response.result.is_final, + error=None) if self._to_delay_response(rsp): continue @@ -732,18 +774,9 @@ def await_response_task(self) -> bool: self._cleanup_pending_responses(nowait=True) queue = self.return_queue(req_id) - bck_error = self._error_queue.get_nowait( - ) if not self._error_queue.empty() else None - - if bck_error is not None: - rsp = self.Response(req_id, - tensors=None, - is_final=None, - error=bck_error) - queue.put(rsp) - if response.result.is_final: + if rsp.is_final: self._results.pop(req_id) return True # success @@ -788,27 +821,31 @@ def _enqueue_request(self, request: GenerationRequest) -> int: else: lora_config = None - executor_request = tllm.Request( - input_token_ids=request.prompt_token_ids, - max_tokens=request.sampling_params.max_tokens, - max_new_tokens=request.sampling_params.max_new_tokens, - streaming=request.streaming, - sampling_config=request.sampling_params._get_sampling_config(), - end_id=request.sampling_params.end_id, - pad_id=request.sampling_params.pad_id, - output_config=request.sampling_params._get_output_config(), - bad_words=request.sampling_params._get_bad_words(), - stop_words=request.sampling_params._get_stop_words(), - embedding_bias=request.sampling_params.embedding_bias, - external_draft_tokens_config=request.sampling_params. - external_draft_tokens_config, - prompt_tuning_config=request.sampling_params.prompt_tuning_config, - lora_config=lora_config, - logits_post_processor_name=request.sampling_params. - logits_post_processor_name, - ) - req_id = self.engine.enqueue_request(executor_request) - return req_id + try: + executor_request = tllm.Request( + input_token_ids=request.prompt_token_ids, + max_tokens=request.sampling_params.max_tokens, + max_new_tokens=request.sampling_params.max_new_tokens, + streaming=request.streaming, + sampling_config=request.sampling_params._get_sampling_config(), + end_id=request.sampling_params.end_id, + pad_id=request.sampling_params.pad_id, + output_config=request.sampling_params._get_output_config(), + bad_words=request.sampling_params._get_bad_words(), + stop_words=request.sampling_params._get_stop_words(), + embedding_bias=request.sampling_params.embedding_bias, + external_draft_tokens_config=request.sampling_params. + external_draft_tokens_config, + prompt_tuning_config=request.sampling_params. + prompt_tuning_config, + lora_config=lora_config, + logits_post_processor_name=request.sampling_params. + logits_post_processor_name, + ) + req_id = self.engine.enqueue_request(executor_request) + return req_id + except Exception as e: + raise RequestError(str(e)) def submit(self, request: GenerationRequest) -> GenerationResult: """ Low-level API to the executor. Return a "future" GenerationResult which can be waited. """ @@ -832,17 +869,27 @@ def submit(self, request: GenerationRequest) -> GenerationResult: return result def shutdown(self): - if self.engine is not None: - self.await_response_thread.stop() - self.dispatch_stats_thread.stop() + if enable_llm_debug(): + print_colored('Proxy.shutdown...\n', "yellow") + print(traceback.extract_stack()) + + if self.doing_shutdown: + return + else: + self.doing_shutdown = True + if self.engine is not None: if self.engine.can_enqueue_requests(): + if self.await_response_thread.is_alive(): + self.await_response_thread.stop() self.await_response_thread.join() if self.dispatch_stats_thread.is_alive(): + self.dispatch_stats_thread.stop() self.dispatch_stats_thread.join() - self.engine.shutdown() + self.engine.shutdown() + self.engine = None # Check if there are any errors from the threads before shutdown. @@ -900,6 +947,7 @@ def __init__(self, self.host_port, self.authkey = (address[0], address[1]), address[2] self.is_server = is_server self.conn = None + self.listener: Optional[Listener] = None if is_server: self.listener = Listener(self.host_port, 'AF_INET', @@ -917,8 +965,11 @@ def put(self, obj: Any): if isinstance(obj, GenerationExecutor.Response): tensors = self._store_tensors_in_shmm(obj.tensors) - obj = GenerationExecutor.Response(obj.request_id, tensors, - obj.is_final, obj.error) + obj = GenerationExecutor.Response(request_id=obj.request_id, + tensors=tensors, + finish_reasons=obj.finish_reasons, + is_final=obj.is_final, + error=obj.error) self.conn.send(obj) @@ -929,8 +980,11 @@ def get(self) -> Any: obj = self.conn.recv() if isinstance(obj, GenerationExecutor.Response): tensors = self._load_tensors_from_shmm(obj.tensors) - obj = GenerationExecutor.Response(obj.request_id, tensors, - obj.is_final, obj.error) + obj = GenerationExecutor.Response(request_id=obj.request_id, + tensors=tensors, + finish_reasons=obj.finish_reasons, + is_final=obj.is_final, + error=obj.error) return obj def _store_tensors_in_shmm( @@ -985,11 +1039,16 @@ def load_tensor(tensor: Optional[str]) -> Optional[torch.Tensor]: def address(self) -> Tuple[str, int, bytes]: return (self.host_port[0], self.host_port[1], self.authkey) - def __del__(self): + def close(self): if self.conn is not None: self.conn.close() - if self.is_server: + self.conn = None + if self.listener is not None: self.listener.close() + self.listener = None + + def __del__(self): + self.close() class ExecutorBindingsProxy(GenerationExecutor): @@ -1007,7 +1066,7 @@ def __init__(self, self.request_queue = IpcQueue(is_server=True) # Return request id back to dispatcher - self.request_id_queue = IpcQueue(is_server=True) + self.rid_or_err_queue = IpcQueue(is_server=True) self.result_queue = IpcQueue(is_server=True) self.mp_stats_queue = IpcQueue(is_server=True) @@ -1024,8 +1083,8 @@ def __init__(self, self.workers_kwargs.update({ "request_queue_addr": self.request_queue.address, - "request_id_queue_addr": - self.request_id_queue.address, + "rid_or_err_queue_addr": + self.rid_or_err_queue.address, "result_queue_addr": self.result_queue.address, "stats_queue_addr": @@ -1044,7 +1103,7 @@ def __init__(self, @staticmethod def workers_main(engine: Union[Path, Engine], request_queue_addr: Tuple[str, int, bytes], - request_id_queue_addr: Tuple[str, int, bytes], + rid_or_err_queue_addr: Tuple[str, int, bytes], result_queue_addr: Tuple[str, int, bytes], stats_queue_addr: Tuple[str, int, bytes], executor_config: tllm.ExecutorConfig = tllm.ExecutorConfig( @@ -1054,7 +1113,7 @@ def workers_main(engine: Union[Path, Engine], if mpi_rank() == 0: request_queue = IpcQueue(request_queue_addr, is_server=False) - request_id_queue = IpcQueue(request_id_queue_addr, is_server=False) + rid_or_err_queue = IpcQueue(rid_or_err_queue_addr, is_server=False) result_queue = IpcQueue(result_queue_addr, is_server=False) mp_stats_queue = IpcQueue(stats_queue_addr, is_server=False) @@ -1077,9 +1136,11 @@ def notify_proxy_threads_to_quit(): executor.set_result_queue(result_queue) executor.set_stats_queue(mp_stats_queue) while (req := request_queue.get()) is not None: - result = executor.submit(req) - request_id_queue.put(result.request_id) - + try: + result = executor.submit(req) + rid_or_err_queue.put(result.request_id) + except RequestError as e: + rid_or_err_queue.put(e) notify_proxy_threads_to_quit() except ExecutorBindingsWorker.WorkerExit as e: @@ -1088,8 +1149,9 @@ def notify_proxy_threads_to_quit(): except Exception as e: # other critical errors if mpi_rank() == 0: notify_proxy_threads_to_quit() - - raise CppExecutorError(f"Failed during generation: {e}") from e + err = CppExecutorError(f"Failed during generation: {e}") + if mpi_rank() == 0: + rid_or_err_queue.put(err) def dispatch_result_task(self) -> bool: # process the remaining pending req_ids before getting the next response, since the queue.get will block, we'd @@ -1141,9 +1203,8 @@ def dispatch_stats_task(self) -> bool: def start(self): def mpi_done_callback(future: concurrent.futures.Future): - try: - future.result() - except: + # This is called when the MPI worker is done, so future.exception() will not block. + if future.exception() is not None: self._error_queue.put_nowait(future.exception()) self.mpi_futures = self.mpi_session.submit( @@ -1162,14 +1223,28 @@ def mpi_done_callback(future: concurrent.futures.Future): self._handle_background_error() def shutdown(self): + if enable_llm_debug(): + print_colored('Proxy.shutdown...\n', "yellow") + print_colored(str(traceback.extract_stack()), "yellow") if not self.workers_started: return - self.request_queue.put(None) # Tell the rank0 worker to quit + if self.doing_shutdown: + return + else: + self.doing_shutdown = True + + # step1: notify the workers to quit + self.request_queue.put(None) for f in self.mpi_futures: - f.result() + try: + f.result() + except: + # The errors are already captured in mpi_done_callback, ignored here + pass + # step2: notify the background threads to quit if self.dispatch_result_thread.is_alive(): self.dispatch_result_thread.stop() self.dispatch_result_thread.join() @@ -1177,9 +1252,17 @@ def shutdown(self): self.dispatch_stats_thread.stop() self.dispatch_stats_thread.join() + # step3: finish all remaining work + # It is possible that some requests are still pending in the workers, we need to process them before shutdown self._cleanup_pending_responses(nowait=False) + # close all the sockets + self.request_queue.close() + self.rid_or_err_queue.close() + self.result_queue.close() + self.mp_stats_queue.close() + self.workers_started = False # Process the errors in-case error during shutting down the threads @@ -1195,12 +1278,14 @@ def submit(self, request: GenerationRequest) -> GenerationResult: self.request_queue.put(request) - req_id = self.request_id_queue.get() - request.set_id(req_id) + rid_or_err = self.rid_or_err_queue.get() + if isinstance(rid_or_err, Exception): + raise rid_or_err + request.set_id(rid_or_err) result = GenerationResult( request, background_error_handler=self._handle_background_error) - self._results[req_id] = result + self._results[rid_or_err] = result self._handle_background_error() @@ -1214,4 +1299,4 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.shutdown() - return False + return False # propagate the exception diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 7a5cca5a1..335f85d7c 100644 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -27,10 +27,11 @@ from . import graph_rewriting as gw from ._common import default_net, default_trtnet, precision -from ._utils import (bf16_array, bool_array, dim_resolve_negative, - dim_to_trt_axes, dims_array, fp16_array, fp32_array, - int32_array, int64_array, np_dtype_to_trt, - str_dtype_to_trt, trt_dtype_to_np, trt_dtype_to_str) +from ._utils import (QuantModeWrapper, bf16_array, bool_array, + dim_resolve_negative, dim_to_trt_axes, dims_array, + fp16_array, fp32_array, int32_array, int64_array, + np_dtype_to_trt, str_dtype_to_trt, trt_dtype_to_np, + trt_dtype_to_str) from .network import PluginInfo, set_np_weight, set_plugin_info from .plugin import TRT_LLM_PLUGIN_NAMESPACE, current_all_reduce_helper from .quantization import QuantMode @@ -4063,7 +4064,7 @@ def bert_attention(tensor: Tensor, The maximum distance of relative position in attention, for implicit mode. Default value is 0, meaning to use the regular mode of relative attention bias. Implicit mode is only enabled when passing in non-zero positive max_distance value. - See relative attention bias in docs/gpt_attention.md + See relative attention bias in docs/source/advanced/gpt-attention.md max_input_length: Tensor = None The maximum input sequence length represented by Tensor shape. Requires for remove_input_padding to pre-define plugin workspace size. @@ -4579,7 +4580,7 @@ def gpt_attention( kv_orig_quant_scale: Optional[Tensor] = None, kv_quant_orig_scale: Optional[Tensor] = None, attention_output_orig_quant_scale: Optional[Tensor] = None, - kv_cache_quant_mode: QuantMode = QuantMode(0), + kv_cache_quant_mode: Union[QuantModeWrapper, QuantMode] = QuantMode(0), max_context_length: Optional[int] = None, mask_type: AttentionMaskType = AttentionMaskType.causal, block_sparse_block_size: int = 64, @@ -4594,6 +4595,7 @@ def gpt_attention( kv_cache_block_offsets: Optional[Tensor] = None, host_kv_cache_block_offsets: Tensor = None, host_kv_cache_pool_pointers: Tensor = None, + host_kv_cache_pool_mapping: Tensor = None, do_cross_attention: bool = False, cross_qkv: Optional[Tensor] = None, # for cross attention cross_qkv_length: Optional[Tensor] = None, # for cross attention @@ -4609,6 +4611,7 @@ def gpt_attention( spec_decoding_position_offsets: Tensor = None, spec_decoding_packed_mask: Tensor = None, host_runtime_perf_knobs: Optional[Tensor] = None, + layer_idx_in_cache_pool: Optional[int] = None, ) -> Tuple[Tensor, Optional[Tensor]]: ''' Add an operation that performs the multi-head attention in GPT-like models. @@ -4619,19 +4622,19 @@ def gpt_attention( arguments that are likely to be removed or merged with others in the future release. - See docs/gpt_attention.md for the documentation of that function. + See docs/source/advanced/gpt-attention.md for the documentation of that function. Parameters: qkv: Tensor (On GPU) The input QKV tensor. Its shape is [batch_beam_size, max_seqlen, qkv_dim] in padded mode and [1, num_tokens, qkv_dim] in - packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/gpt_attention.md, + packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/source/advanced/gpt-attention.md, past_key_value: Tensor (On GPU) The tensor that stores KV cache data. Its shape is [max_batch_size * max_beam_width, 2, num_kv_heads, max_seqlen, hidden_dim_per_head] in contiguous mode and [max_blocks, 2, num_kv_heads, num_tokens_per_block, hidden_dim_per_head] - in paged mode. See KV Cache in docs/gpt_attention.md, + in paged mode. See KV Cache in docs/source/advanced/gpt-attention.md, context_fmha_custom_mask: Tensor (On GPU) The tensor that stores the packed custom mask for fmha. @@ -4639,7 +4642,7 @@ def gpt_attention( sequence_lengths: Tensor (On GPU) The tensor that stores the length of each sequence. Its shape is - [batch_size]. See QKV Input in docs/gpt_attention.md, + [batch_size]. See QKV Input in docs/source/advanced/gpt-attention.md, host_past_key_value_lengths: Tensor (On CPU) An INT32 tensor of shape [batch_size], @@ -4657,12 +4660,12 @@ def gpt_attention( cache_indirection: Tensor (On GPU) The tensor to reconstruct the paths when using beam-search. Its shape is [batch_size, beam_width, max_seqlen]. See Beam-Search in - docs/gpt_attention.md, + docs/source/advanced/gpt-attention.md, host_request_types: Tensor = None (On CPU) The tensor on the host that indicates if a request is in context or generation phase. Its shape is [batch_size]. See Inflight Batching - in docs/gpt_attention.md, + in docs/source/advanced/gpt-attention.md, layer_idx: int The index of this attention layer, used to access kv_cache_block_offsets, @@ -4678,7 +4681,7 @@ def gpt_attention( q_scaling: float The value used to compute the scaling factor applied to the output - of the Q*K^T product. See Scaling Factors in docs/gpt_attention.md, + of the Q*K^T product. See Scaling Factors in docs/source/advanced/gpt-attention.md, qk_tanh_scale: float The scale * tanh(value / scale) used to compute the scaling factor applied to the output @@ -4726,12 +4729,12 @@ def gpt_attention( kv_orig_quant_scale: Tensor The tensor to store the scaling factor for quantization to INT8/FP8 in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache in - docs/gpt_attention.md, + docs/source/advanced/gpt-attention.md, kv_quant_orig_scale: Tensor The tensor to store the scaling factor for dequantization from INT8/FP8 in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache - in docs/gpt_attention.md, + in docs/source/advanced/gpt-attention.md, attention_output_orig_quant_scale: Tensor The tensor to store the scaling factor for quantization to FP8 @@ -4742,7 +4745,7 @@ def gpt_attention( max_context_length: int32_t The length of the longest input sequence. See QKV Input in - docs/gpt_attention.md, + docs/source/advanced/gpt-attention.md, mask_type: int = 1 The type of mask: @@ -4779,14 +4782,17 @@ def gpt_attention( kv_cache_block_offsets: The tensor of block offsets for the KV cache. Its shape is [num_layers, max_batch_size, max_beam_width, 2, max_blocks_per_sequence * 2], - See KV cache section in docs/gpt_attention.md, on gpu, + See KV cache section in docs/source/advanced/gpt-attention.md, on gpu, host_kv_cache_block_offsets: The same as kv_cache_block_offsets, but on cpu, host_kv_cache_pool_pointers: - The tensor of pool pointers for the KV cache. Its shape is [2], - See KV cache section in docs/gpt_attention.md, on gpu, + The tensor of pool pointers for the KV cache. Its shape is [num_layers, 2], + See KV cache section in docs/source/advanced/gpt-attention.md, on gpu, + + host_kv_cache_pool_mapping: + The tensor of pool mapping for the different memory pools. Its shape is [num_layers,], do_cross_attention: bool = False Do we use this as cross attention instead of self attention, @@ -4809,7 +4815,7 @@ def gpt_attention( The maximum distance of relative position in attention, for implicit mode. Default value is 0, meaning to use the regular mode of relative attention bias. Implicit mode is only enabled when passing in non-zero positive max_distance value. - See relative attention bias in docs/gpt_attention.md + See relative attention bias in docs/source/advanced/gpt-attention.md host_context_lengths: Tensor = None (On CPU) A host tensor that contains the lengths of the different inputs, @@ -4861,6 +4867,9 @@ def gpt_attention( assert host_max_attention_window_sizes is not None assert host_sink_token_length is not None + if layer_idx_in_cache_pool is None: + layer_idx_in_cache_pool = layer_idx + paged_kv_cache_flag = default_net().plugin_config.paged_kv_cache if isinstance(qkv, list): is_unfuse_qkv_gemm = 1 @@ -4884,6 +4893,10 @@ def gpt_attention( num_kv_heads = trt.PluginField("num_kv_heads", np.array(num_kv_heads, dtype=np.int32), trt.PluginFieldType.INT32) + layer_idx_in_cache_pool = trt.PluginField( + "layer_idx_in_cache_pool", + np.array(layer_idx_in_cache_pool, dtype=np.int32), + trt.PluginFieldType.INT32) head_size = trt.PluginField("head_size", np.array(hidden_size_per_head, dtype=np.int32), trt.PluginFieldType.INT32) @@ -4985,6 +4998,9 @@ def gpt_attention( trt.PluginFieldType.INT32) tp_rank = trt.PluginField("tp_rank", np.array(tp_rank, dtype=np.int32), trt.PluginFieldType.INT32) + if isinstance(kv_cache_quant_mode, QuantModeWrapper): + # Now in TRT-LLM only use global kv_cache, so it's enough to get the first quant mode from list + kv_cache_quant_mode = kv_cache_quant_mode[0] kv_cache_quant_mode_field = trt.PluginField( "kv_cache_quant_mode", np.array(kv_cache_quant_mode, dtype=np.int32), trt.PluginFieldType.INT32) @@ -5034,13 +5050,14 @@ def gpt_attention( trt.PluginFieldType.INT32) pfc = trt.PluginFieldCollection([ - layer_idx, nheads, vision_start, vision_length, num_kv_heads, head_size, - unidirectional, q_scaling, qk_tanh_scale, position_embedding_type, - rotary_embedding_dim, rotary_embedding_base, - rotary_embedding_scale_type, rotary_embedding_scale, - rotary_embedding_short_m_scale, rotary_embedding_long_m_scale, - rotary_embedding_max_positions, rotary_embedding_original_max_positions, - tp_size, tp_rank, unfuse_qkv_gemm, context_fmha_type, enable_xqa, + layer_idx, nheads, vision_start, vision_length, num_kv_heads, + layer_idx_in_cache_pool, head_size, unidirectional, q_scaling, + qk_tanh_scale, position_embedding_type, rotary_embedding_dim, + rotary_embedding_base, rotary_embedding_scale_type, + rotary_embedding_scale, rotary_embedding_short_m_scale, + rotary_embedding_long_m_scale, rotary_embedding_max_positions, + rotary_embedding_original_max_positions, tp_size, tp_rank, + unfuse_qkv_gemm, context_fmha_type, enable_xqa, kv_cache_quant_mode_field, remove_input_padding, mask_type, block_sparse_block_size, block_sparse_homo_head_pattern, block_sparse_num_local_blocks, block_sparse_vertical_stride, @@ -5079,9 +5096,10 @@ def gpt_attention( assert kv_cache_block_offsets is not None, "Paged kv cache is enabled, the kv_cache_block_offsets tensor shall not be None" assert host_kv_cache_block_offsets is not None, "Paged kv cache is enabled, the host_kv_cache_block_offsets tensor shall not be None" assert host_kv_cache_pool_pointers is not None, "Paged kv cache is enabled, the host_kv_cache_pool_pointers tensor shall not be None" + assert host_kv_cache_pool_mapping is not None, "Paged kv cache is enabled, the host_kv_cache_pool_mapping tensor shall not be None" plug_inputs += [ kv_cache_block_offsets, host_kv_cache_block_offsets, - host_kv_cache_pool_pointers + host_kv_cache_pool_pointers, host_kv_cache_pool_mapping ] else: plug_inputs += [past_key_value] @@ -5609,7 +5627,7 @@ def lora_plugin( host_request_types : Tensor = None The tensor on the host that indicates if a request is in context or generation phase. Its shape is [batch_size]. See Inflight Batching - in docs/gpt_attention.md, + in docs/source/advanced/gpt-attention.md, transa : bool Is the first input transposed? Set to 'True' if you want the first @@ -5736,7 +5754,7 @@ def mamba_conv1d(input: Tensor, host_request_types : Tensor (On CPU) The tensor on the host that indicates if a request is in context or generation phase. Its shape is [batch_size]. See Inflight Batching - in docs/gpt_attention.md, + in docs/source/advanced/gpt-attention.md, last_token_ids : Tensor (On GPU) The inclusive prefix-sum of the lengths or the lengths of the @@ -5883,7 +5901,7 @@ def selective_scan(input: Tensor, host_request_types : Tensor (On CPU) The tensor on the host that indicates if a request is in context or generation phase. Its shape is [batch_size]. See Inflight Batching - in docs/gpt_attention.md + in docs/source/advanced/gpt-attention.md last_token_ids : Tensor (On GPU) The inclusive prefix-sum of the lengths or the lengths of the @@ -6029,7 +6047,7 @@ def rg_lru(input: Tensor, host_request_types : Tensor (On CPU) The tensor on the host that indicates if a request is in context or generation phase. Its shape is [batch_size]. See Inflight Batching - in docs/gpt_attention.md, + in docs/source/advanced/gpt-attention.md, last_token_ids : Tensor (On GPU) The inclusive prefix-sum of the lengths or the lengths of the @@ -6186,7 +6204,7 @@ def rg_lru(input: Tensor, def topk(input: Tensor, - k: int, + k: Union[Tensor, int], dim: int, largest: bool = True) -> Tuple[Tensor, Tensor]: ''' @@ -6227,8 +6245,12 @@ def topk(input: Tensor, layer = default_trtnet().add_topk( input.trt_tensor, trt.TopKOperation.MAX if largest else trt.TopKOperation.MIN, - k=k, + k=k if not isinstance(k, Tensor) else 1, axes=axes) + if isinstance(k, Tensor): + if k.ndim() == 1: + k = squeeze(k, 0) + layer.set_input(1, k.trt_tensor) values = layer.get_output(0) indices = layer.get_output(1) diff --git a/tensorrt_llm/hlapi/__init__.py b/tensorrt_llm/hlapi/__init__.py index 77f4fa3e9..eeb2f07ee 100644 --- a/tensorrt_llm/hlapi/__init__.py +++ b/tensorrt_llm/hlapi/__init__.py @@ -1,7 +1,8 @@ +from ..executor import RequestError from .build_cache import BuildCacheConfig from .llm import LLM, RequestOutput, SamplingParams -from .llm_utils import (BuildConfig, CapacitySchedulerPolicy, KvCacheConfig, - QuantAlgo, QuantConfig, SchedulerConfig) +from .llm_utils import (BuildConfig, CalibConfig, CapacitySchedulerPolicy, + KvCacheConfig, QuantAlgo, QuantConfig, SchedulerConfig) __all__ = [ 'LLM', @@ -13,5 +14,7 @@ 'BuildConfig', 'QuantConfig', 'QuantAlgo', + 'CalibConfig', 'BuildCacheConfig', + 'RequestError', ] diff --git a/tensorrt_llm/hlapi/llm.py b/tensorrt_llm/hlapi/llm.py index 023e55db2..313127b38 100644 --- a/tensorrt_llm/hlapi/llm.py +++ b/tensorrt_llm/hlapi/llm.py @@ -397,7 +397,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback) -> bool: del exc_value, traceback self._shutdown() - return exc_type is not None + return False # propagate exceptions def __getstate__(self): raise RuntimeError("LLM object can not be pickled.") diff --git a/tensorrt_llm/hlapi/llm_utils.py b/tensorrt_llm/hlapi/llm_utils.py index a877667d5..dc6be04c6 100644 --- a/tensorrt_llm/hlapi/llm_utils.py +++ b/tensorrt_llm/hlapi/llm_utils.py @@ -15,6 +15,7 @@ 'BuildConfig', 'BuildCacheConfig', 'QuantConfig', + 'CalibConfig', 'CachedModelLoader', 'ConfigArbitrateError', '_ConfigArbitrator', @@ -30,9 +31,8 @@ from dataclasses import asdict, dataclass, field, fields from enum import Enum from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union -import tensorrt as trt import torch from tqdm import tqdm from transformers import PreTrainedTokenizerBase @@ -55,7 +55,6 @@ from .tokenizer import TokenizerBase, TransformersTokenizer, tokenizer_factory # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import from .utils import (GpuArch, download_hf_model, download_hf_pretrained_config, - file_with_glob_exists, file_with_suffix_exists, get_directory_size_in_gb, print_colored, print_traceback_on_error, set_docstring) @@ -115,6 +114,36 @@ def is_multi_gpu(self) -> bool: return self.world_size > 1 +@dataclass(slots=True) +class CalibConfig: + """ + Calibration configuration. + + Args: + device (Literal['cuda', 'cpu'], default='cuda'): The device to run calibration. + calib_dataset (str, default='cnn_dailymail'): The name or local path of calibration dataset. + calib_batches (int, default=512): The number of batches that the calibration runs. + calib_batch_size (int, default=1): The batch size that the calibration runs. + calib_max_seq_length (int, default=512): The maximum sequence length that the calibration runs. + random_seed (int, default=1234): The random seed used for calibration. + tokenizer_max_seq_length (int, default=2048): The maximum sequence length to initialize tokenizer for calibration. + """ + device: Literal['cuda', 'cpu'] = 'cuda' + calib_dataset: str = 'cnn_dailymail' + calib_batches: int = 512 + calib_batch_size: int = 1 + calib_max_seq_length: int = 512 + random_seed: int = 1234 + tokenizer_max_seq_length: int = 2048 + + @classmethod + def from_dict(cls, config: dict): + return cls(**config) + + def to_dict(self): + return asdict(self) + + class _ModelFormatKind(Enum): HF = 0 TLLM_CKPT = 1 @@ -180,6 +209,10 @@ def from_module(cls, module: Module): revision (str, optional): The revision of the model to use. Default is None. + + load_format (Literal['auto', 'dummy'], default='auto'): The format of the model weights to load. + * 'auto' will try to load the weights from the provided checkpoint. + * 'dummy' will initialize the weights with random values, which is mainly for profiling. """ # The arguments locate in LLM class's kwargs, and will be concatenated to LLM class's apidocs. @@ -204,6 +237,8 @@ def from_module(cls, module: Module): quant_config (QuantConfig, default=QuantConfig()): The quantization configuration for the model. Default is an empty QuantConfig instance. + calib_config (CalibConfig, default=CalibConfig()): The calibration configuration for the model. + embedding_parallel_mode (str, default="SHARDING_ALONG_VOCAB"): The parallel mode for embeddings. share_embedding_table (bool, default=False): Whether to share the embedding table. @@ -263,6 +298,8 @@ class LlmArgs: revision: Optional[str] = None + load_format: Literal['auto', 'dummy'] = 'auto' + # LoRA arguments enable_lora: bool = False @@ -275,8 +312,12 @@ class LlmArgs: # BuildConfig is introduced to give users a familiar interface to configure the model building. build_config: Optional[BuildConfig] = None + fast_build: Optional[bool] = False + quant_config: QuantConfig = field(default_factory=QuantConfig) + calib_config: CalibConfig = field(default_factory=CalibConfig) + # A handful of options from PretrainedConfig embedding_parallel_mode: str = 'SHARDING_ALONG_VOCAB' @@ -331,8 +372,6 @@ def __post_init__(self): if self.dtype == 'bfloat16': raise RuntimeError("Pre SM 80 GPUs do not support bfloat16") - self._engine_config: Optional[EngineConfig] = None - self.auto_parallel_config = AutoParallelConfig( sharded_io_allowlist=[ "past_key_value_\\d+", @@ -422,6 +461,11 @@ def setup(self): self.build_config = self.build_config or BuildConfig() + # TODO(xiweny): remove the checker when manage weights support all data types + if self.fast_build and (self.quant_config.quant_algo is QuantAlgo.FP8 + or self.quant_config.quant_algo is None): + self._update_plugin_config("manage_weights", True) + if self.enable_lora: self.build_config.plugin_config.lora_plugin = 'auto' if self.max_lora_rank is not None: @@ -433,7 +477,7 @@ def setup(self): def _perform_config_arbitration(self): ''' Arbitrate the configurations for the model building. The configs between different functional or performance - features might be confilcted, and this method will arbitrate the conflicts and raise errors if necessary. + features might be conflicted, and this method will arbitrate the conflicts and raise errors if necessary. ''' self._config_arbitrator = _ConfigArbitrator() if self.build_config_mutable: @@ -773,24 +817,17 @@ class _ModelRuntimeContext: ''' _ModelRuntimeContext holds the minimum runtime resources for running a model. It could be a runtime cache in MPI nodes. ''' - engine_buffer: Optional[trt.IHostMemory] = None - # engine_config is only used for saving the engine to disk - engine_config: Optional[Union[dict, EngineConfig]] = None + engine: Optional[Engine] = None mapping: Optional[Mapping] = None model_info: Optional[_ModelInfo] = None # This is only used when build-cache is enabled engine_path: Optional[str] = None - @property - def engine(self) -> trt.IHostMemory: - assert self.engine_buffer is not None - return self.engine_buffer - @property def model_arch(self) -> str: # "LlaMACausalForLM" or "OPTForCausalLM" and so on - return self.engine_config.pretrained_config['architecture'] + return self.engine.config.pretrained_config['architecture'] class ModelLoader: @@ -959,16 +996,10 @@ def __call__(self, engine_dir: Optional[Path] = None) -> Path: ) pipeline() - if not hasattr(self, '_engine_config'): - raise RuntimeError("config is not loaded.") - - config = self._engine_config - assert engine_dir runtime_context = _ModelRuntimeContext( - engine_buffer=self._engine_buffer, - engine_config=config, + engine=self._engine, mapping=self.mapping, model_info=self._model_info, ) @@ -1021,28 +1052,37 @@ def copy_hf_tokenizer_data_to_engine_dir(): else: shutil.copy2(src, dst) - engine = Engine(config=model.engine_config, engine=model.engine) - engine.save(engine_dir) + model.engine.save(engine_dir) if rank == 0: copy_hf_tokenizer_data_to_engine_dir() @staticmethod def get_model_format(model_dir: str) -> _ModelFormatKind: ''' Get the format of the model. ''' - # TODO: migrate to detect version field in config.json after TRTLLM-256 finished - if Path.exists( - Path(model_dir) / 'config.json') and file_with_glob_exists( - model_dir, 'rank*.safetensors'): - return _ModelFormatKind.TLLM_CKPT - if (Path.exists(Path(model_dir) / 'config.json') - and (file_with_suffix_exists(model_dir, '.bin') - or file_with_suffix_exists(model_dir, '.safetensors'))): - return _ModelFormatKind.HF - if Path.exists( - Path(model_dir) / 'config.json') and file_with_suffix_exists( - model_dir, '.engine'): - return _ModelFormatKind.TLLM_ENGINE - raise ValueError(f"Unknown model format for {model_dir}") + if not (Path(model_dir) / 'config.json').exists(): + raise ValueError( + f"Failed to infer model format because no config.json exists in {model_dir}" + ) + + with open(Path(model_dir) / 'config.json') as f: + config = json.load(f) + + try: + if 'pretrained_config' in config and 'build_config' in config: + model_format = _ModelFormatKind.TLLM_ENGINE + EngineConfig.from_json_file(Path(model_dir) / 'config.json') + elif 'architecture' in config and 'dtype' in config: + model_format = _ModelFormatKind.TLLM_CKPT + PretrainedConfig.from_checkpoint(model_dir) + else: + model_format = _ModelFormatKind.HF + AutoConfig.from_hugging_face(model_dir) + except Exception as e: + raise ValueError( + f"Inferred model format {model_format}, but failed to load config.json: {e}" + ) + else: + return model_format def _download_hf_model(self): ''' Download HF model from third-party model hub like www.modelscope.cn or huggingface. ''' @@ -1065,7 +1105,16 @@ def _load_model_from_hf(self): assert self._model_dir is not None model_cls = AutoModelForCausalLM.get_trtllm_model_class( self._model_dir, self.llm_args.trust_remote_code) - if self.llm_args.quant_config.requires_calibration: + if self.llm_args.load_format == 'dummy': + config = model_cls.config_class.from_hugging_face( + str(self._model_dir), + dtype=self.llm_args.dtype, + mapping=self.mapping, + quant_config=self.llm_args.quant_config, + **self.convert_checkpoint_options, + ) + self.model = model_cls(config) + elif self.llm_args.quant_config.requires_calibration: assert self.workspace is not None checkpoint_dir = f"{self.workspace}/quantized-checkpoint" if self.rank == 0: @@ -1075,6 +1124,7 @@ def _load_model_from_hf(self): dtype=self.llm_args.dtype, mapping=self.mapping, quant_config=self.llm_args.quant_config, + **self.llm_args.calib_config.to_dict(), trust_remote_code=self.llm_args.trust_remote_code, ) if self.llm_args.parallel_config.is_multi_gpu: @@ -1108,8 +1158,11 @@ def _load_model_from_ckpt(self): assert architecture in MODEL_MAP, \ f"Unsupported model architecture: {architecture}" model_cls = MODEL_MAP[architecture] - self.model = model_cls.from_checkpoint(self._model_dir, - config=self.pretrained_config) + if self.llm_args.load_format == 'dummy': + self.model = model_cls(self.pretrained_config) + else: + self.model = model_cls.from_checkpoint( + self._model_dir, config=self.pretrained_config) self._model_info = _ModelInfo.from_pretrained_config( self.pretrained_config) @@ -1138,10 +1191,7 @@ def _build_engine(self): self.model.config.mapping.rank = self.rank assert self.model is not None, "model is loaded yet." - engine = build(self.model, copied_build_config) - - self._engine_buffer = engine.engine - self._engine_config = engine.config + self._engine = build(self.model, copied_build_config) self.mapping = self.model.config.mapping # delete the model explicitly to free all the build-time resources @@ -1162,9 +1212,7 @@ def _save_engine_for_runtime(self): def _load_engine_buffer(self): # Load engine buffer from disk - engine = Engine.from_dir(self._model_dir) - self._engine_buffer = engine.engine - self._engine_config = engine.config + self._engine = Engine.from_dir(self._model_dir) @staticmethod def load_extra_build_configs_from_engine( @@ -1322,7 +1370,7 @@ def build_task(engine_dir: Path): if model_format is not _ModelFormatKind.TLLM_ENGINE: model_loader_kwargs = { 'llm_args': self.llm_args, - 'workspace': self.workspace, + 'workspace': str(self.workspace), 'llm_build_stats': self.llm_build_stats, } @@ -1397,7 +1445,7 @@ def save(self, engine_dir: Path): @dataclass class LlmBuildStats: ''' LlmBuildStats is the statistics for the LLM model building. ''' - # Whether the cache is hitted for the engine + # Whether the cache is hit for the engine cache_hitted: bool = False cache_info: Optional[str] = None diff --git a/tensorrt_llm/hlapi/utils.py b/tensorrt_llm/hlapi/utils.py index 9bf2cbe43..6db9d38ee 100644 --- a/tensorrt_llm/hlapi/utils.py +++ b/tensorrt_llm/hlapi/utils.py @@ -201,6 +201,24 @@ def _get_stop_words(self) -> List[List[int]]: "please call the setup method.") return words + self._stop_word_ids + def _get_stop_reasons_and_words( + self) -> List[Tuple[Union[str, int], List[int]]]: + stop_reasons = [] + if self.stop_token_ids is not None: + stop_reasons.extend(self.stop_token_ids) + if self.stop is not None: + if isinstance(self.stop, str): + stop_reasons.append(self.stop) + else: + stop_reasons.extend(self.stop) + stop_words = self._get_stop_words() + if len(stop_reasons) != len(stop_words): + raise RuntimeError( + f"The number of {self.__class__.__name__}.stop_token_ids ({self.stop_token_ids}) " + f"and {self.__class__.__name__}.stop ({self.stop}) are inconsistent with the " + f"processed stop_words ({stop_words}).") + return list(zip(stop_reasons, stop_words)) + def _get_sampling_config(self) -> tllme.SamplingConfig: expected_fields = [ "beam_width", "top_k", "top_p", "top_p_min", "top_p_reset_ids", @@ -451,10 +469,17 @@ def run(self): if not self.task(**self.kwargs): break except Exception as e: - logger.error(f"Error in thread {self.name}: {e}") + logger.error( + f"Error in thread {self.name}: {e}\n{traceback.format_exc()}" + ) self.error_queue.put(e) logger.info(f"Thread {self.name} stopped.") def stop(self): self.stop_event.set() + + +def enable_llm_debug() -> bool: + ''' Tell whether to enable the debug mode for LLM class. ''' + return os.environ.get("TLLM_LLM_ENABLE_DEBUG", "0") == "1" diff --git a/tensorrt_llm/layers/__init__.py b/tensorrt_llm/layers/__init__.py index e41e087ac..8d22806b1 100644 --- a/tensorrt_llm/layers/__init__.py +++ b/tensorrt_llm/layers/__init__.py @@ -23,7 +23,7 @@ from .linear import ColumnLinear, Linear, RowLinear from .lora import Lora, LoraParams, LoraRuntimeParams from .mlp import MLP, FusedGatedMLP, GatedMLP -from .moe import MOE, MoeConfig +from .moe import MOE, MoeConfig, SharedMoE from .normalization import GroupNorm, LayerNorm, RmsNorm from .pooling import AvgPool2d from .recurrent import FusedRgLru, GroupedLinear, Recurrent, RgLru @@ -61,6 +61,7 @@ 'LoraRuntimeParams', 'MOE', 'MoeConfig', + 'SharedMoE', 'Mamba', 'Mamba2', 'Recurrent', diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py index 995425bc1..1c8c66159 100644 --- a/tensorrt_llm/layers/attention.py +++ b/tensorrt_llm/layers/attention.py @@ -28,7 +28,7 @@ allgather, arange, bert_attention, cast, clip, concat, constant, embedding, expand, expand_dims, expand_mask, generate_alibi_biases, generate_alibi_slopes, - gpt_attention, matmul) + gpt_attention, gt, matmul) from ..functional import max as fmax from ..functional import (minimum, repeat_interleave, shape, slice, softmax, split, unsqueeze, where) @@ -251,11 +251,13 @@ def __init__(self, kv_cache_block_offsets: Tensor = None, host_kv_cache_block_offsets: Tensor = None, host_kv_cache_pool_pointers: Tensor = None, + host_kv_cache_pool_mapping: Tensor = None, cache_indirection: Tensor = None, past_key_value_length: Tensor = None, cross_kv_cache_block_offsets: Tensor = None, host_cross_kv_cache_block_offsets: Tensor = None, - host_cross_kv_cache_pool_pointers: Tensor = None): + host_cross_kv_cache_pool_pointers: Tensor = None, + host_cross_kv_cache_pool_mapping: Tensor = None): self.past_key_value = past_key_value self.host_past_key_value_lengths = host_past_key_value_lengths self.host_max_attention_window_sizes = host_max_attention_window_sizes @@ -263,9 +265,11 @@ def __init__(self, self.kv_cache_block_offsets = kv_cache_block_offsets self.host_kv_cache_block_offsets = host_kv_cache_block_offsets self.host_kv_cache_pool_pointers = host_kv_cache_pool_pointers + self.host_kv_cache_pool_mapping = host_kv_cache_pool_mapping self.cross_kv_cache_block_offsets = cross_kv_cache_block_offsets self.host_cross_kv_cache_block_offsets = host_cross_kv_cache_block_offsets self.host_cross_kv_cache_pool_pointers = host_cross_kv_cache_pool_pointers + self.host_cross_kv_cache_pool_mapping = host_cross_kv_cache_pool_mapping self.cache_indirection = cache_indirection # self.past_key_value_length = past_key_value_length @@ -349,7 +353,8 @@ def __init__(self, max_attn_value=0.0, block_sparse_params=None, use_implicit_relative_attention=False, - reorder=False): + reorder=False, + layer_idx_in_cache_pool=None): super().__init__() self.local_layer_idx = local_layer_idx @@ -357,6 +362,7 @@ def __init__(self, self.attention_mask_type = attention_mask_type self.attention_head_size = hidden_size // num_attention_heads if attention_head_size is None else attention_head_size self.num_kv_heads = num_kv_heads + self.layer_idx_in_cache_pool = layer_idx_in_cache_pool if layer_idx_in_cache_pool is not None else local_layer_idx assert num_attention_heads % tp_size == 0, \ "num_attention_heads must be divisible by tp_size" self.num_attention_heads = num_attention_heads // tp_size @@ -852,41 +858,32 @@ def compute_cross_qkv(encoder_output): attention_output_orig_quant_scale = self.attention_output_orig_quant_scale.value if self.attention_output_orig_quant_scale is not None else None if self.position_embedding_type == PositionEmbeddingType.long_rope: - short = slice( - attention_params. - embed_positions_short_factors_for_attention_plugin, - concat([0, 0, 0]), - concat([ - max(attention_params.sequence_length, - self.original_max_position_embeddings), - self.rotary_embedding_dim // 2, 2 - ])) - long = slice( - attention_params. - embed_positions_long_factors_for_attention_plugin, - concat([0, 0, 0]), - concat([ - max(attention_params.sequence_length, - self.original_max_position_embeddings), - self.rotary_embedding_dim // 2, 2 - ])) - short = short.view((1, -1)) - long = long.view((1, -1)) - embed_positions = concat([short, long], dim=0) - select = where( - fmax(attention_params.sequence_length, dim=0) <= - self.original_max_position_embeddings, 0, 1) - rotary_cos_sin = slice(embed_positions, - concat([select, 0]), - sizes=concat([1, shape(long, 1)])) - short_inv_freq = attention_params.short_inv_freq - long_inv_freq = attention_params.long_inv_freq - concat_inv_freq = concat([short_inv_freq, long_inv_freq], dim=0) - rotary_inv_freq = slice(concat_inv_freq, - concat([select, 0]), - sizes=concat( - [1, shape(long_inv_freq, 1)])) - rotary_inv_freq = rotary_inv_freq.view((-1, )) + max_seq_length = fmax(attention_params.sequence_length, dim=0) + floor_seq_length = maximum( + max_seq_length, self.original_max_position_embeddings) + + short = attention_params.embed_positions_short_factors_for_attention_plugin + long = attention_params.embed_positions_long_factors_for_attention_plugin + + starts = concat([0, 0, 0]) + shapes = concat( + [floor_seq_length, self.rotary_embedding_dim // 2, 2]) + + short = slice(short, starts, shapes).view((1, -1)) + long = slice(long, starts, shapes).view((1, -1)) + + use_long_factors = gt(max_seq_length, + self.original_max_position_embeddings) + + cond = Conditional(use_long_factors) + true_val = cond.add_input(long) + false_val = cond.add_input(short) + rotary_cos_sin = cond.add_output(true_val, false_val) + + cond = Conditional(use_long_factors) + true_val = cond.add_input(attention_params.long_inv_freq) + false_val = cond.add_input(attention_params.short_inv_freq) + rotary_inv_freq = cond.add_output(true_val, false_val) else: # The rotary inv freq can be pre-computed. rotary_inv_freq = getattr(attention_params, "rotary_inv_freq", @@ -916,6 +913,7 @@ def compute_cross_qkv(encoder_output): layer_idx=self.local_layer_idx, num_heads=self.num_attention_heads, num_kv_heads=self.num_attention_kv_heads, + layer_idx_in_cache_pool=self.layer_idx_in_cache_pool, hidden_size_per_head=self.attention_head_size, q_scaling=self.q_scaling, rotary_embedding_dim=self.rotary_embedding_dim, @@ -956,6 +954,9 @@ def compute_cross_qkv(encoder_output): host_kv_cache_pool_pointers=kv_cache_params. host_kv_cache_pool_pointers if not self.cross_attention else kv_cache_params.host_cross_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=kv_cache_params. + host_kv_cache_pool_mapping if not self.cross_attention else + kv_cache_params.host_cross_kv_cache_pool_mapping, do_cross_attention=self.cross_attention, cross_qkv=cross_qkv, cross_qkv_length=attention_params.encoder_max_input_length, @@ -1025,24 +1026,18 @@ def transpose_for_scores(x, if self.position_embedding_type.is_rope(): if self.position_embedding_type == PositionEmbeddingType.long_rope: sequence_length = shape(hidden_states, 1) + floor_seq_length = maximum( + sequence_length, self.original_max_position_embeddings) + + starts = concat([0, 0, 0]) + shapes = concat( + [1, floor_seq_length, self.rotary_embedding_dim]) short = slice( - attention_params.embed_positions_short_factors, - concat([0, 0, 0]), - concat([ - 1, - max(sequence_length, - self.original_max_position_embeddings), - self.rotary_embedding_dim - ])) - long = slice( - attention_params.embed_positions_long_factors, - concat([0, 0, 0]), - concat([ - 1, - max(sequence_length, - self.original_max_position_embeddings), - self.rotary_embedding_dim - ])) + attention_params.embed_positions_short_factors, starts, + shapes) + long = slice(attention_params.embed_positions_long_factors, + starts, shapes) + embed_positions = concat([short, long], dim=0) select = where( sequence_length <= @@ -1702,6 +1697,8 @@ def forward(self, host_kv_cache_block_offsets, host_kv_cache_pool_pointers=kv_cache_params. host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=kv_cache_params. + host_kv_cache_pool_mapping, do_cross_attention=self.cross_attention, cross_qkv=None, cross_qkv_length=attention_params.encoder_max_input_length, diff --git a/tensorrt_llm/layers/embedding.py b/tensorrt_llm/layers/embedding.py index 869a05a43..f822cfec1 100644 --- a/tensorrt_llm/layers/embedding.py +++ b/tensorrt_llm/layers/embedding.py @@ -90,15 +90,10 @@ def weight_loader(self, mapping: Mapping, param: Parameter, param.value = loaded_weight def postprocess(self, tllm_key, weights, **kwargs): - config = kwargs.get("config", None) if weights is None: return {} weights = weights.to(str_dtype_to_torch(self.dtype)) - if config.share_embedding_table: - return {} - else: - weights = weights.clone() - return {tllm_key: weights} + return {tllm_key: weights} class PromptTuningEmbedding(Embedding): @@ -133,7 +128,7 @@ def forward(self, tokens, prompt_embedding_table, tasks, task_vocab_size): Parameters: tokens : Tensor - the ids to embbed, size [batch_size, seq_len] + the ids to embed, size [batch_size, seq_len] prompt_embedding_table : Tensor the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size] diff --git a/tensorrt_llm/layers/linear.py b/tensorrt_llm/layers/linear.py index 2608708ac..80d6f7f38 100644 --- a/tensorrt_llm/layers/linear.py +++ b/tensorrt_llm/layers/linear.py @@ -358,9 +358,9 @@ def postprocess(self, tllm_key, weights, **kwargs): config = kwargs.get("config", None) if self.is_qkv: if isinstance(weights, list): + head_size = config.hidden_size // config.num_attention_heads if config.head_size is None else config.head_size if hasattr(config, "remove_duplicated_kv_heads"): if config.remove_duplicated_kv_heads: - head_size = config.hidden_size // config.num_attention_heads if config.head_size is None else config.head_size k, v = weights[1:] k = k.reshape([ k.shape[0] // head_size // 2, 2, head_size, @@ -376,6 +376,22 @@ def postprocess(self, tllm_key, weights, **kwargs): v = v[:, 0].reshape([-1, self.in_features]) weights[1] = k weights[2] = v + # Duplicate kv heads in case of invalid TP size + tp_size = config.mapping.tp_size + num_kv_heads = config.num_key_value_heads + if num_kv_heads < tp_size: + for qkv_idx in range(3): + v = weights[qkv_idx] + if qkv_idx > 0: + assert tp_size % num_kv_heads == 0 + reps = tp_size // num_kv_heads + v = v.reshape(num_kv_heads, head_size, + -1)[:, None, :, :].expand( + num_kv_heads, reps, head_size, + v.shape[1]) + v = v.reshape(num_kv_heads * reps * head_size, -1) + weights[qkv_idx] = v.chunk( + tp_size, self.tp_dim)[config.mapping.tp_rank] weights = torch.cat(weights) if using_head_as_leading_dim: # Reorder [n_head, 3, head_dim, ...] into [3, n_head, head_dim, ...] diff --git a/tensorrt_llm/layers/mlp.py b/tensorrt_llm/layers/mlp.py index 05a760680..312a841eb 100644 --- a/tensorrt_llm/layers/mlp.py +++ b/tensorrt_llm/layers/mlp.py @@ -262,7 +262,7 @@ def __init__( def fc_gate_plugin(self, hidden_states, lora_layer_params=None): # Combine the following pattern # - # SiLU(FC(x)) + Gate(x) + # SiLU(FC(x)) * Gate(x) # # into: # @@ -319,7 +319,7 @@ def fc_gate_plugin(self, hidden_states, lora_layer_params=None): def fc_gate(self, hidden_states, lora_layer_params=None): # Combine the following pattern # - # SiLU(FC(x)) + Gate(x) + # SiLU(FC(x)) * Gate(x) # # into: # @@ -348,7 +348,6 @@ def forward(self, lora_layer_params=None, reduce_fusion_params: Optional[AllReduceFusionParams] = None): if default_net().plugin_config.gemm_swiglu_plugin: - assert self.dtype == 'float16', f"Currently limited support, got {self.dtype}" inter = self.fc_gate_plugin(hidden_states, lora_layer_params) else: inter = self.fc_gate(hidden_states, lora_layer_params) diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py index 0a93f1d02..e05ea6de3 100644 --- a/tensorrt_llm/layers/moe.py +++ b/tensorrt_llm/layers/moe.py @@ -25,7 +25,7 @@ from tensorrt_llm.layers.lora import LoraParams from .._common import default_net, default_trtnet -from .._utils import int32_array +from .._utils import QuantModeWrapper, int32_array from ..functional import (AllReduceFusionParams, _add_plugin_info, _create_tensor, allreduce, cast, concat, constant, div, expand, gather_nd, is_gated_activation, @@ -61,6 +61,9 @@ class ExpertScaleNormalizationMode(IntEnum): SPARSE_MIXER = 2 num_experts: int = 0 + moe_intermediate_size: int = 0 # Add moe inter size (shanshan) + num_shared_experts: int = 0 # Add number of shared experts (shanshan) + top_k: int = 0 normalization_mode: ExpertScaleNormalizationMode = ExpertScaleNormalizationMode.RENORMALIZE sparse_mixer_epsilon: float = 0.01 @@ -167,6 +170,10 @@ def from_parameter(x): p_output_type_id = trt.PluginField( "output_type_id", np.array([int(output_dtype)], dtype=np.int32), trt.PluginFieldType.INT32) + + if isinstance(quant_mode, QuantModeWrapper): + # We only need to get one quant mode here for specific moe layer + quant_mode = quant_mode[0] p_quant_mode = trt.PluginField("quant_mode", np.array([int(quant_mode)], dtype=np.int32), trt.PluginFieldType.INT32) @@ -832,3 +839,51 @@ def load_weights(self, moe: MOE): if is_gated_act: expert.gate.bias.value = experts_bias_1_raw[ i, :self.expert_inter_size] + + +# Add SharedMoE class (shanshan) +class SharedMoE(Module): + + def __init__(self, + moe_config: MoeConfig, + hidden_size: int, + ffn_hidden_size: int, + hidden_act: str, + mapping: Mapping = Mapping(), + bias: bool = True, + dtype=None, + **kwargs): + super().__init__() + + self.moe_config = moe_config + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.hidden_act = hidden_act + self.mapping = mapping + self.bias = bias + self.dtype = dtype + + self.moe = MOE(hidden_size=self.hidden_size, + moe_config=self.moe_config, + mapping=self.mapping, + ffn_hidden_size=self.moe_config.moe_intermediate_size, + hidden_act=self.hidden_act, + dtype=self.dtype, + bias=False, + tp_group=self.mapping.tp_group, + tp_size=self.mapping.tp_size) + ClsMLP = GatedMLP if is_gated_activation(self.hidden_act) else MLP + self.shared_experts = ClsMLP( + hidden_size=self.hidden_size, + ffn_hidden_size=self.ffn_hidden_size, + hidden_act=non_gated_version(self.hidden_act), # deepseek use SiLU + bias=False, + dtype=self.dtype, + tp_group=self.mapping.tp_group, + tp_size=self.mapping.tp_size) + + def forward(self, hidden_states): + if self.moe_config.num_shared_experts > 0: + return self.moe(hidden_states) + self.shared_experts(hidden_states) + else: + return self.moe(hidden_states) diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index 00507a118..18ed817e1 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -244,6 +244,15 @@ def load_hf_lora( if len(lora_config.lora_target_modules) == 0: lora_config.lora_target_modules = lora_loader.get_target_modules( trtllm_modules_to_hf_modules) + if len(lora_config.lora_target_modules) == 0: + raise ValueError( + "lora_target_modules is empty. " + "Please specify lora_target_modules or provide lora_dir to infer lora_target_modules." + ) + + missing_qkv_modules = LoraManager.get_missing_qkv_modules( + lora_config.lora_target_modules) + lora_config.lora_target_modules.extend(missing_qkv_modules) if lora_loader.is_valid: config = model.config diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py index 39481b3a0..e5dcc03a4 100755 --- a/tensorrt_llm/models/__init__.py +++ b/tensorrt_llm/models/__init__.py @@ -22,7 +22,7 @@ from .cogvlm.model import CogVLMForCausalLM from .dbrx.config import DbrxConfig from .dbrx.model import DbrxForCausalLM -from .deci.model import DeciLMForCausalLM +from .deepseek_v1.model import DeepseekForCausalLM from .dit.model import DiT from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder from .falcon.config import FalconConfig @@ -43,6 +43,7 @@ from .modeling_utils import (PretrainedConfig, PretrainedModel, SpeculativeDecodingMode) from .mpt.model import MPTForCausalLM, MPTModel +from .nemotron_nas.model import DeciLMForCausalLM from .opt.model import OPTForCausalLM, OPTModel from .phi3.model import Phi3ForCausalLM, Phi3Model from .phi.model import PhiForCausalLM, PhiModel @@ -57,6 +58,7 @@ 'BloomModel', 'BloomForCausalLM', 'DiT', + 'DeepseekForCausalLM', 'FalconConfig', 'FalconForCausalLM', 'FalconModel', @@ -95,6 +97,7 @@ 'PretrainedModel', 'WhisperEncoder', 'MambaForCausalLM', + 'MambaConfig', 'MPTForCausalLM', 'MPTModel', 'SkyworkForCausalLM', @@ -125,6 +128,7 @@ 'Phi3ForCausalLM': Phi3ForCausalLM, 'Phi3VForCausalLM': Phi3ForCausalLM, 'Phi3SmallForCausalLM': Phi3ForCausalLM, + 'PhiMoEForCausalLM': Phi3ForCausalLM, 'MambaForCausalLM': MambaForCausalLM, 'GPTNeoXForCausalLM': GPTNeoXForCausalLM, 'GPTJForCausalLM': GPTJForCausalLM, @@ -158,5 +162,6 @@ 'RecurrentGemmaForCausalLM': RecurrentGemmaForCausalLM, 'CogVLMForCausalLM': CogVLMForCausalLM, 'DiT': DiT, + 'DeepseekForCausalLM': DeepseekForCausalLM, 'DeciLMForCausalLM': DeciLMForCausalLM, } diff --git a/tensorrt_llm/models/automodel.py b/tensorrt_llm/models/automodel.py index a65781a88..9e382d9df 100644 --- a/tensorrt_llm/models/automodel.py +++ b/tensorrt_llm/models/automodel.py @@ -17,7 +17,14 @@ def from_hugging_face(hf_model_or_dir, hf_config = transformers.AutoConfig.from_pretrained( hf_model_or_dir, trust_remote_code=True) - hf_arch = hf_config.architectures[0] + + if hasattr(hf_config, + 'architectures') and hf_config.architectures is not None: + hf_arch = hf_config.architectures[0] + elif hasattr(hf_config, + 'model_type') and hf_config.model_type.find('mamba') != -1: + hf_arch = 'MambaForCausalLM' + trtllm_model_cls = MODEL_MAP.get(hf_arch, None) if trtllm_model_cls is None: raise NotImplementedError( @@ -47,7 +54,14 @@ def get_trtllm_model_class(hf_model_or_dir, trust_remote_code=False): hf_config = transformers.AutoConfig.from_pretrained( hf_model_or_dir, trust_remote_code=trust_remote_code) - hf_arch = hf_config.architectures[0] + + if hasattr(hf_config, + 'architectures') and hf_config.architectures is not None: + hf_arch = hf_config.architectures[0] + elif hasattr(hf_config, + 'model_type') and hf_config.model_type.find('mamba') != -1: + hf_arch = 'MambaForCausalLM' + trtllm_model_cls = MODEL_MAP.get(hf_arch, None) if trtllm_model_cls is None: diff --git a/tensorrt_llm/models/chatglm/convert.py b/tensorrt_llm/models/chatglm/convert.py index 9d1c59a61..77aaca23b 100644 --- a/tensorrt_llm/models/chatglm/convert.py +++ b/tensorrt_llm/models/chatglm/convert.py @@ -434,9 +434,8 @@ def load_weights_from_hf_model(hf_model: AutoModel, is_qkv=True, multi_query_mode=True) weights[ - f'{tllm_prex}.attention.kv_cache_scaling_factor'] = torch.from_numpy( - np.array([qkv_vals_int8['scale_y_quant_orig']], - dtype=np.float32)).contiguous() + f'{tllm_prex}.attention.kv_cache_scaling_factor'] = qkv_vals_int8[ + 'scale_y_quant_orig'].contiguous() # Attention dense attn_dense_weight, attn_dense_bias = get_weight_and_bias( diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py index 7d25399fe..877a62444 100644 --- a/tensorrt_llm/models/convert_utils.py +++ b/tensorrt_llm/models/convert_utils.py @@ -67,14 +67,14 @@ def get_weight(params: Dict[str, torch.Tensor], prefix: str, dtype: torch.dtype) -> torch.Tensor: if f'{prefix}.weight' not in params: return None - return params[f'{prefix}.weight'].to(dtype).detach().cpu() + return params[f'{prefix}.weight'].to(dtype).detach().cpu().contiguous() def get_bias(params: Dict[str, torch.Tensor], prefix: str, dtype: torch.dtype) -> torch.Tensor: if f'{prefix}.bias' not in params: return None - return params[f'{prefix}.bias'].to(dtype).detach().cpu() + return params[f'{prefix}.bias'].to(dtype).detach().cpu().contiguous() def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str, @@ -248,6 +248,7 @@ def has_safetensors(model_dir: str): 'ccdv/cnn_dailymail': ('3.0.0', 'train', 'article'), 'cnn_dailymail': ('3.0.0', 'train', 'article'), 'lambada': (None, 'validation', 'text'), + '': (None, 'train', 'text'), # Default value in HF } diff --git a/tensorrt_llm/models/deepseek_v1/__init__.py b/tensorrt_llm/models/deepseek_v1/__init__.py new file mode 100644 index 000000000..71bf6d298 --- /dev/null +++ b/tensorrt_llm/models/deepseek_v1/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/models/deepseek_v1/convert.py b/tensorrt_llm/models/deepseek_v1/convert.py new file mode 100644 index 000000000..0e7edb796 --- /dev/null +++ b/tensorrt_llm/models/deepseek_v1/convert.py @@ -0,0 +1,361 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time + +import torch +from transformers import AutoConfig, AutoModelForCausalLM + +from tensorrt_llm.layers import MoeConfig + +from ..._utils import pad_vocab_size, release_gc +from ...mapping import Mapping + + +## Convert config parameters to dict +def create_trt_config_from_hf(model_dir, + dtype, + mapping: Mapping, + override_fields: dict = {}): + config = {} + assert isinstance(model_dir, str) + hf_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) + dtype = dtype + n_layer = hf_config.num_hidden_layers + n_head = hf_config.num_attention_heads + n_embd = hf_config.hidden_size + inter_size = hf_config.intermediate_size + n_kv_head = hf_config.num_key_value_heads + vocab_size = hf_config.vocab_size + n_positions = hf_config.max_position_embeddings + hidden_act = 'swiglu' # TRT-LLM request make gated activation explicit for MOE implementation + rotary_base = hf_config.rope_theta + rms_norm_eps = hf_config.rms_norm_eps + moe_num_experts = hf_config.n_routed_experts + moe_top_k = hf_config.num_experts_per_tok + ## shanshan fix + moe_renorm_mode = MoeConfig.ExpertScaleNormalizationMode.NONE + moe_num_shared_experts = hf_config.n_shared_experts + moe_inter_size = hf_config.moe_intermediate_size + rotary_scaling = hf_config.rope_scaling + + config = { + 'architecture': "DeepseekForCausalLM", + 'dtype': dtype, + 'logits_type': 'float32', + 'num_hidden_layers': n_layer, + 'num_attention_heads': n_head, + 'hidden_size': n_embd, + 'intermediate_size': inter_size, + 'num_key_value_heads': n_kv_head, + 'vocab_size': vocab_size, + 'position_embedding_type': 'rope_gpt_neox', + 'max_position_embeddings': n_positions, + 'hidden_act': hidden_act, + 'rotary_base': rotary_base, + 'norm_epsilon': rms_norm_eps, + 'rotary_scaling': rotary_scaling, + 'moe_num_experts': moe_num_experts, + 'moe_top_k': moe_top_k, + 'moe_renorm_mode': moe_renorm_mode, + 'moe_num_shared_experts': moe_num_shared_experts, + 'moe_inter_size': moe_inter_size, + 'mapping': { + 'world_size': mapping.tp_size * mapping.pp_size, + 'tp_size': mapping.tp_size, + 'pp_size': mapping.pp_size, + 'moe_tp_size': mapping.moe_tp_size, + 'moe_ep_size': mapping.moe_ep_size, + }, + } + config.update(override_fields) + + moe_config = MoeConfig(num_experts=config['moe_num_experts'], + moe_intermediate_size=config['moe_inter_size'], + num_shared_experts=config['moe_num_shared_experts'], + top_k=config['moe_top_k'], + normalization_mode=config['moe_renorm_mode']) + moe_config.validate() + + return config + + +## Get HF model +def load_hf_deepseek(model_dir): + model = AutoModelForCausalLM.from_pretrained(model_dir, + device_map='auto', + torch_dtype='auto', + trust_remote_code=True) + return model + + +## Prepare weights for TP +def split(v, tp_size, idx, dim=0): + if tp_size == 1: + return v + if len(v.shape) == 1: + return torch.chunk(v, tp_size)[idx].contiguous() + else: + return torch.chunk(v, tp_size, dim=dim)[idx].contiguous() + + +def split_qkv_tp(v, n_head, n_hidden, tensor_parallel, rank): + """ + Splits the QKV matrix according to tensor parallelism + """ + v = v.reshape(3, n_hidden, n_hidden) + split_v = split(v, tensor_parallel, rank, dim=1) + split_v = split_v.reshape(3 * (n_hidden // tensor_parallel), n_hidden) + return split_v.contiguous() + + +def split_matrix_tp(v, tensor_parallel, rank, dim): + return split(v, tensor_parallel, rank, dim=dim) + + +def get_weight(config, prefix, dtype, postfix='.weight'): + if config[prefix + postfix].dtype != dtype: + config[prefix + postfix].data = config[prefix + postfix].to(dtype) + return config[prefix + postfix].detach().cpu() + + +def get_trtllm_linear_weight(weight, prefix, postfix='weight'): + results = {} + results[prefix + postfix] = weight + + return results + + +def convert_deepseek(hf_model, + config, + mapping, + dtype='float32', + use_parallel_embedding=False, + sharding_dim=0, + share_embedding_table=False): + + weights = {} + tik = time.time() + mapping.tp_size + model_params = dict(hf_model.named_parameters()) + dtype = getattr(torch, dtype) + moe_config = MoeConfig(num_experts=config['moe_num_experts'], + moe_intermediate_size=config['moe_inter_size'], + num_shared_experts=config['moe_num_shared_experts'], + top_k=config['moe_top_k'], + normalization_mode=config['moe_renorm_mode']) + + layers_range = mapping.pp_layers(config['num_hidden_layers']) + + def convert_layer(l): + prefix = f'model.layers.{l}.' + print(prefix) + trtllm_prex = f'transformer.layers.{l - layers_range[0]}.' + q_weight = get_weight(model_params, prefix + 'self_attn.q_proj', dtype) + k_weight = get_weight(model_params, prefix + 'self_attn.k_proj', dtype) + v_weight = get_weight(model_params, prefix + 'self_attn.v_proj', dtype) + + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0) + + split_v = split_qkv_tp(qkv_weight, config['num_attention_heads'], + config['hidden_size'], mapping.tp_size, + mapping.tp_rank) + + weights.update( + get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.qkv.')) + + attn_dense_weight = get_weight(model_params, + prefix + 'self_attn.o_proj', dtype) + split_v = split_matrix_tp(attn_dense_weight, + mapping.tp_size, + mapping.tp_rank, + dim=1) + + weights.update( + get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.dense.')) + + if moe_config.has_moe() and l > 0: + rank_experts = list(range(moe_config.num_experts)) + if mapping.has_moe_ep(): + rank_experts = mapping.ep_experts(moe_config.num_experts) + for suffix in ["gate_proj", "down_proj", "up_proj"]: + model_params[f'model.layers.{l}.mlp.experts.{suffix}.weight'] = \ + torch.stack([model_params[f'model.layers.{l}.mlp.experts.{expert}.{suffix}.weight'].detach().cpu() + for expert in rank_experts]) + + gate_proj = model_params[ + f'model.layers.{l}.mlp.experts.gate_proj.weight'] + down_proj = model_params[ + f'model.layers.{l}.mlp.experts.down_proj.weight'] + up_proj = model_params[ + f'model.layers.{l}.mlp.experts.up_proj.weight'] + if mapping.has_moe_tp(): + gate_proj = split(gate_proj, + mapping.tp_size, + mapping.tp_rank, + dim=1) + down_proj = split(down_proj, + mapping.tp_size, + mapping.tp_rank, + dim=2) + up_proj = split(up_proj, + mapping.tp_size, + mapping.tp_rank, + dim=1) + + model_params[ + f'model.layers.{l}.mlp.experts.up_gate_proj.weight'] = torch.concat( + [up_proj, gate_proj], dim=-2) + model_params[ + f'model.layers.{l}.mlp.experts.down_proj.weight'] = down_proj + + ## mlp.experts.down_proj.weight + moe_experts_down_proj_weights = get_weight( + model_params, prefix + 'mlp.experts.down_proj', dtype) + weights.update( + get_trtllm_linear_weight(moe_experts_down_proj_weights, + trtllm_prex + 'mlp.moe.proj.')) + ##mlp.experts.up_gate.weight + moe_experts_up_gate_proj_weights = get_weight( + model_params, prefix + 'mlp.experts.up_gate_proj', dtype) + weights.update( + get_trtllm_linear_weight(moe_experts_up_gate_proj_weights, + trtllm_prex + 'mlp.moe.fc.')) + ## MOE hardcoded routing_input into trt.float32, please refer to moe.py line 397 + moe_experts_gate_weights = get_weight(model_params, + prefix + 'mlp.gate', + torch.float32) + weights.update( + get_trtllm_linear_weight(moe_experts_gate_weights, + trtllm_prex + 'mlp.moe.router.')) + + if moe_config.num_shared_experts > 0: + ## mlp.shared_experts.gate_proj.weight + shared_moe_gate_proj_weights = get_weight( + model_params, prefix + 'mlp.shared_experts.gate_proj', + dtype) + split_v = split_matrix_tp(shared_moe_gate_proj_weights, + mapping.tp_size, + mapping.tp_rank, + dim=0) + weights.update( + get_trtllm_linear_weight( + split_v, trtllm_prex + 'mlp.shared_experts.fc.')) + # mlp.shared_experts.down_proj.weight + shared_moe_down_proj_weights = get_weight( + model_params, prefix + 'mlp.shared_experts.down_proj', + dtype) + split_v = split_matrix_tp(shared_moe_down_proj_weights, + mapping.tp_size, + mapping.tp_rank, + dim=1) + weights.update( + get_trtllm_linear_weight( + split_v, trtllm_prex + 'mlp.shared_experts.proj.')) + ## mlp.shared_experts.up_proj.weight + shared_moe_up_proj_weights = get_weight( + model_params, prefix + 'mlp.shared_experts.up_proj', dtype) + split_v = split_matrix_tp(shared_moe_up_proj_weights, + mapping.tp_size, + mapping.tp_rank, + dim=0) + weights.update( + get_trtllm_linear_weight( + split_v, trtllm_prex + 'mlp.shared_experts.gate.')) + + else: + ## Current deepseek model has one MLP layer only, if it goes large consider to do fuse + mlp_gate_weight = get_weight(model_params, prefix + 'mlp.up_proj', + dtype) + split_gate = split_matrix_tp(mlp_gate_weight, + mapping.tp_size, + mapping.tp_rank, + dim=0) + weights.update( + get_trtllm_linear_weight(split_gate, trtllm_prex + 'mlp.gate.')) + + mlp_fc_weight = get_weight(model_params, prefix + 'mlp.gate_proj', + dtype) + split_fc = split_matrix_tp(mlp_fc_weight, + mapping.tp_size, + mapping.tp_rank, + dim=0) + weights.update( + get_trtllm_linear_weight(split_fc, trtllm_prex + 'mlp.fc.')) + + mlp_proj_weight = get_weight(model_params, prefix + 'mlp.down_proj', + dtype) + split_proj = split_matrix_tp(mlp_proj_weight, + mapping.tp_size, + mapping.tp_rank, + dim=1) + weights.update( + get_trtllm_linear_weight(split_proj, trtllm_prex + 'mlp.proj.')) + + # Layer norms do not use tensor parallelism + input_ln_weight = get_weight(model_params, prefix + 'input_layernorm', + dtype) + weights[trtllm_prex + 'input_layernorm.weight'] = input_ln_weight + post_ln_weight = get_weight(model_params, + prefix + 'post_attention_layernorm', dtype) + weights[trtllm_prex + 'post_layernorm.weight'] = post_ln_weight + + for l in layers_range: + convert_layer(l) + release_gc() + + v = get_weight(model_params, 'model.embed_tokens', dtype) + if hf_model.config.tie_word_embeddings: + # lm_head.weight has the same weights as embedding + if mapping.is_last_pp_rank(): + if config['vocab_size'] % mapping.tp_size != 0: + # padding + vocab_size_padded = pad_vocab_size(config['vocab_size'], + mapping.tp_size) + pad_width = vocab_size_padded - config['vocab_size'] + v = torch.nn.functional.pad(v, (0, 0, 0, pad_width), 'constant', + 0) + weights['lm_head.weight'] = split(v, mapping.tp_size, + mapping.tp_rank) + if use_parallel_embedding: + v = split_matrix_tp(v, + mapping.tp_size, + mapping.tp_rank, + dim=config.embedding_sharding_dim) + if mapping.is_first_pp_rank(): + weights['transformer.vocab_embedding.weight'] = v + lm_head_weights = get_weight(model_params, 'lm_head', dtype) + + if mapping.is_last_pp_rank(): + if config['vocab_size'] % mapping.tp_size != 0: + # padding + vocab_size_padded = pad_vocab_size(config['vocab_size'], + mapping.tp_size) + pad_width = vocab_size_padded - config['vocab_size'] + lm_head_weights = torch.nn.functional.pad(lm_head_weights, + (0, 0, 0, pad_width), + 'constant', + value=0) + weights['lm_head.weight'] = split_matrix_tp(lm_head_weights, + mapping.tp_size, + mapping.tp_rank, + dim=0) + ln_f_w = get_weight(model_params, 'model.norm', dtype) + weights['transformer.ln_f.weight'] = ln_f_w + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Weights loaded. Total time: {t}') + #print(set(weights.keys())) + return weights diff --git a/tensorrt_llm/models/deepseek_v1/model.py b/tensorrt_llm/models/deepseek_v1/model.py new file mode 100644 index 000000000..ff6dcc18d --- /dev/null +++ b/tensorrt_llm/models/deepseek_v1/model.py @@ -0,0 +1,257 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import torch + +from ..._utils import pad_vocab_size, torch_dtype_to_str +from ...functional import Tensor, non_gated_version, recv, send +from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding, + GatedMLP, MoeConfig, PositionEmbeddingType, RmsNorm, + SharedMoE) +from ...mapping import Mapping +from ...module import Module +from ...plugin import init_all_reduce_helper +from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, + PretrainedConfig) +from .convert import convert_deepseek, create_trt_config_from_hf + + +class DeepseekDecoderLayer(Module): + + def __init__(self, config: PretrainedConfig, layer_idx: int): + super().__init__() + self.layer_idx = layer_idx + self.config = config + + ### Input layernorm in Deepseek v1 is same as Llama + self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + dtype=config.dtype) + + layers_range = config.mapping.pp_layers(config.num_hidden_layers) + local_layer_idx = layer_idx - layers_range[0] + ### Deepseek v1 model with standard attention + self.attention = Attention( + local_layer_idx=local_layer_idx, + hidden_size=config.hidden_size, + attention_head_size=config.head_size, + num_attention_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + max_position_embeddings=config.max_position_embeddings, + dtype=config.dtype, + attention_mask_type=AttentionMaskType.causal, + bias=False, + position_embedding_type=PositionEmbeddingType.rope_gpt_neox, + rotary_embedding_base=config.rotary_base, + rotary_embedding_scaling=config.rotary_scaling, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + tp_rank=config.mapping.tp_rank) + + ClsMLP = GatedMLP + + moe_config = MoeConfig(num_experts=config.moe_num_experts, + moe_intermediate_size=config.moe_inter_size, + num_shared_experts=config.moe_num_shared_experts, + top_k=config.moe_top_k, + normalization_mode=config.moe_renorm_mode) + + mlp_kwargs = {} + if config.moe_num_experts > 0 and layer_idx > 0: + mlp_hidden_size = moe_config.num_shared_experts * moe_config.moe_intermediate_size + hidden_act = config.hidden_act + ClsMLP = SharedMoE + mlp_kwargs = {"moe_config": moe_config, "mapping": config.mapping} + else: + ClsMLP = GatedMLP + mlp_hidden_size = config.intermediate_size + hidden_act = non_gated_version( + config.hidden_act) # back to non gated for dense layers + + self.mlp = ClsMLP(hidden_size=config.hidden_size, + ffn_hidden_size=mlp_hidden_size, + hidden_act=hidden_act, + dtype=config.dtype, + bias=False, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + **mlp_kwargs) + + ### Pose layernorm in Deepseek v1 is same as Llama ) + self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + dtype=config.dtype) + + def forward(self, + hidden_states, + attention_mask=None, + use_cache=False, + spec_decoding_params=None, + kv_cache_params=None, + attention_params=None): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + attention_output = self.attention( + hidden_states, + attention_mask=attention_mask, + use_cache=use_cache, + spec_decoding_params=spec_decoding_params, + kv_cache_params=kv_cache_params, + attention_params=attention_params) + if use_cache: + attention_output, presents = attention_output + + hidden_states = residual + attention_output + + residual = hidden_states + + hidden_states = self.post_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + if use_cache: + return (hidden_states, presents) + return hidden_states + + +class DeepseekModel(Module): + + def __init__(self, config: PretrainedConfig) -> None: + super().__init__() + init_all_reduce_helper() # enable use_customer_all_reduce + + self.mapping = config.mapping + if self.mapping.is_first_pp_rank(): + self.vocab_embedding = Embedding(config.vocab_size, + config.hidden_size, + dtype=config.dtype) + + self.layers = DecoderLayerList(DeepseekDecoderLayer, config) + + if self.mapping.is_last_pp_rank(): + self.ln_f = RmsNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, + dtype=config.dtype) + + def forward(self, + input_ids, + position_ids=None, + use_cache=False, + attention_mask=None, + spec_decoding_params=None, + kv_cache_params=None, + attention_params=None, + hidden_states=None, + prompt_embedding_table: Optional[Tensor] = None, + prompt_tasks: Optional[Tensor] = None, + prompt_vocab_size: Optional[Tensor] = None): + + ptuning_args = [ + prompt_embedding_table, prompt_tasks, prompt_vocab_size + ] if prompt_embedding_table is not None else [] + + if self.mapping.is_first_pp_rank(): + hidden_states = self.vocab_embedding(input_ids, *ptuning_args) + else: + hidden_states = recv(hidden_states, self.mapping.prev_pp_rank()) + + hidden_states = self.layers.forward( + hidden_states, + use_cache=use_cache, + attention_mask=attention_mask, + kv_cache_params=kv_cache_params, + attention_params=attention_params, + spec_decoding_params=spec_decoding_params) + + if use_cache: + hidden_states, presents = hidden_states + + if self.mapping.is_last_pp_rank(): + hidden_states = self.ln_f(hidden_states) + else: + hidden_states = send(hidden_states, self.mapping.next_pp_rank()) + + if use_cache: + return (hidden_states, tuple(presents)) + return hidden_states + + +class DeepseekForCausalLM(DecoderModelForCausalLM): + + def __init__(self, config: PretrainedConfig): + transformer = DeepseekModel(config) + vocab_size_padded = pad_vocab_size(config.vocab_size, + config.mapping.tp_size) + if config.mapping.is_last_pp_rank(): + lm_head = ColumnLinear(config.hidden_size, + vocab_size_padded, + bias=False, + dtype=config.dtype, + tp_group=config.mapping.tp_group, + tp_size=config.mapping.tp_size, + gather_output=True) + else: + lm_head = None + self.mapping = config.mapping + super().__init__(config, transformer, lm_head) + + @classmethod + def from_hugging_face(cls, + hf_model, + model_dir, + dtype: str = 'auto', + mapping: Optional[Mapping] = None, + override_fields={}, + **kwargs): + assert hf_model is not None + if mapping is None: + mapping = Mapping() + config = create_trt_config_from_hf(model_dir, + dtype, + mapping=mapping, + override_fields=override_fields) + print(config) + pretrained_config = PretrainedConfig.from_dict(config) + pretrained_config.set_rank(mapping.rank) # TODO:remove this hack + + if dtype == 'auto': + dtype = getattr(config, 'torch_dtype', None) + if dtype is None: + dtype = 'float16' + if isinstance(dtype, torch.dtype): + dtype = torch_dtype_to_str(dtype) + if dtype == 'float32': # should remove "float32" + dtype = 'float16' + if dtype == 'bfloat16' and torch.cuda.get_device_properties( + 0).major < 8: + logger.warning( + "Pre SM 80 GPUs do not support bfloat16, fallback to float16") + dtype = 'float16' + + deepseek = cls.from_config(pretrained_config) + weights = convert_deepseek( + hf_model, + config, + mapping, + dtype=dtype, + use_parallel_embedding=config.get('use_parallel_embedding', False), + sharding_dim=config.get('embedding_sharding_dim', 0), + share_embedding_table=config.get('share_embedding_table', False)) + #check_share_embedding(weights, config) + deepseek.load(weights) + + return deepseek diff --git a/tensorrt_llm/models/enc_dec/model.py b/tensorrt_llm/models/enc_dec/model.py index 3f540e690..52a013d16 100644 --- a/tensorrt_llm/models/enc_dec/model.py +++ b/tensorrt_llm/models/enc_dec/model.py @@ -1160,12 +1160,16 @@ def forward(self, host_cross_kv_cache_block_offsets, host_kv_cache_pool_pointers=kv_cache_params. host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=kv_cache_params. + host_kv_cache_pool_mapping, cross_kv_cache_block_offsets=kv_cache_params. cross_kv_cache_block_offsets, host_cross_kv_cache_block_offsets=kv_cache_params. host_cross_kv_cache_block_offsets, host_cross_kv_cache_pool_pointers=kv_cache_params. - host_cross_kv_cache_pool_pointers), + host_cross_kv_cache_pool_pointers, + host_cross_kv_cache_pool_mapping=kv_cache_params. + host_cross_kv_cache_pool_mapping), attention_params=attention_params, lora_layer_params=lora_layer_params, cross_kv_cache_gen=cross_kv_cache_gen, @@ -1601,10 +1605,12 @@ def prepare_inputs(self, kv_cache_block_offsets = None host_kv_cache_block_offsets = None host_kv_cache_pool_pointers = None + host_kv_cache_pool_mapping = None cross_kv_cache_block_offsets = None host_cross_kv_cache_block_offsets = None host_cross_kv_cache_pool_pointers = None + host_cross_kv_cache_pool_mapping = None if use_cache: if not paged_kv_cache: @@ -1669,21 +1675,25 @@ def prepare_inputs(self, x for x in max_cross_blocks_per_seq_range[0] ]] - kv_cache_block_offsets = Tensor(name=f'kv_cache_block_offsets', - dtype=trt.int32, - shape=[-1, 2, -1], - dim_range=OrderedDict([ - ('batch_size_beam_width', - [bb_range]), - ('kv', [2]), - ('max_blocks_per_seq', - max_blocks_per_seq_range), - ])) + # TODO(oargov): add support for vgqa, meanwhile assume a single kv cache pool + num_kv_cache_pools = 1 + + kv_cache_block_offsets = Tensor( + name=f'kv_cache_block_offsets', + dtype=trt.int32, + shape=[num_kv_cache_pools, -1, 2, -1], + dim_range=OrderedDict([ + ('num_kv_cache_pools', [num_kv_cache_pools]), + ('batch_size_beam_width', [bb_range]), + ('kv', [2]), + ('max_blocks_per_seq', max_blocks_per_seq_range), + ])) host_kv_cache_block_offsets = Tensor( name=f'host_kv_cache_block_offsets', dtype=trt.int32, - shape=[-1, 2, -1], + shape=[num_kv_cache_pools, -1, 2, -1], dim_range=OrderedDict([ + ('num_kv_cache_pools', [num_kv_cache_pools]), ('batch_size_beam_width', [bb_range]), ('kv', [2]), ('max_blocks_per_seq', max_blocks_per_seq_range), @@ -1691,17 +1701,26 @@ def prepare_inputs(self, host_kv_cache_pool_pointers = Tensor( name=f'host_kv_cache_pool_pointers', dtype=trt.int64, - shape=[2], + shape=[num_kv_cache_pools, 2], dim_range=OrderedDict([ - ('num_pools', [2]), + ('num_pools_layers', [num_kv_cache_pools]), + ('num_pools_kv', [2]), + ])) + host_kv_cache_pool_mapping = Tensor( + name=f"host_kv_cache_pool_mapping", + dtype=trt.int32, + shape=[num_pp_layers], + dim_range=OrderedDict([ + ('pools_mapping', [num_pp_layers]), ])) # paged blocks for cross kv cross_kv_cache_block_offsets = Tensor( name=f'cross_kv_cache_block_offsets', dtype=trt.int32, - shape=[-1, 2, -1], + shape=[num_kv_cache_pools, -1, 2, -1], dim_range=OrderedDict([ + ('num_kv_cache_pools', [num_kv_cache_pools]), ('batch_size_beam_width', [bb_range]), ('kv', [2]), ('max_cross_blocks_per_seq', @@ -1710,8 +1729,9 @@ def prepare_inputs(self, host_cross_kv_cache_block_offsets = Tensor( name=f'host_cross_kv_cache_block_offsets', dtype=trt.int32, - shape=[-1, 2, -1], + shape=[num_kv_cache_pools, -1, 2, -1], dim_range=OrderedDict([ + ('num_kv_cache_pools', [num_kv_cache_pools]), ('batch_size_beam_width', [bb_range]), ('kv', [2]), ('max_cross_blocks_per_seq', @@ -1720,10 +1740,18 @@ def prepare_inputs(self, host_cross_kv_cache_pool_pointers = Tensor( name=f'host_cross_kv_cache_pool_pointers', dtype=trt.int64, - shape=[2], + shape=[num_kv_cache_pools, 2], dim_range=OrderedDict([ + ('num_kv_cache_pools', [num_kv_cache_pools]), ('num_pools', [2]), ])) + host_cross_kv_cache_pool_mapping = Tensor( + name=f"host_cross_kv_cache_pool_mapping", + dtype=trt.int32, + shape=[num_pp_layers], + dim_range=OrderedDict([ + ('pools_mapping', [num_pp_layers]), + ])) for i in layers_range: past_key_value.append(None) @@ -1737,11 +1765,14 @@ def prepare_inputs(self, kv_cache_block_offsets=kv_cache_block_offsets, host_kv_cache_block_offsets=host_kv_cache_block_offsets, host_kv_cache_pool_pointers=host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=host_kv_cache_pool_mapping, cross_kv_cache_block_offsets=cross_kv_cache_block_offsets, host_cross_kv_cache_block_offsets= host_cross_kv_cache_block_offsets, host_cross_kv_cache_pool_pointers= host_cross_kv_cache_pool_pointers, + host_cross_kv_cache_pool_mapping= + host_cross_kv_cache_pool_mapping, ) attention_params = AttentionParams( diff --git a/tensorrt_llm/models/falcon/model.py b/tensorrt_llm/models/falcon/model.py index a8f6458da..627335eeb 100644 --- a/tensorrt_llm/models/falcon/model.py +++ b/tensorrt_llm/models/falcon/model.py @@ -65,8 +65,7 @@ def __init__(self, config: FalconConfig, layer_idx: int): tp_rank=tp_rank, bias=config.bias, position_embedding_type=config.position_embedding_type, - quant_mode=config.quantization.quant_mode, - ) + quant_mode=config.quantization.quant_mode) mlp_hidden_size = hidden_size * 4 if config.intermediate_size is None else config.intermediate_size diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py index f34ad71d1..4024b2e88 100644 --- a/tensorrt_llm/models/gemma/model.py +++ b/tensorrt_llm/models/gemma/model.py @@ -78,8 +78,7 @@ def __init__(self, config: GemmaConfig, layer_idx: int): tp_size=config.mapping.tp_size, quant_mode=config.quant_mode, q_scaling=q_scaling, - max_attn_value=max_attn_value, - ) + max_attn_value=max_attn_value) mlp_hidden_size = config.hidden_size * 4 if config.intermediate_size is None else config.intermediate_size diff --git a/tensorrt_llm/models/gemma/smoothquant.py b/tensorrt_llm/models/gemma/smoothquant.py index 640ff7ed9..2e1dd7aa2 100644 --- a/tensorrt_llm/models/gemma/smoothquant.py +++ b/tensorrt_llm/models/gemma/smoothquant.py @@ -27,6 +27,7 @@ from transformers import LlamaConfig, LlamaForCausalLM from transformers.models.llama.modeling_llama import (LlamaAttention, LlamaDecoderLayer, + LlamaRotaryEmbedding, apply_rotary_pos_emb, repeat_kv) from transformers.pytorch_utils import Conv1D @@ -380,7 +381,8 @@ def __init__(self, *args, **kwargs): self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self._init_rope() + self.config.head_dim = self.head_dim + self.rotary_emb = LlamaRotaryEmbedding(config=self.config) def forward( self, diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py index cb12289f8..92361e705 100644 --- a/tensorrt_llm/models/generation_mixin.py +++ b/tensorrt_llm/models/generation_mixin.py @@ -14,7 +14,7 @@ # limitations under the License. import math from collections import OrderedDict -from typing import List +from typing import List, Optional import tensorrt as trt @@ -166,28 +166,34 @@ def get_profiles_ranges( } return num_profiles, ranges - def prepare_attention_inputs(self, - *, - max_batch_size, - max_beam_width, - max_input_len, - max_seq_len, - num_kv_heads, - head_size, - num_layers, - kv_dtype, - kv_cache_type: KVCacheType, - num_profiles=1, - enable_ctx_gen_opt_profiles=False, - remove_input_padding=False, - use_gpt_attention_plugin=False, - tokens_per_block=64, - mapping=Mapping(), - streamingllm=False, - attn_layer_idx=None, - opt_batch_size=None, - num_kv_heads_per_layer=None): - + def prepare_attention_inputs( + self, + *, + max_batch_size, + max_beam_width, + max_input_len, + max_seq_len, + num_kv_heads, + head_size, + num_layers, + kv_dtype, + kv_cache_type: KVCacheType, + num_profiles=1, + enable_ctx_gen_opt_profiles=False, + remove_input_padding=False, + use_gpt_attention_plugin=False, + tokens_per_block=64, + mapping=Mapping(), + streamingllm=False, + attn_layer_idx=None, + opt_batch_size=None, + num_kv_heads_per_layer: Optional[List[int]] = None): + + if attn_layer_idx is not None and num_kv_heads_per_layer is not None: + assert len(attn_layer_idx) == len(num_kv_heads_per_layer), ( + f"Expected len(attn_layer_idx) ({len(attn_layer_idx)})" + f" == len(num_kv_heads_per_layer) ({len(num_kv_heads_per_layer)})" + ) default_range = GenerationMixin.default_range if opt_batch_size: @@ -245,23 +251,40 @@ def prepare_attention_inputs(self, max_len_range = [_max_len_range] * num_profiles num_kv_heads = (num_kv_heads + mapping.tp_size - 1) // mapping.tp_size + if num_kv_heads_per_layer is not None: + num_kv_heads_per_layer = [ + (nheads + mapping.tp_size - 1) // mapping.tp_size + for nheads in num_kv_heads_per_layer + ] + layers_range = mapping.pp_layers(num_layers) - num_pp_layers = len(layers_range) if attn_layer_idx is None: attn_layer_idx = [i for i in range(num_layers)] + # layer indices of attention layers local to the current pp rank + local_attn_layers = [i for i in layers_range if i in attn_layer_idx] + # number of attention layers local to previous pp ranks + num_attn_layers_lower_ranks = attn_layer_idx.index(local_attn_layers[0]) past_key_value = [] kv_cache_block_offsets = None host_kv_cache_block_offsets = None host_kv_cache_pool_pointers = None + host_kv_cache_pool_mapping = None if kv_cache_type == KVCacheType.DISABLED: for i in layers_range: past_key_value.append(None) else: if kv_cache_type != KVCacheType.PAGED: - for i in layers_range: + for layer_idx in layers_range: + if layer_idx not in local_attn_layers: + # not an attention layer ==> give it None pkv input + past_key_value.append(None) + continue + + attn_idx = local_attn_layers.index(layer_idx) if num_kv_heads_per_layer is not None: - heads_dim_name = f"num_heads_{attn_layer_idx[i]}" - kv_heads = num_kv_heads_per_layer[i] + heads_dim_name = f"num_heads_{layer_idx}" + kv_heads = num_kv_heads_per_layer[ + num_attn_layers_lower_ranks + attn_idx] else: heads_dim_name = "num_heads" kv_heads = num_kv_heads @@ -274,7 +297,7 @@ def prepare_attention_inputs(self, ('head_size', [head_size] * num_profiles), ]) - kv = Tensor(name=f'past_key_value_{attn_layer_idx[i]}', + kv = Tensor(name=f'past_key_value_{layer_idx}', dtype=kv_dtype, shape=[-1, 2, kv_heads, -1, head_size], dim_range=kv_dim_range) @@ -300,21 +323,28 @@ def prepare_attention_inputs(self, math.ceil(kv_cache_range[0][2] / tokens_per_block) ]] * num_profiles - kv_cache_block_offsets = Tensor(name=f'kv_cache_block_offsets', - dtype=trt.int32, - shape=[-1, 2, -1], - dim_range=OrderedDict([ - ('batch_size_beam_width', - bb_range), - ('kv', [2] * num_profiles), - ('max_blocks_per_seq', - max_blocks_per_seq_range), - ])) + num_kv_cache_pools = 1 if num_kv_heads_per_layer is None else len( + set(num_kv_heads_per_layer[num_attn_layers_lower_ranks: + num_attn_layers_lower_ranks + + len(local_attn_layers)])) + kv_cache_block_offsets = Tensor( + name=f'kv_cache_block_offsets', + dtype=trt.int32, + shape=[num_kv_cache_pools, -1, 2, -1], + dim_range=OrderedDict([ + ('num_kv_cache_pools', + [num_kv_cache_pools] * num_profiles), + ('batch_size_beam_width', bb_range), + ('kv', [2] * num_profiles), + ('max_blocks_per_seq', max_blocks_per_seq_range), + ])) host_kv_cache_block_offsets = Tensor( name=f'host_kv_cache_block_offsets', dtype=trt.int32, - shape=[-1, 2, -1], + shape=[num_kv_cache_pools, -1, 2, -1], dim_range=OrderedDict([ + ('num_kv_cache_pools', + [num_kv_cache_pools] * num_profiles), ('batch_size_beam_width', bb_range), ('kv', [2] * num_profiles), ('max_blocks_per_seq', max_blocks_per_seq_range), @@ -322,9 +352,20 @@ def prepare_attention_inputs(self, host_kv_cache_pool_pointers = Tensor( name=f'host_kv_cache_pool_pointers', dtype=trt.int64, - shape=[2], + shape=[num_kv_cache_pools, 2], + dim_range=OrderedDict([ + ('num_pools_layers', + [num_kv_cache_pools] * num_profiles), + ('num_pools_kv', [2] * num_profiles), + ])) + + host_kv_cache_pool_mapping = Tensor( + name=f'host_kv_cache_pool_mapping', + dtype=trt.int32, + shape=[len(local_attn_layers)], dim_range=OrderedDict([ - ('num_pools', [2] * num_profiles), + ('pools_mapping', + [len(local_attn_layers)] * num_profiles), ])) for i in layers_range: @@ -403,9 +444,10 @@ def prepare_attention_inputs(self, host_max_attention_window_sizes = Tensor( name=f'host_max_attention_window_sizes', dtype=trt.int32, - shape=[num_pp_layers], - dim_range=OrderedDict([('num_layers', - [num_pp_layers] * num_profiles)])) + shape=[len(local_attn_layers)], + dim_range=OrderedDict([ + ('num_layers', [len(local_attn_layers)] * num_profiles) + ])) host_sink_token_length = Tensor(name='host_sink_token_length', dtype=trt.int32, @@ -437,6 +479,7 @@ def prepare_attention_inputs(self, 'kv_cache_block_offsets': kv_cache_block_offsets, 'host_kv_cache_block_offsets': host_kv_cache_block_offsets, 'host_kv_cache_pool_pointers': host_kv_cache_pool_pointers, + 'host_kv_cache_pool_mapping': host_kv_cache_pool_mapping, 'context_lengths': context_lengths, 'host_context_lengths': host_context_lengths, 'host_request_types': host_request_types, diff --git a/tensorrt_llm/models/gpt/config.py b/tensorrt_llm/models/gpt/config.py index ba34ae255..01e1ac257 100644 --- a/tensorrt_llm/models/gpt/config.py +++ b/tensorrt_llm/models/gpt/config.py @@ -135,6 +135,11 @@ def from_hugging_face( hf_config.rotary_base = hf_config.rope_theta hf_config.rotary_pct = getattr(hf_config, 'partial_rotary_factor', 1.0) + try: + # only for persimmon, not starcoder2 + hf_config.vocab_size = hf_config.text_config.vocab_size + except AttributeError: + pass elif gpt_variant == "kosmos-2": hf_config.n_embd = hf_config.text_config.embed_dim hf_config.n_inner = hf_config.text_config.ffn_dim diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py index 7e40e5872..bde4dc991 100644 --- a/tensorrt_llm/models/gpt/model.py +++ b/tensorrt_llm/models/gpt/model.py @@ -25,6 +25,8 @@ from ...mapping import Mapping from ...module import Module from ...quantization import QuantMode +from ...quantization.functional import quantize_fp8_per_token +from ...quantization.layers import Fp8RowwiseMLP from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, QuantConfig, check_share_embedding) from .config import GPTConfig @@ -171,6 +173,10 @@ def forward(self, residual = hidden_states hidden_states = self.post_layernorm(hidden_states) + # Quantize per-token for fp8 + if isinstance(self.mlp, Fp8RowwiseMLP): + hidden_states = quantize_fp8_per_token(hidden_states) + hidden_states = self.mlp(hidden_states, lora_layer_params=lora_layer_params) diff --git a/tensorrt_llm/models/grok/convert.py b/tensorrt_llm/models/grok/convert.py index 527782e3c..219233cef 100644 --- a/tensorrt_llm/models/grok/convert.py +++ b/tensorrt_llm/models/grok/convert.py @@ -504,8 +504,10 @@ def load_weights_from_xai(*, config, mapping, model): assert quant_algo == QuantAlgo.W8A16 plugin_weight_only_quant_type = torch.int8 - moe_config = MoeConfig(config['moe_num_experts'], config['moe_top_k'], - config['moe_normalization_mode']).validate() + moe_config = MoeConfig( + num_experts=config['moe_num_experts'], + top_k=config['moe_top_k'], + normalization_mode=config['moe_normalization_mode']).validate() use_weight_only = quant_algo in [QuantAlgo.W8A16] diff --git a/tensorrt_llm/models/grok/model.py b/tensorrt_llm/models/grok/model.py index 7b77873d7..8fc34349f 100644 --- a/tensorrt_llm/models/grok/model.py +++ b/tensorrt_llm/models/grok/model.py @@ -68,8 +68,10 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): mlp_kwargs = {} assert config.moe_num_experts > 1, "Grok model is a MoE model." ClsMLP = MOE - moe_config = MoeConfig(config.moe_num_experts, config.moe_top_k, - config.moe_normalization_mode).validate() + moe_config = MoeConfig( + num_experts=config.moe_num_experts, + top_k=config.moe_top_k, + normalization_mode=config.moe_normalization_mode).validate() mlp_kwargs = { "moe_config": moe_config, "mapping": config.mapping, diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py index cd9bbc63f..7c9077081 100644 --- a/tensorrt_llm/models/llama/convert.py +++ b/tensorrt_llm/models/llama/convert.py @@ -1085,7 +1085,9 @@ def quantize(hf_model_dir: str, config: LLaMAConfig, device: str = 'cuda', calib_dataset: str = 'cnn_dailymail', - trust_remote_code: bool = True): + trust_remote_code: bool = True, + calib_batches: int = 512, + calib_max_seq_length: int = 512): ''' Quantize the save the model as TRT-LLM checkpoint to output_dir ''' @@ -1121,7 +1123,14 @@ def quantize(hf_model_dir: str, dataset = load_calib_dataset(calib_dataset) - act_range = capture_activation_range(hf_model, tokenizer, dataset) + if calib_batches == -1: # use the whole dataset if calib_batches is -1 + calib_batches = len(dataset) + + act_range = capture_activation_range(hf_model, + tokenizer, + dataset, + num_samples=calib_batches, + seq_len=calib_max_seq_length) qkv_para, smoother = {}, {} if use_smooth_quant: smooth_llama_model(hf_model, act_range, quant_config.smoothquant_val, @@ -1548,11 +1557,15 @@ def load(key, res = tensor_slice[:] elif tp_dim >= 0 and tp_dim < len(tensor_shape): if is_expert_weights: - tp_size = tp_size or mapping.moe_tp_size - tp_rank = tp_rank or mapping.moe_tp_rank + if tp_size is None: + tp_size = mapping.moe_tp_size + if tp_rank is None: + tp_rank = mapping.moe_tp_rank else: - tp_size = tp_size or mapping.tp_size - tp_rank = tp_rank or mapping.tp_rank + if tp_size is None: + tp_size = mapping.tp_size + if tp_rank is None: + tp_rank = mapping.tp_rank dim_size = tensor_shape[tp_dim] if dim_size % tp_size != 0: logger.error( diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py index 534c3bb4b..aabcc5265 100644 --- a/tensorrt_llm/models/llama/model.py +++ b/tensorrt_llm/models/llama/model.py @@ -297,7 +297,9 @@ def from_hugging_face( load_by_shard = kwargs.pop('load_by_shard', False) load_model_on_cpu = kwargs.pop('load_model_on_cpu', False) quant_ckpt_path = kwargs.pop('quant_ckpt_path', None) - if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER") is not None: + if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER" + ) is not None and not isinstance( + hf_model_or_dir, transformers.PreTrainedModel): if "vila" in hf_model_or_dir or "llava" in hf_model_or_dir: hf_model_or_dir = load_hf_llama(hf_model_or_dir, load_model_on_cpu) @@ -326,14 +328,15 @@ def from_hugging_face( config.num_key_value_heads = config.num_key_value_heads // 2 if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER") is None: custom_dict = {} - if "llava" in hf_model_or_dir: + model_name = hf_model.config.model_type if use_preloading else hf_model_or_dir + if "llava" in model_name: custom_dict = { "transformer": "language_model.model", "lm_head": "language_model.lm_head" } - elif "vila" in hf_model_or_dir: + elif "vila" in model_name: hf_model_dir += "/llm" - elif "exaone" in hf_model_or_dir: + elif "exaone" in model_name: custom_dict = { "transformer": "transformer", "layers": "h", @@ -352,8 +355,7 @@ def from_hugging_face( hf_model_dir = quant_ckpt_path loader = ModelWeightsLoader(hf_model_dir, custom_dict) - if config.share_embedding_table: - config.share_embedding_table = loader.check_share_embedding() + loader.check_share_embedding(config) model = cls(config) loader.generate_tllm_weights(model) else: @@ -449,7 +451,9 @@ def quantize( config=config, device=device, calib_dataset=calib_dataset, - trust_remote_code=trust_remote_code) + trust_remote_code=trust_remote_code, + calib_batches=calib_batches, + calib_max_seq_length=calib_max_seq_length) else: raise ValueError( f"The quant_config ({quant_config}) does not require calibration, try {cls.__name__}.from_hugging_face instead." diff --git a/tensorrt_llm/models/mamba/config.py b/tensorrt_llm/models/mamba/config.py new file mode 100644 index 000000000..13e54020f --- /dev/null +++ b/tensorrt_llm/models/mamba/config.py @@ -0,0 +1,340 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +from enum import Enum +from typing import List, Optional, Union + +import torch +import transformers + +from ..._utils import torch_dtype_to_str +from ...logger import logger +from ...mapping import Mapping +from ..modeling_utils import PretrainedConfig, QuantConfig + + +class CheckpointType(str, Enum): + mistral_inference = "mistral_inference" + state_spaces = "state_spaces" + hf = "hf" + + +def get_ckpt_type(model_path): + hf_config = transformers.AutoConfig.from_pretrained(model_path, + trust_remote_code=True) + if hasattr(hf_config, "ssm_cfg") and hf_config.ssm_cfg: + return CheckpointType.state_spaces + if os.path.exists(os.path.join(model_path, "params.json")): + return CheckpointType.mistral_inference + return CheckpointType.hf + + +class MambaConfig(PretrainedConfig): + + def __init__(self, + *, + residual_in_fp32: bool = True, + pad_vocab_size_multiple: int = -1, + layer_types: List[str] = ["recurrent"], + **kwargs): + self.residual_in_fp32 = residual_in_fp32 + self.pad_vocab_size_multiple = pad_vocab_size_multiple + self.layer_types = layer_types + super().__init__(**kwargs) + + def to_dict(self): + output = super().to_dict() + # Serialize the fields added in MambaConfig + + return output + + def update(self, data_dict): + self.__dict__.update(data_dict) + + @classmethod + def from_hugging_face( + cls, + hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'], + dtype: str = 'auto', + mapping: Optional[Mapping] = None, + quant_config: Optional[QuantConfig] = None, + **kwargs): + import transformers + + ckpt_type = get_ckpt_type(hf_config_or_dir) + + mamba_version = 'Mamba1' + if ckpt_type == CheckpointType.hf: + if isinstance(hf_config_or_dir, transformers.PretrainedConfig): + hf_config = hf_config_or_dir + else: + hf_config_dir = str(hf_config_or_dir) + + hf_config = transformers.AutoConfig.from_pretrained( + hf_config_dir, trust_remote_code=True) + + if dtype == 'auto': + dtype = getattr(hf_config, 'torch_dtype', None) + if dtype is None: + dtype = 'float16' + if isinstance(dtype, torch.dtype): + dtype = torch_dtype_to_str(dtype) + if dtype == 'float32': + dtype = 'float16' + if dtype == 'bfloat16' and torch.cuda.get_device_properties( + 0).major < 8: + logger.warning( + "Pre SM 80 GPUs do not support bfloat16, fallback to float16" + ) + dtype = 'float16' + + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, + "pad_vocab_size_multiple", 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple) + return cls(architecture="MambaForCausalLM", + dtype=dtype, + num_hidden_layers=hf_config.num_hidden_layers, + num_attention_heads=mapping.world_size, + hidden_size=hf_config.hidden_size, + intermediate_size=hf_config.intermediate_size, + vocab_size=vocab_size, + mamba_version=mamba_version, + hidden_act=hf_config.hidden_act, + rms_norm=hf_config.rms_norm, + residual_in_fp32=hf_config.residual_in_fp32, + pad_vocab_size_multiple=pad_vocab_size_multiple, + rnn_hidden_size=hf_config.intermediate_size, + rnn_conv_dim_size=hf_config.intermediate_size, + state_size=hf_config.state_size, + conv_kernel=hf_config.conv_kernel, + use_bias=hf_config.use_bias, + mapping=mapping, + quantization=quant_config, + **kwargs) + elif ckpt_type == CheckpointType.state_spaces: + + mamba_version = 'Mamba2' + if isinstance(hf_config_or_dir, transformers.PretrainedConfig): + hf_config = hf_config_or_dir + else: + hf_config_dir = str(hf_config_or_dir) + + hf_config = transformers.AutoConfig.from_pretrained( + hf_config_dir, trust_remote_code=True) + if dtype == 'auto': + dtype = getattr(hf_config, 'torch_dtype', None) + if dtype is None: + dtype = 'float16' + if isinstance(dtype, torch.dtype): + dtype = torch_dtype_to_str(dtype) + if dtype == 'float32': + dtype = 'float16' + if dtype == 'bfloat16' and torch.cuda.get_device_properties( + 0).major < 8: + logger.warning( + "Pre SM 80 GPUs do not support bfloat16, fallback to float16" + ) + dtype = 'float16' + + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, + "pad_vocab_size_multiple", 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple) + assert hasattr(hf_config, + 'ssm_cfg') and hf_config.ssm_cfg['layer'] == 'Mamba2' + config = json.load( + open(os.path.join(hf_config_or_dir, 'config.json'))) + ssm_cfg = config.pop('ssm_cfg') + cfg_to_mamba_cfg = { + 'd_model': 'hidden_size', + 'n_layer': 'num_hidden_layers', + 'fused_add_norm': None, + 'tie_embeddings': None, + } + ssm_cfg_to_mamba_cfg = { + 'd_state': 'state_size', + 'd_conv': 'conv_kernel', + 'bias': 'use_bias', + 'headdim': 'head_dim', + 'ngroups': 'n_groups', + 'chunk_size': 'chunk_size', + 'rmsnorm': 'ssm_rmsnorm', + } + for k in cfg_to_mamba_cfg: + if k in config: + v = config.pop(k) + if cfg_to_mamba_cfg[k] is not None: + config[cfg_to_mamba_cfg[k]] = v + for k in ssm_cfg_to_mamba_cfg: + if k in ssm_cfg and ssm_cfg_to_mamba_cfg[k] is not None: + config[ssm_cfg_to_mamba_cfg[k]] = ssm_cfg[k] + + if 'expand' in config: + expand = config['expand'] + hf_config.intermediate_size = expand * config['hidden_size'] + else: + hf_config.intermediate_size = 2 * config['hidden_size'] + mamba2_default_cfg = { + 'n_groups': 1, + 'hidden_size': hf_config.d_model, + 'head_dim': 64, + 'chunk_size': 256, + 'state_size': 128, + } + hf_config.update(mamba2_default_cfg) + + conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size + ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm) + mamba2_cfg = { + 'rnn_head_size': hf_config.head_dim, + 'rnn_conv_dim_size': conv_dim, + 'ngroups': hf_config.n_groups, + 'chunk_size': hf_config.chunk_size, + 'ssm_rmsnorm': ssm_rmsnorm, + } + hf_config.update(mamba2_cfg) + + return cls(architecture="MambaForCausalLM", + dtype=dtype, + num_hidden_layers=hf_config.n_layer, + num_attention_heads=mapping.world_size + if mapping is not None else 1, + hidden_size=hf_config.hidden_size, + intermediate_size=hf_config.intermediate_size, + vocab_size=vocab_size, + mamba_version=mamba_version, + hidden_act=hf_config.hidden_act, + rms_norm=hf_config.rms_norm, + residual_in_fp32=hf_config.residual_in_fp32, + pad_vocab_size_multiple=pad_vocab_size_multiple, + rnn_hidden_size=hf_config.intermediate_size, + rnn_conv_dim_size=hf_config.rnn_conv_dim_size, + state_size=hf_config.state_size, + conv_kernel=hf_config.conv_kernel, + use_bias=hf_config.use_bias, + mapping=mapping, + quantization=quant_config, + rnn_head_size=hf_config.rnn_head_size, + ngroups=hf_config.ngroups, + chunk_size=hf_config.chunk_size, + ssm_rmsnorm=hf_config.ssm_rmsnorm, + **kwargs) + elif ckpt_type == CheckpointType.mistral_inference: + mamba_version = 'Mamba2' + + config = json.load( + open(os.path.join(hf_config_or_dir, 'params.json'))) + cfg_to_mamba_cfg = { + 'dim': 'hidden_size', + 'n_layers': 'num_hidden_layers', + 'n_groups': 'n_groups', + 'fused_add_norm': None, + 'tie_embeddings': None, + 'model_type': None, + } + for k in cfg_to_mamba_cfg: + if k in config: + v = config.pop(k) + if cfg_to_mamba_cfg[k] is not None: + config[cfg_to_mamba_cfg[k]] = v + + config['architecture'] = 'MambaForCuasualLM' + config['dtype'] = dtype + config['num_attention_heads'] = mapping.world_size + + hf_config = MambaConfig(**config) + mamba2_default_cfg = { + 'n_groups': 8, + 'hidden_size': 4096, + 'head_dim': 64, + 'chunk_size': 256, + 'state_size': 128, + 'conv_kernel': 4, + 'use_bias': False + } + + hf_config.update(mamba2_default_cfg) + conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size + ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm) + mamba2_cfg = { + 'rnn_head_size': hf_config.head_dim, + 'rnn_conv_dim_size': conv_dim, + 'ngroups': hf_config.n_groups, + 'chunk_size': hf_config.chunk_size, + 'ssm_rmsnorm': ssm_rmsnorm, + } + hf_config.update(mamba2_cfg) + + if 'expand' in config: + expand = config['expand'] + hf_config.intermediate_size = expand * hf_config.hidden_size + else: + hf_config.intermediate_size = 2 * hf_config.hidden_size + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, + "pad_vocab_size_multiple", 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple) + + return cls( + architecture="MambaForCausalLM", + dtype=dtype, + num_hidden_layers=hf_config.num_hidden_layers, + num_attention_heads=mapping.world_size, + hidden_size=hf_config.hidden_size, + intermediate_size=hf_config.intermediate_size, + # num_key_value_heads=num_key_value_heads, + vocab_size=vocab_size, + mamba_version=mamba_version, + hidden_act=hf_config.hidden_act, + rms_norm=hf_config.rms_norm, + residual_in_fp32=hf_config.residual_in_fp32, + pad_vocab_size_multiple=pad_vocab_size_multiple, + rnn_hidden_size=hf_config.intermediate_size, + rnn_conv_dim_size=hf_config.rnn_conv_dim_size, + state_size=hf_config.state_size, + conv_kernel=hf_config.conv_kernel, + use_bias=hf_config.use_bias, + mapping=mapping, + quantization=quant_config, + rnn_head_size=hf_config.rnn_head_size, + ngroups=hf_config.n_groups, + chunk_size=hf_config.chunk_size, + ssm_rmsnorm=hf_config.ssm_rmsnorm, + **kwargs) + else: + pass + + if isinstance(hf_config_or_dir, transformers.PretrainedConfig): + hf_config = hf_config_or_dir + else: + hf_config_dir = str(hf_config_or_dir) + + hf_config = transformers.AutoConfig.from_pretrained( + hf_config_dir, trust_remote_code=True) + + vocab_size = hf_config.vocab_size + pad_vocab_size_multiple = getattr(hf_config, "pad_vocab_size_multiple", + 1) + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - (vocab_size % + pad_vocab_size_multiple) diff --git a/tensorrt_llm/models/mamba/convert.py b/tensorrt_llm/models/mamba/convert.py new file mode 100644 index 000000000..f55bda43c --- /dev/null +++ b/tensorrt_llm/models/mamba/convert.py @@ -0,0 +1,245 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import re +import time +from pathlib import Path +from typing import Union + +import torch + +import tensorrt_llm +from tensorrt_llm.models.convert_utils import (iterate_shard_files, + load_state_dict) + + +def get_weight(config, prefix, dtype): + return config[prefix + '.weight'].to(dtype).detach() + + +def get_bias(config, prefix, dtype): + if (prefix + '.bias') in config: + return config[prefix + '.bias'].to(dtype).detach() + return None + + +def get_weight_and_bias(config, prefix, dtype_w, dtype_b): + return get_weight(config, prefix, + dtype_w), get_bias(config, prefix, dtype_b) + + +def split(v, tp_size, idx, dim=0): + assert v.shape[dim] % tp_size == 0 + split_size = v.shape[dim] // tp_size + if tp_size == 1: + return v + return torch.split(v, split_size, dim=dim)[idx] + + +def rename_hf_to_tllm(name: str): + """ Rename a HF parameter name by the corresponding TRT-LLM style name. """ + # remove model + if 'model.' in name: + name = name.replace('model.', '') + + # change layer name + if 'embeddings.' in name: + name = name.replace('embeddings', 'vocab_embedding') + elif 'embedding.' in name: + name = name.replace('embedding', 'vocab_embedding') + norm_pattern = r'\d\.norm\.' + if 'mixer.' in name: + name = name.replace('mixer.', 'ssm.') + elif re.search(norm_pattern, name): + name = name.replace('norm.', 'input_layernorm.') + elif 'norm_f.' in name: + name = name.replace('norm_f.', 'ln_f.') + + # Parameter names in ssm layers + if 'A_log' in name: + name = name.replace('A_log', 'A') + elif 'dt_proj.bias' in name: + name = name.replace('dt_proj.bias', 'dt_bias') + return name + + +def convert_hf_mamba(hf_mamba, dtype='float32'): + weights = {} + tik = time.time() + + model_params = dict(hf_mamba.named_parameters()) + dtype = getattr(torch, dtype) + + # Parameter names in mamba block + for l in range(hf_mamba.config.num_hidden_layers): + # ssm layer + prefix = f'backbone.layers.{l}.mixer.' + tllm_prex = f'backbone.layers.{l}.ssm.' + for layer in ['conv1d', 'x_proj', 'dt_proj', 'out_proj']: + dtype_b = torch.float32 if layer == 'dt_proj' else dtype + weight, bias = get_weight_and_bias(model_params, prefix + layer, + dtype, dtype_b) + if layer == 'conv1d': + weight = weight.unsqueeze(3) + tllm_weight_name = tllm_prex + layer + '.weight' + tllm_bias_name = tllm_prex + ('dt_bias' if layer == 'dt_proj' else + layer + '.bias') + weights[tllm_weight_name] = weight + if bias is not None: + weights[tllm_bias_name] = bias + # in_proj + weight, bias = get_weight_and_bias(model_params, prefix + 'in_proj', + dtype, dtype) + in_proj_weights = torch.split(weight, weight.size(0) // 2, dim=0) + tllm_weight_name = tllm_prex + 'in_proj.weight' + weights[tllm_weight_name.replace('proj', 'proj_x')] = in_proj_weights[0] + weights[tllm_weight_name.replace('proj', 'proj_z')] = in_proj_weights[1] + if bias is not None: + in_proj_biases = torch.split(bias, bias.size(0) // 2, dim=0) + tllm_bias_name = tllm_prex + 'in_proj.bias' + weights[tllm_bias_name.replace('proj', + 'proj_x')] = in_proj_biases[0] + weights[tllm_bias_name.replace('proj', + 'proj_x')] = in_proj_biases[1] + + # A and D + Aparam = model_params[prefix + 'A_log'].float().detach() + Aparam = Aparam.permute(1, 0).contiguous() + weights[tllm_prex + 'A'] = -torch.exp(Aparam) + weights[tllm_prex + 'D'] = model_params[prefix + 'D'].float().detach() + # norm + prefix = f'backbone.layers.{l}.norm' + tllm_prex = f'backbone.layers.{l}.input_layernorm.' + weight, bias = get_weight_and_bias(model_params, prefix, dtype, dtype) + weights[tllm_prex + 'weight'] = weight + if bias is not None: + weights[tllm_prex + 'bias'] = bias + + # others + for layer in ['backbone.embeddings', 'backbone.norm_f']: + weight, bias = get_weight_and_bias(model_params, layer, dtype, dtype) + layer = layer.replace('embeddings', 'vocab_embedding') + layer = layer.replace('norm_f', 'ln_f') + weights[layer + '.weight'] = weight + if bias is not None: + weights[layer + '.bias'] = bias + weights['lm_head.weight'], _ = get_weight_and_bias(model_params, + 'backbone.embeddings', + dtype, dtype) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Weights loaded. Total time: {t}') + return weights + + +def convert_from_hf_checkpoint(mamba_config: dict, model_dir: Union[str, Path]): + + print('Loading weights from HF Mamba...') + tik = time.time() + + tp_rank = mamba_config.mapping.tp_rank + tp_size = mamba_config.mapping.tp_size + d_inner = mamba_config.rnn_hidden_size + d_state = mamba_config.state_size + dtype = mamba_config.dtype + mamba_version = mamba_config.mamba_version + weights = {} + if isinstance(dtype, str): + dtype = tensorrt_llm.str_dtype_to_torch(dtype) + + for model_file in iterate_shard_files(model_dir, 0): + # logger.debug(f'Loading file {str(model_file)}...') + model_params = load_state_dict(model_file, dtype=dtype) + for name, param in model_params.items(): + # logger.debug(f'Converting weight {name}...') + tllm_name = rename_hf_to_tllm(name) + param = param.detach().cpu() + if 'A_log' in name: + param = -torch.exp(param.float()) + if mamba_version == 'Mamba1': + param = param.permute(1, 0).contiguous() + elif 'D' in name: + param = param.float() + elif 'dt_proj.bias' in name: + param = param.float() + elif 'dt_bias' in name: + param = param.float() + elif 'conv1d.weight' in name: + param = param.unsqueeze(3) + + # split in_proj in Mamba1 + if 'in_proj' in name and mamba_version == 'Mamba1': + in_proj_params = torch.split(param, param.size(0) // 2, dim=0) + weights[tllm_name.replace('proj', 'proj_x')] = in_proj_params[0] + weights[tllm_name.replace('proj', 'proj_z')] = in_proj_params[1] + elif 'in_proj' in name and mamba_version == 'Mamba2': + nheads = d_inner // mamba_config.rnn_head_size + ngroups = mamba_config.ngroups + + in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split( + param, [ + d_inner, d_inner, ngroups * d_state, ngroups * d_state, + nheads + ], + dim=0) + in_proj_z = split(in_proj_z, tp_size, tp_rank, dim=0) + in_proj_x = split(in_proj_x, tp_size, tp_rank, dim=0) + in_proj_b = split(in_proj_b, tp_size, tp_rank, dim=0) + in_proj_c = split(in_proj_c, tp_size, tp_rank, dim=0) + in_proj_dt = split(in_proj_dt, tp_size, tp_rank, dim=0) + in_proj = torch.concat( + [in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt]) + weights[tllm_name] = in_proj.contiguous() + elif 'conv1d' in name and mamba_version == 'Mamba2': + ngroups = mamba_config.ngroups + conv_x, conv_b, conv_c = torch.split( + param, [d_inner, ngroups * d_state, ngroups * d_state], + dim=0) + conv_x = split(conv_x, tp_size, tp_rank, dim=0) + conv_b = split(conv_b, tp_size, tp_rank, dim=0) + conv_c = split(conv_c, tp_size, tp_rank, dim=0) + conv = torch.concat([conv_x, conv_b, conv_c]) + weights[tllm_name] = conv.contiguous() + elif any(keyword in name for keyword in ( + 'mixer.norm.weight', + 'A_log', + 'D', + 'dt_proj.bias', + 'dt_bias', + )) and mamba_version == 'Mamba2': + weights[tllm_name] = split(param, tp_size, tp_rank, dim=0) + elif 'out_proj' in name and mamba_version == 'Mamba2': + weights[tllm_name] = split(param, tp_size, tp_rank, + dim=1).contiguous() + else: + weights[tllm_name] = param + del model_params + + # lm_head + emb = weights['backbone.vocab_embedding.weight'] + if 'lm_head.weight' not in weights or weights['lm_head.weight'].data_ptr( + ) == emb.data_ptr(): + weights['lm_head.weight'] = copy.deepcopy(emb) + if mamba_version == 'Mamba2': + weights['lm_head.weight'] = split(weights['lm_head.weight'], + tp_size, + tp_rank, + dim=0) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Weights loaded. Total time: {t}') + return weights diff --git a/tensorrt_llm/models/mamba/model.py b/tensorrt_llm/models/mamba/model.py index 7d2aac4d6..79a20798d 100644 --- a/tensorrt_llm/models/mamba/model.py +++ b/tensorrt_llm/models/mamba/model.py @@ -12,20 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from collections import OrderedDict -from typing import List, Optional +from typing import List, Optional, Union import tensorrt as trt +from transformers import AutoModelForCausalLM from ..._common import default_net from ..._utils import str_dtype_to_trt from ...functional import (Tensor, arange, cast, concat, expand, gather_last_token_logits, shape, unsqueeze) from ...layers import ColumnLinear, Embedding, LayerNorm, Mamba, Mamba2, RmsNorm +from ...mapping import Mapping from ...module import Module, ModuleList from ...plugin import current_all_reduce_helper from ..generation_mixin import GenerationMixin -from ..modeling_utils import PretrainedConfig, PretrainedModel +from ..modeling_utils import PretrainedConfig, PretrainedModel, QuantConfig +from .config import MambaConfig +from .convert import convert_from_hf_checkpoint, convert_hf_mamba class MambaLayer(Module): @@ -168,6 +173,7 @@ def forward(self, class MambaForCausalLM(PretrainedModel): + config_class = MambaConfig def __init__(self, config: PretrainedConfig): super().__init__(config) @@ -425,3 +431,42 @@ def prepare_inputs( return_dict['slot_mapping'] = slot_mapping return return_dict + + @classmethod + def from_hugging_face( + cls, + hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'], + dtype: str = 'auto', + mapping: Optional[Mapping] = None, + quant_config: Optional[QuantConfig] = None, + **kwargs): + import transformers + + assert hf_model_or_dir is not None + use_preloading = isinstance(hf_model_or_dir, + transformers.PreTrainedModel) + if use_preloading: + hf_model = hf_model_or_dir + hf_config_or_dir = hf_model.config + else: + hf_model_dir = hf_model_or_dir + hf_config_or_dir = hf_model_or_dir + config = MambaConfig.from_hugging_face(hf_config_or_dir, + dtype=dtype, + mapping=mapping, + quant_config=quant_config, + **kwargs) + + if not os.path.exists(hf_model_dir): + hf_model = AutoModelForCausalLM.from_pretrained( + hf_model_dir, torch_dtype="auto", trust_remote_code=True) + + assert isinstance(hf_model, transformers.PreTrainedModel) + weights = convert_hf_mamba(hf_model, dtype) + else: + weights = convert_from_hf_checkpoint(config, hf_model_dir) + + model = cls(config) + model.load(weights) + + return model diff --git a/tensorrt_llm/models/model_weights_loader.py b/tensorrt_llm/models/model_weights_loader.py index e30f437e8..6bdc1871b 100644 --- a/tensorrt_llm/models/model_weights_loader.py +++ b/tensorrt_llm/models/model_weights_loader.py @@ -8,6 +8,7 @@ import torch from safetensors import safe_open from tqdm import tqdm +from transformers import PreTrainedModel from .._utils import trt_dtype_to_torch from ..layers.moe import MOEWeightWrapper @@ -17,6 +18,7 @@ class ModelWeightsFormat(Enum): + IN_MEMORY = "in_mem" SAFETENSORS = "safetensors" BINARY = "bin" PYTORCH = "pth" @@ -69,7 +71,7 @@ def translate_to_external_key( """Translate TRT-LLM key into HF key or HF key list (e.g. QKV/MoE/GPTQ) tllm_key will get translated into HF format section by section. - If one section is responeded with multiple hf_keys in a list, \ + If one section is responded with multiple hf_keys in a list, \ the translated keys will also get multiplied accordingly. tllm_key : "transformer.layers.0.attention. qkv .weight" | | | | | | @@ -135,9 +137,13 @@ def detect_format(self): else: raise NotImplementedError( "Only safetensors/pickle/binary directories are supported.") + elif isinstance(self.model_dir, dict) or isinstance( + self.model_dir, PreTrainedModel): + self.format = ModelWeightsFormat.IN_MEMORY else: raise NotImplementedError( - "args.model_dir is Neither a directory nor a file!") + "args.model_dir is not a directory, a file or an in-memory module!" + ) def preload(self): # Initialize shards and load_func @@ -145,9 +151,14 @@ def preload(self): shard_files = glob.glob(self.model_dir + "/*." + self.format.value) elif os.path.isfile(self.model_dir): shard_files = [self.model_dir] + elif isinstance(self.model_dir, dict): + shard_files = [self.model_dir] + elif isinstance(self.model_dir, PreTrainedModel): + shard_files = [dict(self.model_dir.named_parameters())] else: raise NotImplementedError( - "args.model_dir is Neither a directory nor a file!") + "args.model_dir is not a directory, a file or an in-memory module!" + ) shard_files.sort() if self.format == ModelWeightsFormat.SAFETENSORS: self.shards = [ @@ -158,6 +169,8 @@ def preload(self): torch.load(f, weights_only=True, map_location="cpu", mmap=True) for f in shard_files ] + elif self.format == ModelWeightsFormat.IN_MEMORY: + self.shards = [shard_files[0]] else: raise NotImplementedError( "Only *.safetensors/*.pth/*.bin files are supported.") @@ -177,7 +190,7 @@ def load_tensor(self, key, tp_size=1, tp_dim=-1, tp_rank=0): if tensor_shape == []: tensor = self.shards[ptr_idx].get_tensor(key).unsqueeze(0) tensor_shape = tensor.shape - elif self.format == ModelWeightsFormat.BINARY or self.format == ModelWeightsFormat.PYTORCH: + else: tensor = self.shards[ptr_idx][key] tensor_shape = tensor.shape @@ -244,6 +257,11 @@ def load(self, elif tllm_key.endswith("weight"): tp_dim = 1 - tp_dim tp_size = sub_module.tp_size if hasattr(sub_module, "tp_size") else 1 + # Disable auto TP when num_kv_heads is invalid for split + if getattr(sub_module, "is_qkv", + False) and self.model.config.num_key_value_heads < tp_size: + tp_dim = -1 + tp_size = 1 if skip_tp: tp_dim = -1 tp_size = 1 @@ -287,23 +305,49 @@ def load(self, return weight_dict - def check_share_embedding(self): + def check_share_embedding(self, config): + # TODO: Remove after --use_share_embedding is removed + if not config.share_embedding_table: + return + + from ..logger import logger lm_head_weights = self.load_tensor( self.translate_to_external_key("lm_head.weight", self.tllm_to_externel_key_dict)) vocab_embed_weights = self.load_tensor( self.translate_to_external_key("transformer.vocab_embedding.weight", self.tllm_to_externel_key_dict)) + share_embedding_table = False if lm_head_weights is not None and vocab_embed_weights is not None: if lm_head_weights.shape == vocab_embed_weights.shape: if not (lm_head_weights - vocab_embed_weights).any(): - return True - from ..logger import logger - logger.warning( - "lm_head.weight and transformer.vocab_embedding.weight are not identical, " - "share_embedding_table cannot be enabled; setting share_embedding_table=False." - ) - return False + share_embedding_table = True + elif lm_head_weights is None and vocab_embed_weights is not None: + self.tllm_to_externel_key_dict[ + 'lm_head'] = self.tllm_to_externel_key_dict[ + 'transformer'] + '.' + self.tllm_to_externel_key_dict[ + 'vocab_embedding'] + share_embedding_table = True + elif lm_head_weights is not None and vocab_embed_weights is None: + self.tllm_to_externel_key_dict[ + 'vocab_embedding'] = self.tllm_to_externel_key_dict['lm_head'] + share_embedding_table = True + + # Validation + mapping = config.mapping + if mapping.tp_size > 1: + if (not config.use_parallel_embedding) or ( + config.use_parallel_embedding + and config.embedding_sharding_dim == 1): + share_embedding_table = False + if mapping.pp_size > 1: + share_embedding_table = False + if mapping.cp_size > 1: + share_embedding_table = False + config.share_embedding_table = share_embedding_table + + if config.share_embedding_table: + logger.info("share_embedding_table enabled.") def update_key_mapping(self, model): self.model = weakref.ref(model)() @@ -312,11 +356,18 @@ def update_key_mapping(self, model): if config.mapping.has_pp(): pp_layers = config.mapping.pp_layers(config.num_hidden_layers) self.tllm_to_externel_key_dict.update({ - str(tllm_locl_layer_idx): str(hf_global_layer_idx) - for tllm_locl_layer_idx, hf_global_layer_idx in enumerate( + str(tllm_local_layer_idx): str(hf_global_layer_idx) + for tllm_local_layer_idx, hf_global_layer_idx in enumerate( pp_layers) }) + # Share embedding + if self.tllm_to_externel_key_dict[ + 'vocab_embedding'] == self.tllm_to_externel_key_dict['lm_head']: + self.model.transformer.vocab_embedding.tllm_to_externel_key_dict = { + self.tllm_to_externel_key_dict['transformer']: '', + } + def fill(self, weights): for tllm_key, param in self.model.named_parameters(): if param.is_buffer: diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py index 5a03adc9f..16722a74f 100644 --- a/tensorrt_llm/models/modeling_utils.py +++ b/tensorrt_llm/models/modeling_utils.py @@ -3,18 +3,21 @@ import dataclasses import json import os +import re from enum import IntFlag, auto from functools import cached_property from pathlib import Path -from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Union +from typing import (TYPE_CHECKING, Callable, Dict, Generator, List, Optional, + Union) import numpy as np import safetensors import torch from .._common import default_net -from .._utils import (get_init_params, numpy_to_torch, release_gc, - str_dtype_to_torch, str_dtype_to_trt, trt_dtype_to_torch) +from .._utils import (QuantModeWrapper, get_init_params, numpy_to_torch, + release_gc, str_dtype_to_torch, str_dtype_to_trt, + trt_dtype_to_torch) from ..bindings import KVCacheType from ..functional import (PositionEmbeddingType, Tensor, gather_last_token_logits, tanh) @@ -106,7 +109,17 @@ def use_plugin_sq(self): return self.quant_algo in W8A8_SQ_PLUGIN_LIST @cached_property - def quant_mode(self) -> QuantMode: + def quant_mode(self) -> QuantModeWrapper: + quant_mode_list = [ + QuantMode.from_quant_algo( + self.quant_algo, + self.kv_cache_quant_algo, + ) + ] + return QuantModeWrapper(quant_mode_list) + + @cached_property + def layer_quant_mode(self) -> QuantMode: return QuantMode.from_quant_algo( self.quant_algo, self.kv_cache_quant_algo, @@ -123,7 +136,8 @@ def requires_calibration(self): def requires_modelopt_quantization(self): if self.quant_algo in [ QuantAlgo.W4A16_AWQ, QuantAlgo.FP8, - QuantAlgo.W8A8_SQ_PER_CHANNEL, QuantAlgo.W4A8_AWQ + QuantAlgo.W8A8_SQ_PER_CHANNEL, QuantAlgo.W4A8_AWQ, + QuantAlgo.MIXED_PRECISION ]: return True elif self.quant_algo is None and self.kv_cache_quant_algo == QuantAlgo.FP8: @@ -131,6 +145,9 @@ def requires_modelopt_quantization(self): else: return False + def get_quant_cfg(self, module_name=None): + return self + def get_modelopt_qformat(self): algo_to_modelopt_map = { QuantAlgo.W8A16: "int8_wo", @@ -140,6 +157,7 @@ def get_modelopt_qformat(self): QuantAlgo.FP8: 'fp8', QuantAlgo.W8A8_SQ_PER_CHANNEL: 'int8_sq', } + assert self.quant_algo != QuantAlgo.MIXED_PRECISION, f"We don't support mixed precision in QuantConfig" if self.quant_algo is not None: assert self.quant_algo in algo_to_modelopt_map, f"We don't use Modelopt for quantization algorithm {self.quant_algo}, you probably shall not call this" return algo_to_modelopt_map[self.quant_algo] @@ -159,12 +177,102 @@ def get_modelopt_kv_cache_dtype(self): @classmethod def from_dict(cls, config: dict): - return cls(**config) + obj = cls(**config) + return obj def to_dict(self): return dataclasses.asdict(self) +@dataclasses.dataclass +class LayerQuantConfig(QuantConfig): + quant_algo: Optional[QuantConfig] = None + kv_cache_quant_algo: Optional[QuantConfig] = None + quantized_layers: Optional[Dict[str, QuantConfig]] = None + exclude_modules: Optional[List[str]] = None + + def __init__(self, + *, + quant_algo: Optional[QuantConfig] = None, + kv_cache_quant_algo: Optional[QuantConfig] = None, + quantized_layers: Optional[Dict[str, QuantConfig]] = None, + exclude_modules: Optional[List[str]] = None, + **kwargs): + self.quant_algo = quant_algo + self.quantized_layers = quantized_layers + self.kv_cache_quant_algo = kv_cache_quant_algo + self.exclude_modules = exclude_modules + self.auto_quant_mode = {} + for name, layer_config in self.quantized_layers.items(): + self.auto_quant_mode.update({ + name: + QuantMode.from_quant_algo( + layer_config.quant_algo, + self.kv_cache_quant_algo, + ) + }) + for key in kwargs: + logger.warning( + f"Warning: Unrecognized parameter '{key}' with value '{kwargs[key]}'" + ) + + @cached_property + def quant_mode(self): + quant_mode_list = list(set(self.auto_quant_mode.values())) + return QuantModeWrapper(quant_mode_list) + + @property + def layer_quant_mode(self) -> Dict[str, QuantMode]: + return self.auto_quant_mode + + @cached_property + def auto_quant_list(self): + quant_list = [] + for _, layer_config in self.quantized_layers.items(): + quant_list.append(layer_config.quant_algo) + return list(set(quant_list)) + + @classmethod + def from_dict(cls, config: dict): + quantized_layers = config.pop('quantized_layers', {}) + + quantized_layers_dict = { + layer_name: QuantConfig(**layer_config) + for layer_name, layer_config in quantized_layers.items() + } + + obj = cls(quantized_layers=quantized_layers_dict, **config) + return obj + + def get_quant_cfg(self, module_name): + assert module_name in self.quantized_layers.keys(), \ + "module {module_name} should be included in `quantized_layers` in AutoQuant mode" + return self.quantized_layers[module_name] + + def get_modelopt_qformat(self): + algo_to_modelopt_map = { + QuantAlgo.W4A16_AWQ: "int4_awq", + QuantAlgo.W4A8_AWQ: 'w4a8_awq', + QuantAlgo.FP8: 'fp8', + QuantAlgo.W8A8_SQ_PER_CHANNEL: 'int8_sq', + } + assert self.quant_algo == QuantAlgo.MIXED_PRECISION, f"We only support mixed precision quantization in LayerQuantConfig" + autoq_format = ','.join( + [algo_to_modelopt_map[item] for item in self.auto_quant_list]) + return autoq_format + + def to_dict(self): + output = copy.deepcopy(self.__dict__) + output.pop('auto_quant_mode', None) + output.pop('quant_mode', None) + output.pop('exclude_modules', None) + for name, per_layer_config in output['quantized_layers'].items(): + per_layer_config = per_layer_config.to_dict() + per_layer_config.pop('exclude_modules') + output['quantized_layers'][name] = per_layer_config + return output + + class PretrainedConfig: def __init__(self, @@ -268,6 +376,8 @@ def __init__(self, @property def kv_dtype(self): + # TODO: need to align the kv dtype + # now assume the kv cache is for all layers if self.quant_mode.has_int8_kv_cache(): return 'int8' elif self.quant_mode.has_fp8_kv_cache(): @@ -301,7 +411,17 @@ def to_dict(self): def from_json_file(cls, config_file: str): with open(config_file) as f: config = json.load(f) - return cls.from_dict(config) + obj = cls.from_dict(config) + if obj.quantization.quant_algo == QuantAlgo.MIXED_PRECISION: + try: + layer_config_path = str(config_file).replace( + 'config.json', 'quant_cfg.json') + obj.to_layer_quant_config(layer_config_path) + except Exception as e: + raise RuntimeError( + f"Encounter error '{e}' for read quantization config '{layer_config_path}'" + ) + return obj @classmethod def from_checkpoint(cls, ckpt_dir: str): @@ -311,10 +431,22 @@ def to_json_file(self, config_file: str): with open(config_file, 'w') as f: json.dump(self.to_dict(), f, indent=4) + def to_layer_quant_config(self, config_file: str): + with open(config_file) as f: + config = json.load(f) + self.quantization = LayerQuantConfig.from_dict(config) + @property def quant_mode(self): return self.quantization.quant_mode + @property + def quant_algo(self): + return self.quantization.quant_algo + + def get_quant_cfg(self, module_name: str): + return self.quantization.get_quant_cfg(module_name) + def set_rank(self, rank): self.mapping = Mapping(self.mapping.world_size, rank=rank, @@ -380,8 +512,10 @@ def forward(self, if default_net().plugin_config.reduce_fusion: if layer_idx < self.layer_list[-1]: kwargs['next_layer_input_layernorm_args'] = ( - self[layer_idx + 1].input_layernorm.weight.value, - self[layer_idx + 1].input_layernorm.eps) + self[layer_idx + 1 - + self.layer_list[0]].input_layernorm.weight.value, + self[layer_idx + 1 - + self.layer_list[0]].input_layernorm.eps) else: kwargs['next_layer_input_layernorm_args'] = None @@ -403,6 +537,8 @@ def forward(self, host_kv_cache_block_offsets, host_kv_cache_pool_pointers=kv_cache_params. host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=kv_cache_params. + host_kv_cache_pool_mapping, cache_indirection=kv_cache_params.cache_indirection), attention_params=attention_params, **kwargs) @@ -462,10 +598,14 @@ def from_config(cls, config: PretrainedConfig): return cls(config) @classmethod - def from_checkpoint(cls, - ckpt_dir: str, - rank: Optional[int] = None, - config: Optional[PretrainedConfig] = None): + def from_checkpoint( + cls, + ckpt_dir: str, + rank: Optional[int] = None, + config: Optional[PretrainedConfig] = None, + *, + preprocess_weights_hook: Optional[Callable[[Dict[str, Tensor]], + Dict[str, Tensor]]] = None): if config is None: config = PretrainedConfig.from_json_file( os.path.join(ckpt_dir, 'config.json')) @@ -478,9 +618,14 @@ def from_checkpoint(cls, assert os.path.isfile(weights_path) weights = safetensors.torch.load_file(weights_path) - is_checkpoint_pruned = getattr(config, 'is_pruned', False) - preprocess_weights(weights, config, from_pruned=is_checkpoint_pruned) + + if preprocess_weights_hook is not None: + weights = preprocess_weights_hook(weights) + + weights = preprocess_weights(weights, + config, + from_pruned=is_checkpoint_pruned) model = cls(config) model.load(weights, from_pruned=is_checkpoint_pruned) return model @@ -629,6 +774,8 @@ def prepare_inputs( 'host_kv_cache_block_offsets'], host_kv_cache_pool_pointers=model_inputs[ 'host_kv_cache_pool_pointers'], + host_kv_cache_pool_mapping=model_inputs[ + 'host_kv_cache_pool_mapping'], cache_indirection=model_inputs['cache_indirection'], ), 'attention_params': @@ -817,11 +964,6 @@ def fuse_gate_mlp( ) -> PretrainedModel: from ..quantization.quantize import fp8_quantize - quant_algo = model.config.quantization.quant_algo - if quant_algo != QuantAlgo.FP8 and quant_algo is not None: - logger.warning("fuse_gate_mlp cannot be done for this model. Skipping.") - return model - for name, mlp, layer in model.named_modules_with_parent(): if isinstance(mlp, GatedMLP): init_params = get_init_params(mlp) @@ -836,9 +978,18 @@ def fuse_gate_mlp( init_params["inner_layernorm"] = mlp.inner_layernorm is not None fused_layer = FusedGatedMLP(**init_params) - if quant_algo == QuantAlgo.FP8: - fused_layer = fp8_quantize(fused_layer, - model.config.quantization) + fc_name = name + '.fc' + layer_quant_cfg = model.config.get_quant_cfg(fc_name) + layer_quant_algo = layer_quant_cfg.quant_algo + if layer_quant_algo != QuantAlgo.FP8 and layer_quant_algo is not None: + continue + + if isinstance(model.config.quantization.exclude_modules, list) \ + and fc_name in model.config.quantization.exclude_modules: + layer_quant_algo = None + + if layer_quant_algo == QuantAlgo.FP8: + fused_layer = fp8_quantize(fused_layer, layer_quant_cfg) if isinstance(mlp.dtype, str): dtype = str_dtype_to_torch(mlp.dtype) @@ -891,7 +1042,7 @@ def fuse_gate_mlp( mlp.gate.activation_scaling_factor.raw_value, mlp.fc.activation_scaling_factor.raw_value, ) - elif quant_algo is None: + elif layer_quant_algo is None: fused_layer.fused_fc.weight.value = np.concatenate( [ mlp.gate.weight.raw_value, @@ -904,7 +1055,7 @@ def fuse_gate_mlp( [mlp.gate.bias.raw_value, mlp.fc.bias.raw_value], axis=0) else: - raise ValueError(f'Unsupported quant algo: {quant_algo}') + raise ValueError(f'Unsupported quant algo: {layer_quant_algo}') fused_layer.proj = mlp.proj fused_layer.inner_layernorm = mlp.inner_layernorm @@ -950,9 +1101,10 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel: layer.tp_size * layer.num_attention_kv_heads * layer.attention_head_size, }) - q = quantize(q, model.config.quantization) - k = quantize(k, model.config.quantization) - v = quantize(v, model.config.quantization) + layer_quant_cfg = model.config.get_quant_cfg(name + '.qkv') + q = quantize(q, layer_quant_cfg) + k = quantize(k, layer_quant_cfg) + v = quantize(v, layer_quant_cfg) out_features = q.out_features + k.out_features + v.out_features if isinstance(layer.qkv, ( WeightOnlyQuantLinear, @@ -1130,7 +1282,8 @@ def share_embedding(model: PretrainedModel) -> PretrainedModel: def set_fp8_context_fhma(model: PretrainedModel) -> PretrainedModel: for name, layer in model.named_modules(): - if isinstance(layer, Attention): + if isinstance(layer, Attention) and hasattr( + layer.dense, 'activation_scaling_factor'): scale = [1.0] / layer.dense.activation_scaling_factor.raw_value layer.attention_output_orig_quant_scale = Parameter( value=scale.astype(np.float32)) @@ -1180,19 +1333,11 @@ def optimize_model( return model -def preprocess_weights(weights: Dict[str, torch.Tensor], - model_config: PretrainedConfig, - from_pruned=False) -> None: - """This function in-place modifies weights and model_config, making them compatible with each other. - - Note: Typically, it should be called before model creation and weight loading. For example, - preprocess_weights(weights, model_config) - model = XXXForCausalLM(model_config) - model.load(weights) - """ - quant_algo = model_config.quantization.quant_algo +def preprocess_perlayer_weights(weights, + model_config, + quant_algo, + from_pruned=False): exclude_modules = model_config.quantization.exclude_modules - # INT4_AWQ if quant_algo == QuantAlgo.W4A8_AWQ or quant_algo == QuantAlgo.W4A16_AWQ: preprocessor = torch.ops.trtllm.preprocess_weights_for_mixed_gemm @@ -1267,20 +1412,76 @@ def preprocess_weights(weights: Dict[str, torch.Tensor], exclude_modules=exclude_modules, plugin=True) - # Parallel block rowlinear should not have duplicate bias. - elif model_config.architecture == 'GPTJForCausalLM': - if model_config.mapping.tp_rank > 0: - for name, param in weights.items(): + +def preprocess_weights(weights: Dict[str, torch.Tensor], + model_config: PretrainedConfig, + from_pruned=False) -> None: + """This function in-place modifies weights and model_config, making them compatible with each other. + + Note: Typically, it should be called before model creation and weight loading. For example, + preprocess_weights(weights, model_config) + model = XXXForCausalLM(model_config) + model.load(weights) + """ + quant_config = model_config.quantization + quant_algo = quant_config.quant_algo + + pattern_info = ['fc', 'gate', 'proj', 'qkv', 'dense'] + + per_layer_weights = {} + + for name, param in weights.items(): + in_mode = False + for info in pattern_info: + pattern = rf'(.*?{info}.*?)' + pattern_match = re.match(pattern, name) + if pattern_match: + base_name = pattern_match.group(1) + if base_name not in per_layer_weights.keys(): + per_layer_weights[base_name] = {} + per_layer_weights[base_name][name] = param + in_mode = True + break + if not in_mode: + # [lm_head.weight, ln_f.weight, vocab_embedding.weight] + base_name = name.rsplit('.', 1)[0] + if base_name not in per_layer_weights.keys(): + per_layer_weights[base_name] = {} + per_layer_weights[base_name][name] = param + + new_weights = {} + for base_name, layer_weights in per_layer_weights.items(): + if quant_algo != QuantAlgo.MIXED_PRECISION: + layer_quant_algo = quant_algo + else: + if base_name not in quant_config.quantized_layers.keys(): + new_weights.update(layer_weights) + continue + layer_quant_algo = quant_config.quantized_layers[ + base_name].quant_algo + + preprocess_perlayer_weights(layer_weights, model_config, + layer_quant_algo, from_pruned) + new_weights.update(layer_weights) + + weights = new_weights + for name, param in weights.items(): + if model_config.architecture == 'GPTJForCausalLM': + if model_config.mapping.tp_rank > 0: if 'attention.dense.bias' in name or 'mlp.proj.bias' in name: weights[name] = torch.zeros_like(param) # For share_embedding_table check_share_embedding(weights, model_config) + return weights def check_share_embedding(weights: Dict[str, torch.Tensor], model_config: PretrainedConfig): if model_config.share_embedding_table: + if "lm_head.weight" in weights: + if weights["lm_head.weight"] is None: + weights.pop("lm_head.weight") if "lm_head.weight" in weights and "transformer.vocab_embedding.weight" in weights: if (weights["lm_head.weight"] - weights["transformer.vocab_embedding.weight"]).any(): diff --git a/tensorrt_llm/models/nemotron_nas/__init__.py b/tensorrt_llm/models/nemotron_nas/__init__.py new file mode 100644 index 000000000..71bf6d298 --- /dev/null +++ b/tensorrt_llm/models/nemotron_nas/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tensorrt_llm/models/deci/config.py b/tensorrt_llm/models/nemotron_nas/config.py similarity index 86% rename from tensorrt_llm/models/deci/config.py rename to tensorrt_llm/models/nemotron_nas/config.py index b9accc61e..ca3b4fb1b 100644 --- a/tensorrt_llm/models/deci/config.py +++ b/tensorrt_llm/models/nemotron_nas/config.py @@ -21,11 +21,11 @@ from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models.deci.convert import hf_block_config_to_layer_config -from tensorrt_llm.models.deci.layer_config import (AttentionConfig, - AttentionImplementation, - DeciLayerConfig, FFNConfig) from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig +from tensorrt_llm.models.nemotron_nas.convert import \ + hf_block_configs_to_layer_configs +from tensorrt_llm.models.nemotron_nas.layer_config import ( + AttentionConfig, AttentionImplementation, DeciLayerConfig, FFNConfig) class DeciConfig(PretrainedConfig): @@ -60,6 +60,7 @@ def __init__(self, Dict[str, Dict[str, Any]]]]] = None, + block_configs: Optional[object] = None, **kwargs): super().__init__(architecture=architecture, dtype=dtype, @@ -86,7 +87,13 @@ def __init__(self, self.rotary_base = rotary_base self.rotary_scaling = rotary_scaling - if layer_configs is not None: + if block_configs is not None: + assert layer_configs is None + self.layer_configs = hf_block_configs_to_layer_configs( + block_configs, + num_attention_heads=num_attention_heads, + hidden_size=hidden_size) + elif layer_configs is not None: assert len( layer_configs ) == num_hidden_layers, f"num_hidden_layers ({num_hidden_layers}) must match len(layer_configs) ({len(layer_configs)})" @@ -102,6 +109,14 @@ def __init__(self, for layer_idx in range(self.num_hidden_layers) ] + # HACK: this is here since the runtime doesn't parse the layer_configs yet + self.num_kv_heads_per_layer = [] + for layer_idx in range(self.num_hidden_layers): + layer_config = self.get_layer_config(layer_idx) + if layer_config.is_attention_layer: + self.num_kv_heads_per_layer.append( + layer_config.attention.num_key_value_heads) + def _ensure_layer_configs( self, layer_configs: List[Union[DeciLayerConfig, Dict[str, Any]]] ) -> List[DeciLayerConfig]: @@ -154,16 +169,16 @@ def from_hugging_face( hf_config = transformers.AutoConfig.from_pretrained( hf_config_or_dir, trust_remote_code=trust_remote_code) - assert hf_config.model_type == "deci", f"Unsupported model type: {hf_config.model_type}" + assert hf_config.model_type in ( + "deci", + "nemotron-nas"), f"Unsupported model type: {hf_config.model_type}" block_configs = getattr(hf_config, "block_configs", None) if block_configs is not None: - layer_configs = [ - hf_block_config_to_layer_config(block_config, - hf_config.num_attention_heads, - hf_config.hidden_size) - for block_config in block_configs - ] + layer_configs = hf_block_configs_to_layer_configs( + block_configs, + num_attention_heads=hf_config.num_attention_heads, + hidden_size=hf_config.hidden_size) else: # older deci arch num_key_value_heads_per_layer = getattr( diff --git a/tensorrt_llm/models/deci/convert.py b/tensorrt_llm/models/nemotron_nas/convert.py similarity index 77% rename from tensorrt_llm/models/deci/convert.py rename to tensorrt_llm/models/nemotron_nas/convert.py index c6bff772a..06ca34b61 100644 --- a/tensorrt_llm/models/deci/convert.py +++ b/tensorrt_llm/models/nemotron_nas/convert.py @@ -17,8 +17,9 @@ import time from abc import ABC, abstractmethod from contextlib import contextmanager +from dataclasses import asdict from pathlib import Path -from typing import Any, Dict, Iterator, Optional, TypedDict, Union +from typing import Any, Dict, Iterator, List, Optional, TypedDict, Union import safetensors import torch @@ -26,10 +27,9 @@ from tensorrt_llm._utils import pad_vocab_size from tensorrt_llm.logger import logger from tensorrt_llm.models.convert_utils import dup_kv_weight, split -from tensorrt_llm.models.deci.layer_config import (AttentionConfig, - AttentionImplementation, - DeciLayerConfig, FFNConfig, - FFNImplementation) +from tensorrt_llm.models.nemotron_nas.layer_config import ( + AttentionConfig, AttentionImplementation, DeciLayerConfig, FFNConfig, + FFNImplementation) from tensorrt_llm.quantization.mode import QuantAlgo @@ -45,35 +45,39 @@ def _find_multiple(n: int, k: int) -> int: # BlockConfig is a custom class defined inside deci huggingface checkpoints, we can't import it -def hf_block_config_to_layer_config(block_config: "BlockConfig", +def hf_block_config_to_layer_config(block_config: Union["BlockConfig", dict], num_attn_heads: int, hidden_size: int) -> DeciLayerConfig: - attn = block_config.attention - if attn.no_op: + """`block_config` (`Union[BlockConfig, dict]`): A `dict` when exported from `ModelOpt`; A `dataclass` at the HF phase + """ + block_config = block_config if isinstance(block_config, + dict) else asdict(block_config) + attn = block_config["attention"] + if attn["no_op"]: attn_impl = AttentionImplementation.NO_OP num_key_value_heads = None - elif attn.replace_with_linear: + elif attn["replace_with_linear"]: attn_impl = AttentionImplementation.LINEAR num_key_value_heads = None - elif attn.sparsify: + elif attn.get("sparsify", None): raise NotImplementedError("Sparsification is not supported") else: attn_impl = AttentionImplementation.ATTENTION - num_key_value_heads = num_attn_heads // attn.n_heads_in_group + num_key_value_heads = num_attn_heads // attn["n_heads_in_group"] - ffn = block_config.ffn - if ffn.no_op: + ffn = block_config["ffn"] + if ffn["no_op"]: ffn_impl = FFNImplementation.NO_OP intermediate_size = None - elif ffn.replace_with_linear: + elif ffn["replace_with_linear"]: ffn_impl = FFNImplementation.LINEAR intermediate_size = None - elif ffn.sparsify: + elif ffn.get("sparsify", None): raise NotImplementedError("Sparsification is not supported") else: ffn_impl = FFNImplementation.MLP intermediate_size = _ffn_mult_to_intermediate_size( - ffn.ffn_mult, hidden_size) + ffn["ffn_mult"], hidden_size) return DeciLayerConfig(attention=AttentionConfig( impl=attn_impl, num_key_value_heads=num_key_value_heads), @@ -81,6 +85,16 @@ def hf_block_config_to_layer_config(block_config: "BlockConfig", intermediate_size=intermediate_size)) +def hf_block_configs_to_layer_configs( + block_configs: Union["BlockConfig", dict], *, num_attention_heads: int, + hidden_size: int) -> List[DeciLayerConfig]: + return [ + hf_block_config_to_layer_config(block_config, num_attention_heads, + hidden_size) + for block_config in block_configs + ] + + @contextmanager def timed_loading() -> Iterator[None]: tik = time.time() @@ -105,12 +119,31 @@ class SafetensorsIndex(TypedDict): class WeightsLoader(ABC): @abstractmethod + def read_weight(self, name: str) -> torch.Tensor: + ... + def get_weight(self, name: str, tp_dim: TpDim = TpDim.NO_TP, tp_size: int = 1, tp_rank: int = 0) -> torch.Tensor: - ... + weight = self.read_weight(name) + if tp_dim != TpDim.NO_TP: + weight = split(weight, tp_size, tp_rank, dim=tp_dim) + return weight + + def get_kv_weight(self, + name: str, + num_heads: int, + tp_size: int = 1, + tp_rank: int = 0) -> torch.Tensor: + weight = self.read_weight(name) + if tp_size > num_heads: + weight = dup_kv_weight(weight, num_heads, tp_size) + if tp_size > 1: + weight = split(weight, tp_size, tp_rank, dim=0) + + return weight class HFModelWeightsLoader(WeightsLoader): @@ -120,18 +153,11 @@ def __init__(self, *, hf_model: "transformers.PreTrainedModel", self.model_params = dict(hf_model.named_parameters()) self.dtype = getattr(torch, dtype) - def get_weight(self, - name: str, - tp_dim: TpDim = TpDim.NO_TP, - tp_size: int = 1, - tp_rank: int = 0) -> torch.Tensor: + def read_weight(self, name: str) -> torch.Tensor: weight = self.model_params[name] if weight.dtype != self.dtype: weight = weight.to(self.dtype) weight = weight.detach() - - if tp_dim != TpDim.NO_TP: - weight = split(weight, tp_size, tp_rank, dim=tp_dim) return weight @@ -163,37 +189,10 @@ def __init__(self, *, model_dir: Path, dtype: str) -> None: for shard_file in shard_files } - def get_weight(self, - name: str, - tp_dim: TpDim = TpDim.NO_TP, - tp_size: int = 1, - tp_rank: int = 0) -> torch.Tensor: + def read_weight(self, name: str) -> torch.Tensor: shard_filename = self.sharding_map['weight_map'].get( name, self.shard_files[0]) - if tp_dim == TpDim.NO_TP: - res = self.safetensors_files[shard_filename].get_tensor(name) - else: - tensor_slice = self.safetensors_files[shard_filename].get_slice( - name) - tensor_shape = tensor_slice.get_shape() - if len(tensor_shape) == 1: - if tp_dim == TpDim.COLWISE: - slice_width = tensor_shape[0] // tp_size - res = tensor_slice[slice_width * tp_rank:slice_width * - (tp_rank + 1)] - else: # row-wise, but 1-dimensional ==> no tp - res = tensor_slice[:] - else: - assert tensor_shape[ - tp_dim] % tp_size == 0, f"Current weight shape is invalid for tp_size={tp_size}" - slice_width = tensor_shape[tp_dim] // tp_size - if tp_dim == TpDim.COLWISE: - res = tensor_slice[slice_width * tp_rank:slice_width * - (tp_rank + 1), :] - else: - res = tensor_slice[:, slice_width * tp_rank:slice_width * - (tp_rank + 1)] - + res = self.safetensors_files[shard_filename].get_tensor(name) return res.to(self.dtype).contiguous() @@ -245,24 +244,20 @@ def load_weight(name: str, tp_dim: TpDim = TpDim.NO_TP) -> torch.Tensor: f"model.layers.{l}.input_layernorm.weight" ) # input_layernorm - qkv = {} - for comp in ["q", "k", "v"]: - weight_part = load_weight( - f"model.layers.{l}.self_attn.{comp}_proj.weight", - TpDim.COLWISE) - qkv[comp] = weight_part - - if layer_config.attention.num_key_value_heads < mapping.tp_size: - # duplicate the KV heads up to tensor_parallel - qkv["k"] = dup_kv_weight( - qkv["k"], layer_config.attention.num_key_value_heads, - mapping.tp_size) - qkv["v"] = dup_kv_weight( - qkv["v"], layer_config.attention.num_key_value_heads, - mapping.tp_size) - + q = load_weight(f"model.layers.{l}.self_attn.q_proj.weight", + TpDim.COLWISE) + k = loader.get_kv_weight( + f"model.layers.{l}.self_attn.k_proj.weight", + num_heads=layer_config.attention.num_key_value_heads, + tp_size=mapping.tp_size, + tp_rank=mapping.tp_rank) + v = loader.get_kv_weight( + f"model.layers.{l}.self_attn.v_proj.weight", + num_heads=layer_config.attention.num_key_value_heads, + tp_size=mapping.tp_size, + tp_rank=mapping.tp_rank) weights[f'{tllm_prex}.attention.qkv.weight'] = torch.cat( - [qkv["q"], qkv["k"], qkv["v"]], 0) + [q, k, v], 0) weights[f'{tllm_prex}.attention.dense.weight'] = load_weight( f"model.layers.{l}.self_attn.o_proj.weight", TpDim.ROWWISE) # attention.dense @@ -363,3 +358,23 @@ def load_weights_from_hf_safetensors( loader = SafetensorsWeightsLoader(model_dir=model_dir, dtype=config.dtype) logger.info('Loading weights from Huggingface safetensors...') return load_model_weights(loader=loader, config=config) + + +def update_weights_following_modelopt_optimization( + weights: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + # Rename MLPs to FFNs to match TRTLLM implementation expectation + weights = {k.replace('.mlp.', '.ffn.'): v for k, v in weights.items()} + + # Move all linear attentions to their expected locations + weights = { + k.replace('.attn_replacing_linear.', '.attention.'): v + for k, v in weights.items() + } + + # Move all linear MLPs to their expected locations + weights = { + k.replace('.mlp_replacing_linear.', '.ffn.'): v + for k, v in weights.items() + } + + return weights diff --git a/tensorrt_llm/models/deci/layer_config.py b/tensorrt_llm/models/nemotron_nas/layer_config.py similarity index 100% rename from tensorrt_llm/models/deci/layer_config.py rename to tensorrt_llm/models/nemotron_nas/layer_config.py diff --git a/tensorrt_llm/models/deci/model.py b/tensorrt_llm/models/nemotron_nas/model.py similarity index 67% rename from tensorrt_llm/models/deci/model.py rename to tensorrt_llm/models/nemotron_nas/model.py index b0d0ded0e..a3c3e2388 100644 --- a/tensorrt_llm/models/deci/model.py +++ b/tensorrt_llm/models/nemotron_nas/model.py @@ -16,9 +16,10 @@ from typing import List, Optional, Tuple, Type, Union from tensorrt_llm.bindings import KVCacheType -from tensorrt_llm.functional import (AllReduceFusionParams, AttentionMaskType, - PositionEmbeddingType, Tensor, - gather_last_token_logits, recv, send) +from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceFusionParams, + AttentionMaskType, PositionEmbeddingType, + Tensor, gather_last_token_logits, recv, + send) from tensorrt_llm.layers.attention import (Attention, AttentionParams, KeyValueCacheParams, SpecDecodingParams) @@ -29,16 +30,17 @@ from tensorrt_llm.layers.normalization import RmsNorm from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.convert_utils import has_safetensors -from tensorrt_llm.models.deci.config import DeciConfig -from tensorrt_llm.models.deci.convert import (load_weights_from_hf_model, - load_weights_from_hf_safetensors) from tensorrt_llm.models.modeling_utils import DecoderModelForCausalLM +from tensorrt_llm.models.nemotron_nas.config import DeciConfig +from tensorrt_llm.models.nemotron_nas.convert import ( + load_weights_from_hf_model, load_weights_from_hf_safetensors, + update_weights_following_modelopt_optimization) from tensorrt_llm.module import Module, ModuleList from tensorrt_llm.plugin.plugin import init_all_reduce_helper from ..._common import default_net from ..._utils import pad_vocab_size -from ..modeling_utils import QuantConfig, preprocess_weights +from ..modeling_utils import PretrainedConfig, QuantConfig, preprocess_weights @dataclass @@ -123,21 +125,106 @@ def __init__(self, config: DeciConfig, layer_idx: int): self.layer_config = self.config.get_layer_config(self.layer_idx) - layer_type_len = len(config.layer_types) - layer_types = config.layer_types * ((layer_idx + 1) // layer_type_len) - layer_types = layer_types + config.layer_types[0:( - (layer_idx + 1) % layer_type_len)] - - attention_layer_idx = layer_types.count('attention') - 1 - self._init_attention(attention_layer_idx) + self._init_attention() self._init_ffn() - def _init_attention(self, attention_layer_idx) -> None: + @property + def input_layernorm_was_fused(self) -> bool: + """ + The previous layer ran our input_layernorm for us if: + 1. The reduce_fusion plugin is enabled and + 2. We are not the first local model layer and + 3. The previous layer is an MLP layer + """ + return default_net( + ).plugin_config.reduce_fusion and self.local_layer_idx > 0 and self.config.get_layer_config( + self.layer_idx - + 1).is_mlp_layer and self.needs_input_layernorm_fusion + + @property + def needs_input_layernorm_fusion(self) -> bool: + """ + This layer needs the previous layer to perform input_layernorm fusion if: + 1. The reduce_fusion plugin is enabled and + 2. This is not a NOOP attention layer (otherwise it has no input_layernorm) + """ + return default_net( + ).plugin_config.reduce_fusion and not self.layer_config.is_noop_attention_layer + + @property + def can_fuse_post_layernorm(self) -> bool: + """ + This layer can fuse attention and post_layernorm if: + 1. The reduce_fusion plugin is enabled and + 2. It is an attention layer and + 3. It is not a NOOP FFN layer (othrewise it has no post_layernorm) + """ + return default_net( + ).plugin_config.reduce_fusion and self.layer_config.is_attention_layer and not self.layer_config.is_noop_ffn_layer + + @property + def can_fuse_input_layernorm(self) -> bool: + """ + This layer can run the next layer's input_layernorm if: + 1. The reduce_fusion plugin is enable and + 2. It is an MLP layer + """ + return default_net( + ).plugin_config.reduce_fusion and self.layer_config.is_mlp_layer + + def _init_attention(self) -> None: """ Initialize some attention alternative """ # normal attention if self.layer_config.is_attention_layer: + # according to recurrentgemma, len(layer_types) can be less than num_hidden_layers + # in this case, the list should wrap-around + # for example, if layer_types = ["attention", "recurrent", "recurrent"], and we have 5 layers, we get: + # layer 0 ==> attention + # layer 1 ==> recurrent + # layer 2 ==> recurrent + # layer 3 ==> attention + # layer 4 ==> recurrent + # we check which layers are local to our rank + layers_range = self.config.mapping.pp_layers( + self.config.num_hidden_layers) + # then take the size of layer_types in the config + layer_type_len = len(self.config.layer_types) + # collect the layer types of all the local layers + local_layer_types = [ + self.config.layer_types[layer_id % layer_type_len] + for layer_id in layers_range + ] + # and see how many of them are attention layers to determine our local attention layer idx + local_attn_layer_idx = local_layer_types[:self. + local_layer_idx].count( + "attention") + + # Iterate over all local layer configs, getting num_kv_heads of the attention ones + num_kv_heads_per_local_layer = [ + layer_config.attention.num_key_value_heads for layer_config in + [self.config.layer_configs[idx] for idx in layers_range] + if layer_config.is_attention_layer + ] + + # adjust num heads according to tp size + num_kv_heads_per_local_layer = [ + (nheads + self.config.mapping.tp_size - 1) // + self.config.mapping.tp_size + for nheads in num_kv_heads_per_local_layer + ] + nheads_tp = (self.layer_config.attention.num_key_value_heads + + self.config.mapping.tp_size - + 1) // self.config.mapping.tp_size + + # local layers with the same number of kv heads share the same cache pool + # we count how many such layers there are before us to determine our index inside that pool + layer_idx_in_cache_pool = num_kv_heads_per_local_layer[: + local_attn_layer_idx].count( + nheads_tp + ) + self.input_layernorm = RmsNorm( normalized_shape=self.config.hidden_size, eps=self.config.norm_epsilon, @@ -145,7 +232,7 @@ def _init_attention(self, attention_layer_idx) -> None: ) self.attention = Attention( - local_layer_idx=attention_layer_idx, + local_layer_idx=local_attn_layer_idx, hidden_size=self.config.hidden_size, attention_head_size=self.config.head_size, num_attention_heads=self.config.num_attention_heads, @@ -161,7 +248,7 @@ def _init_attention(self, attention_layer_idx) -> None: tp_size=self.config.mapping.tp_size, tp_rank=self.config.mapping.tp_rank, quant_mode=self.config.quant_mode, - ) + layer_idx_in_cache_pool=layer_idx_in_cache_pool) elif self.layer_config.is_noop_attention_layer: self.input_layernorm = NoOpLayerNorm() @@ -238,20 +325,34 @@ def _init_ffn(self) -> None: f"FFN of type {str(self.layer_config.ffn.impl)} is not implemented" ) - def forward( - self, - hidden_states: Tensor, - attention_mask: Optional[Tensor] = None, - use_cache: bool = False, - spec_decoding_params=None, - kv_cache_params: Optional[KeyValueCacheParams] = None, - attention_params: Optional[AttentionParams] = None, - lora_layer_params: Optional[LoraParams] = None, - ): - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) + def forward(self, + hidden_states: Tensor | Tuple[Tensor, Tensor], + attention_mask: Optional[Tensor] = None, + use_cache: bool = False, + spec_decoding_params=None, + kv_cache_params: Optional[KeyValueCacheParams] = None, + attention_params: Optional[AttentionParams] = None, + lora_layer_params: Optional[LoraParams] = None, + next_layer_input_layernorm_args: Optional[Tuple[Tensor, + float]] = None): + if self.input_layernorm_was_fused: + # previous layer already performed our layer norm + assert isinstance(hidden_states, tuple) + hidden_states, residual = hidden_states + else: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + if self.can_fuse_post_layernorm: + reduce_fusion_params = AllReduceFusionParams( + fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM, + residual=residual, + norm_weight=self.post_layernorm.weight.value, + eps=self.post_layernorm.eps) + else: + reduce_fusion_params = None - attention_output = self.attention( + attention_output = self._run_attention( hidden_states=hidden_states, attention_mask=attention_mask, use_cache=use_cache, @@ -259,23 +360,92 @@ def forward( kv_cache_params=kv_cache_params, attention_params=attention_params, lora_layer_params=lora_layer_params, - ) + reduce_fusion_params=reduce_fusion_params) if use_cache: attention_output, present_kv = attention_output else: present_kv = None - hidden_states = residual + attention_output - residual = hidden_states - hidden_states = self.post_layernorm(hidden_states) - hidden_states = self.ffn(hidden_states, - lora_layer_params=lora_layer_params) - hidden_states = residual + hidden_states + if self.can_fuse_post_layernorm: + hidden_states, residual = attention_output + else: + hidden_states = residual + attention_output + residual = hidden_states + hidden_states = self.post_layernorm(hidden_states) + + if next_layer_input_layernorm_args is not None: + assert self.can_fuse_input_layernorm + norm_weight, eps = next_layer_input_layernorm_args + reduce_fusion_params = AllReduceFusionParams( + fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM, + residual=residual, + norm_weight=norm_weight, + eps=eps) + hidden_states = self._run_ffn( + hidden_states, + lora_layer_params=lora_layer_params, + reduce_fusion_params=reduce_fusion_params) + + else: + hidden_states = self._run_ffn(hidden_states, + lora_layer_params=lora_layer_params) + hidden_states = residual + hidden_states return DeciLMLayerOutput(hidden_states=hidden_states, present_kv=present_kv) + def _run_attention( + self, + hidden_states: Tensor, + attention_mask: Optional[Tensor] = None, + use_cache: bool = False, + spec_decoding_params=None, + kv_cache_params: Optional[KeyValueCacheParams] = None, + attention_params: Optional[AttentionParams] = None, + lora_layer_params: Optional[LoraParams] = None, + reduce_fusion_params: Optional[AllReduceFusionParams] = None + ) -> Union[Tensor, Tuple[Tensor, None]]: + """ + Ideally, this functionality would be encapsulated in a LinearAttention class, but during + FP8 and lower quantization, our linear classes get overrun by ModelOpt, thus we must + control the attention inputs at the DecoderLayer level. + """ + if self.layer_config.is_linear_attention_layer: + out = self.attention(hidden_states) + return out, None if use_cache else out + else: + if not self.layer_config.is_attention_layer: + assert reduce_fusion_params is None, f"Layer with attention of type {self.layer_config.attention.impl} can't do reduce_fusion" + + return self.attention(hidden_states=hidden_states, + attention_mask=attention_mask, + use_cache=use_cache, + spec_decoding_params=spec_decoding_params, + kv_cache_params=kv_cache_params, + attention_params=attention_params, + lora_layer_params=lora_layer_params, + reduce_fusion_params=reduce_fusion_params) + + def _run_ffn(self, + hidden_states, + lora_layer_params=None, + reduce_fusion_params: Optional[AllReduceFusionParams] = None): + """ + Ideally, this functionality would be encapsulated in a LinearMLP class, but during + FP8 and lower quantization, our linear classes get overrun by ModelOpt, thus we must + control the MLP inputs at the DecoderLayer level. + """ + if reduce_fusion_params is not None: + assert self.layer_config.is_mlp_layer, f"Layer with FFN of type {self.layer_config.ffn.impl} can't do reduce_fusion" + + if self.layer_config.is_linear_ffn_layer: + return self.ffn(hidden_states) + else: + return self.ffn(hidden_states, + lora_layer_params=lora_layer_params, + reduce_fusion_params=reduce_fusion_params) + class DeciLMDecoderLayerList(ModuleList): @@ -311,6 +481,17 @@ def forward( past_key_values = [x for x in pkv_iter] for layer_idx, (layer, past) in enumerate(zip(self, past_key_values)): + next_layer_input_layernorm_args = None + if default_net().plugin_config.reduce_fusion: + if layer_idx < self.layer_list[-1]: + # this is not the last layer + next_layer = self[layer_idx + 1] + if layer.can_fuse_input_layernorm and next_layer.needs_input_layernorm_fusion: + # this layer can fuse the next layer's input_layernorm + next_layer_input_layernorm_args = ( + next_layer.input_layernorm.weight.value, + next_layer.input_layernorm.eps) + layer_out = layer( hidden_states=hidden_states, attention_mask=attention_mask, @@ -329,13 +510,16 @@ def forward( host_kv_cache_block_offsets, host_kv_cache_pool_pointers=kv_cache_params. host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=kv_cache_params. + host_kv_cache_pool_mapping, cache_indirection=kv_cache_params.cache_indirection, ), spec_decoding_params=spec_decoding_params, use_cache=use_cache, lora_layer_params=lora_params.get_layer_config(layer_idx) if lora_params is not None - and lora_params.lora_ranks is not None else None) + and lora_params.lora_ranks is not None else None, + next_layer_input_layernorm_args=next_layer_input_layernorm_args) hidden_states = layer_out.hidden_states if use_cache and layer_out.present_kv is not None: @@ -511,6 +695,19 @@ def from_hugging_face(cls, model.load(weights) return model + @classmethod + def from_checkpoint(cls, + ckpt_dir: str, + rank: Optional[int] = None, + config: Optional["PretrainedConfig"] = None): + return super().from_checkpoint( + ckpt_dir, + rank, + config, + preprocess_weights_hook= + update_weights_following_modelopt_optimization, + ) + def forward( self, input_ids: Tensor, @@ -605,7 +802,6 @@ def prepare_attention_inputs( attn_layer_idx.append(layer_idx) num_kv_heads_per_layer.append( layer_config.attention.num_key_value_heads) - num_layers = len(attn_layer_idx) attention_inputs = super().prepare_attention_inputs( max_batch_size=max_batch_size, @@ -628,16 +824,4 @@ def prepare_attention_inputs( opt_batch_size=opt_batch_size, num_kv_heads_per_layer=num_kv_heads_per_layer) - kv_idx = 0 - past_key_value = [] - for i in range(self.config.num_hidden_layers): - layer_config = self.config.get_layer_config(i) - if layer_config.is_attention_layer: - past_key_value.append( - attention_inputs['past_key_value'][kv_idx]) - kv_idx += 1 - else: - past_key_value.append(None) - attention_inputs['past_key_value'] = past_key_value - return attention_inputs diff --git a/tensorrt_llm/models/phi3/config.py b/tensorrt_llm/models/phi3/config.py index 601603fd2..558196930 100644 --- a/tensorrt_llm/models/phi3/config.py +++ b/tensorrt_llm/models/phi3/config.py @@ -18,6 +18,7 @@ import torch from ..._utils import torch_dtype_to_str +from ...layers import MoeConfig from ...logger import logger from ...mapping import Mapping from ..modeling_utils import PretrainedConfig, QuantConfig @@ -103,9 +104,23 @@ def from_hugging_face( hf_config, "blocksparse_vert_stride", None) kwargs['dense_attention_every_n_layers'] = getattr( hf_config, "dense_attention_every_n_layers", None) + kwargs['norm_epsilon'] = hf_config.layer_norm_epsilon else: kwargs['rotary_base'] = hf_config.rope_theta kwargs['norm_epsilon'] = hf_config.rms_norm_eps + moe_variant = hf_config.architectures[0] == "PhiMoEForCausalLM" + if moe_variant: + kwargs.update({ + 'moe': { + 'num_experts': hf_config.num_local_experts, + 'top_k': hf_config.num_experts_per_tok, + 'normalization_mode': + MoeConfig.ExpertScaleNormalizationMode.SPARSE_MIXER, + 'sparse_mixer_epsilon': hf_config.router_jitter_noise, + }, + 'attention_bias': hf_config.attention_bias + }) + kwargs['position_embedding_type'] = 'rope_gpt_neox' if hf_config.max_position_embeddings >= 128000: kwargs[ @@ -115,7 +130,7 @@ def from_hugging_face( "short_factor"] kwargs['longrope_scaling_long_factors'] = hf_config.rope_scaling[ "long_factor"] - if small_variant: + if small_variant or moe_variant: kwargs['longrope_long_mscale'] = hf_config.rope_scaling[ "long_mscale"] kwargs['longrope_short_mscale'] = hf_config.rope_scaling[ diff --git a/tensorrt_llm/models/phi3/convert.py b/tensorrt_llm/models/phi3/convert.py index 9ee6821db..5a2bf59ec 100644 --- a/tensorrt_llm/models/phi3/convert.py +++ b/tensorrt_llm/models/phi3/convert.py @@ -34,6 +34,12 @@ def load_weights_from_hf_model(hf_model, config): key = key.replace("mlp.down_proj.", "mlp.proj.") #128k key = key.replace("mlp.gate_proj.", "mlp.fc.") #128k key = key.replace("o_proj.", "dense.") #128k + + #MoE + key = key.replace("block_sparse_moe.gate", "mlp.router") + key = key.replace("block_sparse_moe.experts.0.w3", "mlp.fc") + key = key.replace("block_sparse_moe.experts.0.w2", "mlp.proj") + #Layer norm key = key.replace("post_attention_layernorm.", "post_layernorm.") #128k @@ -54,16 +60,44 @@ def load_weights_from_hf_model(hf_model, config): # Swap the halves value = torch.cat((second_half, first_half), dim=0) + if config.architecture == "PhiMoEForCausalLM": + num_experts = config.moe["num_experts"] + mlp_hidden_size = config.intermediate_size + num_hidden = config.hidden_size + rank_experts = list(range(num_experts)) + if config.mapping.has_moe_ep(): + rank_experts = config.mapping.ep_experts(num_experts) + + def get_moe_weight(key, suffix): + param = [] + for expert in rank_experts: + name = key.replace(f"0.{suffix}", f"{expert}.{suffix}") + fc_value = hf_state_dict[name] + param.append(fc_value) + w = torch.stack(param) + return w.reshape(-1, mlp_hidden_size, num_hidden) + + if ".0.w3" in orig_key: + w3 = get_moe_weight(orig_key, 'w3') + w1 = get_moe_weight(orig_key.replace("w3", "w1"), 'w1') + value = torch.concat([w3, w1], dim=-2) + elif ".0.w2" in orig_key: + w2 = get_moe_weight(orig_key, 'w2') + value = w2.reshape(-1, num_hidden, mlp_hidden_size) + elif any([k in orig_key for k in ["w1", "w2", "w3"]]): + continue + if "q_proj" in key: #128k q_param = value k_param = hf_state_dict[orig_key.replace("q_proj", "k_proj")] v_param = hf_state_dict[orig_key.replace("q_proj", "v_proj")] value = torch.cat([q_param, k_param, v_param], dim=0) - key = key.replace("q_proj.weight", "qkv.weight") + key = key.replace("q_proj", "qkv") elif "k_proj" in key or "v_proj" in key: continue - weights[key] = value.to(torch_dtype).cpu() + dtype = torch.float if "router" in key else torch_dtype + weights[key] = value.to(dtype).cpu() if config.architecture == 'Phi3SmallForCausalLM': weights['lm_head.weight'] = weights[ @@ -74,6 +108,8 @@ def load_weights_from_hf_model(hf_model, config): if "qkv." in key: weights[key] = shuffle_qkv_weights(weights[key], config) + if config.architecture in ['Phi3SmallForCausalLM', "PhiMoEForCausalLM" + ] and config.mapping.has_tp(): weights = split_weights_tp(config, weights, torch_dtype) return weights diff --git a/tensorrt_llm/models/phi3/model.py b/tensorrt_llm/models/phi3/model.py index 82abec886..ac29ab9a0 100644 --- a/tensorrt_llm/models/phi3/model.py +++ b/tensorrt_llm/models/phi3/model.py @@ -5,8 +5,9 @@ from ..._utils import pad_vocab_size from ...functional import PositionEmbeddingType, Tensor -from ...layers import (MLP, Attention, AttentionMaskType, BlockSparseAttnParams, - ColumnLinear, Embedding, LayerNorm, RmsNorm) +from ...layers import (MLP, MOE, Attention, AttentionMaskType, + BlockSparseAttnParams, ColumnLinear, Embedding, + LayerNorm, MoeConfig, RmsNorm) from ...lora_manager import LoraConfig, use_lora from ...mapping import Mapping from ...module import Module @@ -31,6 +32,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): self.gegelu_limit = None self.small_variant = config.architecture == "Phi3SmallForCausalLM" + self.moe_variant = config.architecture == "PhiMoEForCausalLM" if self.small_variant: self.gegelu_limit = config.gegelu_limit @@ -51,10 +53,14 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): config.blocksparse_num_local_blocks, config.blocksparse_vertical_stride) + if self.small_variant or self.moe_variant: self.input_layernorm = LayerNorm( - normalized_shape=config.hidden_size, dtype=config.dtype) + normalized_shape=config.hidden_size, + dtype=config.dtype, + eps=config.norm_epsilon) self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size, - dtype=config.dtype) + dtype=config.dtype, + eps=config.norm_epsilon) else: self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size, eps=config.norm_epsilon, @@ -80,7 +86,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): original_max_position_embeddings = config.original_max_position_embeddings position_embedding_type = PositionEmbeddingType.long_rope - if self.small_variant: + if self.small_variant or self.moe_variant: rope_scaling_short_mscale = config.longrope_short_mscale rope_scaling_long_mscale = config.longrope_long_mscale @@ -94,7 +100,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): max_position_embeddings=config.max_position_embeddings, dtype=config.dtype, attention_mask_type=attention_mask_type, - bias=self.small_variant, + bias=self.small_variant or self.moe_variant, q_scaling=q_scaling, tp_group=tp_group, tp_size=tp_size, @@ -106,14 +112,27 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): original_max_position_embeddings=original_max_position_embeddings, block_sparse_params=block_sparse_attn_params) - self.mlp = MLP(hidden_size=config.hidden_size, - ffn_hidden_size=config.intermediate_size, - hidden_act=config.hidden_act, - dtype=config.dtype, - tp_group=tp_group, - tp_size=tp_size, - quant_mode=config.quant_mode, - bias=self.small_variant) + ClsMLP = MLP + mlp_kwargs = {} + if hasattr(config, "moe"): + ClsMLP = MOE + moe_config = MoeConfig() + for key, value in config.moe.items(): + setattr(moe_config, key, value) + mlp_kwargs = { + "moe_config": moe_config, + "mapping": config.mapping, + } + + self.mlp = ClsMLP(hidden_size=config.hidden_size, + ffn_hidden_size=config.intermediate_size, + hidden_act=config.hidden_act, + dtype=config.dtype, + tp_group=tp_group, + tp_size=tp_size, + quant_mode=config.quant_mode, + bias=self.small_variant, + **mlp_kwargs) def forward( self, @@ -141,10 +160,14 @@ def forward( post_attention_input = hidden_states + attention_output post_attention_output = self.post_layernorm(post_attention_input) - feed_forward_hidden_states = self.mlp( - post_attention_output, - gegelu_limit=self.gegelu_limit, - lora_layer_params=lora_layer_params) + if self.small_variant: + feed_forward_hidden_states = self.mlp( + post_attention_output, + gegelu_limit=self.gegelu_limit, + lora_layer_params=lora_layer_params) + else: + feed_forward_hidden_states = self.mlp( + post_attention_output, lora_layer_params=lora_layer_params) hidden_states = post_attention_input + feed_forward_hidden_states if use_cache: return (hidden_states, presents) @@ -161,10 +184,13 @@ def __init__(self, config: PretrainedConfig): self.layers = DecoderLayerList(Phi3DecoderLayer, config) self.small_variant = config.architecture == "Phi3SmallForCausalLM" - if self.small_variant: + self.moe_variant = config.architecture == "PhiMoEForCausalLM" + if self.small_variant or self.moe_variant: self.ln_f = LayerNorm(normalized_shape=config.hidden_size, + eps=config.norm_epsilon, dtype=config.dtype) - self.mup_embedding_multiplier = config.mup_embedding_multiplier + if self.small_variant: + self.mup_embedding_multiplier = config.mup_embedding_multiplier else: self.ln_f = RmsNorm(normalized_shape=config.hidden_size, eps=config.norm_epsilon, @@ -216,9 +242,10 @@ def __init__(self, config: PretrainedConfig): vocab_size_padded = pad_vocab_size(config.vocab_size, config.mapping.tp_size) + self.moe_variant = config.architecture == "PhiMoEForCausalLM" lm_head = ColumnLinear(config.hidden_size, vocab_size_padded, - bias=False, + bias=self.moe_variant, dtype=config.dtype, tp_group=config.mapping.tp_group, tp_size=config.mapping.tp_size, diff --git a/tensorrt_llm/models/phi3/split_weights.py b/tensorrt_llm/models/phi3/split_weights.py index fcc4d735b..62a889123 100644 --- a/tensorrt_llm/models/phi3/split_weights.py +++ b/tensorrt_llm/models/phi3/split_weights.py @@ -15,8 +15,8 @@ import torch -from tensorrt_llm.models.convert_utils import (get_weight_and_bias, split, - split_matrix_tp, +from tensorrt_llm.models.convert_utils import (get_weight, get_weight_and_bias, + split, split_matrix_tp, split_qkv_bias_tp, split_qkv_tp) from ..._utils import pad_vocab_size @@ -110,10 +110,13 @@ def split_weights_tp(config, weights, dtype): num_heads = config.num_attention_heads num_kv_heads = config.num_key_value_heads hidden_size = config.hidden_size + moe_variant = config.architecture == "PhiMoEForCausalLM" mha_mode = num_heads == num_kv_heads tp_size = config.mapping.tp_size rank = config.mapping.tp_rank + moe_tp_size = config.mapping.moe_tp_size + moe_tp_rank = config.mapping.moe_tp_rank use_weight_only = config.quant_mode.is_weight_only() plugin_weight_only_quant_type = None if use_weight_only and config.quant_mode.is_int8_weight_only() == 'int8': @@ -121,8 +124,7 @@ def split_weights_tp(config, weights, dtype): elif use_weight_only and config.quant_mode.is_int4_weight_only() == 'int4': plugin_weight_only_quant_type = torch.quint4x2 - # Helper - def get_weight(weight, prefix, bias): + def get_quant_weight(weight, prefix, bias): return get_tllm_linear_weight(weight, prefix, bias, use_weight_only, plugin_weight_only_quant_type) @@ -156,25 +158,43 @@ def get_weight(weight, prefix, bias): split_bias = split_qkv_bias_tp(qkv_bias, num_heads, hidden_size, tp_size, rank) - weights.update(get_weight(split_weight, prefix, split_bias)) + weights.update(get_quant_weight(split_weight, prefix, split_bias)) prefix = layer_prefix + 'attention.dense' attn_dense_weight, attn_dense_bias = get_weight_and_bias( weights, prefix, dtype) split_v = split_matrix_tp(attn_dense_weight, tp_size, rank, dim=1) - weights.update(get_weight(split_v, prefix, attn_dense_bias)) + weights.update(get_quant_weight(split_v, prefix, attn_dense_bias)) prefix = layer_prefix + 'mlp.fc' - mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(weights, prefix, dtype) - split_v = split_matrix_tp(mlp_fc_weight, tp_size, rank, dim=0) - bias = split_matrix_tp(mlp_fc_bias, tp_size, rank, dim=0) - weights.update(get_weight(split_v, prefix, bias)) + if not moe_variant: + mlp_fc_weight, mlp_fc_bias = get_weight_and_bias( + weights, prefix, dtype) + split_v = split_matrix_tp(mlp_fc_weight, tp_size, rank, dim=0) + bias = split_matrix_tp(mlp_fc_bias, tp_size, rank, dim=0) + weights.update(get_quant_weight(split_v, prefix, bias)) + else: + mlp_fc_weight = get_weight(weights, prefix, dtype) + w3 = split_matrix_tp(mlp_fc_weight, 2, 0, dim=1) + split_w3 = split_matrix_tp(w3, moe_tp_size, moe_tp_rank, dim=1) + w1 = split_matrix_tp(mlp_fc_weight, 2, 1, dim=1) + split_w1 = split_matrix_tp(w1, moe_tp_size, moe_tp_rank, dim=1) + split_v = torch.concat([split_w3, split_w1], dim=-2) + weights.update(get_quant_weight(split_v, prefix, None)) prefix = layer_prefix + 'mlp.proj' - mlp_proj_weight, mlp_proj_bias = get_weight_and_bias( - weights, prefix, dtype) - split_v = split_matrix_tp(mlp_proj_weight, tp_size, rank, dim=1) - weights.update(get_weight(split_v, prefix, mlp_proj_bias)) + if not moe_variant: + mlp_proj_weight, mlp_proj_bias = get_weight_and_bias( + weights, prefix, dtype) + split_v = split_matrix_tp(mlp_proj_weight, tp_size, rank, dim=1) + weights.update(get_quant_weight(split_v, prefix, mlp_proj_bias)) + else: + mlp_proj_weight = get_weight(weights, prefix, dtype) + split_v = split_matrix_tp(mlp_proj_weight, + moe_tp_size, + moe_tp_rank, + dim=2) + weights.update(get_quant_weight(split_v, prefix, None)) weights['transformer.vocab_embedding.weight'] = split_embedding( weights['transformer.vocab_embedding.weight'], tp_size, rank) @@ -182,5 +202,10 @@ def get_weight(weight, prefix, bias): tp_size, rank, dim=0) + if moe_variant: + weights['lm_head.bias'] = split_matrix_tp(weights['lm_head.bias'], + tp_size, + rank, + dim=0) return weights diff --git a/tensorrt_llm/models/qwen/model.py b/tensorrt_llm/models/qwen/model.py index 2eec6e62d..c3dd5b305 100644 --- a/tensorrt_llm/models/qwen/model.py +++ b/tensorrt_llm/models/qwen/model.py @@ -25,6 +25,7 @@ from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear, Embedding, GatedMLP, RmsNorm, RowLinear) from ...layers.moe import MOEWeightWrapper +from ...logger import logger from ...lora_manager import (LoraConfig, get_default_trtllm_modules_to_hf_modules, use_lora) from ...mapping import Mapping @@ -428,12 +429,18 @@ def concat_gate_up_proj(weights): else: if not use_preloading: hf_model = load_hf_qwen(hf_model_dir, load_model_on_cpu) + + logger.debug(f"HuggingFace model: {hf_model}") + + model = QWenForCausalLM(config) + + logger.debug(f"TensorRT-LLM model: {model}") + if use_hf_gptq_checkpoint: weights = load_weights_from_hf_gptq_model(hf_model, config) else: weights = load_weights_from_hf_model(hf_model, config) check_share_embedding(weights, config) - model = QWenForCausalLM(config) model.load(weights) return model diff --git a/tensorrt_llm/models/recurrentgemma/model.py b/tensorrt_llm/models/recurrentgemma/model.py index d555fc5c3..e0cbe77b1 100644 --- a/tensorrt_llm/models/recurrentgemma/model.py +++ b/tensorrt_llm/models/recurrentgemma/model.py @@ -57,6 +57,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int): layer_types = layer_types + config.layer_types[0:( (layer_idx + 1) % layer_type_len)] attention_layer_idx = layer_types.count('attention') - 1 + self.attention = Attention( local_layer_idx=attention_layer_idx, hidden_size=config.hidden_size, @@ -209,6 +210,8 @@ def forward(self, host_kv_cache_block_offsets, host_kv_cache_pool_pointers=kv_cache_params. host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=kv_cache_params. + host_kv_cache_pool_mapping, cache_indirection=kv_cache_params.cache_indirection), attention_params=attention_params, conv_state=past_conv, @@ -499,7 +502,6 @@ def prepare_inputs( mapping, num_profiles) # attention inputs - num_attention_layers = self.layer_types.count('attention') attn_layer_idx = [] for i in range(self.config.num_hidden_layers): if self.layer_types[i] == 'attention': @@ -511,7 +513,7 @@ def prepare_inputs( max_seq_len=max_seq_len, num_kv_heads=self.config.num_key_value_heads, head_size=self.config.head_size, - num_layers=num_attention_layers, + num_layers=self.config.num_hidden_layers, kv_dtype=str_dtype_to_trt(self.config.kv_dtype), num_profiles=num_profiles, enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles, @@ -523,17 +525,6 @@ def prepare_inputs( streamingllm=streamingllm, attn_layer_idx=attn_layer_idx) - kv_idx = 0 - past_key_value = [] - for i in range(self.config.num_hidden_layers): - if self.layer_types[i] == 'attention' and not paged_kv_cache: - past_key_value.append( - attention_inputs['past_key_value'][kv_idx]) - kv_idx += 1 - else: - past_key_value.append(None) - attention_inputs['past_key_value'] = past_key_value - # recurrent inputs recurrent_inputs = self.prepare_recurrent_inputs( max_batch_size=max_batch_size, @@ -601,6 +592,8 @@ def prepare_inputs( 'host_kv_cache_block_offsets'], host_kv_cache_pool_pointers=attention_inputs[ 'host_kv_cache_pool_pointers'], + host_kv_cache_pool_mapping=attention_inputs[ + 'host_kv_cache_pool_mapping'], cache_indirection=attention_inputs['cache_indirection'], ), 'attention_params': diff --git a/tensorrt_llm/models/redrafter/redrafter_helper.py b/tensorrt_llm/models/redrafter/redrafter_helper.py index 9604f40af..4cbd0b05c 100644 --- a/tensorrt_llm/models/redrafter/redrafter_helper.py +++ b/tensorrt_llm/models/redrafter/redrafter_helper.py @@ -1,3 +1,4 @@ +import warnings from typing import Tuple import numpy as np @@ -11,7 +12,7 @@ div, eq, exp, expand, expand_dims, floordiv, gather, gather_nd, index_select, int32_array, log_softmax, lt, max, maximum, masked_select, minimum, nonzero, not_op, op_and, rand, relu, scatter, select, shape, slice, - softmax, squeeze, stack, sum, topk, transpose, unsqueeze, view, where) + silu, softmax, squeeze, stack, sum, topk, transpose, unsqueeze, view, where) # isort: on from tensorrt_llm.layers import Embedding from tensorrt_llm.module import Module @@ -358,9 +359,133 @@ def _unflatten_decoding_dim(x: Tensor, num_beams: int) -> Tensor: return x -def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding, - drafter: Module, num_beams: int, beam_length: int, +def _beam_search_candidates(prompt_state: Tensor, init_token: Tensor, + embedding: Embedding, drafter: Module, + num_beams: int, beam_length: int, is_rnn: bool) -> Tuple[Tensor, Tensor]: + """ + This version of beam search matches with ReDrafter GitHub version as of 10/02/2024. + Link: https://github.com/apple/ml-recurrent-drafter/releases/tag/v1.1 + """ + + LOG_0 = -50000.0 + LOG_1 = 0.0 + + def maintain_logits(logits: Tensor) -> Tensor: + max_logits = max(logits, -1, keepdim=True) + max_logits = expand(max_logits, + shape(logits, cast_to_dtype=INT_DTYPE_STR)) + return logits - max_logits + + def warp_logits(logits: Tensor, + top_k: int = 50, + mask_value: float = LOG_0) -> Tensor: + top_k = minimum(top_k, shape(logits, + dim=-1, + cast_to_dtype=INT_DTYPE_STR)) + top_values, _ = topk(logits, k=top_k, dim=-1) # [bs, nb, top_k] + starts = concat([0, 0, top_k - 1]) + sizes = concat([shape(logits, 0), shape(logits, 1), 1]) + lt_mask = logits < slice(top_values, starts=starts, sizes=sizes) + logits = where(lt_mask, + constant_to_tensor_(mask_value, dtype=logits.dtype), + logits) + return logits + + def compute_logits(x: Tensor) -> Tensor: + """ + x: [bs, nb, 2*H] + """ + logits = drafter(x) # [bs, nb, 2*H] => [bs, nb, V] + logits = maintain_logits(logits) # [bs, nb, V] + logits = warp_logits(logits) # [bs, nb, V] + return logits + + assert prompt_state.ndim() == 2 + assert init_token.ndim() == 1 + assert beam_length > 1 + batch_size = shape(prompt_state, 0, INT_DTYPE_STR) + vocab_size = embedding.num_embeddings + dtype = prompt_state.dtype + + log_p_beam = expand( + unsqueeze( + constant( + numpy_array([LOG_1] + [LOG_0] * (num_beams - 1), + trt_dtype=dtype)), 0), # [1, nb] + concat([batch_size, num_beams])) # [bs, nb] + context = _add_decoding_dim(prompt_state, num_beams) # [bs, nb, H] + if init_token.ndim() == 1: + init_token = unsqueeze(init_token, -1) # [bs] => [bs, 1] + beams = _add_decoding_dim(init_token, num_beams) # [bs, nb, 1] + + last_tokens = squeeze(beams, -1) # [bs, nb] + state_shape = shape(context, cast_to_dtype=INT_DTYPE_STR) # [bs, nb, H] + state = expand(expand_dims(constant_to_tensor_(0.0, dtype=dtype), [0, 1]), + state_shape) # [bs, nb, H] + logits_token_in_beam = None + candidate_length = beam_length - 1 + for _ in range(candidate_length): + state = ( + silu(drafter.rnn_w(embedding(last_tokens)) + + drafter.rnn_u(state)) if is_rnn else embedding(last_tokens) + + state) # [bs, nb, H] + + logits_new_token = compute_logits(concat([context, state], + -1)) # [bs, nb, V] + log_p_new_token = log_softmax(logits_new_token, -1) # [bs, nb, V] + + log_p_beam_new_token = log_p_new_token + unsqueeze(log_p_beam, + 2) # [bs, nb, V] + + tokens_times_beams = view(log_p_beam_new_token, + concat([batch_size, num_beams * vocab_size + ])) # [bs, nb*V] + log_p_beam, topk_indices = topk(tokens_times_beams, k=num_beams, + dim=-1) # [bs, nb] + top_beam_indices = topk_indices // vocab_size # [bs, nb] + # Avoid repeated division for: top_token_ids = topk_indices % vocab_size + top_token_ids = topk_indices - (top_beam_indices * vocab_size + ) # [bs, nb] + + # get the common indices to gather beams + gather_indices = _get_indices_for_gather_beams(batch_size, + top_beam_indices, + num_beams) + + # update running beams, state, logits, and last_tokens + prev_top_beams = _gather_beams(beams, gather_indices, batch_size, + num_beams) # [bs, nb] OR [bs, nb, 1+i] + if prev_top_beams.ndim() == 2: + prev_top_beams = unsqueeze(prev_top_beams, -1) # [bs, nb, 1] + new_tokens = unsqueeze(top_token_ids, -1) # [bs, nb, 1] + beams = concat([prev_top_beams, new_tokens], dim=-1) # [bs, nb, 1+i+1] + + state = _gather_beams(state, gather_indices, batch_size, + num_beams) # [bs, nb, H] + + cur_logits_token_in_beam = unsqueeze( + _gather_beams(logits_new_token, gather_indices, batch_size, + num_beams), 2) # [bs, nb, 1, V] + if logits_token_in_beam is None: # first iteration + logits_token_in_beam = cur_logits_token_in_beam + else: + logits_token_in_beam = concat( + [ + _gather_beams(logits_token_in_beam, gather_indices, + batch_size, + num_beams), # prev_top_logits [bs, nb, i, V] + cur_logits_token_in_beam + ], + dim=2) # [bs, nb, i+1, V] + last_tokens = top_token_ids # [bs, nb] + return beams, logits_token_in_beam + + +def _beam_search_candidates_v0(x: Tensor, init_token: Tensor, + embedding: Embedding, drafter: Module, + num_beams: int, beam_length: int, + is_rnn: bool) -> Tuple[Tensor, Tensor]: ''' x: [bs, H] init_token: [bs] @@ -372,6 +497,9 @@ def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding, draft_probs: (batch, num_beams, beam_length - 1, vocab_size) Probabilities for the draft_tokens. ''' + warnings.warn( + "This version of beam search is deprecated and will be removed in the future." + ) NEG_INF = -50000.0 batch_size = shape(x, 0, INT_DTYPE_STR) vocab_size = embedding.num_embeddings @@ -408,7 +536,7 @@ def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding, h)) # [bs, nb, 2H] => [bs*nb, 2H] => [bs*nb, V] new_flat_log_probs = log_softmax(new_flat_logits, dim=-1) # [bs*nb, V] - # compute probabilties and flatten the beams for topk + # compute probabilities and flatten the beams for topk candidate_log_probs = _unflatten_decoding_dim( new_flat_log_probs, num_beams) # [bs*nb, V] => [bs, nb, V] log_probs = candidate_log_probs + unsqueeze(scores, 2) # [bs, nb, V] diff --git a/tensorrt_llm/module.py b/tensorrt_llm/module.py index 96575c0d8..106974973 100644 --- a/tensorrt_llm/module.py +++ b/tensorrt_llm/module.py @@ -18,6 +18,18 @@ from .logger import logger +def _addindent(s_, numSpaces): + s = s_.split('\n') + # don't do anything for single-line stuff + if len(s) == 1: + return s_ + first = s.pop(0) + s = [(numSpaces * ' ') + line for line in s] + s = '\n'.join(s) + s = first + '\n' + s + return s + + class Module(object): def __init__(self) -> None: @@ -191,6 +203,23 @@ def update_parameters(self, torch_module): for k, v in self.named_parameters(): v.value = tm[k].detach().cpu().numpy() + def _get_name(self): + return self.__class__.__name__ + + def __repr__(self): + # We treat the extra repr like the sub-module, one item per line + child_lines = [] + for key, module in self._modules.items(): + mod_str = repr(module) + mod_str = _addindent(mod_str, 2) + child_lines.append('(' + key + '): ' + mod_str) + main_str = self._get_name() + '(' + if child_lines: + # simple one-liner info, which most builtin Modules will use + main_str += '\n ' + '\n '.join(child_lines) + '\n' + main_str += ')' + return main_str + class ModuleList(Module): @@ -221,3 +250,35 @@ def __setitem__(self, idx, module) -> None: def __len__(self): return len(self._modules) + + def __repr__(self): + """Return a custom repr for ModuleList that compresses repeated module representations.""" + list_of_reprs = [repr(item) for item in self] + if len(list_of_reprs) == 0: + return self._get_name() + "()" + + start_end_indices = [[0, 0]] + repeated_blocks = [list_of_reprs[0]] + for i, r in enumerate(list_of_reprs[1:], 1): + if r == repeated_blocks[-1]: + start_end_indices[-1][1] += 1 + continue + + start_end_indices.append([i, i]) + repeated_blocks.append(r) + + lines = [] + main_str = self._get_name() + "(" + for (start_id, end_id), b in zip(start_end_indices, repeated_blocks): + local_repr = f"({start_id}): {b}" # default repr + + if start_id != end_id: + n = end_id - start_id + 1 + local_repr = f"({start_id}-{end_id}): {n} x {b}" + + local_repr = _addindent(local_repr, 2) + lines.append(local_repr) + + main_str += "\n " + "\n ".join(lines) + "\n" + main_str += ")" + return main_str diff --git a/tensorrt_llm/parameter.py b/tensorrt_llm/parameter.py index f405387a0..47f719eff 100644 --- a/tensorrt_llm/parameter.py +++ b/tensorrt_llm/parameter.py @@ -86,11 +86,13 @@ def _create_managed_tensor(self, network, need_transpose=False) -> Tensor: if self._value is None or (isinstance(self._value, np.ndarray) and not self._value.flags['C_CONTIGUOUS']): + value_old = self._value + self._value = np.empty(self._shape, trt_dtype_to_np(self._dtype)) network._register_unfilled_weights( # use updated self._shape here name, - np.empty(self._shape, trt_dtype_to_np(self._dtype)), - self._value) + self._value, + value_old) return Tensor(name=name, dtype=self._dtype, shape=shape) def get_managed_tensor(self, diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py index 6b0a86814..84441597c 100644 --- a/tensorrt_llm/plugin/plugin.py +++ b/tensorrt_llm/plugin/plugin.py @@ -368,7 +368,7 @@ class CustomAllReduceHelper: - Set custom_all_reduce_helper.workspace with the required tensor. Then, each instance of allreduce will reference that tensor automatically. """ - POINTERS_PER_RANK = 4 + POINTERS_PER_RANK = 7 def __init__(self) -> None: self.workspace: Optional[Tensor] = None @@ -377,7 +377,7 @@ def set_workspace_tensor(self, mapping: Mapping, num_profiles: Optional[int] = None): from ..functional import Tensor - workspace_size = self.POINTERS_PER_RANK * mapping.tp_size + 1 + workspace_size = self.POINTERS_PER_RANK * mapping.tp_size + 2 dim_range = None if num_profiles is not None: @@ -412,16 +412,23 @@ def allocate_workspace(mapping: Mapping, ipc_barriers_out = IpcMemory( mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size * 2, is_p2p_supported) + lamport_buffers_0 = IpcMemory(mapping, size * mapping.tp_size, + is_p2p_supported) + lamport_buffers_1 = IpcMemory(mapping, size * mapping.tp_size, + is_p2p_supported) + lamport_buffers_2 = IpcMemory(mapping, size * mapping.tp_size, + is_p2p_supported) buffers = [ - ipc_buffers_ping, - ipc_buffers_pong, - ipc_barriers_in, - ipc_barriers_out, + ipc_buffers_ping, ipc_buffers_pong, ipc_barriers_in, + ipc_barriers_out, lamport_buffers_0, lamport_buffers_1, + lamport_buffers_2 ] return buffers, torch.tensor( ipc_buffers_ping.serialize() + ipc_buffers_pong.serialize() + - ipc_barriers_in.serialize() + ipc_barriers_out.serialize() + [0], + ipc_barriers_in.serialize() + ipc_barriers_out.serialize() + + lamport_buffers_0.serialize() + lamport_buffers_1.serialize() + + lamport_buffers_2.serialize() + [0] + [0], dtype=torch.int64, device="cpu") diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py index c185c9cd9..ac0b14916 100644 --- a/tensorrt_llm/quantization/layers.py +++ b/tensorrt_llm/quantization/layers.py @@ -1495,7 +1495,10 @@ def forward(self, hidden_states, lora_layer_params=None): if self.quant_mode.has_fp8_rowwise(): # Quantize per token outputs tuple: # quantized tensor and scaling factors per token - inter = quantize_fp8_per_token(inter, self.clamp_val.val) + if hasattr(self.clamp_val, "val"): + inter = quantize_fp8_per_token(inter, self.clamp_val.val) + else: + inter = quantize_fp8_per_token(inter) output = self.proj(inter) return output @@ -1619,32 +1622,31 @@ def forward(self, hidden_states, lora_layer_params=None): class SmoothQuantAttention(Module): - def __init__( - self, - *, - local_layer_idx, - hidden_size, - num_attention_heads, - num_kv_heads=None, - max_position_embeddings=1024, - num_layers=1, - apply_query_key_layer_scaling=False, - attention_head_size=None, - attention_mask_type=AttentionMaskType.padding, - bias=True, - dense_bias=None, - dtype=None, - position_embedding_type=PositionEmbeddingType.learned_absolute, - rotary_embedding_base=10000.0, - rotary_embedding_scaling=None, - rotary_embedding_percentage=1.0, - tp_group=None, - tp_size=1, - tp_rank=0, - scale_alibi_bias=False, - paged_kv_cache=False, - quant_mode=QuantMode(0), - ): + def __init__(self, + *, + local_layer_idx, + hidden_size, + num_attention_heads, + num_kv_heads=None, + max_position_embeddings=1024, + num_layers=1, + apply_query_key_layer_scaling=False, + attention_head_size=None, + attention_mask_type=AttentionMaskType.padding, + bias=True, + dense_bias=None, + dtype=None, + position_embedding_type=PositionEmbeddingType.learned_absolute, + rotary_embedding_base=10000.0, + rotary_embedding_scaling=None, + rotary_embedding_percentage=1.0, + tp_group=None, + tp_size=1, + tp_rank=0, + scale_alibi_bias=False, + paged_kv_cache=False, + quant_mode=QuantMode(0), + layer_idx_in_cache_pool=None): super().__init__() self.local_layer_idx = local_layer_idx self.attention_mask_type = attention_mask_type @@ -1653,6 +1655,7 @@ def __init__( self.num_attention_kv_heads = ( num_kv_heads + tp_size - 1 ) // tp_size if num_kv_heads is not None else self.num_attention_heads + self.layer_idx_in_cache_pool = layer_idx_in_cache_pool self.hidden_size = hidden_size // tp_size self.max_position_embeddings = 0 if max_position_embeddings is None else max_position_embeddings self.tp_size = tp_size @@ -1817,6 +1820,7 @@ def forward( layer_idx=self.local_layer_idx, num_heads=self.num_attention_heads, num_kv_heads=self.num_attention_kv_heads, + layer_idx_in_cache_pool=self.layer_idx_in_cache_pool, hidden_size_per_head=self.attention_head_size, q_scaling=self.q_scaling, rotary_embedding_dim=self.rotary_embedding_dim, @@ -1839,6 +1843,8 @@ def forward( host_kv_cache_block_offsets, host_kv_cache_pool_pointers=kv_cache_params. host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping=kv_cache_params. + host_kv_cache_pool_mapping, host_context_lengths=attention_params.host_context_lengths, use_cache=use_cache, spec_decoding_generation_lengths=spec_decoding_params. diff --git a/tensorrt_llm/quantization/mode.py b/tensorrt_llm/quantization/mode.py index 04ffe2fe7..0ececc424 100644 --- a/tensorrt_llm/quantization/mode.py +++ b/tensorrt_llm/quantization/mode.py @@ -34,6 +34,8 @@ class QuantAlgo(StrEnum, metaclass=BaseEnumMeta): FP8 = auto() FP8_PER_CHANNEL_PER_TOKEN = auto() INT8 = auto() + MIXED_PRECISION = auto() + NO_QUANT = auto() QUANT_ALGO_LIST = list(set(QuantAlgo) - {QuantAlgo.INT8}) @@ -82,6 +84,9 @@ class QuantMode(IntFlag): # The mask of all valid flags. VALID_FLAGS = COUNT - 1 + def __deepcopy__(self, memo): + return self + # All the bits set? You can restrict the test to the bits indicated by "mask". def _all(self, bits, mask=VALID_FLAGS): return (self & mask) == bits @@ -138,6 +143,9 @@ def has_fp8_qdq(self): def has_fp8_rowwise(self): return self._any(self.FP8_ROWWISE) + def has_weight_quant(self): + return self._any(self.INT4_WEIGHTS | self.INT8_WEIGHTS) + def has_any_quant(self): return self._any(self.INT4_WEIGHTS | self.INT8_WEIGHTS | self.ACTIVATIONS @@ -241,7 +249,7 @@ def use_weight_only(use_int4_weights=False, per_group=False): @staticmethod def from_quant_algo( - quant_algo: Optional[QuantAlgo], + quant_algo: Optional[QuantAlgo] = None, kv_cache_quant_algo: Optional[QuantAlgo] = None, ) -> "QuantMode": assert quant_algo is None or quant_algo in QUANT_ALGO_LIST diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py index 25b4bdf6c..92cc7bac8 100644 --- a/tensorrt_llm/quantization/quantize.py +++ b/tensorrt_llm/quantization/quantize.py @@ -1,10 +1,11 @@ import fnmatch +from typing import Union from .._utils import get_init_params from ..layers import (MLP, Attention, ColumnLinear, Embedding, GatedMLP, LayerNorm, RmsNorm, RowLinear) from ..layers.moe import MixtureOfExperts -from ..models.modeling_utils import QuantConfig +from ..models.modeling_utils import LayerQuantConfig, QuantConfig from ..parameter import Parameter from .layers import (FP8Linear, FP8RowLinear, Fp8RowwiseGatedMLP, Fp8RowwiseMLP, Fp8RowwiseRmsNorm, Int8SmoothQuantLinear, @@ -79,9 +80,14 @@ def quantize_layers( return model -def weight_only_quantize(model, quant_config: QuantConfig): +def weight_only_quantize(model, quant_config: QuantConfig, model_config=None): assert quant_config.quant_mode.is_weight_only() + try: + model_cfg = model.config + except Exception: + model_cfg = model_config + quant_map = { ColumnLinear: WeightOnlyQuantColumnLinear, RowLinear: WeightOnlyQuantRowLinear, @@ -93,7 +99,7 @@ def preprocess_init_params(init_params, name, module): if isinstance(module, ColumnLinear): module_name = name.rsplit('.', 1)[-1] init_params["transb"] = module_name == "lm_head" - init_params["tp_rank"] = model.config.mapping.tp_rank + init_params["tp_rank"] = model_cfg.mapping.tp_rank model = quantize_layers( model, @@ -104,9 +110,16 @@ def preprocess_init_params(init_params, name, module): return model -def weight_only_groupwise_quantize(model, quant_config: QuantConfig): +def weight_only_groupwise_quantize(model, + quant_config: QuantConfig, + model_config=None): assert quant_config.quant_mode.is_weight_only() + try: + model_cfg = model.config + except Exception: + model_cfg = model_config + quant_map = { ColumnLinear: WeightOnlyGroupwiseQuantColumnLinear, RowLinear: WeightOnlyGroupwiseQuantRowLinear, @@ -118,7 +131,7 @@ def preprocess_init_params(init_params, name, module): init_params["zero"] = quant_config.has_zero_point init_params[ "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ - init_params["tp_rank"] = model.config.mapping.tp_rank + init_params["tp_rank"] = model_cfg.mapping.tp_rank model = quantize_layers( model, @@ -207,9 +220,14 @@ def fp8_quantize(model, quant_config: QuantConfig): return model -def fp8_rowwise_quantize(model, quant_config: QuantConfig): +def fp8_rowwise_quantize(model, quant_config: QuantConfig, model_config=None): assert quant_config.quant_mode.has_fp8_rowwise() + try: + model_cfg = model.config + except Exception: + model_cfg = model_config + quant_map = { RmsNorm: Fp8RowwiseRmsNorm, GatedMLP: Fp8RowwiseGatedMLP, @@ -230,8 +248,8 @@ def extract_layer_idx(name): continue # Meta's Fp8 recipe - mapping = model.config.mapping - layers_range = mapping.pp_layers(model.config.num_hidden_layers) + mapping = model_cfg.mapping + layers_range = mapping.pp_layers(model_cfg.num_hidden_layers) is_first_layer = mapping.is_first_pp_rank() and layer_idx == 0 is_last_layer = mapping.is_last_pp_rank( ) and layer_idx == len(layers_range) - 1 @@ -259,30 +277,54 @@ def extract_layer_idx(name): return model -def kv_cache_quantize(model, quant_config: QuantConfig): - assert quant_config.quant_mode.has_kv_cache_quant() +# Now consider the kv cache is enabled for all layers +def kv_cache_quantize(model): for name, module in model.named_modules(): if isinstance(module, (Attention, SmoothQuantAttention)): module.kv_cache_scaling_factor = Parameter(shape=(1, ), dtype='float32') + return model -def quantize(model, quant_config: QuantConfig): - quant_mode = quant_config.quant_mode +def quantize(model, quant_config: Union[QuantConfig, LayerQuantConfig]): + quant_mode = quant_config.layer_quant_mode - if quant_mode.has_fp8_qdq(): - model = fp8_quantize(model, quant_config) - elif quant_mode.has_fp8_rowwise(): - model = fp8_rowwise_quantize(model, quant_config) - elif quant_mode.has_act_and_weight_quant(): - model = smooth_quantize(model, quant_config) - elif quant_mode.is_weight_only(): - if quant_mode.has_per_group_scaling(): - model = weight_only_groupwise_quantize(model, quant_config) + for name, module, parent in model.named_modules_with_parent(): + if quant_config.quant_algo == QuantAlgo.MIXED_PRECISION: + if name in quant_mode.keys(): + layer_quant_mode = quant_mode[name] + else: + continue else: - model = weight_only_quantize(model, quant_config) + layer_quant_mode = quant_mode + if layer_quant_mode == QuantMode(0): + continue + + layer_quant_cfg = quant_config.get_quant_cfg(name) + + if layer_quant_mode.has_fp8_qdq(): + module = fp8_quantize(module, layer_quant_cfg) + elif layer_quant_mode.has_fp8_rowwise(): + module = fp8_rowwise_quantize(module, layer_quant_cfg, model.config) + elif layer_quant_mode.has_act_and_weight_quant(): + module = smooth_quantize(module, layer_quant_cfg) + elif layer_quant_mode.is_weight_only(): + if layer_quant_mode.has_per_group_scaling(): + module = weight_only_groupwise_quantize(module, layer_quant_cfg, + model.config) + else: + module = weight_only_quantize(module, layer_quant_cfg, + model.config) - if quant_mode.has_kv_cache_quant(): - model = kv_cache_quantize(model, quant_config) + if parent is not None: # for per layer + module_name = name.rsplit('.', 1)[-1] + setattr(parent, module_name, module) + else: # for all layer + model = module + break + + if quant_config.quant_mode.has_kv_cache_quant(): + model = kv_cache_quantize(model) + setattr(model, 'quant_mode', quant_config.quant_mode) return model diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py index 2e38acb16..7aae7aec3 100644 --- a/tensorrt_llm/quantization/quantize_by_modelopt.py +++ b/tensorrt_llm/quantization/quantize_by_modelopt.py @@ -26,6 +26,7 @@ import numpy as np import safetensors import torch +from accelerate.hooks import remove_hook_from_module from datasets import load_dataset from safetensors.torch import load_file, save_file from torch.utils.data import DataLoader @@ -126,9 +127,26 @@ def quant_cfg_choices(): "Starcoder2ForCausalLM": "gptnext", "GPTBigCodeForCausalLM": "gptnext", "GLM": "glm", + "DeciLMForCausalLM": "deci", } +class _CustomDataset(torch.utils.data.Dataset): + + def __init__(self, encodings): + self.encodings = encodings + + def __getitem__(self, idx): + item = { + key: torch.tensor(val[idx]) + for key, val in self.encodings.items() + } + return item + + def __len__(self): + return len(self.encodings["input_ids"]) + + def get_tokenizer(ckpt_path, max_seq_length=2048, model_type=None): logger.info(f"Initializing tokenizer from {ckpt_path}") tokenizer = AutoTokenizer.from_pretrained( @@ -174,11 +192,20 @@ def get_model(ckpt_path, dtype="fp16", device="cuda"): raise NotImplementedError(f"Unknown dtype {dtype}") # Note: VILA model is not in public HF model zoo yet. We need to explicitly import from the git repo - hf_config = AutoConfig.from_pretrained(ckpt_path, trust_remote_code=True) + if "mpt" in ckpt_path: + # MPT-7B cannot get initialized from AutoConfig + from transformers import MptConfig + hf_config = MptConfig.from_pretrained(ckpt_path) + else: + hf_config = AutoConfig.from_pretrained(ckpt_path, + trust_remote_code=True) model_cls = AutoModelForCausalLM if hf_config.model_type == "llava": from transformers import LlavaForConditionalGeneration model_cls = LlavaForConditionalGeneration + elif hf_config.model_type == "mpt": + from transformers import MptForCausalLM + model_cls = MptForCausalLM if "vila" in ckpt_path: model = _get_vila_model(ckpt_path) elif hf_config.model_type == "glm": @@ -217,7 +244,9 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail", tokenizer=None, batch_size=1, calib_size=512, - block_size=512): + block_size=512, + device=None, + include_labels=False): logger.info("Loading calibration dataset") if dataset_name_or_dir == "pileval": dataset = load_dataset( @@ -226,7 +255,11 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail", split="train") dataset = dataset["text"][:calib_size] elif "cnn_dailymail" in dataset_name_or_dir: - dataset = load_dataset(dataset_name_or_dir, name="3.0.0", split="train") + dataset = load_dataset( + dataset_name_or_dir, + name="3.0.0", + split="train", + ) dataset = dataset["article"][:calib_size] elif os.path.isdir(dataset_name_or_dir): logger.info( @@ -245,7 +278,23 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail", padding=True, truncation=True, max_length=block_size) - batch_encoded = batch_encoded["input_ids"] + + if device: + batch_encoded = batch_encoded.to(device) + + if include_labels: + # Labels are needed when backward is called in the model. + # The labels should be a shifted version of the input_ids. + # However, we should not shift the input_ids here since the labels are shifted by + # Huggingface models during loss calculation as shown here - + # https://github.com/huggingface/transformers/blob/7f79a97399bb52aad8460e1da2f36577d5dccfed/src/transformers/models/llama/modeling_llama.py#L1093-L1095 + batch_encoded["labels"] = torch.where( + batch_encoded["attention_mask"] > 0.5, batch_encoded["input_ids"], + -100) + batch_encoded = _CustomDataset(batch_encoded) + else: + # For backward compatibility, if labels are not needed, we only return input_ids. + batch_encoded = batch_encoded["input_ids"] calib_dataloader = DataLoader(batch_encoded, batch_size=batch_size, @@ -254,7 +303,8 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail", return calib_dataloader -def quantize_model(model, quant_cfg, calib_dataloader=None): +def quantize_model(model, quant_cfg, calib_dataloader, batch_size, qformat, + weight_compression): import modelopt.torch.quantization as atq def calibrate_loop(): @@ -267,14 +317,40 @@ def calibrate_loop(): data = data.to(model.device) model(data) + QUANT_CFG_CHOICES = { + "int8": "INT8_DEFAULT_CFG", + "int8_sq": "INT8_SMOOTHQUANT_CFG", + "fp8": "FP8_DEFAULT_CFG", + "int4_awq": "INT4_AWQ_CFG", + "w4a8_awq": "W4A8_AWQ_BETA_CFG", + } + logger.info("Starting quantization...") start_time = time.time() - atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + if weight_compression: + logger.info("Starting mixed precision quantization...") + model, search_history = atq.auto_quantize( + model, + data_loader=calib_dataloader, + loss_func=lambda output, batch: output.loss, + constraints={"weight_compression": weight_compression}, + quantization_formats=[ + QUANT_CFG_CHOICES[item] for item in qformat.split(",") + ] + [None], + collect_func=lambda x: x, + num_calib_steps=len(calib_dataloader), + num_score_steps=min( + len(calib_dataloader), 128 // batch_size + ), # Limit the number of score steps to avoid long calibration time + verbose=True, + ) + atq.print_quant_summary(model) + else: + atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) end_time = time.time() logger.info( "Quantization done. Total time used: {:.2f} s.".format(end_time - start_time)) - return model @@ -366,7 +442,8 @@ def quantize_and_export(*, max_draft_len=None, medusa_hidden_act=None, medusa_model_dir=None, - quant_medusa_head=None): + quant_medusa_head=None, + weight_compression=None): ''' Load model from the model_dir, call Modelopt to quantize the model, and then export the quantized model as TRT-LLM checkpoint @@ -402,7 +479,7 @@ def quantize_and_export(*, ] and kv_cache_dtype is None: logger.info(f"No quantization applied, export {dtype} model") else: - if "awq" in qformat: + if any("awq" in item for item in qformat.split(",")): if calib_size > 32: logger.info( f"AWQ calibration could take longer with calib_size = {calib_size}, Using" @@ -414,34 +491,53 @@ def quantize_and_export(*, " set by adding the argument --batch_size to the command line.\n" ) + # Check if qformat provided is supported. qformat is list of one element for non auto_quant case. + if all(item in quant_cfg_choices() for item in qformat.split(",")): + quant_cfg = quant_cfg_choices()[qformat.split(",")[0]] + else: + raise ValueError(f"Unsupported quantization format: {qformat}") + + # Auto quantize does not use quant_cfg + if not weight_compression and "awq" in qformat: + quant_cfg = copy.deepcopy(quant_cfg_choices()[qformat]) + weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"] + if isinstance(weight_quantizer, list): + weight_quantizer = weight_quantizer[0] + weight_quantizer["block_sizes"][-1] = awq_block_size + + # Coarser optimal scale search seems to resolve the overflow in TRT-LLM for some models + if "w4a8_awq" == qformat and model_type in ["gemma", "mpt"]: + quant_cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 1} + calib_dataloader = get_calib_dataloader( dataset_name_or_dir=calib_dataset, tokenizer=tokenizer, batch_size=batch_size, calib_size=calib_size, block_size=calib_max_seq_length, + device=torch.device("cuda") if weight_compression else None, + include_labels=weight_compression is not None, ) - if qformat in quant_cfg_choices(): - quant_cfg = quant_cfg_choices()[qformat] - else: - raise ValueError(f"Unsupported quantization format: {qformat}") - - if "awq" in qformat: - quant_cfg = copy.deepcopy(quant_cfg_choices()[qformat]) - weight_quantizer = quant_cfg["quant_cfg"][ - "*weight_quantizer"] # type: ignore - if isinstance(weight_quantizer, list): - weight_quantizer = weight_quantizer[0] - weight_quantizer["block_sizes"][-1] = awq_block_size + # Always turn on FP8 kv cache to save memory footprint. + # For int8_sq, we do not quantize kv cache to preserve accuracy. + # We turn off FP8 kv cache for unified_hf checkpoint + enable_quant_kv_cache = "int8" not in qformat + print( + f'{"Enable" if enable_quant_kv_cache else "Disable"} KV cache quantization' + ) + quant_cfg["quant_cfg"]["*output_quantizer"] = { + "num_bits": 8 if qformat == "int8_sq" else (4, 3), + "axis": None, + "enable": enable_quant_kv_cache, + } - if kv_cache_dtype is not None: - if kv_cache_dtype == "fp8": - for value in KV_CACHE_CFG.values(): - value.update({"num_bits": (4, 3)}) # type: ignore - quant_cfg["quant_cfg"].update(KV_CACHE_CFG) # type: ignore + # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead. + if model_type == "gemma" and "int8_sq" in qformat.split(","): + quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5} - model = quantize_model(model, quant_cfg, calib_dataloader) + model = quantize_model(model, quant_cfg, calib_dataloader, batch_size, + qformat, weight_compression) with torch.inference_mode(): if model_type is None: @@ -453,12 +549,37 @@ def quantize_and_export(*, export_path = output_dir start_time = time.time() - export_tensorrt_llm_checkpoint(model, - model_type, - getattr(torch, dtype), - export_dir=export_path, - inference_tensor_parallel=tp_size, - inference_pipeline_parallel=pp_size) + # Move meta tensor back to device before exporting. + remove_hook_from_module(model, recurse=True) + + QUANT_ALGO = { + "int8": "INT8", + "int8_sq": "W8A8_SQ_PER_CHANNEL", + "fp8": "FP8", + "int4_awq": "W4A16_AWQ", + "w4a8_awq": "W4A8_AWQ", + } + + # workaround for old API version + if weight_compression: + export_tensorrt_llm_checkpoint( + model, + model_type, + getattr(torch, dtype), + export_dir=export_path, + inference_tensor_parallel=tp_size, + inference_pipeline_parallel=pp_size, + auto_quant=weight_compression is not None, + ) + else: + export_tensorrt_llm_checkpoint( + model, + model_type, + getattr(torch, dtype), + export_dir=export_path, + inference_tensor_parallel=tp_size, + inference_pipeline_parallel=pp_size, + ) with open(f"{export_path}/config.json", "r") as f: tensorrt_llm_config = json.load(f) diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py index f843772eb..983d458b8 100755 --- a/tensorrt_llm/runtime/generation.py +++ b/tensorrt_llm/runtime/generation.py @@ -16,13 +16,13 @@ import copy import math import platform +from collections import Counter from dataclasses import dataclass, field from functools import reduce, wraps from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Set, Union import numpy as np -import tensorrt as trt # isort: off import torch @@ -30,6 +30,10 @@ # isort: on from cuda import cudart +from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \ + MemoryPoolsAllocator +from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ + PoolsKVCacheManager from tensorrt_llm.runtime.redrafter_utils import * from .._utils import (pad_vocab_size, str_dtype_to_torch, torch_to_numpy, @@ -40,7 +44,7 @@ from ..mapping import Mapping from ..plugin.plugin import CustomAllReduceHelper from ..quantization import QuantMode -from .kv_cache_manager import GenerationSequence, KVCacheManager, KVCacheUpdater +from .kv_cache_manager import GenerationSequence, KVCacheUpdater from .session import _scoped_stream @@ -810,10 +814,12 @@ def __init__(self, expected_tensor_names += [f'kv_cache_block_offsets'] expected_tensor_names += [f'host_kv_cache_block_offsets'] expected_tensor_names += [f'host_kv_cache_pool_pointers'] + expected_tensor_names += [f'host_kv_cache_pool_mapping'] if self.cross_attention: expected_tensor_names += [f'cross_kv_cache_block_offsets'] expected_tensor_names += [f'host_cross_kv_cache_block_offsets'] expected_tensor_names += [f'host_cross_kv_cache_pool_pointers'] + expected_tensor_names += [f'host_cross_kv_cache_pool_mapping'] else: # Refer to gpt_attention() inside functional.py if self.use_kv_cache and not self.paged_kv_cache: @@ -1696,40 +1702,42 @@ def setup(self, num_blocks, _ = self._get_num_paged_blocks( self.max_attention_window_size, self.sink_token_length, self.use_one_more_block) - cache_shape = ( - num_blocks, - self.num_attn_layers, - 2, - self.get_num_heads_kv(), - self.tokens_per_block, - self.head_size, - ) - self.kv_cache_pool = torch.empty(cache_shape, - dtype=kv_cache_type, - device=self.device) + self._memory_pool_allocator = MemoryPoolsAllocator( + num_blocks=num_blocks, + tokens_per_block=self.tokens_per_block, + head_size=self.head_size) + if self._model_config.num_kv_heads_per_layer is None: + num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer( + self.get_num_heads_kv(), self.num_attn_layers) + else: + num_kv_heads_per_layer = self._model_config.num_kv_heads_per_layer + + self._memory_pool_allocator.allocate(kv_cache_type, + num_kv_heads_per_layer) + if self.cross_attention: # As for now we enable cross paged kv and self paged kv to share the same tokens_per_block cross_num_blocks, _ = self._get_num_paged_blocks( self.encoder_max_input_length, sink_token_length=0, use_one_more_block=False) - cross_cache_shape = ( - cross_num_blocks, - self.num_layers, - 2, - self.get_num_heads_kv(), - self.tokens_per_block, - self.head_size, - ) - self.cross_kv_cache_pool = torch.empty(cross_cache_shape, - dtype=kv_cache_type, - device=self.device) + + num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer( + self.get_num_heads_kv(), self.num_layers) + + self._cross_memory_pool_allocator = MemoryPoolsAllocator( + num_blocks=cross_num_blocks, + tokens_per_block=self.tokens_per_block, + head_size=self.head_size) + self._cross_memory_pool_allocator.allocate( + kv_cache_type, num_kv_heads_per_layer) + elif self.has_attn_layers: for i in range(self.first_layer, self.last_layer): if self.layer_types[i] == 'attention': cache_shape = ( batch_size, 2, - self.get_num_heads_kv(self.general_to_attn_idx[i]), + self.get_num_heads_kv(i), self.max_attention_window_size, self.head_size, ) @@ -1845,6 +1853,43 @@ def setup(self, if self.is_medusa_mode: return self.num_draft_tokens + def _allocate_empty_kv_cache_pools(self, kv_cache_type, num_blocks): + # Layers are homogeneous, use old kv cache shape + unique_cache_pools = [] + if self._model_config.num_kv_heads_per_layer is None: + cache_shape = ( + num_blocks, + self.num_attn_layers, + 2, + self.get_num_heads_kv(), + self.tokens_per_block, + self.head_size, + ) + unique_cache_pools.append( + torch.empty(cache_shape, + dtype=kv_cache_type, + device=self.device)) + + # Layers are not homogeneous, use new kv cache shape + else: + kv_heads_unique_counter = Counter( + self._model_config.num_kv_heads_per_layer) + for kv_head, num_layers in kv_heads_unique_counter.items(): + cache_shape = ( + num_blocks, + num_layers, + 2, + kv_head, + self.tokens_per_block, + self.head_size, + ) + unique_cache_pools.append( + torch.empty(cache_shape, + dtype=kv_cache_type, + device=self.device)) + + return unique_cache_pools + def _get_context_shape_buffer( self, input_ids: torch.Tensor, @@ -1963,17 +2008,20 @@ def add_tensor_with_bs(x, name, bs): if self.paged_kv_cache and self.has_attn_layers: buffer = kv_cache_block_offsets.contiguous() shape = kv_cache_block_offsets.shape - shape = [shape[0] * shape[1], *shape[2:]] + shape = [shape[0], shape[1] * shape[2], *shape[3:]] add_tensor_with_shape(buffer, f'kv_cache_block_offsets', shape) add_tensor_with_shape(host_kv_cache_block_offsets, f'host_kv_cache_block_offsets', shape) pool_pointers = f'host_kv_cache_pool_pointers' + pool_mapping = f'host_kv_cache_pool_mapping' add_tensor(self.buffer[pool_pointers], pool_pointers) + add_tensor(self.buffer[pool_mapping], pool_mapping) if self.cross_attention: cross_buffer = cross_kv_cache_block_offsets.contiguous() cross_shape = cross_kv_cache_block_offsets.shape cross_shape = [ - cross_shape[0] * cross_shape[1], *cross_shape[2:] + cross_shape[0], cross_shape[1] * cross_shape[2], + *cross_shape[3:] ] add_tensor_with_shape(cross_buffer, f'cross_kv_cache_block_offsets', @@ -1982,8 +2030,10 @@ def add_tensor_with_bs(x, name, bs): f'host_cross_kv_cache_block_offsets', cross_shape) cross_pool_pointers = f'host_cross_kv_cache_pool_pointers' + cross_pool_mapping = f'host_cross_kv_cache_pool_mapping' add_tensor(self.buffer[cross_pool_pointers], cross_pool_pointers) + add_tensor(self.buffer[cross_pool_mapping], cross_pool_mapping) batch_size = context_lengths.shape[0] if self.use_kv_cache and not self.paged_kv_cache: @@ -2246,17 +2296,20 @@ def add_tensor_with_shape(x, name, shape): if self.paged_kv_cache and self.has_attn_layers: shape = kv_cache_block_offsets.shape - shape = [shape[0] * shape[1], *shape[2:]] + shape = [shape[0], shape[1] * shape[2], *shape[3:]] add_tensor_with_shape(kv_cache_block_offsets, f'kv_cache_block_offsets', shape) add_tensor_with_shape(host_kv_cache_block_offsets, f'host_kv_cache_block_offsets', shape) pool_pointers = f'host_kv_cache_pool_pointers' + pool_mapping = f'host_kv_cache_pool_mapping' add_tensor(self.buffer[pool_pointers], pool_pointers) + add_tensor(self.buffer[pool_mapping], pool_mapping) if self.cross_attention: cross_shape = cross_kv_cache_block_offsets.shape cross_shape = [ - cross_shape[0] * cross_shape[1], *cross_shape[2:] + cross_shape[0], cross_shape[1] * cross_shape[2], + *cross_shape[3:] ] add_tensor_with_shape(cross_kv_cache_block_offsets, f'cross_kv_cache_block_offsets', @@ -2265,8 +2318,10 @@ def add_tensor_with_shape(x, name, shape): f'host_cross_kv_cache_block_offsets', cross_shape) cross_pool_pointers = f'host_cross_kv_cache_pool_pointers' + cross_pool_mapping = f'host_cross_kv_cache_pool_mapping' add_tensor(self.buffer[cross_pool_pointers], cross_pool_pointers) + add_tensor(self.buffer[cross_pool_mapping], cross_pool_mapping) if prompt_embedding_table is not None: add_tensor(prompt_embedding_table, 'prompt_embedding_table') @@ -3055,11 +3110,11 @@ def handle_per_step( 'host_runtime_perf_knobs', None) if self.paged_kv_cache and self.has_attn_layers: - host_kv_cache_block_offsets = self.kv_cache_manager.get_block_offsets( + host_kv_cache_block_offsets = self.pools_kv_cache_manager.get_block_offsets( beam_width=1) kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda') if self.cross_attention: - host_cross_kv_cache_block_offsets = self.cross_kv_cache_manager.get_block_offsets( + host_cross_kv_cache_block_offsets = self.cross_pools_kv_cache_manager.get_block_offsets( beam_width=1) cross_kv_cache_block_offsets = host_cross_kv_cache_block_offsets.to( 'cuda') @@ -3236,7 +3291,7 @@ def handle_per_step( self.accept_lengths).item() assert add_token_count > 0 for _ in range(add_token_count): - self.kv_cache_manager.step([False] * batch_size) + self.pools_kv_cache_manager.step([False] * batch_size) if self.is_medusa_mode and self.num_draft_tokens > 0: # Allocate kv cache token slots for next step. # Make sure there are always > (num_draft_tokens + 1) free token slots. @@ -3246,16 +3301,16 @@ def handle_per_step( self.accept_lengths).item() assert add_token_count > 0 for _ in range(add_token_count): - self.kv_cache_manager.step([False] * batch_size) + self.pools_kv_cache_manager.step([False] * batch_size) else: - self.kv_cache_manager.step([False] * batch_size) + self.pools_kv_cache_manager.step([False] * batch_size) torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_push("paged_kv_post_alloc") - host_kv_cache_block_offsets = self.kv_cache_manager.get_block_offsets( + host_kv_cache_block_offsets = self.pools_kv_cache_manager.get_block_offsets( beam_width) kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda') if self.cross_attention: - host_cross_kv_cache_block_offsets = self.cross_kv_cache_manager.get_block_offsets( + host_cross_kv_cache_block_offsets = self.cross_pools_kv_cache_manager.get_block_offsets( beam_width) cross_kv_cache_block_offsets = host_cross_kv_cache_block_offsets.to( 'cuda') @@ -3386,9 +3441,9 @@ def handle_per_step( and should_stop.item()): # Free all blocks in all sequences. # With in-flight batching and while loop we'll free some sequences, when they are done - self.kv_cache_manager.step([True] * batch_size) + self.pools_kv_cache_manager.step([True] * batch_size) if self.cross_attention: - self.cross_kv_cache_manager.step([True] * batch_size) + self.cross_pools_kv_cache_manager.step([True] * batch_size) if self.debug_mode: self.dump_debug_buffers(step) @@ -3764,21 +3819,23 @@ def decode(self, num_blocks, max_blocks_per_seq = self._get_num_paged_blocks( self.max_attention_window_size, self.sink_token_length, self.use_one_more_block) - self.buffer[f'host_kv_cache_pool_pointers'] = torch.tensor( - [self.kv_cache_pool.data_ptr(), 0], dtype=torch.int64) - - block_size = self.get_num_heads_kv( - ) * self.tokens_per_block * self.head_size - self.kv_cache_manager = KVCacheManager( - num_layers=self.num_attn_layers, - num_blocks=num_blocks, - block_size=block_size, - tokens_per_block=self.tokens_per_block, - max_blocks_per_seq=max_blocks_per_seq, + + self.buffer[ + f'host_kv_cache_pool_pointers'] = self._memory_pool_allocator.get_kv_cache_pool_pointers( + ) + self.buffer[ + f'host_kv_cache_pool_mapping'] = self._memory_pool_allocator.pool_mapping + + self.pools_kv_cache_manager = PoolsKVCacheManager( + self._memory_pool_allocator.pools_metadata, + max_blocks_per_seq, + num_blocks, + self.tokens_per_block, + self.head_size, max_attention_window_size=self.max_attention_window_size, - sink_token_len=self.sink_token_length, beam_width=beam_width, - use_one_more_block=self.use_one_more_block) + use_one_more_block=self.use_one_more_block, + sink_token_len=self.sink_token_length) if self.cross_attention: cross_num_blocks, max_cross_blocks_per_seq = self._get_num_paged_blocks( @@ -3786,33 +3843,32 @@ def decode(self, sink_token_length=0, use_one_more_block=False) self.buffer[ - f'host_cross_kv_cache_pool_pointers'] = torch.tensor( - [self.cross_kv_cache_pool.data_ptr(), 0], - dtype=torch.int64) - - cross_block_size = self.get_num_heads_kv( - ) * self.tokens_per_block * self.head_size - self.cross_kv_cache_manager = KVCacheManager( - num_layers=self.num_layers, - num_blocks=cross_num_blocks, - block_size=cross_block_size, - tokens_per_block=self.tokens_per_block, - max_blocks_per_seq=max_cross_blocks_per_seq, + f'host_cross_kv_cache_pool_pointers'] = self._cross_memory_pool_allocator.get_kv_cache_pool_pointers( + ) + self.buffer[ + f'host_cross_kv_cache_pool_mapping'] = self._cross_memory_pool_allocator.pool_mapping + + self.cross_pools_kv_cache_manager = PoolsKVCacheManager( + self._memory_pool_allocator.pools_metadata, + max_cross_blocks_per_seq, + cross_num_blocks, + self.tokens_per_block, + self.head_size, max_attention_window_size=self.encoder_max_input_length, - sink_token_len=self.sink_token_length, beam_width=beam_width, - use_one_more_block=False) + use_one_more_block=False, + sink_token_len=self.sink_token_length) # Add sequences to the manager for bi in range(batch_size): generation_sequence = GenerationSequence(seq_idx=bi, batch_idx=bi) - self.kv_cache_manager.add_sequence(generation_sequence, - max_context_length) + self.pools_kv_cache_manager.add_sequence( + generation_sequence, max_context_length) if self.cross_attention: cross_generation_sequence = GenerationSequence(seq_idx=bi, batch_idx=bi) - self.cross_kv_cache_manager.add_sequence( + self.cross_pools_kv_cache_manager.add_sequence( cross_generation_sequence, self.encoder_max_input_length, always_share_across_beam=True) @@ -3834,7 +3890,7 @@ def decode(self, if self.paged_kv_cache: self.kv_cache_updater.init_paged_kv_cache( self.num_layers, self.get_num_heads_kv(), self.head_size, - kv_cache_type, self.kv_cache_manager, + kv_cache_type, self.pools_kv_cache_manager, self.buffer[f'host_kv_cache_pool_pointers']) else: past_key_value_list = [ diff --git a/tensorrt_llm/runtime/kv_cache_manager.py b/tensorrt_llm/runtime/kv_cache_manager.py index f7b33c336..c2b6c3f9b 100644 --- a/tensorrt_llm/runtime/kv_cache_manager.py +++ b/tensorrt_llm/runtime/kv_cache_manager.py @@ -79,7 +79,8 @@ def __init__(self, max_blocks_per_seq: int = 128, beam_width: int = 1): """ - expected block pool shape: [num_blocks, num_layers, 2, block_size] + If layers are homogeneous then the expected block pool shape is: [num_blocks, num_layers, 2, block_size] + Otherwise, the expected block pool shape is: [num_blocks, 2, block_size] """ self.max_blocks_per_seq = max_blocks_per_seq @@ -263,6 +264,7 @@ def __init__(self, block_size=block_size, max_blocks_per_seq=max_blocks_per_seq, beam_width=beam_width) + self.tokens_per_block = tokens_per_block self.max_attention_window_size = max_attention_window_size self.sink_token_len = sink_token_len @@ -422,8 +424,15 @@ def update(self, accepted_draft_token_offsets, int) else 0 assert self.use_paged_kv_cache is not None if self.use_paged_kv_cache: - host_kv_cache_block_offsets = self.kv_cache_manager.get_block_offsets( - 1) + if self.kv_cache_manager.has_single_pool(): + kv_cache_manager = self.kv_cache_manager.get_single_kv_cache_manager( + ) + else: + raise RuntimeError( + "Currently, using KVCacheUpdater with more then single memory pool is not supported" + ) + + host_kv_cache_block_offsets = kv_cache_manager.get_block_offsets(1) kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda') torch.ops.tensorrt_llm.update_kv_cache_draft_token_location( accepted_draft_token_offsets, @@ -434,13 +443,13 @@ def update(self, accepted_draft_token_offsets, self.num_kv_heads, self.head_dim * self.elt_size, rewind_tokens_count, - self.kv_cache_manager.max_attention_window_size, + kv_cache_manager.max_attention_window_size, rewind_tokens_tensor, None, self.host_kv_cache_pool_pointers, kv_cache_block_offsets, - self.kv_cache_manager.blocks_manager.max_blocks_per_seq, - self.kv_cache_manager.tokens_per_block, + kv_cache_manager.blocks_manager.max_blocks_per_seq, + kv_cache_manager.tokens_per_block, None, ) else: diff --git a/tensorrt_llm/runtime/memory_pools/__init__.py b/tensorrt_llm/runtime/memory_pools/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py b/tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py new file mode 100644 index 000000000..d24d8d68b --- /dev/null +++ b/tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py @@ -0,0 +1,80 @@ +from collections import Counter +from typing import List + +import torch + +import tensorrt_llm +from tensorrt_llm.runtime.memory_pools.pool import Pool + + +class MemoryPoolsAllocator(object): + + def __init__(self, num_blocks, tokens_per_block, head_size): + self._pools_metadata = [] + self._pool_pointers = [] + self._pool_mapping = None + + self._num_blocks = num_blocks + self._tokens_per_block = tokens_per_block + self._head_size = head_size + + def allocate(self, dtype, num_kv_heads_per_layer: List[int], device="cuda"): + self._num_kv_heads_per_layer = num_kv_heads_per_layer + + if isinstance(dtype, str): + dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype) + kv_heads_unique_counter = Counter(self._num_kv_heads_per_layer) + keys_to_indices = {} + + for idx, (kv_head, + num_layers) in enumerate(kv_heads_unique_counter.items()): + keys_to_indices[kv_head] = idx + cache_shape = ( + self._num_blocks, + num_layers, + 2, + kv_head, + self._tokens_per_block, + self._head_size, + ) + self._pool_pointers.append( + torch.empty(cache_shape, dtype=dtype, device=device)) + self._pools_metadata.append( + Pool(num_kv_heads=kv_head, num_layers=num_layers)) + + self._set_layers_mapping(keys_to_indices) + + def get_kv_cache_pool_pointers(self): + return self._get_primarmy_secondary_pool_pointers() + + def _set_layers_mapping(self, keys_to_indices): + layers_mapping = [] + for kv_size in self._num_kv_heads_per_layer: + layers_mapping.append(keys_to_indices[kv_size]) + + self._pool_mapping = torch.tensor(layers_mapping, dtype=torch.int32) + + def _get_primarmy_secondary_pool_pointers(self): + assert len(self._pool_pointers + ) >= 1, "pool pointers haven't been initiated yet" + data_ptr_pointers = torch.tensor(list( + map(lambda x: x.data_ptr(), self._pool_pointers)), + dtype=torch.int64) + host_kv_cache_pool_pointers = torch.cat( + (data_ptr_pointers.view(-1, 1), + torch.zeros(len(self._pool_pointers), 1, dtype=torch.int64)), + dim=1) + + return host_kv_cache_pool_pointers + + @classmethod + def prepare_num_kv_heads_per_layer(cls, kv_head, num_layers): + return [kv_head] * num_layers + + @property + def pools_metadata(self): + return self._pools_metadata + + @property + def pool_mapping(self): + return self._pool_mapping diff --git a/tensorrt_llm/runtime/memory_pools/pool.py b/tensorrt_llm/runtime/memory_pools/pool.py new file mode 100644 index 000000000..63308ad0d --- /dev/null +++ b/tensorrt_llm/runtime/memory_pools/pool.py @@ -0,0 +1,7 @@ +from dataclasses import dataclass + + +@dataclass +class Pool(object): + num_kv_heads: int + num_layers: int diff --git a/tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py b/tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py new file mode 100644 index 000000000..4baf86ad3 --- /dev/null +++ b/tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py @@ -0,0 +1,67 @@ +from typing import List + +import torch + +from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence, + KVCacheManager) +from tensorrt_llm.runtime.memory_pools.pool import Pool + + +class PoolsKVCacheManager(object): + + def __init__(self, + pools_metadata: List[Pool], + max_blocks_per_seq, + num_blocks, + tokens_per_block, + head_size, + max_attention_window_size, + beam_width, + sink_token_len, + use_one_more_block: bool = False) -> None: + self._num_pools = len(pools_metadata) + self._kv_cache_managers = [] + + for pool in pools_metadata: + block_size = pool.num_kv_heads * tokens_per_block * head_size + self._kv_cache_managers.append( + KVCacheManager( + num_layers=pool.num_layers, + num_blocks=num_blocks, + block_size=block_size, + tokens_per_block=tokens_per_block, + max_blocks_per_seq=max_blocks_per_seq, + max_attention_window_size=max_attention_window_size, + sink_token_len=sink_token_len, + use_one_more_block=use_one_more_block, + beam_width=beam_width, + )) + + def add_sequence(self, + sequence: GenerationSequence, + context_len: int, + always_share_across_beam: bool = False): + for kv_cache_manager in self._kv_cache_managers: + kv_cache_manager.add_sequence(sequence, context_len, + always_share_across_beam) + + def step(self, finished: List[bool]): + for kv_cache_manager in self._kv_cache_managers: + kv_cache_manager.step(finished) + + def get_block_offsets(self, beam_width: int) -> torch.Tensor: + offsets = [] + for kv_cache_manager in self._kv_cache_managers: + block_offset = kv_cache_manager.get_block_offsets(beam_width) + offsets.append(block_offset) + + return torch.stack(offsets) + + def get_single_kv_cache_manager(self): + assert len(self._kv_cache_managers + ) == 1, f"More then one kv cache manager exists" + + return self._kv_cache_managers[0] + + def has_single_pool(self): + return len(self._kv_cache_managers) == 1 diff --git a/tensorrt_llm/runtime/model_runner_cpp.py b/tensorrt_llm/runtime/model_runner_cpp.py index d715c2da3..269ca2f37 100644 --- a/tensorrt_llm/runtime/model_runner_cpp.py +++ b/tensorrt_llm/runtime/model_runner_cpp.py @@ -15,7 +15,7 @@ import copy from pathlib import Path -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union import torch @@ -24,6 +24,7 @@ from ..bindings import (DataType, GptJsonConfig, KVCacheType, ModelConfig, WorldConfig) from ..bindings import executor as trtllm +from ..bindings.executor import ExternalDraftTokensConfig, ParallelConfig from ..builder import EngineConfig from ..logger import logger from ..mapping import Mapping @@ -87,6 +88,7 @@ def from_dir( max_attention_window_size: Optional[list[int]] = None, sink_token_length: Optional[int] = None, kv_cache_free_gpu_memory_fraction: Optional[float] = None, + cross_kv_cache_fraction: Optional[float] = None, medusa_choices: list[list[int]] | None = None, lookahead_config: list[int] | None = None, debug_mode: bool = False, @@ -97,7 +99,10 @@ def from_dir( enable_chunked_context: bool = False, is_enc_dec: bool = False, multi_block_mode: bool = True, - enable_context_fmha_fp32_acc: Optional[bool] = None + enable_context_fmha_fp32_acc: Optional[bool] = None, + cuda_graph_mode: Optional[bool] = None, + logits_processor_map: Optional[Dict[str, LogitsProcessor]] = None, + device_ids: List[int] | None = None, ) -> 'ModelRunnerCpp': """ Create a ModelRunnerCpp instance from an engine directory. @@ -131,6 +136,8 @@ def from_dir( The sink token length, default=0. kv_cache_free_gpu_memory_fraction (float) : Free GPU memory fraction that KV cache used. + cross_kv_cache_fraction (float) : + KV Cache fraction reserved for cross attention, should only be used with enc-dec models. debug_mode (bool): Whether or not to turn on the debug mode. medusa_choices (List[List[int]]): @@ -149,6 +156,13 @@ def from_dir( Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel. enable_context_fmha_fp32_acc (bool): Enable FMHA runner FP32 accumulation. + cuda_graph_mode (bool): + Whether to use cuda graph for inference. + logits_processor_map (Dict[str, LogitsProcessor]) + A map of logits processor functions indexed by names. A name can be provided later to + the generate() function to specify which logits processor to run. + device_ids (List[int]): + Device indices to run the Executor on. Returns: ModelRunnerCpp: An instance of ModelRunnerCpp. """ @@ -158,6 +172,8 @@ def from_dir( extended_runtime_perf_knob_config.multi_block_mode = multi_block_mode if enable_context_fmha_fp32_acc is not None: extended_runtime_perf_knob_config.enable_context_fmha_fp32_acc = enable_context_fmha_fp32_acc + if cuda_graph_mode is not None: + extended_runtime_perf_knob_config.cuda_graph_mode = cuda_graph_mode if is_enc_dec: encoder_config_path = Path(engine_dir) / "encoder" / "config.json" @@ -182,8 +198,8 @@ def from_dir( profiler.start('load tensorrt_llm engine') kv_cache_config = trtllm.KvCacheConfig( - free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction / - 2, # hardcoded as half self kv & half cross kv for now + free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, + cross_kv_cache_fraction=cross_kv_cache_fraction, max_attention_window=max_attention_window_size, sink_token_length=sink_token_length) @@ -215,6 +231,7 @@ def from_dir( json_config = GptJsonConfig.parse_file(config_path) model_config = json_config.model_config use_kv_cache = model_config.kv_cache_type != KVCacheType.DISABLED + assert cross_kv_cache_fraction is None, "cross_kv_cache_fraction should only be used with enc-dec models." if not use_kv_cache: assert max_output_len == 1 or max_output_len is None, 'Disabled KV cache is intended for context phase only now.' @@ -320,8 +337,8 @@ def from_dir( debug_tensor_names: List[str] = [ ] # modify this list for specific tensor dump debug_config = trtllm.DebugConfig( - dump_input_tensors=True, - dump_output_tensors=True, + debug_input_tensors=True, + debug_output_tensors=True, debug_tensor_names=debug_tensor_names) trtllm_config = trtllm.ExecutorConfig( @@ -333,6 +350,16 @@ def from_dir( gpu_weights_percent=gpu_weights_percent) trtllm_config.enable_chunked_context = enable_chunked_context trtllm_config.extended_runtime_perf_knob_config = extended_runtime_perf_knob_config + trtllm_config.parallel_config = ParallelConfig( + trtllm.CommunicationType.MPI, + trtllm.CommunicationMode.LEADER, + device_ids=device_ids, + orchestrator_config=None) + + logits_proc_config = trtllm.LogitsPostProcessorConfig() + if logits_processor_map is not None: + logits_proc_config.processor_map = logits_processor_map + trtllm_config.logits_post_processor_config = logits_proc_config executor = trtllm.Executor(engine_dir, trtllm.ModelType.DECODER_ONLY, trtllm_config) @@ -434,7 +461,7 @@ def generate( lookahead_config: list[int] | None = None, streaming: bool = False, stopping_criteria: Optional[StoppingCriteria] = None, - logits_processor: Optional[LogitsProcessor] = None, + logits_processor_names: list[str] | None = None, max_new_tokens: int = 1, num_return_sequences: int = 1, end_id: int | None = None, @@ -447,6 +474,7 @@ def generate( output_cum_log_probs: bool = False, prompt_table: Optional[Union[str, torch.Tensor]] = None, prompt_tasks: Optional[str] = None, + input_token_extra_ids: List[List[int]] = None, return_all_generated_tokens: bool = False, **kwargs) -> Union[torch.Tensor, dict]: """ @@ -473,14 +501,16 @@ def generate( The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself. prompt_tasks (str): The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0). + input_token_extra_ids (List[List[int]]): + Input token extra ids for using p-tuning and KV Cache reuse together lora_uids (list): The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module. streaming (bool): Whether or not to use streaming mode for generation. stopping_criteria (StoppingCriteria): Custom stopping criteria. - logits_processor (LogitsProcessor): - Custom logits processors. + logits_processor_names (List[str]): + Custom logits processor names. return_all_generated_tokens (bool): Whether the full output is returned at each streaming step num_return_sequences (int): @@ -501,9 +531,6 @@ def generate( if stopping_criteria is not None: raise RuntimeError( "Stopping criteria is not supported in C++ session.") - if logits_processor is not None: - raise RuntimeError( - "Logits processor is not supported in C++ session.") if not self.use_kv_cache and max_new_tokens > 1: raise RuntimeError( @@ -554,12 +581,15 @@ def generate( ) prompt_tuning_configs = self._prepare_ptuning_executor( - batch_input_ids_list, prompt_table, prompt_tasks) + batch_input_ids_list, prompt_table, prompt_tasks, + input_token_extra_ids) stop_words_list = self._prepare_words_list(stop_words_list, len(batch_input_ids_list)) bad_words_list = self._prepare_words_list(bad_words_list, len(batch_input_ids_list)) + logits_processor_names = self._prepare_names_list( + logits_processor_names, len(batch_input_ids_list)) lora_configs = self._prepare_lora_configs(lora_uids, len(batch_input_ids_list)) @@ -568,6 +598,29 @@ def generate( [w, n, g] = lookahead_config request_lookahead_config = trtllm.LookaheadDecodingConfig(w, n, g) + # Draft-Target-Model speculative decoding + if "draft_tokens_list" in kwargs.keys() and kwargs[ + "draft_tokens_list"] is not None and "draft_logits_list" in kwargs.keys( + ) and kwargs["draft_logits_list"] is not None: + # Use logits to accept + external_draft_tokens_configs = [ + ExternalDraftTokensConfig(draft_tokens, draft_logits, 1.0e-8) + for draft_tokens, draft_logits in zip( + kwargs["draft_tokens_list"], kwargs["draft_logits_list"]) + ] + is_draft_target_model = True + elif "draft_tokens_list" in kwargs.keys( + ) and kwargs["draft_tokens_list"] is not None: + # Use tokens to accept + external_draft_tokens_configs = [ + ExternalDraftTokensConfig(draft_tokens) + for draft_tokens in kwargs["draft_tokens_list"] + ] + is_draft_target_model = True + else: + external_draft_tokens_configs = [None] * len(batch_input_ids_list) + is_draft_target_model = False + requests = [ trtllm.Request( input_token_ids=input_ids, @@ -591,11 +644,16 @@ def generate( output_config=output_config, prompt_tuning_config=prompt_tuning_config, lora_config=lora_config, - return_all_generated_tokens=return_all_generated_tokens) for i, + return_all_generated_tokens=return_all_generated_tokens, + logits_post_processor_name=logits_post_processor_name, + external_draft_tokens_config=external_draft_tokens_config, + ) for i, (input_ids, stop_words, bad_words, prompt_tuning_config, - lora_config) in enumerate( + lora_config, logits_post_processor_name, + external_draft_tokens_config) in enumerate( zip(batch_input_ids_list, stop_words_list, bad_words_list, - prompt_tuning_configs, lora_configs)) + prompt_tuning_configs, lora_configs, + logits_processor_names, external_draft_tokens_configs)) ] request_ids = self.session.enqueue_requests(requests) @@ -603,14 +661,15 @@ def generate( return self._initialize_and_fill_output( request_ids, end_id, return_dict, output_sequence_lengths, output_log_probs, output_cum_log_probs, batch_input_ids, - streaming, max_new_tokens, num_return_sequences) + streaming, max_new_tokens, num_return_sequences, + is_draft_target_model) else: return self._stream(request_ids, end_id, return_dict, output_sequence_lengths, output_log_probs, output_cum_log_probs, batch_input_ids, batch_input_ids_list, streaming, return_all_generated_tokens, max_new_tokens, - num_return_sequences) + num_return_sequences, is_draft_target_model) def _prepare_words_list(self, words_list: List[List[List[int]]], batch_size: int): @@ -618,8 +677,16 @@ def _prepare_words_list(self, words_list: List[List[List[int]]], return [None] * batch_size return words_list + def _prepare_names_list(self, names_list: List[str], batch_size: int): + if names_list is None: + return [None] * batch_size + return names_list + def _prepare_ptuning_executor(self, batch_input_ids_list, prompt_table, - prompt_tasks): + prompt_tasks, input_token_extra_ids): + if input_token_extra_ids: + assert len(batch_input_ids_list) == len(input_token_extra_ids), \ + f"Batch size of input_token_extra_ids ({len(input_token_extra_ids)}) must be the same as input batch size ({len(batch_input_ids_list)})" prompt_tuning_configs = len(batch_input_ids_list) * [None] if prompt_table is not None: prompt_table_data = self._prepare_embedding_table( @@ -630,14 +697,18 @@ def _prepare_ptuning_executor(self, batch_input_ids_list, prompt_table, f"Number of supplied tasks ({len(task_indices)}) must match input batch size ({len(batch_input_ids_list)})" prompt_tuning_configs = [ trtllm.PromptTuningConfig( - embedding_table=prompt_table_data[task_indices[i]]) + embedding_table=prompt_table_data[task_indices[i]], + input_token_extra_ids=input_token_extra_ids[i] + if input_token_extra_ids else None) for i in range(len(batch_input_ids_list)) ] else: prompt_tuning_configs = [ trtllm.PromptTuningConfig( - embedding_table=prompt_table_data[0]) - for _ in range(len(batch_input_ids_list)) + embedding_table=prompt_table_data[0], + input_token_extra_ids=input_token_extra_ids[i] + if input_token_extra_ids else None) + for i in range(len(batch_input_ids_list)) ] return prompt_tuning_configs @@ -652,17 +723,20 @@ def _prepare_lora_configs(self, lora_uids, batch_size): if int(uid) >= 0 else None for uid in lora_uids ] - def _initialize_and_fill_output(self, - request_ids, - end_id, - return_dict, - output_sequence_lengths, - output_log_probs, - output_cum_log_probs, - batch_input_ids, - streaming, - max_new_tokens: int, - num_return_sequences: int = 1): + def _initialize_and_fill_output( + self, + request_ids, + end_id, + return_dict, + output_sequence_lengths, + output_log_probs, + output_cum_log_probs, + batch_input_ids, + streaming, + max_new_tokens: int, + num_return_sequences: int = 1, + is_draft_target_model: bool = False, + ): output_ids = [[[] for _ in range(self.max_beam_width)] for _ in range(len(request_ids) * num_return_sequences)] @@ -675,21 +749,24 @@ def _initialize_and_fill_output(self, output_sequence_lengths, output_log_probs, output_cum_log_probs, batch_input_ids, [], streaming, request_ids, False, max_new_tokens, - num_return_sequences) - - def _stream(self, - request_ids, - end_id, - return_dict, - output_sequence_lengths, - output_log_probs, - output_cum_log_probs, - batch_input_ids, - batch_input_ids_list, - streaming, - return_all_generated_tokens, - max_new_tokens: int, - num_return_sequences: int = 1): + num_return_sequences, is_draft_target_model) + + def _stream( + self, + request_ids, + end_id, + return_dict, + output_sequence_lengths, + output_log_probs, + output_cum_log_probs, + batch_input_ids, + batch_input_ids_list, + streaming, + return_all_generated_tokens, + max_new_tokens: int, + num_return_sequences: int = 1, + is_draft_target_model: bool = False, + ): output_ids = [[] for _ in range(len(request_ids) * num_return_sequences)] @@ -712,14 +789,15 @@ def _stream(self, output_cum_log_probs, batch_input_ids, batch_input_ids_list, streaming, request_ids, return_all_generated_tokens, - max_new_tokens, num_return_sequences) + max_new_tokens, num_return_sequences, + is_draft_target_model) def _fill_output(self, responses, output_ids, end_id, return_dict, output_sequence_lengths, output_log_probs, output_cum_log_probs, batch_input_ids, batch_input_ids_list, streaming, request_ids, return_all_generated_tokens, max_new_tokens, - num_return_sequences): + num_return_sequences, is_draft_target_model): cuda_device = torch.device("cuda") # Total number of output sequences = batch_size * num_return_sequences. @@ -792,32 +870,39 @@ def req_idx(response: trtllm.Response): outputs['context_logits'] = context_logits if self.gather_generation_logits: - if not streaming: - gen_shape = (num_beams, max_new_tokens, vocab_size) - elif streaming and return_all_generated_tokens: - gen_shape = (max_new_tokens, num_beams, vocab_size) - else: # streaming and not return_all_generated_tokens - gen_shape = (1, num_beams, vocab_size) - gen_logits = None - for response in responses: - # gen logits shape: (beam, seq, vocab) - logits = response.result.generation_logits - if logits is None: - continue - num_beams, seq_len, vocab_size = logits.shape - if gen_logits is None: - gen_logits = torch.zeros( - (num_output_sequences, *gen_shape), - dtype=logits.dtype, - device=cuda_device) - batch_idx = request_ids.index(response.request_id) - seq_idx = response.result.sequence_index - reqid_pos = batch_idx * num_return_sequences + seq_idx - if streaming: - gen_logits[reqid_pos, :seq_len, ...] = logits[0] - else: - gen_logits[reqid_pos, :, :seq_len, ...] = logits[0] + if is_draft_target_model: + # Put the outputs in a list rather than a tensor since their + # length may vary among requests in a batch + gen_logits = [ + a.result.generation_logits.cuda() for a in responses + if a.result.generation_logits is not None + ] + else: + for response in responses: + # gen logits shape: (beam, seq, vocab) + logits = response.result.generation_logits + if logits is None: + continue + num_beams, seq_len, vocab_size = logits.shape + if not streaming: + gen_shape = (num_beams, max_new_tokens, vocab_size) + elif streaming and return_all_generated_tokens: + gen_shape = (max_new_tokens, num_beams, vocab_size) + else: # streaming and not return_all_generated_tokens + gen_shape = (1, num_beams, vocab_size) + if gen_logits is None: + gen_logits = torch.zeros( + (num_output_sequences, *gen_shape), + dtype=logits.dtype, + device=cuda_device) + batch_idx = request_ids.index(response.request_id) + seq_idx = response.result.sequence_index + reqid_pos = batch_idx * num_return_sequences + seq_idx + if streaming: + gen_logits[reqid_pos, :seq_len, ...] = logits[0] + else: + gen_logits[reqid_pos, :, :seq_len, ...] = logits[0] outputs['generation_logits'] = gen_logits if output_log_probs: diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py index 3c66b5261..dc3d9d279 100644 --- a/tensorrt_llm/tools/multimodal_builder.py +++ b/tensorrt_llm/tools/multimodal_builder.py @@ -18,7 +18,6 @@ Pix2StructForConditionalGeneration, VisionEncoderDecoderModel) # isort: on -import json import math import torch.nn.functional as F @@ -145,7 +144,8 @@ def build_trt_engine(model_type, config_args = { "precision": str(dtype).split('.')[-1], - "model_type": model_type + "model_type": model_type, + "strongly_typed": False } if num_frames is not None: config_args["num_frames"] = num_frames @@ -325,8 +325,10 @@ def forward(self, image): features = all_hidden_states[self.feature_layer][:, 1:] return self.projector(features) + hf_config = AutoConfig.from_pretrained(args.model_path) + hf_config.vision_config._attn_implementation = "eager" model = LlavaForConditionalGeneration.from_pretrained( - args.model_path, torch_dtype=torch.float16) + args.model_path, torch_dtype=torch.float16, config=hf_config) wrapper = LlavaVisionWrapper( model.vision_tower.to(args.device), model.multi_modal_projector.to(args.device), @@ -352,8 +354,10 @@ def forward(self, pixel_values): image_features = self.projector(selected_image_feature) return image_features # (bs, 576, c) + hf_config = AutoConfig.from_pretrained(args.model_path) + hf_config.vision_config._attn_implementation = "eager" model = LlavaNextForConditionalGeneration.from_pretrained( - args.model_path, torch_dtype=torch.float16) + args.model_path, torch_dtype=torch.float16, config=hf_config) wrapper = LlavaNextVisionWrapper( model.vision_tower.vision_model.to(args.device), model.multi_modal_projector.to(args.device), @@ -644,7 +648,8 @@ def forward(self, images): encoder = AutoModel.from_pretrained(vision_config["from_pretrained"], torch_dtype=torch.bfloat16, - trust_remote_code=True) + trust_remote_code=True, + attn_implementation="eager") vision_encoder = encoder.vision_model hf_config = encoder.config dtype = hf_config.torch_dtype @@ -731,13 +736,6 @@ def build_phi_engine(args): images=raw_image, return_tensors="pt")['pixel_values'].to( args.device, torch.float16) - try: - with open(f"{args.model_path}/preprocessor_config.json", "r") as file: - config = file.read() - config_dict = json.loads(config) - num_crops = config_dict.get("num_crops") - except: - num_crops = 16 class Phi3VisionWrapper(torch.nn.Module): @@ -792,7 +790,8 @@ def apply_img_projection(self, input): tensors = {"glb_GN": glb_GN, "sub_GN": sub_GN} save_file(tensors, args.output_dir + "/image_newlines.safetensors") export_onnx(wrapper, image, f'{args.output_dir}/onnx') - build_trt_engine( - args.model_type, [image.shape[1], image.shape[2], image.shape[3]], - f'{args.output_dir}/onnx', args.output_dir, - args.max_batch_size * (num_crops + 1)) #TODO: Take input from config + num_crops = processor.image_processor.num_crops + build_trt_engine(args.model_type, + [image.shape[1], image.shape[2], image.shape[3]], + f'{args.output_dir}/onnx', args.output_dir, + args.max_batch_size * (num_crops + 1)) diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index 4c97312a5..b425c50e5 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.13.0" +__version__ = "0.14.0" diff --git a/tests/attention/test_gpt_attention.py b/tests/attention/test_gpt_attention.py index 9fdbaec30..ed3e5cadd 100644 --- a/tests/attention/test_gpt_attention.py +++ b/tests/attention/test_gpt_attention.py @@ -40,13 +40,18 @@ RotaryScalingType) from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import GenerationSequence, KVCacheManager +from tensorrt_llm.runtime import GenerationSequence +from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ + PoolsKVCacheManager sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import (getSMVersion, skip_bf16_fp32_accum, skip_bf16_pre_ampere, skip_fp8_pre_ada, skip_fp32_accum_pre_ampere, unittest_name_func) +from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \ + MemoryPoolsAllocator + class TestFunctional(unittest.TestCase): @@ -399,11 +404,12 @@ def test_gpt_attention(self, def _construct_execution( session, input_tensor, weight, bias, past_key_value, host_kv_cache_block_offsets, host_kv_cache_pool_pointers, - packed_mask_for_fmha, sequence_length, - host_past_key_value_lengths, host_max_attention_window_sizes, - host_sink_token_length, context_lengths, host_context_lengths, - cache_indirection, host_request_types, num_heads, hidden_size, - num_kv_heads, output, dtype, max_context_length, shape_dict, + host_kv_cache_pool_mapping, packed_mask_for_fmha, + sequence_length, host_past_key_value_lengths, + host_max_attention_window_sizes, host_sink_token_length, + context_lengths, host_context_lengths, cache_indirection, + host_request_types, num_heads, hidden_size, num_kv_heads, + output, dtype, max_context_length, shape_dict, kv_int8_quant_scale, kv_int8_dequant_scale, configuration, host_runtime_perf_knobs): kv_cache_block_offsets = None @@ -480,6 +486,7 @@ def _construct_execution( kv_cache_block_offsets_tensor = None host_kv_cache_block_offsets_tensor = None host_kv_cache_pool_pointers_tensor = None + host_kv_cache_pool_mapping_tensor = None if paged_kv_cache: kv_cache_block_offsets_tensor = Tensor( name='kv_cache_block_offsets', @@ -491,8 +498,15 @@ def _construct_execution( dtype=tensorrt_llm.str_dtype_to_trt('int32')) host_kv_cache_pool_pointers_tensor = Tensor( name='host_kv_cache_pool_pointers', - shape=(1, ), + shape=( + 1, + 1, + ), dtype=tensorrt_llm.str_dtype_to_trt('int64')) + host_kv_cache_pool_mapping_tensor = Tensor( + name='host_kv_cache_pool_mapping', + shape=(1, ), + dtype=tensorrt_llm.str_dtype_to_trt('int32')) else: past_key_value_tensor = Tensor( name='past_key_value', @@ -606,6 +620,7 @@ def _construct_execution( host_kv_cache_block_offsets_tensor, host_kv_cache_pool_pointers= host_kv_cache_pool_pointers_tensor, + host_kv_cache_pool_mapping=host_kv_cache_pool_mapping_tensor, max_context_length=max_context_length, qkv_bias=qkv_bias, host_runtime_perf_knobs=host_runtime_perf_knobs_tensor) @@ -639,6 +654,8 @@ def _construct_execution( 'host_kv_cache_block_offsets'] = host_kv_cache_block_offsets inputs[ 'host_kv_cache_pool_pointers'] = host_kv_cache_pool_pointers + inputs[ + 'host_kv_cache_pool_mapping'] = host_kv_cache_pool_mapping else: inputs['past_key_value'] = past_key_value @@ -663,7 +680,6 @@ def _construct_execution( builder_config = builder.create_builder_config( name=attention_type, precision=dtype, - opt_level=0, int8=int8_trt_flag, quant_mode=quant_mode) @@ -725,24 +741,34 @@ def _construct_execution( dtype=torch_kv_cache_dtype, device='cuda') host_kv_cache_pool_pointers = None + host_kv_cache_pool_mapping = None # Init KV cache block manager if paged_kv_cache: - block_size = plugin_kv_num_heads * tokens_per_block * head_size - kv_cache_manager = KVCacheManager( - num_layers=1, + memory_pools_allocator = MemoryPoolsAllocator( num_blocks=num_blocks, - block_size=block_size, tokens_per_block=tokens_per_block, - max_blocks_per_seq=max_blocks_per_seq, + head_size=head_size) + + num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer( + plugin_kv_num_heads, 1) + memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer) + pools_kv_cache_manager = PoolsKVCacheManager( + memory_pools_allocator.pools_metadata, + max_blocks_per_seq, + num_blocks, + tokens_per_block, + head_size, max_attention_window_size=max_seq_len, - sink_token_len=sink_token_len, - beam_width=beam_width) + beam_width=beam_width, + sink_token_len=sink_token_len) + host_kv_cache_pool_pointers = torch.tensor( [present_key_value.data_ptr(), 0], dtype=torch.int64) + host_kv_cache_pool_mapping = memory_pools_allocator.pool_mapping # Add sequences to the kv_cache_manager for bi in range(batch_size): - kv_cache_manager.add_sequence( + pools_kv_cache_manager.add_sequence( GenerationSequence(seq_idx=bi, batch_idx=bi), in_len) weight = torch.randn(shape_dict['weight'], @@ -801,7 +827,7 @@ def _construct_execution( # See input_lengths below. configuration.max_position_embeddings = ( in_len // 2) + out_len - (out_len // 2) - attention = AttentionCls(configuration).cuda().eval() + attention = AttentionCls(configuration, layer_idx=0).cuda().eval() if attention_type == 'gpt2_attention': attention.c_attn.weight = torch.nn.parameter.Parameter( data=weight.clone().detach(), requires_grad=False) @@ -969,29 +995,38 @@ def remove_input_padding(tensor): device='cuda') def get_kv_quant_scale(torch_present): - - torch_kv = torch.cat((torch_present[0], torch_present[1])) - kv_dequant_scale = torch.tensor([torch.max(torch_kv).item() / 127], - dtype=torch.float32, - device='cuda').reshape( - shape_dict['kv_dequant_scale']) - - # fp8 kv cache uses 1.0f scale. - if not use_int8_kv_cache: + if torch_present is None: kv_dequant_scale = torch.tensor( [1.0], dtype=torch.float32, device='cuda').reshape(shape_dict['kv_dequant_scale']) + kv_quant_scale = 1.0 / kv_dequant_scale + else: + torch_kv = torch.cat((torch_present[0], torch_present[1])) + kv_dequant_scale = torch.tensor( + [torch.max(torch_kv).item() / 127], + dtype=torch.float32, + device='cuda').reshape(shape_dict['kv_dequant_scale']) + + # fp8 kv cache uses 1.0f scale. + if not use_int8_kv_cache: + kv_dequant_scale = torch.tensor( + [1.0], dtype=torch.float32, + device='cuda').reshape(shape_dict['kv_dequant_scale']) - kv_quant_scale = 1.0 / kv_dequant_scale + kv_quant_scale = 1.0 / kv_dequant_scale return kv_dequant_scale, kv_quant_scale def verify_kv_cache(torch_present): # If enable streamingllm, kv_cache stores keys and values that with no positional embedding applied - if streamingllm: + if streamingllm or torch_present is None: return if not use_int8_kv_cache and not use_fp8_kv_cache and num_kv_heads == num_heads and beam_width == 1: if paged_kv_cache: + assert pools_kv_cache_manager.has_single_pool( + ) is True, f"Current test assuming only one memory pool" + kv_cache_manager = pools_kv_cache_manager.get_single_kv_cache_manager( + ) kv_cache_cont = kv_cache_manager.blocks_manager.get_continuous_caches( present_key_value) kv_cache_cont = kv_cache_cont.permute(1, 0, 2) @@ -1054,9 +1089,12 @@ def verify_kv_cache(torch_present): kv_cache_block_offsets = None if paged_kv_cache: # Get arrays of pointers to the "pages" of KV values + assert pools_kv_cache_manager.has_single_pool( + ) is True, f"Current test assuming only one memory pool" + kv_cache_manager = pools_kv_cache_manager.get_single_kv_cache_manager( + ) kv_cache_block_offsets = kv_cache_manager.get_block_offsets( beam_width) - if step == 0: host_request_types = torch.tensor([0] * batch_size, dtype=torch.int32) @@ -1181,8 +1219,9 @@ def verify_kv_cache(torch_present): session, output, present_key_value = _construct_execution( session, input_tensor, weight_plugin, bias_plugin, present_key_value, kv_cache_block_offsets, - host_kv_cache_pool_pointers, packed_mask_for_fmha, - sequence_length, host_past_key_value_lengths, + host_kv_cache_pool_pointers, host_kv_cache_pool_mapping, + packed_mask_for_fmha, sequence_length, + host_past_key_value_lengths, host_max_attention_window_sizes, host_sink_token_length, input_lengths, host_context_lengths, cache_indirection, host_request_types, num_heads, hidden_size, num_kv_heads, @@ -1191,7 +1230,6 @@ def verify_kv_cache(torch_present): context_host_runtime_perf_knobs) del session session = None - # Note: Volta has larger errors. # We speculate it’s because Volta’s TC is smaller and more calculations are required, # which may lead to more error accumulation. @@ -1353,7 +1391,8 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int): session, tiled_output, present_key_value = _construct_execution( session, tiled_input_tensor, weight_plugin, bias_plugin, tiled_present_key_value, kv_cache_block_offsets, - host_kv_cache_pool_pointers, None, tiled_sequence_length, + host_kv_cache_pool_pointers, host_kv_cache_pool_mapping, + None, tiled_sequence_length, tiled_host_past_key_value_lengths, host_max_attention_window_sizes, host_sink_token_length, tiled_input_lengths, tiled_host_context_lengths, @@ -1374,7 +1413,7 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int): if paged_kv_cache: # Iterate to the next step. Increase number of tokens for all unfinished sequences # And allocate new blocks if needed - kv_cache_manager.step([False] * batch_size) + pools_kv_cache_manager.step([False] * batch_size) # assert False, "Force fail" return diff --git a/tests/attention/test_gpt_attention_IFB.py b/tests/attention/test_gpt_attention_IFB.py index 08d327fe3..3f0514c44 100644 --- a/tests/attention/test_gpt_attention_IFB.py +++ b/tests/attention/test_gpt_attention_IFB.py @@ -45,13 +45,18 @@ RotaryScalingType) from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import GenerationSequence, KVCacheManager +from tensorrt_llm.runtime import GenerationSequence sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import (skip_bf16_fp32_accum, skip_bf16_pre_ampere, skip_fp8_pre_ada, skip_fp32_accum_pre_ampere, unittest_name_func) +from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \ + MemoryPoolsAllocator +from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ + PoolsKVCacheManager + class TestFunctional(unittest.TestCase): @@ -217,6 +222,7 @@ def _construct_execution(session, bias, host_kv_cache_block_offsets, host_kv_cache_pool_pointers, + host_kv_cache_pool_mapping, sequence_length, host_past_key_value_lengths, host_max_attention_window_sizes, @@ -291,8 +297,15 @@ def _construct_execution(session, dtype=tensorrt_llm.str_dtype_to_trt('int32')) host_kv_cache_pool_pointers_tensor = Tensor( name='host_kv_cache_pool_pointers', - shape=(1, ), + shape=( + 1, + 1, + ), dtype=tensorrt_llm.str_dtype_to_trt('int64')) + host_kv_cache_pool_mapping_tensor = Tensor( + name='host_kv_cache_pool_mapping', + shape=(1, ), + dtype=tensorrt_llm.str_dtype_to_trt('int32')) host_runtime_perf_knobs_tensor = Tensor( name='host_runtime_perf_knobs', shape=[16], @@ -419,6 +432,7 @@ def _construct_execution(session, host_kv_cache_block_offsets_tensor, host_kv_cache_pool_pointers= host_kv_cache_pool_pointers_tensor, + host_kv_cache_pool_mapping=host_kv_cache_pool_mapping_tensor, host_context_lengths=host_context_lengths_tensor, qkv_bias=qkv_bias, host_runtime_perf_knobs=host_runtime_perf_knobs_tensor) @@ -443,6 +457,7 @@ def _construct_execution(session, 'kv_cache_block_offsets': kv_cache_block_offsets, 'host_kv_cache_block_offsets': host_kv_cache_block_offsets, 'host_kv_cache_pool_pointers': host_kv_cache_pool_pointers, + 'host_kv_cache_pool_mapping': host_kv_cache_pool_mapping, 'host_runtime_perf_knobs': host_runtime_perf_knobs } if use_int8_kv_cache or use_fp8_kv_cache: @@ -465,7 +480,6 @@ def _construct_execution(session, builder_config = builder.create_builder_config( name=attention_type, precision=dtype, - opt_level=0, fp8=use_fp8_context_fmha, int8=int8_trt_flag) if session is None: @@ -570,6 +584,11 @@ def _construct_execution(session, configuration.max_position_embeddings = ( in_len // 2) + out_len - (out_len // 2) attention = AttentionCls(configuration).cuda().eval() + if isinstance(attention, LlamaAttention): + from transformers.models.llama.modeling_llama import \ + LlamaRotaryEmbedding + attention.rotary_emb = LlamaRotaryEmbedding(config=configuration, + device="cuda") if attention_type == 'gpt2_attention': attention.c_attn.weight = torch.nn.parameter.Parameter( data=weight.clone().detach(), requires_grad=False) @@ -780,18 +799,27 @@ def torch_exec(step: int, torch.cuda.synchronize() return torch_output, torch_present - # Init KV cache block manager - block_size = plugin_kv_num_heads * tokens_per_block * head_size - kv_cache_manager = KVCacheManager(num_layers=1, - num_blocks=num_blocks, - block_size=block_size, - tokens_per_block=tokens_per_block, - max_blocks_per_seq=max_blocks_per_seq, - max_attention_window_size=max_seq_len, - sink_token_len=sink_token_len, - beam_width=beam_width) - host_kv_cache_pool_pointers = torch.tensor( - [ordered_key_value.data_ptr(), 0], dtype=torch.int64) + # Init Pools KV cache manager + memory_pools_allocator = MemoryPoolsAllocator( + num_blocks=num_blocks, + tokens_per_block=tokens_per_block, + head_size=head_size) + num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer( + plugin_kv_num_heads, 1) + memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer) + pools_kv_cache_manager = PoolsKVCacheManager( + memory_pools_allocator.pools_metadata, + max_blocks_per_seq, + num_blocks, + tokens_per_block, + head_size, + max_attention_window_size=max_seq_len, + beam_width=beam_width, + sink_token_len=sink_token_len) + + host_kv_cache_pool_pointers = memory_pools_allocator.get_kv_cache_pool_pointers( + ) + host_kv_cache_pool_mapping = memory_pools_allocator.pool_mapping print("pool ptr ", ordered_key_value.data_ptr()) torch_cache_list = [None] * num_req @@ -848,11 +876,15 @@ def torch_exec(step: int, # Add sequence to the manager sequence = GenerationSequence(seq_idx=iteration, batch_idx=iteration) - kv_cache_manager.add_sequence(sequence, in_len_req.clone()) + pools_kv_cache_manager.add_sequence(sequence, + in_len_req.clone()) # Get arrays of pointers to the "pages" of KV values - offset_array = kv_cache_manager.get_block_offsets(beam_width) - dense_offset_array = offset_array[sequence_selection] + offset_array = pools_kv_cache_manager.get_block_offsets(beam_width) + assert offset_array.shape[ + 0] == 1, f"test is suppose to use only one pool. sequence_selection is based on a single pool" + # assume only one pool + dense_offset_array = offset_array[0][sequence_selection] host_input_lengths = np.concatenate(input_length_list) host_input_lengths = torch.tensor(host_input_lengths, @@ -1022,11 +1054,11 @@ def torch_exec(step: int, session, output = _construct_execution( session, input_tensor, weight_plugin, bias_plugin, dense_offset_array, host_kv_cache_pool_pointers, - sequence_lengths, host_past_key_value_lengths, - host_max_attention_window_sizes, host_sink_token_length, - context_lengths, max_context_length, cache_indirection, - num_heads, hidden_size, num_kv_heads, output, dtype, - kv_quant_scale, kv_dequant_scale, host_context_lengths, + host_kv_cache_pool_mapping, sequence_lengths, + host_past_key_value_lengths, host_max_attention_window_sizes, + host_sink_token_length, context_lengths, max_context_length, + cache_indirection, num_heads, hidden_size, num_kv_heads, output, + dtype, kv_quant_scale, kv_dequant_scale, host_context_lengths, host_request_types, generation_host_runtime_perf_knobs, use_fp8_context_fmha, atten_output_quant_scale) @@ -1050,7 +1082,7 @@ def torch_exec(step: int, finished = [False for _ in range(cache_num_req)] # Iterate to the next step. Increase number of tokens for all unfinished sequences # And allocate new blocks if needed - kv_cache_manager.step(finished) + pools_kv_cache_manager.step(finished) if __name__ == "__main__": diff --git a/tests/bindings/test_bindings_ut.py b/tests/bindings/test_bindings_ut.py index 3d688faa0..3cb1ba598 100644 --- a/tests/bindings/test_bindings_ut.py +++ b/tests/bindings/test_bindings_ut.py @@ -40,9 +40,10 @@ def test_model_config(): num_heads = 16 hidden_size = 768 data_type = _tb.DataType.FLOAT - model_config = _tb.ModelConfig(vocab_size, num_attention_layers, - num_rnn_layers, num_heads, hidden_size, - data_type) + model_config = _tb.ModelConfig(vocab_size, + num_attention_layers + num_rnn_layers, + num_attention_layers, num_rnn_layers, + num_heads, hidden_size, data_type) assert model_config.vocab_size == vocab_size assert model_config.num_attention_layers() == num_attention_layers assert model_config.num_rnn_layers() == num_rnn_layers @@ -53,10 +54,23 @@ def test_model_config(): assert model_config.vocab_size_padded(1) is not None assert model_config.size_per_head == hidden_size // num_heads - assert model_config.num_kv_heads == num_heads + num_kv_heads_per_layer = model_config.num_kv_heads_per_layer + for layer_idx in range(num_attention_layers): + assert model_config.num_kv_heads(layer_idx) == num_heads + assert num_kv_heads_per_layer[layer_idx] == num_heads + num_kv_heads = 1 - model_config.num_kv_heads = num_kv_heads - assert model_config.num_kv_heads == num_kv_heads + model_config.set_num_kv_heads(num_kv_heads) + num_kv_heads_per_layer = model_config.num_kv_heads_per_layer + for layer_idx in range(num_attention_layers): + assert model_config.num_kv_heads(layer_idx) == num_kv_heads + assert num_kv_heads_per_layer[layer_idx] == num_kv_heads + + num_kv_heads_per_layer[-1] = 2 + model_config.num_kv_heads_per_layer = num_kv_heads_per_layer + for nheads, ref in zip(model_config.num_kv_heads_per_layer, + num_kv_heads_per_layer): + assert nheads == ref assert not model_config.use_gpt_attention_plugin model_config.use_gpt_attention_plugin = True @@ -182,6 +196,7 @@ def check_empty_then_set(member, value): def test_gpt_json_config(): model_config = { "vocab_size": 1000, + "num_layers": 18, # >= attn + rnn "num_attention_layers": 12, "num_rnn_layers": 2, "num_heads": 4, @@ -314,7 +329,7 @@ def test_llm_request(): assert llm_request.max_num_generated_tokens == 2 llm_request.pause(0) - assert llm_request.state == _tb.LlmRequestState.REQUEST_STATE_CONTEXT_INIT + assert llm_request.state == _tb.LlmRequestState.CONTEXT_INIT llm_request.max_sent_token_len = 1 assert llm_request.max_sent_token_len == 1 diff --git a/tests/bindings/test_executor_bindings.py b/tests/bindings/test_executor_bindings.py index 89632e3e9..a2f08f5d0 100644 --- a/tests/bindings/test_executor_bindings.py +++ b/tests/bindings/test_executor_bindings.py @@ -102,13 +102,15 @@ def test_shutdown(model_files, model_path): with pytest.raises(Exception): executor.await_responses() with pytest.raises(Exception): - executor.get_latest_iteration_stats()() + executor.get_latest_iteration_stats() with pytest.raises(Exception): - executor.get_latest_request_stats()() + executor.get_latest_request_stats() with pytest.raises(Exception): - executor.cancel_request(req_id)() + executor.get_latest_debug_tensors() with pytest.raises(Exception): - executor.get_num_responses_ready(req_id)() + executor.cancel_request(req_id) + with pytest.raises(Exception): + executor.get_num_responses_ready(req_id) @skip_pre_ampere # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture @@ -215,6 +217,7 @@ def test_single_request(streaming: bool, exclude_input_from_output: bool, executor.get_latest_iteration_stats() executor.get_latest_request_stats() + executor.get_latest_debug_tensors() @skip_pre_ampere # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture @@ -475,13 +478,12 @@ def test_get_num_responses_ready(streaming: bool, @pytest.mark.parametrize("return_context_logits", [False, True]) @pytest.mark.parametrize("return_generation_logits", [False, True]) @skip_pre_ampere # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture -def test_token_comparison(batching_type: trtllm.BatchingType, streaming: bool, - beam_width: int, compute_log_probs: bool, - exclude_input_from_output: bool, - return_context_logits: bool, - return_generation_logits: bool, model_files, - model_path, model_path_return_logits, input_data_path, - results_data_path, results_data_path_beam_width_2): +def test_token_comparison( + batching_type: trtllm.BatchingType, streaming: bool, beam_width: int, + compute_log_probs: bool, exclude_input_from_output: bool, + return_context_logits: bool, return_generation_logits: bool, + model_files, model_path, model_path_return_logits, input_data_path, + results_data_path_fmhafp32acc, results_data_path_beam_width_2): if streaming and beam_width > 1: pytest.skip("Test does not support streaming with beam search") @@ -594,7 +596,7 @@ def verify_output(beam_tokens, test_data, given_input_lengths): executor_config) # Load test data - results_path = results_data_path if beam_width == 1 else results_data_path_beam_width_2 + results_path = results_data_path_fmhafp32acc if beam_width == 1 else results_data_path_beam_width_2 given_input, given_input_lengths, max_input_length, test_data = load_test_data( input_data_path, results_path) @@ -1020,6 +1022,11 @@ def test_scheduler_config(): assert config.capacity_scheduler_policy == capacity_scheduler_policy assert config.context_chunking_policy == None + capacity_scheduler_policy = trtllm.CapacitySchedulerPolicy.STATIC_BATCH + config = trtllm.SchedulerConfig(capacity_scheduler_policy) + assert config.capacity_scheduler_policy == capacity_scheduler_policy + assert config.context_chunking_policy == None + context_chunking_policy = trtllm.ContextChunkingPolicy.FIRST_COME_FIRST_SERVED config = trtllm.SchedulerConfig(capacity_scheduler_policy, context_chunking_policy) @@ -1034,6 +1041,7 @@ def test_kv_cache_config(): assert config.max_attention_window is None assert config.sink_token_length is None assert config.free_gpu_memory_fraction is None + assert config.cross_kv_cache_fraction is None assert config.host_cache_size is None assert config.onboard_blocks == True @@ -1042,6 +1050,7 @@ def test_kv_cache_config(): config.max_attention_window = [2] config.sink_token_length = 3 config.free_gpu_memory_fraction = 0.5 + config.cross_kv_cache_fraction = 0.5 config.host_cache_size = 4 config.onboard_blocks = False assert config.enable_block_reuse == True @@ -1049,6 +1058,7 @@ def test_kv_cache_config(): assert config.max_attention_window == [2] assert config.sink_token_length == 3 assert config.free_gpu_memory_fraction == 0.5 + assert config.cross_kv_cache_fraction == 0.5 assert config.host_cache_size == 4 assert config.onboard_blocks == False @@ -1058,6 +1068,7 @@ def test_kv_cache_config(): "max_attention_window": [10], "sink_token_length": 2, "free_gpu_memory_fraction": 0.5, + "cross_kv_cache_fraction": 0.5, "host_cache_size": 1024, "onboard_blocks": False, } @@ -1208,8 +1219,8 @@ def test_executor_config(): "extended_runtime_perf_knob_config": trtllm.ExtendedRuntimePerfKnobConfig(multi_block_mode=True), "debug_config": - trtllm.DebugConfig(dump_input_tensors=True, - dump_output_tensors=True, + trtllm.DebugConfig(debug_input_tensors=True, + debug_output_tensors=True, debug_tensor_names=["test"]), "recv_poll_period_ms": 50, @@ -1455,6 +1466,7 @@ def test_request_stats(): "avgNumDecodedTokensPerIter"] == stats.avg_num_decoded_tokens_per_iter assert stats_json["scheduled"] == stats.scheduled assert stats_json["paused"] == stats.paused + assert stats_json["disServingStats"] is None def test_request_stats_per_iteration(): @@ -1480,12 +1492,14 @@ def test_kv_cache_config_pickle(): config = trtllm.KvCacheConfig() config.enable_block_reuse = True config.free_gpu_memory_fraction = 0.3 + config.cross_kv_cache_fraction = 0.5 config_copy = pickle.loads(pickle.dumps(config)) assert config.enable_block_reuse == config_copy.enable_block_reuse assert config.max_tokens == config_copy.max_tokens assert config.max_attention_window == config_copy.max_attention_window assert config.sink_token_length == config_copy.sink_token_length assert config.free_gpu_memory_fraction == config_copy.free_gpu_memory_fraction + assert config.cross_kv_cache_fraction == config_copy.cross_kv_cache_fraction assert config.host_cache_size == config_copy.host_cache_size assert config.onboard_blocks == config_copy.onboard_blocks @@ -1516,13 +1530,15 @@ def test_decoding_config_pickle(): def test_debug_config_pickle(): - config = trtllm.DebugConfig(dump_input_tensors=True, - dump_output_tensors=True, - debug_tensor_names=["test"]) + config = trtllm.DebugConfig(debug_input_tensors=True, + debug_output_tensors=True, + debug_tensor_names=["test"], + debug_tensors_max_iterations=5) config_copy = pickle.loads(pickle.dumps(config)) - assert config.dump_input_tensors == config_copy.dump_input_tensors - assert config.dump_output_tensors == config_copy.dump_output_tensors + assert config.debug_input_tensors == config_copy.debug_input_tensors + assert config.debug_output_tensors == config_copy.debug_output_tensors assert config.debug_tensor_names == config_copy.debug_tensor_names + assert config.debug_tensors_max_iterations == config_copy.debug_tensors_max_iterations def test_logits_post_processor_config_pickle(): @@ -1573,8 +1589,8 @@ def test_executor_config_pickle(): "extended_runtime_perf_knob_config": trtllm.ExtendedRuntimePerfKnobConfig(multi_block_mode=True), "debug_config": - trtllm.DebugConfig(dump_input_tensors=True, - dump_output_tensors=True, + trtllm.DebugConfig(debug_input_tensors=True, + debug_output_tensors=True, debug_tensor_names=["test"]), "recv_poll_period_ms": 50, @@ -1602,7 +1618,7 @@ def test_executor_config_pickle(): assert config.peft_cache_config.num_host_module_layer == config_copy.peft_cache_config.num_host_module_layer assert config_copy.decoding_config.decoding_mode.isTopKandTopP assert config.extended_runtime_perf_knob_config.multi_block_mode == config_copy.extended_runtime_perf_knob_config.multi_block_mode - assert config.debug_config.dump_input_tensors == config_copy.debug_config.dump_input_tensors + assert config.debug_config.debug_input_tensors == config_copy.debug_config.debug_input_tensors assert config.max_seq_idle_microseconds == config_copy.max_seq_idle_microseconds diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..1afbe2f35 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # Force resource release after test +import pytest + + +@pytest.hookimpl(wrapper=True) +def pytest_runtest_protocol(item, nextitem): + yield + + import sys + for m in sys.modules: + if m == 'torch' or m.startswith('torch.'): + import gc + import os + + import torch + worker_count = int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', 1)) + + if (torch.cuda.memory_reserved(0) + torch.cuda.memory_allocated(0) + ) >= (torch.cuda.get_device_properties(0).total_memory // + worker_count) * 0.9: + gc.collect() + print("torch.cuda.memory_allocated: %fGB" % + (torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024)) + print("torch.cuda.memory_reserved: %fGB" % + (torch.cuda.memory_reserved(0) / 1024 / 1024 / 1024)) + print("torch.cuda.max_memory_reserved: %fGB" % + (torch.cuda.max_memory_reserved(0) / 1024 / 1024 / 1024)) + + torch.cuda.empty_cache() + break diff --git a/tests/functional/test_moe.py b/tests/functional/test_moe.py index c6819f3f1..deedde4f6 100644 --- a/tests/functional/test_moe.py +++ b/tests/functional/test_moe.py @@ -1022,8 +1022,7 @@ def create_trt_session( network, precision=trt_dtype_to_str(dtype), int8=weight_dtype == trt.int8, - quant_mode=quant_mode, - opt_level=4) + quant_mode=quant_mode) return session def generate_reference(self, inputs, k, actfn, weight_dtype, quant_mode, diff --git a/tests/hlapi/apps/_test_llm_server.py b/tests/hlapi/apps/_test_llm_server.py index bf6b97881..73026a26a 100644 --- a/tests/hlapi/apps/_test_llm_server.py +++ b/tests/hlapi/apps/_test_llm_server.py @@ -7,7 +7,7 @@ sys.path.append( os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples", "apps")) -from fastapi_server import LLM, KvCacheConfig, LlmServer +from fastapi_server import LLM, LlmServer sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from test_llm import llama_model_path @@ -16,9 +16,8 @@ @pytest.fixture(scope="module") def client(): llm = LLM(llama_model_path) - kv_cache_config = KvCacheConfig() - app_instance = LlmServer(llm, kv_cache_config) + app_instance = LlmServer(llm) client = TestClient(app_instance.app) yield client @@ -31,6 +30,11 @@ def test_health(client): assert response.status_code == 200 +def test_health(client): + response = client.get("/health") + assert response.status_code == 200 + + def test_generate(client): response = client.post("/generate", json={"prompt": "A B C"}) assert response.status_code == 200 diff --git a/tests/hlapi/test_llm.py b/tests/hlapi/test_llm.py index 5ebed6314..7750b52dc 100644 --- a/tests/hlapi/test_llm.py +++ b/tests/hlapi/test_llm.py @@ -4,17 +4,18 @@ import sys import tempfile import time -from typing import List, Optional +from typing import List, Optional, Union import pytest import torch -from transformers import AutoTokenizer +import transformers from tensorrt_llm._utils import release_gc +from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.executor import (ExecutorBindingsWorker, GenerationRequest, GenerationResult, LoRARequest) from tensorrt_llm.hlapi import (LLM, BuildCacheConfig, KvCacheConfig, - SamplingParams) + RequestError, SamplingParams) from tensorrt_llm.hlapi.llm_utils import BuildConfig, _ParallelConfig from tensorrt_llm.hlapi.tokenizer import TokenizerBase, TransformersTokenizer from tensorrt_llm.hlapi.utils import get_total_gpu_memory @@ -24,7 +25,7 @@ from utils.llm_data import llm_models_root from utils.util import force_ampere, similar, skip_less_than_40gb_memory -from tensorrt_llm.models.llama.model import LLaMAForCausalLM +from tensorrt_llm.models.automodel import AutoConfig, AutoModelForCausalLM # The unittests are based on the tiny-llama, which is fast to build and run. # There are other tests based on llama-7B model, such as the end-to-end tests in test_e2e.py, and parallel tests in @@ -79,20 +80,34 @@ def llm_test_harness(model_dir: str, def llm_check_output(llm: LLM, inputs: List[str], references: List[str], + *, + sampling_params: Optional[SamplingParams] = None, similar_threshold: float = 0.8, - *gen_args, + finish_reasons: Optional[List[str]] = None, + stop_reasons: Optional[List[Union[int, str]]] = None, **gen_kwargs): - outputs = llm.generate(inputs, *gen_args, **gen_kwargs) + outputs = llm.generate(inputs, + sampling_params=sampling_params, + **gen_kwargs) assert len(outputs) == len(references) - for output, target_output in zip(outputs, references): + for i, (output, target_output) in enumerate(zip(outputs, references)): if isinstance(target_output, list): # N output assert len(output.outputs) == len(target_output) - for out, ref in zip(output.outputs, target_output): + for j, (out, ref) in enumerate(zip(output.outputs, target_output)): assert similar(out.text, ref, threshold=similar_threshold) + if finish_reasons is not None: + assert out.finish_reason == finish_reasons[i][j] + if stop_reasons is not None: + assert out.stop_reason == stop_reasons[i][j] else: - assert similar(output.outputs[0].text, target_output) + out = output.outputs[0] + assert similar(out.text, target_output, threshold=similar_threshold) + if finish_reasons is not None: + assert out.finish_reason == finish_reasons[i] + if stop_reasons is not None: + assert out.stop_reason == stop_reasons[i] default_model_name = "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" @@ -100,6 +115,9 @@ def llm_check_output(llm: LLM, llama_model_path = get_model_path(default_model_name) llm_engine_dir = os.environ.get('LLM_ENGINE_DIR', './tmp.engine') + +cnn_dailymail_path = str(llm_models_root() / "datasets" / "cnn_dailymail") + prompts = ["A B C"] global_kvcache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) @@ -110,7 +128,6 @@ def test_llm_build_config(): # change some building parameters build_config.max_batch_size = 129 build_config.max_beam_width = 4 - build_config.builder_opt = 3 build_config.max_num_tokens = 888 build_config.strongly_typed = True build_config.max_seq_len = 333 @@ -131,7 +148,6 @@ def test_llm_build_config(): build_config1.plugin_config.nccl_plugin = 'float16' assert build_config1.max_batch_size == build_config.max_batch_size assert build_config1.max_beam_width == build_config.max_beam_width - assert build_config1.builder_opt == build_config.builder_opt assert build_config1.max_num_tokens == build_config.max_num_tokens assert build_config1.strongly_typed == build_config.strongly_typed assert build_config1.max_seq_len == build_config.max_seq_len @@ -149,16 +165,41 @@ def test_llm_loading_from_hf(): def test_llm_loading_from_ckpt(): tokenizer = TransformersTokenizer.from_pretrained(llama_model_path) assert tokenizer is not None - with tempfile.TemporaryDirectory() as ckpt_dir: - llama = LLaMAForCausalLM.from_hugging_face(llama_model_path) - llama.save_checkpoint(ckpt_dir) - del llama - llm_test_harness(ckpt_dir, - prompts, ["D E F G H I J K"], - tokenizer=tokenizer, - kv_cache_config=global_kvcache_config, - sampling_params=SamplingParams(max_tokens=8)) + ckpt_dir = tempfile.TemporaryDirectory() + llama = AutoModelForCausalLM.from_hugging_face(llama_model_path) + llama.save_checkpoint(ckpt_dir.name) + del llama + + llm_test_harness(ckpt_dir.name, + prompts, ["D E F G H I J K"], + tokenizer=tokenizer, + kv_cache_config=global_kvcache_config, + sampling_params=SamplingParams(max_tokens=8)) + + +@pytest.mark.parametrize('model_format', ['hf', 'ckpt']) +def test_llm_with_dummy_weights(model_format): + # dummy_dir contains config.json and tokenizer files only + # the test fails if load_format != 'dummy' + dummy_dir = tempfile.TemporaryDirectory() + if model_format == 'hf': + hf_config = transformers.AutoConfig.from_pretrained(llama_model_path) + hf_config.save_pretrained(dummy_dir.name) + else: + config = AutoConfig.from_hugging_face(llama_model_path, dtype='float16') + config.to_json_file(os.path.join(dummy_dir.name, 'config.json')) + tokenizer = transformers.AutoTokenizer.from_pretrained(llama_model_path) + tokenizer.save_pretrained(dummy_dir.name) + + sampling_params = SamplingParams(max_tokens=8) + llm_test_harness(dummy_dir.name, + prompts, + ["A placeholder reference for dummy-weight engine."], + sampling_params=sampling_params, + similar_threshold=0.0, + load_format='dummy', + kv_cache_config=global_kvcache_config) class MyTokenizer(TokenizerBase): @@ -167,8 +208,8 @@ class MyTokenizer(TokenizerBase): @classmethod def from_pretrained(cls, pretrained_model_dir: str, **kwargs): - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, - **kwargs) + tokenizer = transformers.AutoTokenizer.from_pretrained( + pretrained_model_dir, **kwargs) return MyTokenizer(tokenizer) def __init__(self, tokenizer): @@ -326,6 +367,9 @@ async def main(): test_future_async() test_non_streaming_usage_wait() + del llm + release_gc() + @pytest.fixture(scope="module") def llm_for_sampling_params() -> LLM: @@ -475,27 +519,55 @@ def test_generate_with_stop_words(): llm_check_output(llm, prompts, ["D E F G H I J K L M"], - sampling_params=SamplingParams(stop_token_ids=[stop_id])) + sampling_params=SamplingParams(end_id=stop_id), + finish_reasons=['stop'], + stop_reasons=[None]) + + llm_check_output(llm, + prompts, ["D E F G H"], + sampling_params=SamplingParams(max_tokens=5), + finish_reasons=['length'], + stop_reasons=[None]) + + llm_check_output(llm, + prompts, ["D E F G H I J K L M"], + sampling_params=SamplingParams(stop_token_ids=[stop_id]), + finish_reasons=['stop'], + stop_reasons=[stop_id]) llm_check_output(llm, prompts, ["D E F G H I J K L M N"], sampling_params=SamplingParams( stop_token_ids=[stop_id], - include_stop_str_in_output=True)) + include_stop_str_in_output=True), + finish_reasons=['stop'], + stop_reasons=[stop_id]) llm_check_output(llm, prompts, ["D E F G H"], - sampling_params=SamplingParams(stop="I J")) + sampling_params=SamplingParams(stop="I J"), + finish_reasons=['stop'], + stop_reasons=["I J"]) + + llm_check_output(llm, + prompts, ["D E F G H I J K L M"], + sampling_params=SamplingParams(stop="I E", max_tokens=10), + finish_reasons=['length'], + stop_reasons=[None]) llm_check_output(llm, prompts, ["D E F G H I J"], sampling_params=SamplingParams( - stop="I J", include_stop_str_in_output=True)) + stop="I J", include_stop_str_in_output=True), + finish_reasons=['stop'], + stop_reasons=["I J"]) llm_check_output(llm, prompts, ["D E F G H"], sampling_params=SamplingParams(stop=["F E", "I J"], - stop_token_ids=[stop_id])) + stop_token_ids=[stop_id]), + finish_reasons=['stop'], + stop_reasons=["I J"]) @force_ampere @@ -524,8 +596,7 @@ def test_generate_with_bad_words(): @force_ampere def test_generate_with_embedding_bias(): - - tokenizer = AutoTokenizer.from_pretrained(llama_model_path) + tokenizer = transformers.AutoTokenizer.from_pretrained(llama_model_path) biased_word_id = tokenizer.encode("Z", add_special_tokens=False)[-1] vocab_size_padded = 32000 embedding_bias = torch.zeros(vocab_size_padded) @@ -568,6 +639,7 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs): hf_model_dir = get_model_path("llama-models-v2/llama-v2-13b-hf") hf_lora_dir = get_model_path("llama-models-v2/chinese-llama-2-lora-13b") + # For LoRA checkpoints with finetuned embedding and lm_head, lora_dir must be provided at build time. build_config = BuildConfig(lora_config=LoraConfig(lora_dir=[hf_lora_dir])) llm = LLM(hf_model_dir, tokenizer=hf_lora_dir, @@ -598,8 +670,10 @@ def llama_7b_multi_lora_test_harness(**llm_kwargs): hf_lora_dir1 = get_model_path("llama-models/luotuo-lora-7b-0.1") hf_lora_dir2 = get_model_path("llama-models/Japanese-Alpaca-LoRA-7b-v0") + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. build_config = BuildConfig(lora_config=LoraConfig( - lora_dir=[hf_lora_dir1, hf_lora_dir2], lora_target_modules=['attn_q', 'attn_k', 'attn_v'])) llm = LLM(hf_model_dir, enable_lora=True, @@ -734,7 +808,7 @@ def second_run(): prompts, ["D E F G H I J K"], sampling_params=sampling_params) - # the cache should be hitted + # the cache should be hit assert llm.llm_build_stats.cache_hitted, llm.llm_build_stats.cache_info del llm release_gc() @@ -914,4 +988,51 @@ def test_llm_return_generation_logits(): check_llm_return_generation_logits(tp_size=1) +class DummyExecutorWorker3(ExecutorBindingsWorker): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.counter = 0 + + def _engine_response_callback(self, response: tllm.Response): + return tllm.Response(request_id=response.request_id, + error_msg="Test error") + + +DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {}, + worker_cls=DummyExecutorWorker3) + + +def test_llm_handling_per_requeust_error(): + llm = LLM(model=llama_model_path, + executor_cls=DummyExecutor3, + kv_cache_config=global_kvcache_config) + # The dummy executor will delay the responses + sampling_params = SamplingParams(max_tokens=6) + + # test in streaming mode + async def task(): + with pytest.raises(RequestError): + # 10 requests, each request will get error, while the whole LLM instance is still alive + for i in range(10): + async for output in llm.generate_async( + prompts[0], streaming=True, + sampling_params=sampling_params): + print(output) + + asyncio.run(task()) + + def batch_task(): + with pytest.raises(RequestError): + for output in llm.generate(prompts, + sampling_params=sampling_params): + print(output) + + batch_task() + + # TODO[chunweiy]: Add test for loading inmemory model + +if __name__ == '__main__': + test_llm_handling_per_requeust_error() diff --git a/tests/hlapi/test_llm_models.py b/tests/hlapi/test_llm_models.py index 3097f2a9e..22f6b256f 100644 --- a/tests/hlapi/test_llm_models.py +++ b/tests/hlapi/test_llm_models.py @@ -3,12 +3,12 @@ import pytest from tensorrt_llm import BuildConfig, SamplingParams -from tensorrt_llm.hlapi import QuantAlgo, QuantConfig +from tensorrt_llm.hlapi import CalibConfig, QuantAlgo, QuantConfig try: - from .test_llm import get_model_path, llm_test_harness + from .test_llm import cnn_dailymail_path, get_model_path, llm_test_harness except ImportError: - from test_llm import get_model_path, llm_test_harness + from test_llm import get_model_path, llm_test_harness, cnn_dailymail_path import os import sys @@ -36,7 +36,8 @@ qwen_model_path = get_model_path('Qwen-1_8B-Chat') qwen1_5_model_path = get_model_path('Qwen1.5-0.5B-Chat') qwen2_model_path = get_model_path('Qwen2-7B-Instruct') - +mamba2_370m_model_path = get_model_path('mamba2/mamba2-370m') +gpt_neox_20b_model_path = get_model_path('gpt-neox-20b') sampling_params = SamplingParams(max_tokens=10) @@ -51,11 +52,13 @@ def test_llm_gptj(): @force_ampere def test_llm_gptj_int4_weight_only(): quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(gptj_model_path, inputs=["A B C"], references=["D E F G H I J K L M"], sampling_params=sampling_params, - quant_config=quant_config) + quant_config=quant_config, + calib_config=calib_config) @force_ampere @@ -71,32 +74,38 @@ def test_llm_gpt2_sq(): quant_config = QuantConfig( quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, kv_cache_quant_algo=QuantAlgo.INT8) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(gpt2_model_path, inputs=["A B C"], references=["D E F G H I J K L M"], sampling_params=sampling_params, - quant_config=quant_config) + quant_config=quant_config, + calib_config=calib_config) @force_ampere def test_llm_gpt2_int8_weight_only(): quant_config = QuantConfig(quant_algo=QuantAlgo.W8A16, kv_cache_quant_algo=QuantAlgo.INT8) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(gpt2_model_path, inputs=["A B C"], references=["D E F G H I J K L M"], sampling_params=sampling_params, - quant_config=quant_config) + quant_config=quant_config, + calib_config=calib_config) @skip_pre_hopper def test_llm_gpt2_fp8(): quant_config = QuantConfig(quant_algo=QuantAlgo.FP8) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(gpt2_model_path, inputs=["A B C"], references=["D E F G H I J K L M"], sampling_params=sampling_params, - quant_config=quant_config) + quant_config=quant_config, + calib_config=calib_config) @force_ampere @@ -110,11 +119,13 @@ def test_llm_starcoder2(): @skip_pre_hopper def test_llm_starcoder2_fp8(): quant_config = QuantConfig(quant_algo=QuantAlgo.FP8) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(starcoder2_model_path, inputs=["def print_hello_world():"], references=['\n print("Hello World")\n\ndef print'], sampling_params=sampling_params, - quant_config=quant_config) + quant_config=quant_config, + calib_config=calib_config) def test_llm_phi_1_5(): @@ -173,12 +184,14 @@ def test_llm_falcon(): @force_ampere def test_llm_falcon_int4_weight_only(): quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(falcon_model_path, inputs=['A B C'], references=['D E F G H I J K L M'], sampling_params=sampling_params, quant_config=quant_config, - build_config=BuildConfig(strongly_typed=False)) + build_config=BuildConfig(strongly_typed=False), + calib_config=calib_config) @force_ampere @@ -192,11 +205,13 @@ def test_llm_gemma_2b(): @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/4575937") def test_llm_gemma_2b_int4weight_only(): quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(gemma_2b_model_path, inputs=['A B C'], references=['D E F G H I J K L M'], sampling_params=sampling_params, - quant_config=quant_config) + quant_config=quant_config, + calib_config=calib_config) @force_ampere @@ -207,6 +222,10 @@ def test_llm_gemma_2_9b_it(): sampling_params=sampling_params) +@pytest.mark.skip( + reason= + "Require further transformers update https://github.com/THUDM/ChatGLM3/issues/1324" +) def test_llm_glm(): print('test GLM....') llm_test_harness(glm_model_path, @@ -257,11 +276,13 @@ def test_llm_baichuan2_13b(): @force_ampere def test_llm_baichuan2_7b_int4weight_only(): quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(baichuan2_7b_model_path, inputs=['A B C'], references=['D E F G H I J K L M'], sampling_params=sampling_params, quant_config=quant_config, + calib_config=calib_config, trust_remote_code=True) @@ -300,22 +321,39 @@ def test_llm_qwen2(): @skip_pre_ampere def test_llm_qwen2_int4_weight_only(): quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(qwen2_model_path, inputs=['A B C'], references=['D E F G H I J K L M'], sampling_params=sampling_params, quant_config=quant_config, + calib_config=calib_config, trust_remote_code=True) @skip_pre_hopper def test_llm_qwen2_fp8(): quant_config = QuantConfig(quant_algo=QuantAlgo.FP8) + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) llm_test_harness(qwen2_model_path, inputs=['A B C'], references=['D E F G H I J K L M'], sampling_params=sampling_params, quant_config=quant_config, + calib_config=calib_config, + trust_remote_code=True) + + +@skip_pre_ampere +def test_llm_mamba2_370m(): + build_config = BuildConfig() + build_config.plugin_config._paged_kv_cache = False + llm_test_harness(mamba2_370m_model_path, + inputs=['A B C'], + references=['D E F G H I J K L M'], + sampling_params=sampling_params, + tokenizer=gpt_neox_20b_model_path, + build_config=build_config, trust_remote_code=True) diff --git a/tests/hlapi/test_llm_multi_gpu.py b/tests/hlapi/test_llm_multi_gpu.py index 782a8037b..9f3aaff47 100644 --- a/tests/hlapi/test_llm_multi_gpu.py +++ b/tests/hlapi/test_llm_multi_gpu.py @@ -9,6 +9,7 @@ import torch from parameterized import parameterized +from tensorrt_llm._utils import release_gc from tensorrt_llm.executor import (ExecutorBindingsProxy, GenerationRequest, GenerationResult) from tensorrt_llm.hlapi import LLM, KvCacheConfig, SamplingParams @@ -110,8 +111,8 @@ def test_llm_return_generation_logits_tp2(): ids=["from_ckpt", "from_hf"]) @skip_single_gpu def test_llm_generate_async_tp2( - engine_from_checkpoint: tempfile.TemporaryDirectory, - use_auto_parallel: bool, from_ckpt: bool): + engine_from_checkpoint: tempfile.TemporaryDirectory, from_ckpt: bool, + use_auto_parallel: bool): if use_auto_parallel and from_ckpt: pytest.skip("Skip auto parallel for TP2 checkpoint") model_dir = engine_from_checkpoint.name if from_ckpt else get_model_path( @@ -264,7 +265,7 @@ def submit(self, request: GenerationRequest) -> GenerationResult: self.request_queue.put(request) - req_id = self.request_id_queue.get() + req_id = self.rid_or_err_queue.get() request.set_id(req_id) result = GenerationResult( @@ -354,6 +355,8 @@ def __init__( def dispatch_result_task(self) -> bool: self.counter += 1 + # This will raise error in dispatch_result_thread in the main process, it will be captured by ManagedThread and + # redirect to the error_queue if self.counter == 2: raise DummyError("Test error") @@ -364,6 +367,29 @@ def dispatch_result_task(self) -> bool: proxy_class=DummyExecutorProxy2) +# TODO[chunweiy]: This test is not stable, need to investigate +def test_executor_handle_background_error_in_dispatch_result_thread(): + llm = LLM(model=llama_model_path, + executor_cls=DummyExecutor2, + kv_cache_config=global_kv_cache_config) + # The dummy executor will delay the responses + sampling_params = SamplingParams(max_tokens=6) + + # test in streaming mode + async def task(): + with pytest.raises(DummyError): + with llm: + async for output in llm.generate_async( + prompts[0], streaming=True, + sampling_params=sampling_params): + print(output) + + asyncio.run(task()) + + del llm + release_gc() + + class DummyExecutorProxy3(ExecutorBindingsProxy): ''' This is for testing the error occur in a Worker process in the Proxy. ''' @@ -373,10 +399,13 @@ def __init__( model_world_size: int = 1, mpi_session=None, ) -> None: - super().__init__(workers_kwargs, - model_world_size, - mpi_session, - worker_cls=DummyExecutorWorker2) + super().__init__( + workers_kwargs, + model_world_size, + mpi_session, + # The worker process will raise error, and be captured by mpi4py done handler, and redirect to + # the global error queue. + worker_cls=DummyExecutorWorker2) DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {}, @@ -384,45 +413,31 @@ def __init__( # TODO[chunweiy]: This test is not stable, need to investigate -def test_executor_handle_background_error(): +# The phenomenon is that the IpcQueues don't match each other. +def test_executor_handle_background_error_in_worker_process(): llm = LLM(model=llama_model_path, - executor_cls=DummyExecutor2, + executor_cls=DummyExecutor3, kv_cache_config=global_kv_cache_config) # The dummy executor will delay the responses sampling_params = SamplingParams(max_tokens=6) # test in streaming mode async def task(): + nonlocal llm with pytest.raises(DummyError): - async for output in llm.generate_async( - prompts[0], streaming=True, - sampling_params=sampling_params): - print(output) + with llm: + async for output in llm.generate_async( + prompts[0], streaming=True, + sampling_params=sampling_params): + print(output) asyncio.run(task()) - -def test_executor_handle_background_error_in_worker(): - llm = LLM(model=llama_model_path, - executor_cls=DummyExecutor2, - kv_cache_config=global_kv_cache_config) - # The dummy executor will delay the responses - sampling_params = SamplingParams(max_tokens=6) - - # test in streaming mode - async def task(): - with pytest.raises(DummyError): - async for output in llm.generate_async( - prompts[0], streaming=True, - sampling_params=sampling_params): - print(output) - - asyncio.run(task()) + del llm + release_gc() if __name__ == '__main__': - #test_llama_v2_13b_lora_tp2() - #test_llm_end2end_tp2({'embedding_parallel_mode': 'NONE'}) - test_llm_return_context_logits_tp2() - test_llm_return_generation_logits_tp2() + test_executor_handle_background_error_in_dispatch_result_thread() + test_executor_handle_background_error_in_worker_process() diff --git a/tests/hlapi/test_llm_quant.py b/tests/hlapi/test_llm_quant.py index 33627cdf0..d7edb3eef 100644 --- a/tests/hlapi/test_llm_quant.py +++ b/tests/hlapi/test_llm_quant.py @@ -2,24 +2,26 @@ import sys from tensorrt_llm.hlapi.llm import LLM, SamplingParams -from tensorrt_llm.hlapi.llm_utils import QuantAlgo, QuantConfig +from tensorrt_llm.hlapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import skip_pre_ampere, skip_pre_hopper try: - from .test_llm import llama_model_path + from .test_llm import cnn_dailymail_path, llama_model_path except ImportError: - from test_llm import llama_model_path + from test_llm import cnn_dailymail_path, llama_model_path @skip_pre_ampere def test_llm_int4_awq_quantization(): - quant_config = QuantConfig() - quant_config.quant_algo = QuantAlgo.W4A16_AWQ + quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ) assert quant_config.quant_mode.has_any_quant() + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) - llm = LLM(llama_model_path, quant_config=quant_config) + llm = LLM(llama_model_path, + quant_config=quant_config, + calib_config=calib_config) sampling_params = SamplingParams(max_tokens=6) for output in llm.generate(["A B C"], sampling_params=sampling_params): @@ -29,13 +31,14 @@ def test_llm_int4_awq_quantization(): @skip_pre_hopper def test_llm_fp8_quantization(): - quant_config = QuantConfig() - quant_config.quant_algo = QuantAlgo.FP8 - quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - + quant_config = QuantConfig(quant_algo=QuantAlgo.FP8, + kv_cache_quant_algo=QuantAlgo.FP8) assert quant_config.quant_mode.has_any_quant() + calib_config = CalibConfig(calib_dataset=cnn_dailymail_path) - llm = LLM(llama_model_path, quant_config=quant_config) + llm = LLM(llama_model_path, + quant_config=quant_config, + calib_config=calib_config) sampling_params = SamplingParams(max_tokens=6) for output in llm.generate(["A B C"], sampling_params=sampling_params): print(output) diff --git a/tests/hlapi/test_llm_utils.py b/tests/hlapi/test_llm_utils.py index 8bc460725..733b424f4 100644 --- a/tests/hlapi/test_llm_utils.py +++ b/tests/hlapi/test_llm_utils.py @@ -13,7 +13,7 @@ def test_ConfigArbitrator_basic(): - # the performance and functionality have conflict plugins config, keep the functionalies and disable the performance's + # the performance and functionality have conflict plugins config, keep the functionalities and disable the performance's arb = _ConfigArbitrator() arb.claim_perf("chunked_context", config_name="plugin_config", diff --git a/tests/model/test_decilm.py b/tests/model/test_decilm.py deleted file mode 100644 index 083db6671..000000000 --- a/tests/model/test_decilm.py +++ /dev/null @@ -1,602 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import itertools -import os -import sys -import tempfile -import unittest -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import tensorrt as trt -import torch -import transformers -from parameterized import parameterized - -import tensorrt_llm -from tensorrt_llm import logger -from tensorrt_llm._utils import str_dtype_to_torch -from tensorrt_llm.builder import Builder -from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models.deci.config import DeciConfig, DeciLayerConfig -from tensorrt_llm.models.deci.convert import _ffn_mult_to_intermediate_size -from tensorrt_llm.models.deci.layer_config import (AttentionImplementation, - FFNImplementation) -from tensorrt_llm.models.deci.model import DeciLMForCausalLM -from tensorrt_llm.network import Network, net_guard -from tensorrt_llm.plugin.plugin import ContextFMHAType -from tensorrt_llm.runtime.generation import _Runtime - -sys.path.append(os.path.join(os.path.dirname(__file__), '..')) -from utils.llm_data import llm_models_root -from utils.util import unittest_name_func - - -class TestDeciLM(unittest.TestCase): - - def _make_decilm_config(self, - layer_configs: List[Union[DeciLayerConfig, - Dict[str, Dict[str, - Any]]]], - dtype: str = 'bfloat16', - num_attention_heads: int = 32, - num_key_value_heads: Optional[int] = None, - hidden_size: int = 4096, - intermediate_size: int = 16384, - vocab_size: int = 32128, - max_positions_embedding: int = 1024, - norm_epsilon: float = 1e-05) -> DeciConfig: - config = { - 'architecture': 'DeciLMForCausalLM', - 'num_hidden_layers': len(layer_configs), - 'num_attention_heads': num_attention_heads, - 'num_key_value_heads': num_key_value_heads, - 'dtype': dtype, - 'logits_dtype': dtype, - 'hidden_size': hidden_size, - 'intermediate_size': intermediate_size, - 'vocab_size': vocab_size, - 'position_embedding_type': 'rope_gpt_neox', - 'max_position_embeddings': max_positions_embedding, - 'hidden_act': 'silu', - 'norm_epsilon': norm_epsilon, - 'layer_configs': layer_configs - } - - config = DeciConfig.from_dict(config) - return config - - def _gen_tensorrt_llm_network(self, network: Network, - decilm: DeciLMForCausalLM, batch_size: int, - beam_width: int, input_len: int, - output_len: int, rank: int, - tensor_parallel: int, **opt_flags): - list(range(tensor_parallel)) - - with net_guard(network): - # optimize_model(decilm, **opt_flags) - # Prepare - network.set_named_parameters(decilm.named_parameters()) - inputs = decilm.prepare_inputs(max_batch_size=batch_size, - max_input_len=input_len, - max_seq_len=input_len + output_len, - max_num_tokens=batch_size * - input_len, - use_cache=True, - max_beam_width=beam_width) - # Forward - decilm(**inputs) - return network - - def _gen_tensorrt_llm_engine( - self, - rank: int, - world_size: int, - decilm: DeciLMForCausalLM, - model_name: str, - use_plugin: bool, - batch_size: int, - beam_width: int, - input_len: int, - output_len: int, - use_refit: bool, - use_gemm: bool = False, - context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled, - enable_remove_input_padding: bool = False, - **opt_flags) -> trt.IHostMemory: - - builder = Builder() - dtype = decilm.config.dtype - - with tempfile.TemporaryDirectory(): - builder_config = builder.create_builder_config( - name=model_name, - precision=dtype, - timing_cache='model.cache', - tensor_parallel=world_size, # TP only - use_refit=use_refit, - strongly_typed=True, - ) - network = builder.create_network() - network.plugin_config.to_legacy_setting() - if use_plugin: - network.plugin_config.gpt_attention_plugin = dtype - if use_gemm: - network.plugin_config.gemm_plugin = dtype - if enable_remove_input_padding: - network.plugin_config.remove_input_padding = True - network.plugin_config.set_context_fmha(context_fmha_flag) - - self._gen_tensorrt_llm_network(network=network, - decilm=decilm, - batch_size=batch_size, - beam_width=beam_width, - input_len=input_len, - output_len=output_len, - rank=rank, - tensor_parallel=world_size, - **opt_flags) - engine_buffer = builder.build_engine(network, builder_config) - return engine_buffer - - def _gen_tensorrt_llm_runtime( - self, - log_level: str, - world_size: int, - rank: int, - decilm: DeciLMForCausalLM, - model_name: str, - use_plugin: bool, - batch_size: int, - beam_width: int, - input_len: int, - output_len: int, - use_refit: bool, - use_gemm: bool = False, - context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled, - enable_remove_input_padding: bool = False, - **opt_flags) -> Tuple[_Runtime, trt.IHostMemory]: - logger.set_level(log_level) - mapping = Mapping(world_size, rank, tp_size=world_size) - engine_buffer = self._gen_tensorrt_llm_engine( - rank=rank, - world_size=world_size, - decilm=decilm, - model_name=model_name, - use_plugin=use_plugin, - batch_size=batch_size, - beam_width=beam_width, - input_len=input_len, - output_len=output_len, - use_refit=use_refit, - use_gemm=use_gemm, - context_fmha_flag=context_fmha_flag, - enable_remove_input_padding=enable_remove_input_padding, - **opt_flags) - runtime = _Runtime(engine_buffer, mapping) - return runtime, engine_buffer - - def test_config_to_from_dict(self) -> None: - config = self._make_decilm_config(layer_configs=[{ - "attention": { - "num_key_value_heads": 4 - }, - "ffn": {} - }, { - "attention": { - "num_key_value_heads": 2 - }, - "ffn": { - "impl": "no_op" - } - }, { - "attention": { - "impl": "no_op" - }, - "ffn": { - "intermediate_size": 8192 - } - }]) - - config2 = DeciConfig.from_dict(config.to_dict()) - self.assertListEqual(config.layer_configs, config2.layer_configs) - - def test_save_load_config(self) -> None: - config = self._make_decilm_config(layer_configs=[{ - "attention": { - "num_key_value_heads": 4 - }, - "ffn": {} - }, { - "attention": { - "num_key_value_heads": 2 - }, - "ffn": { - "impl": "no_op" - } - }, { - "attention": { - "impl": "no_op" - }, - "ffn": { - "intermediate_size": 8192 - } - }]) - - with tempfile.TemporaryDirectory( - prefix="test_save_load_checkpoint") as ckpt_dir: - config_file = f"{ckpt_dir}/config.json" - config.to_json_file(config_file) - config2 = DeciConfig.from_json_file(config_file) - - self.assertDictEqual(config.to_dict(), config2.to_dict()) - self.assertListEqual(config.layer_configs, config2.layer_configs) - - def get_loader_test_cases(): - model_root = llm_models_root(check=True) - test_models_base_path = Path(model_root, "nvsmall/tests") - - models_path = [ - os.path.join(test_models_base_path, x) - for x in os.listdir(test_models_base_path) - ] - test_cases = list( - itertools.product(models_path, ["bfloat16", "float16"])) - - return test_cases - - @parameterized.expand(get_loader_test_cases, name_func=unittest_name_func) - def test_allclose_to_hf(self, hf_model_dir, dtype): - if hf_model_dir is None: - self.skipTest( - f"Missing nvsmall checkpoint, define a valid checkpoint path with the NVSMALL_CKPT environment variable" - ) - - dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype) - - hf_model = transformers.AutoModelForCausalLM.from_pretrained( - hf_model_dir, trust_remote_code=True, torch_dtype=dtype).cuda() - decilm = DeciLMForCausalLM.from_hugging_face(hf_model) - config = decilm.config - - log_level = "warning" - batch_size = 1 - beam_width = 1 - input_len = 4 - output_len = 2 - max_seq_len = input_len + output_len - dtype = config.dtype - enable_remove_input_padding = False - use_gpt_plugin = True - use_gemm = True - - runtime, engine_buffer = self._gen_tensorrt_llm_runtime( - log_level=log_level, - decilm=decilm, - batch_size=batch_size, - beam_width=beam_width, - input_len=input_len, - output_len=output_len, - rank=0, - world_size=1, - model_name="decilm", - use_gemm=use_gemm, - use_plugin=use_gpt_plugin, - use_refit=False) - - key_value_cache_buffers = [] - head_size = config.hidden_size // config.num_attention_heads - - attn_layer_idx = [ - i for i in range(config.num_hidden_layers) - if config.get_layer_config(i).attention.needs_kv_cache - ] - for layer_idx in attn_layer_idx: - layer_config = config.get_layer_config(layer_idx) - new_cache = torch.zeros(( - batch_size, - 2, - layer_config.attention.num_key_value_heads, - max_seq_len, - head_size, - ), - dtype=str_dtype_to_torch(dtype), - device='cuda') - key_value_cache_buffers.append(new_cache) - - # compare context - ctx_ids = torch.randint(100, (batch_size, input_len)).int().cuda() - ctx_context_lengths = input_len * torch.ones( - (batch_size), dtype=torch.int32, device='cuda') - ctx_position_ids = torch.tensor(range(input_len), - dtype=torch.int32).reshape([ - 1, input_len - ]).expand([batch_size, - input_len]).cuda() - ctx_last_token_ids = ctx_context_lengths.clone() - ctx_host_request_types = torch.tensor([0] * batch_size, - dtype=torch.int32) - - # We need sequence_lengths start as context_lengths for step 0, - # and it will be added one after each step. - sequence_length_buffer = ctx_context_lengths.detach().clone() - - with torch.no_grad(): - hf_outputs = hf_model.forward(ctx_ids, - output_hidden_states=True, - output_attentions=True) - - torch.cuda.synchronize() - ref = hf_outputs.logits[:, -1, :] - - if enable_remove_input_padding: - ctx_ids = ctx_ids.view([batch_size * input_len]) - ctx_position_ids = ctx_position_ids.view([batch_size * input_len]) - ctx_last_token_ids = torch.cumsum(ctx_last_token_ids, dim=0).int() - - cache_indirections = [ - torch.full(( - batch_size, - beam_width, - max_seq_len, - ), - 0, - dtype=torch.int32, - device='cuda'), - torch.full(( - batch_size, - beam_width, - max_seq_len, - ), - 0, - dtype=torch.int32, - device='cuda') - ] # ping-pong buffers - - perf_knob_tensor_size = 16 - # runtime_perf_knobs is not used in context phase - context_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size, - dtype=torch.int64) - - ctx_buffer = { - 'input_ids': ctx_ids, - 'context_lengths': ctx_context_lengths, - 'position_ids': ctx_position_ids, - 'last_token_ids': ctx_last_token_ids, - 'cache_indirection': cache_indirections[0], - 'host_request_types': ctx_host_request_types, - 'host_runtime_perf_knobs': context_runtime_perf_knobs, - } - if enable_remove_input_padding: - ctx_buffer['host_context_lengths'] = ctx_context_lengths.cpu() - - ctx_shape = {k: v.shape for k, v in ctx_buffer.items()} - - ctx_buffer[f'host_max_attention_window_sizes'] = torch.tensor( - [max_seq_len] * len(attn_layer_idx), dtype=torch.int32) - ctx_shape[f'host_max_attention_window_sizes'] = (len(attn_layer_idx), ) - for layer_idx, buf in zip(attn_layer_idx, key_value_cache_buffers): - layer_config = config.get_layer_config(layer_idx) - kv_shape = (batch_size, 2, - layer_config.attention.num_key_value_heads, max_seq_len, - head_size) - ctx_shape[f'past_key_value_{layer_idx}'] = kv_shape - ctx_buffer[f'past_key_value_{layer_idx}'] = buf - ctx_buffer[f'present_key_value_{layer_idx}'] = buf - - ctx_buffer['sequence_length'] = sequence_length_buffer - ctx_shape['sequence_length'] = ctx_buffer['sequence_length'].shape - ctx_shape['host_past_key_value_lengths'] = (batch_size, ) - ctx_buffer['host_past_key_value_lengths'] = torch.tensor( - [0] * batch_size, dtype=torch.int32) - ctx_shape['host_sink_token_length'] = (1, ) - ctx_buffer['host_sink_token_length'] = torch.tensor([0], - dtype=torch.int32) - - context = runtime.ctx_context - runtime._set_shape(context, ctx_shape) - runtime._set_buffer(context, ctx_buffer) - runtime._run(context) - torch.cuda.synchronize() - - res = ctx_buffer['logits'] - np.testing.assert_allclose(ref.to(torch.float32).cpu().numpy(), - res.to(torch.float32).cpu().numpy(), - atol=0.12) - - # compare generation - step = 1 - step1_id = torch.randint(100, (batch_size, 1)).int().cuda() - gen_context_lengths = ctx_context_lengths.clone() - gen_position_ids = torch.ones_like(step1_id).int().cuda() * input_len - gen_last_token_ids = torch.zeros_like(gen_context_lengths).int().cuda() - gen_host_request_types = torch.tensor([1] * batch_size, - dtype=torch.int32) - gen_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size, - dtype=torch.int64) - - with torch.no_grad(): - hf_outputs = hf_model.forward( - step1_id, - past_key_values=hf_outputs.past_key_values, - use_cache=True, - output_hidden_states=True) - - torch.cuda.synchronize() - ref = hf_outputs.logits[:, -1, :] - - if enable_remove_input_padding: - step1_id = step1_id.view([batch_size]) - gen_position_ids = gen_position_ids.view([batch_size]) - gen_last_token_ids = torch.ones_like( - gen_context_lengths).int().cuda() - gen_last_token_ids = torch.cumsum(gen_last_token_ids, dim=0).int() - - step1_buffer = { - 'input_ids': step1_id, - 'context_lengths': gen_context_lengths, - 'position_ids': gen_position_ids, - 'last_token_ids': gen_last_token_ids, - 'host_request_types': gen_host_request_types, - 'cache_indirection': cache_indirections[1], - 'host_runtime_perf_knobs': gen_runtime_perf_knobs, - } - if enable_remove_input_padding: - step1_buffer['host_context_lengths'] = gen_context_lengths.cpu() - - step1_shape = {k: v.shape for k, v in step1_buffer.items()} - - sequence_length_buffer = torch.add(sequence_length_buffer, step) - step1_buffer[f'host_max_attention_window_sizes'] = torch.tensor( - [max_seq_len] * len(attn_layer_idx), dtype=torch.int32) - step1_shape[f'host_max_attention_window_sizes'] = ( - len(attn_layer_idx), ) - for layer_idx, buf in zip(attn_layer_idx, key_value_cache_buffers): - layer_config = config.get_layer_config(layer_idx) - kv_shape = (batch_size, 2, - layer_config.attention.num_key_value_heads, max_seq_len, - head_size) - step1_shape[f"past_key_value_{layer_idx}"] = kv_shape - step1_buffer[f"past_key_value_{layer_idx}"] = buf - step1_buffer[f"present_key_value_{layer_idx}"] = buf - - step1_buffer['sequence_length'] = sequence_length_buffer - step1_shape['sequence_length'] = ctx_buffer['sequence_length'].shape - step1_shape['sequence_length'] = (batch_size, ) - step1_shape['host_past_key_value_lengths'] = (batch_size, ) - step1_buffer[ - 'host_past_key_value_lengths'] = sequence_length_buffer.cpu() - step1_shape['host_sink_token_length'] = (1, ) - step1_buffer['host_sink_token_length'] = torch.tensor([0], - dtype=torch.int32) - - context = runtime.context_1 - runtime._set_shape(context, step1_shape) - runtime._set_buffer(context, step1_buffer) - runtime._run(context) - torch.cuda.synchronize() - res = step1_buffer['logits'] - - np.testing.assert_allclose(ref.to(torch.float32).cpu().numpy(), - res.to(torch.float32).cpu().numpy(), - atol=0.12) - - @parameterized.expand( - itertools.product( - (os.getenv("NVSMALL_CKPT"), ), # "deci/decilm-7b"), - (True, False), - (1, 2), - (1, 2), - ("auto", "float16", "bfloat16"))) - def test_convert_config_from_hf(self, ckpt_path: Optional[str], - preloaded: bool, tp_size: int, pp_size: int, - dtype: str) -> None: - if ckpt_path is None: - self.skipTest( - f"Missing nvsmall checkpoint, define a valid checkpoint path with the NVSMALL_CKPT environment variable" - ) - - hf_config = transformers.AutoConfig.from_pretrained( - ckpt_path, trust_remote_code=True) - - mapping = Mapping(world_size=(tp_size * pp_size), - rank=0, - gpus_per_node=1, - tp_size=tp_size, - pp_size=pp_size) - - config = DeciConfig.from_hugging_face( - hf_config if preloaded else ckpt_path, - dtype=dtype, - mapping=mapping, - trust_remote_code=not preloaded) - - if getattr(hf_config, "num_key_value_heads_per_layer", - None) is not None: - # verify layers for old config - for layer_idx, num_kv_heads in enumerate( - hf_config.num_key_value_heads_per_layer): - layer_config = config.get_layer_config(layer_idx) - self.assertEqual(layer_config.attention.impl, - AttentionImplementation.ATTENTION) - self.assertEqual(num_kv_heads, - layer_config.attention.num_key_value_heads) - self.assertEqual(layer_config.ffn.impl, FFNImplementation.MLP) - self.assertEqual(layer_config.ffn.intermediate_size, - config.intermediate_size) - - elif getattr(hf_config, "block_configs", None) is not None: - # verify layers for new config - for layer_idx, block_config in enumerate(hf_config.block_configs): - layer_config = config.get_layer_config(layer_idx) - if layer_config.attention.impl == AttentionImplementation.ATTENTION: - self.assertFalse(block_config.attention.no_op) - self.assertFalse(block_config.attention.replace_with_linear) - self.assertEqual( - config.num_attention_heads // - block_config.attention.n_heads_in_group, - layer_config.attention.num_key_value_heads) - elif layer_config.attention.impl == AttentionImplementation.NO_OP: - self.assertTrue(block_config.attention.no_op) - elif layer_config.attention.impl == AttentionImplementation.LINEAR: - self.assertTrue(block_config.attention.replace_with_linear) - - if layer_config.ffn.impl == FFNImplementation.MLP: - self.assertFalse(block_config.ffn.no_op) - self.assertFalse(block_config.ffn.replace_with_linear) - self.assertEqual( - _ffn_mult_to_intermediate_size( - block_config.ffn.ffn_mult, config.hidden_size), - layer_config.ffn.intermediate_size) - elif layer_config.ffn.impl == FFNImplementation.NO_OP: - self.assertTrue(block_config.ffn.no_op) - elif layer_config.ffn.impl == FFNImplementation.LINEAR: - self.assertTrue(block_config.ffn.replace_with_linear) - - # verify config is valid enough for model creation - DeciLMForCausalLM(config) - - @parameterized.expand( - itertools.product( - (os.getenv("NVSMALL_CKPT"), ), # "deci/decilm-7b"), - (True, False), - (1, 2), - (1, 2), - ("auto", "float16", "bfloat16"))) - def test_convert_model_from_hf(self, ckpt_path: Optional[str], - preloaded: bool, tp_size: int, pp_size: int, - dtype: str) -> None: - if ckpt_path is None: - self.skipTest( - f"Missing nvsmall checkpoint, define a valid checkpoint path with the NVSMALL_CKPT environment variable" - ) - - if preloaded: - hf_model_or_dir = transformers.AutoModelForCausalLM.from_pretrained( - ckpt_path, trust_remote_code=True) - else: - hf_model_or_dir = ckpt_path - - mapping = Mapping(world_size=(tp_size * pp_size), - rank=0, - gpus_per_node=1, - tp_size=tp_size, - pp_size=pp_size) - - DeciLMForCausalLM.from_hugging_face(hf_model_or_dir=hf_model_or_dir, - dtype=dtype, - mapping=mapping, - trust_remote_code=not preloaded) diff --git a/tests/model/test_gpt.py b/tests/model/test_gpt.py index 54b2822b4..7123ab296 100644 --- a/tests/model/test_gpt.py +++ b/tests/model/test_gpt.py @@ -38,12 +38,16 @@ from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.runtime import ModelConfig, SamplingConfig from tensorrt_llm.runtime.generation import _prepare_attention_mask -from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence, - KVCacheManager) +from tensorrt_llm.runtime.kv_cache_manager import GenerationSequence +from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ + PoolsKVCacheManager sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func +from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \ + MemoryPoolsAllocator + class TestGPT(unittest.TestCase): @@ -513,27 +517,50 @@ def test_gpt_plugin(self, test_partition, use_refit, fast_building, if enable_paged_kv_cache: max_blocks_per_seq = math.ceil(total_length / tokens_per_block) num_blocks = batch_size * beam_width * max_blocks_per_seq - block_size = gpt_config.n_head * tokens_per_block * head_size - kv_cache_manager = KVCacheManager( - num_layers=gpt_config.n_layer, + + memory_pools_allocator = MemoryPoolsAllocator( num_blocks=num_blocks, - block_size=block_size, tokens_per_block=tokens_per_block, - max_blocks_per_seq=max_blocks_per_seq, + head_size=head_size) + num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer( + gpt_config.n_head, gpt_config.n_layer) + memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer) + pools_kv_cache_manager = PoolsKVCacheManager( + memory_pools_allocator.pools_metadata, + max_blocks_per_seq, + num_blocks, + tokens_per_block, + head_size, max_attention_window_size=total_length, - sink_token_len=0, - beam_width=beam_width) - host_kv_cache_pool_pointers = torch.tensor( - [key_value_cache_buffers[0].data_ptr(), 0], dtype=torch.int64) + beam_width=beam_width, + sink_token_len=0) + + host_kv_cache_pool_pointers = memory_pools_allocator.get_kv_cache_pool_pointers( + ) + host_kv_cache_pool_mapping = memory_pools_allocator.pool_mapping + + # block_size = gpt_config.n_head * tokens_per_block * head_size + # kv_cache_manager = KVCacheManager( + # num_layers=gpt_config.n_layer, + # num_blocks=num_blocks, + # block_size=block_size, + # tokens_per_block=tokens_per_block, + # max_blocks_per_seq=max_blocks_per_seq, + # max_attention_window_size=total_length, + # sink_token_len=0, + # beam_width=beam_width) + # host_kv_cache_pool_pointers = torch.tensor( + # [key_value_cache_buffers[0].data_ptr(), 0], dtype=torch.int64) # Add sequences to the manager for bi in range(batch_size): generation_sequence = GenerationSequence(seq_idx=bi, batch_idx=bi) - kv_cache_manager.add_sequence(generation_sequence, seq_len) + pools_kv_cache_manager.add_sequence(generation_sequence, + seq_len) # Pre allocate the kv cache for the generated tokens. - kv_cache_manager.step([False] * batch_size) + pools_kv_cache_manager.step([False] * batch_size) def run_engine(context, input_ids, @@ -570,7 +597,7 @@ def run_engine(context, if enable_paged_kv_cache: assert beam_width == 1 # for beam_width > 1 the argument must be '1' in ctx phase and 'beam_width' in gen phase - host_kv_cache_block_offsets = kv_cache_manager.get_block_offsets( + host_kv_cache_block_offsets = pools_kv_cache_manager.get_block_offsets( beam_width=1) kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda') @@ -585,6 +612,10 @@ def run_engine(context, ctx_buffer[ f'host_kv_cache_pool_pointers'] = host_kv_cache_pool_pointers.contiguous( ) + ctx_buffer[ + f'host_kv_cache_pool_mapping'] = memory_pools_allocator.pool_mapping.contiguous( + ) + ctx_buffer[ f'host_max_attention_window_sizes'] = host_max_attention_window_sizes else: diff --git a/tests/model/test_gpt_e2e.py b/tests/model/test_gpt_e2e.py index 936298085..82ee3eb76 100644 --- a/tests/model/test_gpt_e2e.py +++ b/tests/model/test_gpt_e2e.py @@ -62,7 +62,6 @@ def build_engine(checkpoint_dir: str, engine_dir: str, *args): '--max_input_len=40', '--max_seq_len=60', '--max_beam_width=2', - '--builder_opt=0', ] legacy_args = [ "--gpt_attention_plugin=disable", diff --git a/tests/model/test_llama.py b/tests/model/test_llama.py index 344d52172..586522517 100644 --- a/tests/model/test_llama.py +++ b/tests/model/test_llama.py @@ -244,6 +244,7 @@ def test_llama(self, use_refit, fast_building, context_fmha_flag, llama_config.vocab_size = 128 llama_config.num_attention_heads = 2 if num_kv_heads <= 1 else 2 * num_kv_heads llama_config.hidden_size = llama_config.num_attention_heads * head_size + llama_config.head_dim = head_size llama_config.intermediate_size = (( (llama_config.hidden_size * 4 * 2 // 3) + head_size - 1) // head_size) * head_size @@ -256,7 +257,7 @@ def test_llama(self, use_refit, fast_building, context_fmha_flag, llama_config.eos_token_id = self.EOS_TOKEN seed_idx = random.randint(0, len(PRECHECKED_GOOD_RANDOM_SEEDS) - 1) torch.manual_seed(PRECHECKED_GOOD_RANDOM_SEEDS[seed_idx]) - hf_llama = LlamaForCausalLM(llama_config).cuda() + hf_llama = LlamaForCausalLM(llama_config).cuda().eval() runtime, _ = self._gen_tensorrt_llm_runtime( log_level, dtype, world_size, rank, llama_config, hf_llama, model, use_plugin, batch_size, beam_width, input_len, output_len, diff --git a/tests/model/test_mamba.py b/tests/model/test_mamba.py index 9f16397d7..d7ae52456 100644 --- a/tests/model/test_mamba.py +++ b/tests/model/test_mamba.py @@ -32,7 +32,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) -from examples.mamba.convert_checkpoint import (convert_from_hf_checkpoint, +from tensorrt_llm.models.mamba.convert import (convert_from_hf_checkpoint, convert_hf_mamba) sys.path.append(os.path.join(os.path.dirname(__file__), '..')) @@ -74,13 +74,12 @@ def _gen_tensorrt_llm_mamba(self, hf_config, hf_path, hf_mamba, load_mode, 'pp_size': 1 }, } + config = tensorrt_llm.models.PretrainedConfig.from_dict(config) if load_mode == 'from_checkpoint': weights = convert_from_hf_checkpoint(mamba_config=config, - model_dir=hf_path, - dtype=dtype) + model_dir=hf_path) else: - weights = convert_hf_mamba(hf_mamba, rank=0, dtype=dtype) - config = tensorrt_llm.models.PretrainedConfig.from_dict(config) + weights = convert_hf_mamba(hf_mamba, dtype=dtype) tensorrt_llm_mamba = tensorrt_llm.models.MambaForCausalLM(config) tensorrt_llm_mamba.load(weights) return tensorrt_llm_mamba @@ -251,7 +250,7 @@ def test_mamba(self, gemm_plugin, mamba_conv1d_plugin, dtype, device=step1_id.device)) gen_ref = hf_outputs.logits[:, -1, :] - # get tensorrt llm mamba rumtime + # get tensorrt llm mamba runtime runtime, _ = self._gen_tensorrt_llm_runtime( log_level, model_name, gemm_plugin, mamba_conv1d_plugin, hf_config, hf_path, hf_mamba, load_mode, batch_size, input_len, output_len, diff --git a/tests/model/test_mistral.py b/tests/model/test_mistral.py index 5000566aa..f66a83c31 100644 --- a/tests/model/test_mistral.py +++ b/tests/model/test_mistral.py @@ -211,6 +211,7 @@ def test_mistral(self, use_refit, fast_building, context_fmha_flag, mistral_config.max_position_embeddings = 64 mistral_config.vocab_size = 128 mistral_config.num_attention_heads = 2 * num_kv_heads + mistral_config.head_dim = head_size mistral_config.hidden_size = mistral_config.num_attention_heads * head_size mistral_config.intermediate_size = (( (mistral_config.hidden_size * 4 * 2 // 3) + head_size - 1) // @@ -222,7 +223,7 @@ def test_mistral(self, use_refit, fast_building, context_fmha_flag, mistral_config.eos_token_id = self.EOS_TOKEN seed_idx = random.randint(0, len(PRECHECKED_GOOD_RANDOM_SEEDS) - 1) torch.manual_seed(PRECHECKED_GOOD_RANDOM_SEEDS[seed_idx]) - hf_mistral = MistralForCausalLM(mistral_config).cuda() + hf_mistral = MistralForCausalLM(mistral_config).cuda().eval() runtime, _ = self._gen_tensorrt_llm_runtime( log_level, dtype, world_size, rank, mistral_config, hf_mistral, model, use_plugin, batch_size, beam_width, input_len, output_len, diff --git a/tests/model/test_nemotron_nas.py b/tests/model/test_nemotron_nas.py new file mode 100644 index 000000000..469a65b4e --- /dev/null +++ b/tests/model/test_nemotron_nas.py @@ -0,0 +1,989 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +import math +import os +import re +import subprocess +import sys +import tempfile +import unittest +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import pytest +import tensorrt as trt +import torch +import transformers +from parameterized import parameterized +from transformers import AutoTokenizer +from typing_extensions import Literal + +import tensorrt_llm +from tensorrt_llm import logger +from tensorrt_llm._utils import str_dtype_to_torch +from tensorrt_llm.builder import Builder, Engine, EngineConfig +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models.modeling_utils import PretrainedConfig +from tensorrt_llm.models.nemotron_nas.config import DeciConfig, DeciLayerConfig +from tensorrt_llm.models.nemotron_nas.convert import ( + _ffn_mult_to_intermediate_size, load_weights_from_hf_safetensors) +from tensorrt_llm.models.nemotron_nas.layer_config import ( + AttentionImplementation, FFNImplementation) +from tensorrt_llm.models.nemotron_nas.model import DeciLMForCausalLM +from tensorrt_llm.network import Network, net_guard +from tensorrt_llm.plugin.plugin import ContextFMHAType +from tensorrt_llm.runtime.generation import _Runtime + +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from utils.llm_data import llm_models_root +from utils.util import get_project_root, unittest_name_func + +sys.path.append( + os.path.join(os.path.dirname(__file__), '../..', 'examples/nemotron_nas')) +from calibration_utils import create_trtllm_magpie_calibration_dataset + +from tensorrt_llm.runtime.kv_cache_manager import GenerationSequence +from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \ + MemoryPoolsAllocator +from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \ + PoolsKVCacheManager +from tensorrt_llm.runtime.model_runner import ModelRunner +from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCpp + + +@dataclass(kw_only=True, frozen=True) +class TestParams: + enable_paged_kv_cache: bool + enable_remove_input_padding: bool + dtype: Literal["float16", "bfloat16"] + + batch_size: int = 1 + beam_width: int = 1 + seq_len: int = 128 + total_length: int = seq_len + 2 + tokens_per_block: int = 128 + + @property + def output_len(self): + return self.total_length - self.seq_len + + def __str__(self) -> str: + """tests/utils/util.py#L143 - > `str(x)`: parameterized test name""" + properties_without_default = (self.enable_paged_kv_cache, + self.enable_remove_input_padding, + self.dtype) + return "_".join((parameterized.to_safe_name(prop).lower() + for prop in properties_without_default)) + + @property + def mapping(self) -> Mapping: + return Mapping(world_size=1, rank=0, tp_size=1) + + +@dataclass +class RuntimeHandle: + """Deleting `Runtime().runtime` will **definitively** deallocate the weights.""" + runtime: _Runtime + + +class TestNemotronNas(unittest.TestCase): + + def _make_config(self, + layer_configs: List[Union[DeciLayerConfig, + Dict[str, Dict[str, Any]]]], + dtype: str = 'bfloat16', + num_attention_heads: int = 32, + num_key_value_heads: Optional[int] = None, + hidden_size: int = 4096, + intermediate_size: int = 16384, + vocab_size: int = 32128, + max_positions_embedding: int = 1024, + norm_epsilon: float = 1e-05) -> DeciConfig: + config = { + 'architecture': 'DeciLMForCausalLM', + 'num_hidden_layers': len(layer_configs), + 'num_attention_heads': num_attention_heads, + 'num_key_value_heads': num_key_value_heads, + 'dtype': dtype, + 'logits_dtype': dtype, + 'hidden_size': hidden_size, + 'intermediate_size': intermediate_size, + 'vocab_size': vocab_size, + 'position_embedding_type': 'rope_gpt_neox', + 'max_position_embeddings': max_positions_embedding, + 'hidden_act': 'silu', + 'norm_epsilon': norm_epsilon, + 'layer_configs': layer_configs + } + + config = DeciConfig.from_dict(config) + return config + + def _gen_tensorrt_llm_network(self, network: Network, + model: DeciLMForCausalLM, batch_size: int, + beam_width: int, input_len: int, + output_len: int, rank: int, + tensor_parallel: int, **opt_flags): + list(range(tensor_parallel)) + + with net_guard(network): + # Prepare + network.set_named_parameters(model.named_parameters()) + inputs = model.prepare_inputs(max_batch_size=batch_size, + max_input_len=input_len, + max_seq_len=input_len + output_len, + max_num_tokens=batch_size * input_len, + use_cache=True, + max_beam_width=beam_width) + # Forward + model(**inputs) + return network + + def _gen_tensorrt_llm_engine( + self, + rank: int, + world_size: int, + model: DeciLMForCausalLM, + model_name: str, + use_plugin: bool, + batch_size: int, + beam_width: int, + input_len: int, + output_len: int, + tokens_per_block: int, + use_refit: bool, + use_gemm: bool = False, + context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled, + enable_remove_input_padding: bool = False, + enable_paged_kv_cache: bool = False, + **opt_flags) -> trt.IHostMemory: + + builder = Builder() + dtype = model.config.dtype + + with tempfile.TemporaryDirectory(): + builder_config = builder.create_builder_config( + name=model_name, + precision=dtype, + timing_cache='model.cache', + tensor_parallel=world_size, # TP only + use_refit=use_refit, + strongly_typed=True, + ) + network = builder.create_network() + network.plugin_config.to_legacy_setting() + if use_plugin: + network.plugin_config.gpt_attention_plugin = dtype + if use_gemm: + network.plugin_config.gemm_plugin = dtype + if enable_remove_input_padding: + network.plugin_config.remove_input_padding = True + if enable_paged_kv_cache: + network.plugin_config.enable_paged_kv_cache(tokens_per_block) + + network.plugin_config.set_context_fmha(context_fmha_flag) + + self._gen_tensorrt_llm_network(network=network, + model=model, + batch_size=batch_size, + beam_width=beam_width, + input_len=input_len, + output_len=output_len, + rank=rank, + tensor_parallel=world_size, + **opt_flags) + engine_buffer = builder.build_engine(network, builder_config) + return engine_buffer + + def _from_hf_model( + self, + hf_model: transformers.AutoModelForCausalLM, + params: TestParams, + *, + model_name: str = "nemotron-nas", + use_plugin: bool = True, + use_refit: bool = False, + use_gemm: bool = True, + context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled, + **opt_flags) -> Tuple[RuntimeHandle, PretrainedConfig]: + model = DeciLMForCausalLM.from_hugging_face(hf_model) + logger.set_level("warning") + mapping = params.mapping + engine_buffer = self._gen_tensorrt_llm_engine( + rank=mapping.rank, + world_size=mapping.world_size, + model=model, + model_name=model_name, + use_plugin=use_plugin, + batch_size=params.batch_size, + beam_width=params.beam_width, + input_len=params.seq_len, + output_len=params.output_len, + use_refit=use_refit, + use_gemm=use_gemm, + context_fmha_flag=context_fmha_flag, + enable_remove_input_padding=params.enable_remove_input_padding, + tokens_per_block=params.tokens_per_block, + enable_paged_kv_cache=params.enable_paged_kv_cache, + **opt_flags) + runtime = RuntimeHandle(_Runtime(engine_buffer, mapping)) + return runtime, model.config + + def _from_fp8_quantized_engine( + self, + *, + model_dir: str, + quantize_dir: str, + dataset: Optional[str] = "cnn_dailymail", + params: TestParams) -> Tuple[RuntimeHandle, PretrainedConfig]: + root = get_project_root(__file__) + quantize_path = str(root / "examples/quantization/quantize.py") + + with tempfile.TemporaryDirectory( + prefix="transformed_magpie") as dataset_dir: + create_trtllm_magpie_calibration_dataset(dataset_dir) + quantize = [ + sys.executable, + quantize_path, + f"--model_dir={model_dir}", + f"--output_dir={quantize_dir}", + f"--calib_dataset={dataset_dir}", + "--dtype=bfloat16", + "--kv_cache_dtype=fp8", + "--qformat=fp8", + "--calib_size=512", + ] + print(f"Running quantize: {quantize}") + subprocess.run(quantize, check=True) + + engine_path = f"{quantize_dir}/engine" + build = [ + "trtllm-build", + f"--checkpoint_dir={quantize_dir}", + f"--output_dir={engine_path}", + f"--max_input_len={params.seq_len}", + f"--max_batch_size={params.batch_size}", + f"--remove_input_padding={'enable' if params.enable_remove_input_padding else 'disable'}", + f"--kv_cache_type={'paged' if params.enable_paged_kv_cache else 'continuous'}", + "--gemm_plugin=auto", + "--gpt_attention_plugin=auto", + ] + + if params.enable_paged_kv_cache: + build.append(f"--tokens_per_block={params.tokens_per_block}") + + print(f"Running trtllm-build: {build}") + subprocess.run(build, check=True) + + engine = Engine.from_dir(engine_path) + runtime = RuntimeHandle(_Runtime(engine.engine, params.mapping)) + config = EngineConfig.from_json_file(f"{engine_path}/config.json") + + return runtime, config.pretrained_config + + def test_config_to_from_dict(self) -> None: + config = self._make_config(layer_configs=[{ + "attention": { + "num_key_value_heads": 4 + }, + "ffn": {} + }, { + "attention": { + "num_key_value_heads": 2 + }, + "ffn": { + "impl": "no_op" + } + }, { + "attention": { + "impl": "no_op" + }, + "ffn": { + "intermediate_size": 8192 + } + }]) + + config2 = DeciConfig.from_dict(config.to_dict()) + self.assertListEqual(config.layer_configs, config2.layer_configs) + + def test_save_load_config(self) -> None: + config = self._make_config(layer_configs=[{ + "attention": { + "num_key_value_heads": 4 + }, + "ffn": {} + }, { + "attention": { + "num_key_value_heads": 2 + }, + "ffn": { + "impl": "no_op" + } + }, { + "attention": { + "impl": "no_op" + }, + "ffn": { + "intermediate_size": 8192 + } + }]) + + with tempfile.TemporaryDirectory( + prefix="test_save_load_checkpoint") as ckpt_dir: + config_file = f"{ckpt_dir}/config.json" + config.to_json_file(config_file) + config2 = DeciConfig.from_json_file(config_file) + + self.assertDictEqual(config.to_dict(), config2.to_dict()) + self.assertListEqual(config.layer_configs, config2.layer_configs) + + def get_loader_test_cases(): + model_root = llm_models_root(check=True) + test_models_base_path = Path(model_root, "nvsmall/tests") + models_path = [ + os.path.join(test_models_base_path, x) + for x in os.listdir(test_models_base_path) + ] + + params_product = [ + TestParams( + enable_paged_kv_cache=paged, + enable_remove_input_padding=padded, + dtype=dtype, + ) for paged, padded, dtype in itertools.product( + [True, False], + [True, False], + ["bfloat16", "float16"], + ) + ] + test_cases = list(itertools.product(models_path, params_product)) + + return test_cases + + @parameterized.expand(get_loader_test_cases, name_func=unittest_name_func) + def test_allclose_to_hf(self, hf_model_dir: str, params: TestParams): + hf_model = transformers.AutoModelForCausalLM.from_pretrained( + hf_model_dir, + trust_remote_code=True, + torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype), + ).cuda() + runtime, config = self._from_hf_model(hf_model, params) + self.allclose( + runtime, + config=config, + params=params, + obtain_hf_model=lambda: hf_model, + ) + + def allclose( + self, + runtime_handle: RuntimeHandle, + *, + config: PretrainedConfig, + params: TestParams, + obtain_hf_model: Callable[[], transformers.AutoModelForCausalLM], + ): + batch_size = params.batch_size + beam_width = params.beam_width + seq_len = params.seq_len + total_length = params.total_length + dtype = config.dtype + tokens_per_block = params.tokens_per_block + enable_remove_input_padding = params.enable_remove_input_padding + enable_paged_kv_cache = params.enable_paged_kv_cache + + key_value_cache_buffers = [] + head_size = config.hidden_size // config.num_attention_heads + attn_layer_idx = [ + i for i in range(config.num_hidden_layers) + if config.get_layer_config(i).attention.needs_kv_cache + ] + + if enable_paged_kv_cache: + num_blocks = batch_size * beam_width * math.ceil( + total_length / tokens_per_block) + + memory_pools_allocator = MemoryPoolsAllocator( + num_blocks=num_blocks, + tokens_per_block=tokens_per_block, + head_size=head_size) + if config.num_kv_heads_per_layer is None: + num_kv_heads = config.get_layer_config( + attn_layer_idx[0]).attention.num_key_value_heads + num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer( + num_kv_heads, len(attn_layer_idx)) + else: + num_kv_heads_per_layer = config.num_kv_heads_per_layer + + memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer) + max_blocks_per_seq = math.ceil(total_length / tokens_per_block) + num_blocks = batch_size * beam_width * max_blocks_per_seq + + pools_kv_cache_manager = PoolsKVCacheManager( + memory_pools_allocator.pools_metadata, + max_blocks_per_seq, + num_blocks, + tokens_per_block, + head_size, + max_attention_window_size=total_length, + beam_width=beam_width, + sink_token_len=0) + # Add sequences to the manager + for bi in range(batch_size): + generation_sequence = GenerationSequence(seq_idx=bi, + batch_idx=bi) + pools_kv_cache_manager.add_sequence(generation_sequence, + seq_len) + + # Pre allocate the kv cache for the generated tokens. + pools_kv_cache_manager.step([False] * batch_size) + + else: + for layer_idx in attn_layer_idx: + layer_config = config.get_layer_config(layer_idx) + new_cache = torch.zeros(( + batch_size, + 2, + layer_config.attention.num_key_value_heads, + total_length, + head_size, + ), + dtype=str_dtype_to_torch(dtype), + device='cuda') + key_value_cache_buffers.append(new_cache) + + cache_indirections = [ + torch.full(( + batch_size, + beam_width, + total_length, + ), + 0, + dtype=torch.int32, + device='cuda'), + torch.full(( + batch_size, + beam_width, + total_length, + ), + 0, + dtype=torch.int32, + device='cuda') + ] # ping-pong buffers + + def run_engine(context, + input_ids, + context_lengths, + host_request_types, + position_ids, + last_token_ids, + cache_indirection, + host_past_key_value_lengths, + host_max_attention_window_sizes, + host_sink_token_length, + host_runtime_perf_knobs, + sequence_length=None, + host_context_lengths=None): + + ctx_buffer = { + 'input_ids': input_ids, + 'context_lengths': context_lengths, + 'host_request_types': host_request_types, + 'position_ids': position_ids, + 'last_token_ids': last_token_ids, + 'cache_indirection': cache_indirection, + 'host_past_key_value_lengths': host_past_key_value_lengths, + 'sequence_length': sequence_length, + 'host_sink_token_length': host_sink_token_length, + 'host_runtime_perf_knobs': host_runtime_perf_knobs + } + + assert host_request_types is not None + if enable_remove_input_padding: + assert host_context_lengths is not None, "host_context_lengths is required for ragged input" + ctx_buffer['host_context_lengths'] = host_context_lengths + + if enable_paged_kv_cache: + assert beam_width == 1 + # for beam_width > 1 the argument must be '1' in ctx phase and 'beam_width' in gen phase + host_kv_cache_block_offsets = pools_kv_cache_manager.get_block_offsets( + beam_width=1) + kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda') + shape = kv_cache_block_offsets.shape + target_shape = [shape[0], shape[1] * shape[2], *shape[3:]] + ctx_buffer[ + f'kv_cache_block_offsets'] = kv_cache_block_offsets.reshape( + target_shape) + ctx_buffer[ + f'host_kv_cache_block_offsets'] = host_kv_cache_block_offsets.reshape( + target_shape) + ctx_buffer[ + f'host_kv_cache_pool_pointers'] = memory_pools_allocator.get_kv_cache_pool_pointers( + ).contiguous() + ctx_buffer[ + f'host_kv_cache_pool_mapping'] = memory_pools_allocator.pool_mapping.contiguous( + ) + ctx_buffer[ + f'host_max_attention_window_sizes'] = host_max_attention_window_sizes + else: + for layer_idx, buf in zip(attn_layer_idx, + key_value_cache_buffers): + ctx_buffer[f'past_key_value_{layer_idx}'] = buf + ctx_buffer[f'present_key_value_{layer_idx}'] = buf + ctx_buffer[ + f'host_max_attention_window_sizes'] = host_max_attention_window_sizes + + ctx_shape = { + key: buffer.shape + for key, buffer in ctx_buffer.items() + } + + runtime_handle.runtime._set_shape(context, ctx_shape) + runtime_handle.runtime._set_buffer(context, ctx_buffer) + runtime_handle.runtime._run(context) + torch.cuda.synchronize() + res = ctx_buffer['logits'] + return res + + step0_ids = torch.randint(100, (batch_size, seq_len)).int().cuda() + step1_ids = torch.randint(100, (batch_size, 1)).int().cuda() + + def tllm() -> Tuple[np.ndarray, np.ndarray]: + ctx_ids = step0_ids.clone() + + ctx_context_lengths = seq_len * torch.ones( + (batch_size), dtype=torch.int32, device='cuda') + ctx_position_ids = torch.tensor(range(seq_len), + dtype=torch.int32).reshape([ + 1, seq_len + ]).expand([batch_size, + seq_len]).cuda() + ctx_last_token_ids = ctx_context_lengths.clone() + + if enable_remove_input_padding: + ctx_ids = ctx_ids.view([batch_size * seq_len]) + ctx_position_ids = ctx_position_ids.view([batch_size * seq_len]) + ctx_last_token_ids = torch.cumsum(ctx_last_token_ids, + dim=0).int() + + host_max_attention_window_sizes = torch.tensor([total_length] * + len(attn_layer_idx), + dtype=torch.int32) + host_sink_token_length = torch.tensor([0], dtype=torch.int32) + + host_context_lengths = ctx_context_lengths.cpu( + ) if enable_remove_input_padding else None + host_request_types = torch.tensor([0 for i in range(batch_size)], + dtype=torch.int32).cpu() + + host_past_key_value_lengths = ctx_context_lengths.detach().clone( + ).cpu() + # We need sequence_lengths start as context_lengths for step 0 (context), + # and it will be added one after each step. + sequence_length = ctx_context_lengths.detach().clone() + + perf_knob_tensor_size = 16 + ctx_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size, + dtype=torch.int64) + + step0 = run_engine( + context=runtime_handle.runtime.ctx_context, + input_ids=ctx_ids, + context_lengths=ctx_context_lengths, + position_ids=ctx_position_ids, + last_token_ids=ctx_last_token_ids, + cache_indirection=cache_indirections[0], + host_past_key_value_lengths=host_past_key_value_lengths, + host_max_attention_window_sizes=host_max_attention_window_sizes, + host_sink_token_length=host_sink_token_length, + sequence_length=sequence_length, + host_context_lengths=host_context_lengths, + host_request_types=host_request_types, + host_runtime_perf_knobs=ctx_runtime_perf_knobs) + + step = 1 + gen_ids = step1_ids.clone() + + gen_context_lengths = seq_len * torch.ones( + (batch_size), dtype=torch.int32, device='cuda') + gen_position_ids = torch.ones_like(gen_ids).int().cuda() * seq_len + gen_last_token_ids = torch.zeros_like( + gen_context_lengths).int().cuda() + + if enable_remove_input_padding: + gen_ids = gen_ids.view([batch_size]) + gen_position_ids = gen_position_ids.view([batch_size]) + gen_last_token_ids = torch.ones_like( + gen_context_lengths).int().cuda() + gen_last_token_ids = torch.cumsum(gen_last_token_ids, + dim=0).int() + + host_past_key_value_lengths = torch.tensor([seq_len + step - 1] * + batch_size, + dtype=torch.int32) + host_max_attention_window_sizes = torch.tensor([seq_len + step] * + len(attn_layer_idx), + dtype=torch.int32) + host_sink_token_length = torch.tensor([0], dtype=torch.int32) + + host_context_lengths = gen_context_lengths.cpu( + ) if enable_remove_input_padding else None + host_request_types = torch.tensor([1 for i in range(batch_size)], + dtype=torch.int32).cpu() + + # For step 1, the sequence_lengths = context_lengths + 1. + sequence_length = torch.add(gen_context_lengths.detach().clone(), 1) + + perf_knob_tensor_size = 16 + gen_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size, + dtype=torch.int64) + + step1 = run_engine( + context=runtime_handle.runtime.context_1, + input_ids=gen_ids, + context_lengths=gen_context_lengths, + position_ids=gen_position_ids, + last_token_ids=gen_last_token_ids, + cache_indirection=cache_indirections[1], + host_past_key_value_lengths=host_past_key_value_lengths, + host_max_attention_window_sizes=host_max_attention_window_sizes, + host_sink_token_length=host_sink_token_length, + sequence_length=sequence_length, + host_context_lengths=host_context_lengths, + host_request_types=host_request_types, + host_runtime_perf_knobs=gen_runtime_perf_knobs) + + return step0, step1 + + def hf() -> Tuple[np.ndarray, np.ndarray]: + with torch.no_grad(): + hf_model = obtain_hf_model() + step0_outputs = hf_model.forward(step0_ids.clone()) + torch.cuda.synchronize() + step0 = step0_outputs.logits[:, -1, :] + step1_outputs = hf_model.forward( + step1_ids.clone(), + past_key_values=step0_outputs.past_key_values, + use_cache=True, + ) + torch.cuda.synchronize() + step1 = step1_outputs.logits[:, -1, :] + + return step0, step1 + + res_step0, res_step1 = tllm() + del runtime_handle.runtime + ref_step0, ref_step1 = hf() + np.testing.assert_allclose(ref_step0.cpu().numpy().flatten(), + res_step0.cpu().numpy().flatten(), + atol=1e-1) + np.testing.assert_allclose(ref_step1.cpu().numpy().flatten(), + res_step1.cpu().numpy().flatten(), + atol=1e-1) + + @parameterized.expand(get_loader_test_cases, name_func=unittest_name_func) + @pytest.mark.skipif( + os.environ.get("TEST_NEMOTRON_NAS_FP8_ALLCLOSE") is None, + reason="fp8 accuracy is low.") + def test_allclose_to_hf_fp8(self, hf_model_dir: str, params: TestParams): + with tempfile.TemporaryDirectory("quantize_dir") as quantize_dir: + runtime, config = self._from_fp8_quantized_engine( + model_dir=hf_model_dir, + quantize_dir=quantize_dir, + params=params) + self.allclose( + runtime, + config=config, + params=params, + obtain_hf_model=lambda: transformers.AutoModelForCausalLM. + from_pretrained( + hf_model_dir, + trust_remote_code=True, + torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype + ), + ).cuda(), + ) + + @pytest.mark.skipif( + os.environ.get("NEMOTRON_NAS_CKPT") is None + or os.environ.get("NEMOTRON_NAS_OUTPUT_DIR") is None, + reason="You must define NEMOTRON_NAS_CKPT, NEMOTRON_NAS_OUTPUT_DIR", + ) + def test_allclose_to_hf_fp8_accelerate(self): + hf_model_dir = os.environ["NEMOTRON_NAS_CKPT"] + output_dir = os.environ["NEMOTRON_NAS_OUTPUT_DIR"] + params = TestParams(enable_paged_kv_cache=True, + enable_remove_input_padding=True, + dtype="float16", + seq_len=2048) + runtime, config = self._from_fp8_quantized_engine( + model_dir=hf_model_dir, quantize_dir=str(output_dir), params=params) + self.allclose( + runtime, + config=config, + params=params, + obtain_hf_model=lambda: transformers.AutoModelForCausalLM. + from_pretrained( + hf_model_dir, + trust_remote_code=True, + torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype + ), + device_map="auto", + ), + ) + + @parameterized.expand( + itertools.product(("nvidia/Llama-3_1-Nemotron-51B-Instruct", ), + (True, False), (1, 2), (1, 2), + ("auto", "float16", "bfloat16"))) + def test_convert_config_from_hf(self, ckpt_path: Optional[str], + preloaded: bool, tp_size: int, pp_size: int, + dtype: str) -> None: + hf_config = transformers.AutoConfig.from_pretrained( + ckpt_path, trust_remote_code=True) + + mapping = Mapping(world_size=(tp_size * pp_size), + rank=0, + gpus_per_node=1, + tp_size=tp_size, + pp_size=pp_size) + + config = DeciConfig.from_hugging_face( + hf_config if preloaded else ckpt_path, + dtype=dtype, + mapping=mapping, + trust_remote_code=not preloaded) + + if getattr(hf_config, "num_key_value_heads_per_layer", + None) is not None: + # verify layers for old config + for layer_idx, num_kv_heads in enumerate( + hf_config.num_key_value_heads_per_layer): + layer_config = config.get_layer_config(layer_idx) + self.assertEqual(layer_config.attention.impl, + AttentionImplementation.ATTENTION) + self.assertEqual(num_kv_heads, + layer_config.attention.num_key_value_heads) + self.assertEqual(layer_config.ffn.impl, FFNImplementation.MLP) + self.assertEqual(layer_config.ffn.intermediate_size, + config.intermediate_size) + + elif getattr(hf_config, "block_configs", None) is not None: + # verify layers for new config + for layer_idx, block_config in enumerate(hf_config.block_configs): + layer_config = config.get_layer_config(layer_idx) + if layer_config.attention.impl == AttentionImplementation.ATTENTION: + self.assertFalse(block_config.attention.no_op) + self.assertFalse(block_config.attention.replace_with_linear) + self.assertEqual( + config.num_attention_heads // + block_config.attention.n_heads_in_group, + layer_config.attention.num_key_value_heads) + elif layer_config.attention.impl == AttentionImplementation.NO_OP: + self.assertTrue(block_config.attention.no_op) + elif layer_config.attention.impl == AttentionImplementation.LINEAR: + self.assertTrue(block_config.attention.replace_with_linear) + + if layer_config.ffn.impl == FFNImplementation.MLP: + self.assertFalse(block_config.ffn.no_op) + self.assertFalse(block_config.ffn.replace_with_linear) + self.assertEqual( + _ffn_mult_to_intermediate_size( + block_config.ffn.ffn_mult, config.hidden_size), + layer_config.ffn.intermediate_size) + elif layer_config.ffn.impl == FFNImplementation.NO_OP: + self.assertTrue(block_config.ffn.no_op) + elif layer_config.ffn.impl == FFNImplementation.LINEAR: + self.assertTrue(block_config.ffn.replace_with_linear) + + # verify config is valid enough for model creation + DeciLMForCausalLM(config) + + @parameterized.expand( + itertools.product( + os.listdir( + Path(llm_models_root(check=True), "nvsmall/tests").as_posix()), + (True, False), (1, 2), (1, 2), ("auto", "float16", "bfloat16"))) + def test_convert_model_from_hf(self, model_dir: Optional[str], + preloaded: bool, tp_size: int, pp_size: int, + dtype: str) -> None: + ckpt_path = Path(llm_models_root(check=True), "nvsmall/tests", + model_dir) + + if preloaded: + hf_model_or_dir = transformers.AutoModelForCausalLM.from_pretrained( + ckpt_path, trust_remote_code=True) + else: + hf_model_or_dir = ckpt_path + + mapping = Mapping(world_size=(tp_size * pp_size), + rank=0, + gpus_per_node=1, + tp_size=tp_size, + pp_size=pp_size) + + DeciLMForCausalLM.from_hugging_face(hf_model_or_dir=hf_model_or_dir, + dtype=dtype, + mapping=mapping, + trust_remote_code=not preloaded) + + @parameterized.expand( + itertools.product( + os.listdir( + Path(llm_models_root(check=True), "nvsmall/tests").as_posix()), + (1, 2, 4))) + def test_weights_loader(self, model_dir: str, tp_size: int) -> None: + + ckpt_path = Path(llm_models_root(check=True), "nvsmall/tests", + model_dir) + config = DeciConfig.from_hugging_face(ckpt_path, trust_remote_code=True) + weights = load_weights_from_hf_safetensors(ckpt_path, config) + + shard_configs = [ + DeciConfig.from_hugging_face(ckpt_path, + trust_remote_code=True, + mapping=Mapping(world_size=tp_size, + tp_size=tp_size, + rank=rank)) + for rank in range(tp_size) + ] + shard_weights = [ + load_weights_from_hf_safetensors(ckpt_path, shard_config) + for shard_config in shard_configs + ] + + for name, param in weights.items(): + shards = [shard[name] for shard in shard_weights] + + if name.endswith("attention.weight"): + # linear attention + combined = torch.cat(shards, dim=0) + torch.testing.assert_close(combined, param, atol=0, rtol=0) + elif name.endswith("attention.qkv.weight"): + # proper attention + layer_idx = int( + re.match("transformer.layers.(\\d+).", name).groups()[0]) + layer_config = config.layer_configs[layer_idx] + num_kv_heads = int(layer_config.attention.num_key_value_heads) + num_kv_heads_tp = (num_kv_heads + tp_size - 1) // tp_size + dups = tp_size // num_kv_heads or 1 + q, k, v = torch.split(param, [ + config.num_attention_heads * config.head_size, + num_kv_heads * config.head_size, + num_kv_heads * config.head_size + ]) + + q_shards, k_shards, v_shards = [], [], [] + for rank, shard in enumerate(shards): + qt, kt, vt = torch.split( + shard, + [(config.num_attention_heads // tp_size) * + config.head_size, num_kv_heads_tp * config.head_size, + num_kv_heads_tp * config.head_size]) + q_shards.append(qt) + if rank % dups == 0: + k_shards.append(kt) + v_shards.append(vt) + + combined_q = torch.cat(q_shards, dim=0) + combined_k = torch.cat(k_shards, dim=0) + combined_v = torch.cat(v_shards, dim=0) + + torch.testing.assert_close(combined_q, q, atol=0, rtol=0) + torch.testing.assert_close(combined_k, k, atol=0, rtol=0) + torch.testing.assert_close(combined_v, v, atol=0, rtol=0) + + @parameterized.expand(itertools.product([True, False], + ["float16", "bfloat16"], [None], + [None]), + name_func=unittest_name_func) + def test_vgqa_model_runner_allclose(self, use_py_session, dtype, engine_dir, + hf_model_dir): + input_text = "Born in north-east France, Soyer trained as a" + tokenizer_dir = hf_model_dir + + if engine_dir is None or not Path(engine_dir).exists: + self.skipTest(f"Engine dir is either None or doesn't exist") + if hf_model_dir is None or not Path(hf_model_dir).exists: + self.skipTest( + f"Missing HF checkpoint, define a valid checkpoint path with the NEMOTRON_NAS_CKPT environment variable" + ) + + dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype) + + hf_model = transformers.AutoModelForCausalLM.from_pretrained( + hf_model_dir, trust_remote_code=True, torch_dtype=dtype).cuda() + + batch_size = 1 + max_seq_len = 30 + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + padding_side="left", + truncation_side="left", + trust_remote_code=True, + use_fast=True) + batch_input_ids = [ + torch.tensor(tokenizer.encode(input_text, + add_special_tokens=True, + truncation=True), + dtype=torch.int32) + ] + + hf_batch_ids = batch_input_ids[0].unsqueeze(0).repeat(batch_size, + 1).cuda() + in_tokens = batch_input_ids[0].shape[0] + + with torch.no_grad(): + hf_outputs = hf_model.generate(hf_batch_ids, max_length=max_seq_len) + + torch.cuda.synchronize() + + if use_py_session: + runner = ModelRunner.from_dir(engine_dir=engine_dir, + rank=0, + debug_mode=False) + + else: + runner = ModelRunnerCpp.from_dir(engine_dir=engine_dir, + rank=0, + debug_mode=False) + + pad_token_id = tokenizer.pad_token_id + if tokenizer.pad_token_id is None: + pad_token_id = tokenizer.eos_token_id + + with torch.no_grad(): + runner_outputs = runner.generate(batch_input_ids=batch_input_ids, + max_new_tokens=max_seq_len - + in_tokens, + end_id=tokenizer.eos_token_id, + pad_id=pad_token_id, + output_sequence_lengths=True, + return_dict=False) + + torch.cuda.synchronize() + + del runner + + if not use_py_session: + np.testing.assert_allclose( + runner_outputs[0][0][:max_seq_len].cpu().numpy(), + hf_outputs[0].cpu().numpy()) + else: + np.testing.assert_allclose(runner_outputs[0].cpu().numpy(), + hf_outputs.cpu().numpy()) diff --git a/tests/model_api/test_model_quantization.py b/tests/model_api/test_model_quantization.py index 160f646d3..6cca4aaef 100644 --- a/tests/model_api/test_model_quantization.py +++ b/tests/model_api/test_model_quantization.py @@ -28,12 +28,15 @@ def test_int4_awq_quantization(): max_batch_size, max_isl, max_osl = 8, 256, 256 - hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf" + hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf") + cnn_dailymail_path = str(llm_models_root() / "datasets/cnn_dailymail") + checkpoint_dir = tempfile.TemporaryDirectory("llama-checkpoint").name quant_config = QuantConfig(QuantAlgo.W4A16_AWQ) LLaMAForCausalLM.quantize(hf_model_dir, checkpoint_dir, quant_config=quant_config, + calib_dataset=cnn_dailymail_path, calib_batches=32, calib_batch_size=32) llama = LLaMAForCausalLM.from_checkpoint(checkpoint_dir) @@ -68,12 +71,14 @@ def test_int4_awq_quantization(): def test_fp8_quantization(): max_batch_size, max_isl, max_osl = 8, 256, 256 hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf") + cnn_dailymail_path = str(llm_models_root() / "datasets/cnn_dailymail") checkpoint_dir = tempfile.TemporaryDirectory("llama-checkpoint").name quant_config = QuantConfig(QuantAlgo.FP8) LLaMAForCausalLM.quantize(hf_model_dir, checkpoint_dir, quant_config=quant_config, + calib_dataset=cnn_dailymail_path, calib_batches=32) llama = LLaMAForCausalLM.from_checkpoint(checkpoint_dir) diff --git a/tests/test_graph_rewriter.py b/tests/test_graph_rewriter.py index 2adc8875a..2fb8656e7 100644 --- a/tests/test_graph_rewriter.py +++ b/tests/test_graph_rewriter.py @@ -455,7 +455,7 @@ def match_and_rewrite(self, layer: Layer) -> bool: new_inputs = flayer.clone_inputs() with net_guard(layer.network): - # Step 1: create new inputs and repalce the original arglist + # Step 1: create new inputs and replace the original arglist input = Tensor( name='qkv', dtype=trt.float16, diff --git a/tests/test_layer.py b/tests/test_layer.py index 8546ec725..0219331bb 100644 --- a/tests/test_layer.py +++ b/tests/test_layer.py @@ -1359,7 +1359,6 @@ def test_mamba(self, batch_size, in_seq_len, out_seq_len, d_model, d_state, stream = torch.cuda.current_stream() builder_config = builder.create_builder_config(name='mamba', - opt_level=0, precision=dtype) engine = builder.build_engine(net, builder_config) session = tensorrt_llm.runtime.Session.from_serialized_engine(engine) @@ -1695,7 +1694,6 @@ def test_mamba2(self, batch_size, in_seq_len, out_seq_len, d_model, d_state, stream = torch.cuda.current_stream() builder_config = builder.create_builder_config(name='mamba2', - opt_level=0, precision=dtype) engine = builder.build_engine(net, builder_config) session = tensorrt_llm.runtime.Session.from_serialized_engine(engine) @@ -1706,7 +1704,7 @@ def test_mamba2(self, batch_size, in_seq_len, out_seq_len, d_model, d_state, hidden_states_ref, last_token_ids, conv_state_ref, ssm_state_ref, remove_padding, batch_size, seqlen_offset) - dtype_atol = {"float16": 5e-3, "float32": 5e-3, "bfloat16": 5e-2} + dtype_atol = {"float16": 7e-3, "float32": 5e-3, "bfloat16": 5e-2} if not remove_padding: # get out_mask @@ -2045,7 +2043,6 @@ def fuse_rg_lru(recurrent_layer): stream = torch.cuda.current_stream() builder_config = builder.create_builder_config(name='recurrent', - opt_level=0, precision=dtype) engine = builder.build_engine(net, builder_config) session = tensorrt_llm.runtime.Session.from_serialized_engine(engine) diff --git a/tests/test_model_runner_cpp.py b/tests/test_model_runner_cpp.py new file mode 100644 index 000000000..e5bd10459 --- /dev/null +++ b/tests/test_model_runner_cpp.py @@ -0,0 +1,84 @@ +import typing as tp +from pathlib import Path + +import torch +from bindings.binding_test_utils import * +from transformers import AutoTokenizer +from utils.cpp_paths import * +from utils.llm_data import llm_models_root +from utils.util import skip_pre_ampere + +from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCpp + + +@pytest.fixture +def model_files(llm_root: Path, resource_path: Path, results_data_path: Path): + # Model engines and expected outputs need to be generated. + print(results_data_path) + if not results_data_path.exists(): + model_cache = llm_models_root() + model_cache_arg = ["--model_cache", str(model_cache) + ] if model_cache is not None else [] + prepare_model_tests(llm_root, resource_path, "gpt", model_cache_arg) + + +@skip_pre_ampere # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture +def test_logits_post_processor(model_files, model_path): + + # Define the logits post-processor callback + def logits_post_processor(req_id: int, logits: torch.Tensor, + ids: tp.List[tp.List[int]], stream_ptr: int, + client_id: tp.Optional[int]): + with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)): + logits[:] = float("-inf") + logits[..., 42] = 0 + + # Create ModelRunnerCpp + logits_processor_map = {"my_logits_pp": logits_post_processor} + runner = ModelRunnerCpp.from_dir(model_path, + logits_processor_map=logits_processor_map) + + model_root = llm_models_root(check=True) + hf_model_dir = Path(model_root, "gpt2") + + tokenizer = AutoTokenizer.from_pretrained(hf_model_dir, + padding_side="left", + truncation_side="left", + trust_remote_code=True, + use_fast=True) + + input_text = "Born in north-east France, Soyer trained as a" + batch_input_ids = [ + torch.tensor(tokenizer.encode(input_text, + add_special_tokens=True, + truncation=True), + dtype=torch.int32) + ] + + pad_token_id = tokenizer.pad_token_id + if tokenizer.pad_token_id is None: + pad_token_id = tokenizer.eos_token_id + + # Create the request + max_new_tokens = 5 + with torch.no_grad(): + outputs = runner.generate(batch_input_ids=batch_input_ids, + max_new_tokens=max_new_tokens, + end_id=tokenizer.eos_token_id, + pad_id=pad_token_id, + output_sequence_lengths=True, + return_dict=True, + logits_processor_names={"my_logits_pp"}) + + torch.cuda.synchronize() + + # Get the new tokens + tokens = outputs['output_ids'] + sequence_lengths = outputs['sequence_lengths'] + + output_begin = len(batch_input_ids[0]) + output_end = sequence_lengths[0][0] + + # check that all output tokens are 42 + assert tokens[0][0][output_begin:output_end].tolist() == [42 + ] * max_new_tokens diff --git a/tests/test_module.py b/tests/test_module.py index 30e3513fe..375ff651e 100644 --- a/tests/test_module.py +++ b/tests/test_module.py @@ -72,6 +72,7 @@ class TestModule(unittest.TestCase): def test_module(self): m = Module3() + print(m) m.forward() self.assertEqual(4, len(list(m.named_modules()))) @@ -88,6 +89,7 @@ def test_module(self): def test_module_list(self): m = Module4() + print(m) m.forward() self.assertEqual(8, len(list(m.named_modules()))) diff --git a/tests/utils/cpp_paths.py b/tests/utils/cpp_paths.py index 02a8abff4..7fd5cee3e 100644 --- a/tests/utils/cpp_paths.py +++ b/tests/utils/cpp_paths.py @@ -77,3 +77,8 @@ def results_data_path(data_path: _pl.Path) -> _pl.Path: @pytest.fixture(scope="module") def results_data_path_beam_width_2(data_path: _pl.Path) -> _pl.Path: return data_path / f"gpt2/beam_search_2/{get_base_model_spec().get_results_file()}" + + +@pytest.fixture(scope="module") +def results_data_path_fmhafp32acc(data_path: _pl.Path) -> _pl.Path: + return data_path / f"gpt2/sampling/{get_base_model_spec().enable_context_fmha_fp32_acc().get_results_file()}" diff --git a/tests/utils/util.py b/tests/utils/util.py index d12af5b5a..8184c9f0f 100644 --- a/tests/utils/util.py +++ b/tests/utils/util.py @@ -1,6 +1,7 @@ import os import unittest from difflib import SequenceMatcher +from pathlib import Path import pytest import tensorrt as trt @@ -199,7 +200,6 @@ def create_session(builder, precision="float32", int8=False, fp8=False, - opt_level=None, memory_pool_limit=None, optimization_profiles=[], quant_mode=QuantMode(0)): @@ -208,14 +208,13 @@ def create_session(builder, Args: network: a tensorrt_llm.Network object precision: the precision of the network, choose from ["float32", "float16", "bfloat16"] - **kwargs: builder flags such as int8, fp8, builder_opt, etc. + **kwargs: builder flags such as int8, fp8, etc. Returns: session: a tensorrt_llm.runtime.Session """ builder_config = builder.create_builder_config(precision=precision, int8=int8, fp8=fp8, - opt_level=opt_level, quant_mode=quant_mode) # Some tests require to set mem pool limit to avoid OOM if memory_pool_limit is not None: @@ -279,3 +278,8 @@ def similarity_score(a, b): def similar(a, b, threshold=0.8): "similar compare a and b " return similarity_score(a, b) >= threshold + + +def get_project_root(test_file: str) -> Path: + return next(p for p in Path(test_file).resolve().parents + if (p / 'tests').is_dir() and (p / "tensorrt_llm").is_dir())