From 8084312b43f15bee17069a6a26aecca5e7058fcc Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Fri, 1 Nov 2024 09:57:55 +0000
Subject: [PATCH] open source cef1070ccdde579844de64f2a2bb8099bc3e5f02

---
 .gitignore                                    |   2 +-
 .gitmodules                                   |   5 +-
 .pre-commit-config.yaml                       |   2 +-
 3rdparty/pybind11                             |   1 +
 3rdparty/ucxx                                 |   2 +-
 README.md                                     |  31 +-
 benchmarks/README.md                          |   5 +-
 benchmarks/cpp/gptManagerBenchmark.cpp        |  99 +-
 benchmarks/cpp/utils/prepare_real_data.py     |   2 -
 benchmarks/python/gpt_benchmark.py            |   2 +-
 cpp/CMakeLists.txt                            |   8 +-
 .../batch_manager/capacityScheduler.h         | 187 ++++
 .../tensorrt_llm/batch_manager/common.h       | 118 +++
 .../batch_manager/evictionPolicy.h            |  74 ++
 .../batch_manager/kvCacheConfig.h             |  11 +-
 .../batch_manager/kvCacheManager.h            | 282 +++--
 .../tensorrt_llm/batch_manager/kvCacheUtils.h |   9 +-
 .../tensorrt_llm/batch_manager/llmRequest.h   | 233 +++--
 .../batch_manager/microBatchScheduler.h       | 108 ++
 .../batch_manager/peftCacheManager.h          |   2 +
 .../batch_manager/trtGptModelOptionalParams.h |  24 +-
 cpp/include/tensorrt_llm/common/algorithm.h   |  32 +
 cpp/include/tensorrt_llm/common/cudaUtils.h   |   9 +-
 cpp/include/tensorrt_llm/common/mpiUtils.h    |   6 +-
 cpp/include/tensorrt_llm/executor/executor.h  | 135 ++-
 .../tensorrt_llm/executor/serialization.h     |  13 +-
 cpp/include/tensorrt_llm/executor/types.h     |  56 +
 .../tensorrt_llm/runtime/decodingInput.h      |  16 +
 .../tensorrt_llm/runtime/decodingOutput.h     |   5 +-
 cpp/include/tensorrt_llm/runtime/gptDecoder.h |  21 -
 .../tensorrt_llm/runtime/gptDecoderBatched.h  |  27 +-
 cpp/include/tensorrt_llm/runtime/gptSession.h |   1 -
 .../tensorrt_llm/runtime/iGptDecoderBatched.h |  16 +-
 cpp/include/tensorrt_llm/runtime/ipcUtils.h   |   2 +-
 .../tensorrt_llm/runtime/lookaheadBuffers.h   |   1 +
 .../tensorrt_llm/runtime/modelConfig.h        | 109 +-
 .../runtime/speculativeDecodingMode.h         |   3 +-
 cpp/tensorrt_llm/CMakeLists.txt               |   2 +
 .../libtensorrt_llm_batch_manager_static.a    |   4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |   4 +-
 .../aarch64-linux-gnu/version.txt             |   6 +-
 .../libtensorrt_llm_batch_manager_static.a    |   4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |   4 +-
 .../x86_64-linux-gnu/version.txt              |   6 +-
 .../tensorrt_llm_batch_manager_static.lib     |   4 +-
 .../x86_64-windows-msvc/version.txt           |   4 +-
 .../common/customAllReduceUtils.h             |   2 +-
 cpp/tensorrt_llm/common/envUtils.cpp          |  38 +
 cpp/tensorrt_llm/common/envUtils.h            |   5 +
 cpp/tensorrt_llm/common/mpiUtils.cpp          |  67 +-
 cpp/tensorrt_llm/common/reduceKernelUtils.cuh |   6 +
 .../libtensorrt_llm_executor_static.a         |   4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |   4 +-
 .../executor/aarch64-linux-gnu/version.txt    |   6 +-
 .../libtensorrt_llm_executor_static.a         |   4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |   4 +-
 .../executor/x86_64-linux-gnu/version.txt     |   6 +-
 .../tensorrt_llm_executor_static.lib          |   4 +-
 .../executor/x86_64-windows-msvc/version.txt  |   4 +-
 .../beamSearchKernelsTemplate.h               |   4 +-
 .../fmhaRunner.cpp                            |  16 +-
 .../kernels/customAllReduceKernels.cu         | 613 +++++++++--
 .../kernels/customAllReduceKernels.h          |  18 +-
 .../fused_moe_gemm_launcher_sm80.inl          |   1 -
 .../moe_gemm/moe_gemm_kernels_template.h      |   3 +-
 .../libtensorrt_llm_nvrtc_wrapper.so          |   4 +-
 .../aarch64-linux-gnu/version.txt             |   4 +-
 .../libtensorrt_llm_nvrtc_wrapper.so          |   2 +-
 .../nvrtcWrapper/x86_64-linux-gnu/version.txt |   4 +-
 .../tensorrt_llm_nvrtc_wrapper.dll            |   2 +-
 .../tensorrt_llm_nvrtc_wrapper.lib            |   2 +-
 .../x86_64-windows-msvc/version.txt           |   6 +-
 cpp/tensorrt_llm/kernels/decodingKernels.cu   |  91 ++
 cpp/tensorrt_llm/kernels/decodingKernels.h    |  20 +
 cpp/tensorrt_llm/kernels/gptKernels.cu        |   2 +-
 ...orrt_llm_internal_cutlass_kernels_static.a |   2 +-
 ...nternal_cutlass_kernels_static.pre_cxx11.a |   2 +-
 .../aarch64-linux-gnu/version.txt             |   6 +-
 ...orrt_llm_internal_cutlass_kernels_static.a |   2 +-
 ...nternal_cutlass_kernels_static.pre_cxx11.a |   2 +-
 .../x86_64-linux-gnu/version.txt              |   6 +-
 ...rt_llm_internal_cutlass_kernels_static.lib |   2 +-
 .../x86_64-windows-msvc/version.txt           |   4 +-
 .../kernels/mixtureOfExperts/moe_kernels.cu   |   2 +-
 .../kernels/samplingAirTopPKernels.cu         |   2 +-
 .../kernels/samplingTopKKernels.cu            |  29 +-
 .../kernels/samplingTopKKernels.h             |  10 +-
 .../kernels/samplingTopPKernels.cu            |  74 +-
 .../kernels/samplingTopPKernels.h             |  21 +-
 .../externalDraftTokensKernels.cu             | 369 ++++---
 .../externalDraftTokensKernels.h              | 123 +--
 .../kernels/stopCriteriaKernels.h             |   2 +-
 .../unfusedAttentionKernels_2_template.h      |  16 +-
 cpp/tensorrt_llm/layers/decodingLayer.cpp     |  50 +
 cpp/tensorrt_llm/layers/decodingLayer.h       |   2 +-
 cpp/tensorrt_llm/layers/decodingParams.h      |  40 +-
 .../layers/externalDraftTokensLayer.cpp       | 514 +++++++++
 .../layers/externalDraftTokensLayer.h         | 100 ++
 .../layers/lookaheadAlgorithm.cpp             | 328 ++++--
 cpp/tensorrt_llm/layers/lookaheadAlgorithm.h  |  78 +-
 .../layers/lookaheadDecodingLayer.cpp         | 123 +--
 .../layers/lookaheadDecodingLayer.h           |   6 +-
 .../layers/lookaheadDecodingUtils.h           |  88 +-
 .../layers/medusaDecodingLayer.cpp            |   2 +-
 cpp/tensorrt_llm/layers/topPSamplingLayer.cpp |   2 +-
 .../gptAttentionCommon/gptAttentionCommon.cpp |  33 +-
 .../gptAttentionCommon/gptAttentionCommon.h   |   4 +-
 .../gptAttentionPlugin/gptAttentionPlugin.cpp |  44 +-
 .../gptAttentionPlugin/gptAttentionPlugin.h   |   3 +-
 .../plugins/loraPlugin/loraPlugin.cpp         |   4 +-
 .../plugins/ncclPlugin/allreducePlugin.cpp    |  16 +-
 cpp/tensorrt_llm/pybind/CMakeLists.txt        |  43 +-
 .../pybind/batch_manager/algorithms.cpp       |  55 +
 .../pybind/batch_manager/algorithms.h         |  28 +
 .../pybind/batch_manager/bindings.cpp         |  41 +
 .../pybind/batch_manager/bindings.h           |  28 +
 .../pybind/batch_manager/gptManager.h         |   1 +
 .../pybind/batch_manager/inferenceRequest.h   |   1 +
 .../pybind/batch_manager/kvCacheManager.cpp   |  29 +
 .../pybind/batch_manager/kvCacheManager.h     |  36 +
 .../pybind/batch_manager/llmRequest.cpp       | 145 ++-
 .../pybind/batch_manager/llmRequest.h         |  11 +
 .../pybind/batch_manager/namedTensor.h        |   1 +
 cpp/tensorrt_llm/pybind/bindings.cpp          |  59 +-
 .../pybind/common/algorithmBindings.h         |  39 +
 .../pybind/common/opaqueBindings.h            |  18 +
 cpp/tensorrt_llm/pybind/executor/bindings.cpp |  78 +-
 cpp/tensorrt_llm/pybind/executor/bindings.h   |   2 +
 cpp/tensorrt_llm/pybind/executor/executor.cpp |   1 +
 cpp/tensorrt_llm/pybind/executor/executor.h   |   7 +
 .../pybind/executor/streamCaster.h            |   4 +-
 .../pybind/executor/tensorCaster.h            |   4 +-
 cpp/tensorrt_llm/pybind/utils/bindTypes.h     |  69 ++
 cpp/tensorrt_llm/pybind/utils/pathCaster.h    |   1 +
 cpp/tensorrt_llm/runtime/gptDecoder.cpp       | 250 +----
 .../runtime/gptDecoderBatched.cpp             | 225 ++--
 cpp/tensorrt_llm/runtime/gptJsonConfig.cpp    |  47 +-
 cpp/tensorrt_llm/runtime/gptSession.cpp       |  14 +-
 cpp/tensorrt_llm/runtime/ipcUtils.cpp         |  38 +-
 cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp |  58 +-
 cpp/tensorrt_llm/runtime/loraUtils.cpp        |   4 +
 cpp/tensorrt_llm/runtime/medusaModule.cpp     |   2 +-
 cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp  |  10 +-
 .../runtime/statefulGptDecoder.cpp            |   9 +-
 cpp/tensorrt_llm/runtime/tllmBuffers.h        |   2 +-
 cpp/tensorrt_llm/runtime/tllmRuntime.cpp      | 212 ++--
 cpp/tensorrt_llm/runtime/tllmRuntime.h        |  23 +-
 .../runtime/transformerBuffers.cpp            |  36 +-
 cpp/tensorrt_llm/runtime/transformerBuffers.h |   8 +-
 .../runtime/utils/sessionUtils.cpp            |  11 +
 cpp/tensorrt_llm/runtime/utils/sessionUtils.h |   2 +
 .../kernels/allReduce/allReduceKernelTest.cu  | 181 +++-
 cpp/tests/kernels/decodingKernelTest.cpp      | 468 ++++++++-
 cpp/tests/kernels/mixtureOfExpertsTest.cu     | 411 ++------
 .../kernels/sampling/samplingAirTopPTest.cpp  |  27 +-
 cpp/tests/kernels/sampling/samplingTest.cpp   |  58 +-
 cpp/tests/kernels/sampling/samplingTest.h     |   6 +-
 .../kernels/sampling/samplingTopKTest.cpp     |  21 +-
 .../kernels/sampling/samplingTopPTest.cpp     |  29 +-
 cpp/tests/layers/baseSamplingLayerTest.cpp    |  41 +-
 cpp/tests/layers/baseSamplingLayerTest.h      |  18 +-
 cpp/tests/layers/lookaheadAlgorithmTest.cpp   |  66 +-
 .../layers/lookaheadDecodingLayerTest.cpp     |  22 +-
 cpp/tests/layers/lookaheadRandomLlmTest.cpp   |   2 +-
 cpp/tests/layers/randomLlm.cpp                |   4 +-
 cpp/tests/layers/samplingLayerTest.cpp        |   2 +-
 cpp/tests/layers/topKSamplingLayerTest.cpp    |   2 +-
 cpp/tests/layers/topPSamplingLayerTest.cpp    |  29 +-
 .../data/test_model_lora_config.json          |   1 -
 .../scripts/build_chatglm_engines.py          |   1 -
 .../resources/scripts/build_gpt_engines.py    |   1 -
 .../resources/scripts/build_llama_engines.py  |   2 +-
 .../scripts/generate_expected_gpt_output.py   |  10 +-
 .../scripts/generate_expected_llama_output.py |   2 +-
 cpp/tests/resources/scripts/test_cpp.py       | 178 +++-
 cpp/tests/runtime/gptDecoderBatchedTest.cpp   |  16 +-
 cpp/tests/runtime/gptDecoderTest.cpp          |   2 +-
 cpp/tests/runtime/loraCacheTest.cpp           |  12 +-
 cpp/tests/runtime/loraManagerTest.cpp         |   6 +-
 cpp/tests/runtime/loraUtilsTest.cpp           |   7 +-
 docker/Dockerfile.multi                       |   6 +-
 docker/common/install_pytorch.sh              |  35 +-
 docs/requirements.txt                         |   1 +
 docs/source/advanced/batch-manager.md         |   4 +
 docs/source/{ => advanced}/executor.md        |  57 +-
 docs/source/advanced/gpt-runtime.md           |  22 +-
 .../kv-cache-reuse.md}                        |   2 +
 .../speculative-decoding.md}                  |  87 +-
 docs/source/architecture/core-concepts.md     | 132 ++-
 docs/source/architecture/workflow.md          |  23 +-
 docs/source/blogs/quantization-in-TRT-LLM.md  |  60 +-
 docs/source/conf.py                           |   7 +
 docs/source/helper.py                         |   2 +-
 docs/source/index.rst                         |  21 +-
 .../installation/build-from-source-windows.md |   5 +-
 docs/source/installation/windows.md           |  10 +-
 docs/source/llm-api-examples/index.md         |  14 +-
 docs/source/llm-api/index.md                  | 101 ++
 docs/source/media/image-09-29-2024.png        | Bin 0 -> 178826 bytes
 .../source/performance/perf-benchmarking.md   | 273 ++++-
 docs/source/performance/perf-overview.md      | 331 +++---
 docs/source/quick-start-guide.md              |  25 +-
 docs/source/reference/support-matrix.md       |   3 +-
 docs/source/reference/troubleshooting.md      |  14 -
 docs/source/release-notes.md                  |  54 +-
 examples/apps/fastapi_server.py               |  58 +-
 examples/apps/openai_server.py                |  31 +-
 examples/baichuan/requirements.txt            |   4 +-
 examples/bindings/executor/README.md          |   2 +-
 .../bindings/executor/example_advanced.py     |   2 +-
 examples/bindings/executor/example_debug.py   |  39 +-
 examples/bloom/requirements.txt               |   4 +-
 examples/chatglm/requirements.txt             |   4 +-
 examples/dbrx/convert_checkpoint.py           |   7 +-
 examples/dbrx/requirements.txt                |   2 +-
 examples/deepseek_v1/README.md                |  77 ++
 .../deci => examples/deepseek_v1}/__init__.py |   0
 examples/deepseek_v1/convert_checkpoint.py    | 215 ++++
 examples/deepseek_v1/requirements.txt         |   5 +
 examples/draft_target_model/README.md         |  86 ++
 examples/draft_target_model/requirements.txt  |   6 +
 examples/enc_dec/README.md                    |   2 +
 examples/falcon/requirements.txt              |   4 +-
 examples/gemma/requirements.txt               |   4 +-
 examples/gpt/README.md                        |   3 +-
 examples/gpt/requirements.txt                 |   4 +-
 examples/gptj/requirements.txt                |   2 +-
 examples/gptneox/requirements.txt             |   2 +-
 examples/grok/requirements.txt                |   2 +-
 examples/internlm/requirements.txt            |   4 +-
 examples/jais/requirements.txt                |   4 +-
 examples/llama/README.md                      |  33 +-
 examples/llama/convert_checkpoint.py          |  15 +
 examples/llama/requirements.txt               |   5 +-
 examples/llm-api/README.md                    | 330 +-----
 .../{llm_generate.py => llm_inference.py}     |   0
 ...nerate_async.py => llm_inference_async.py} |   0
 ...ng.py => llm_inference_async_streaming.py} |   0
 examples/llm-api/llm_inference_customize.py   |  47 +
 ...ibuted.py => llm_inference_distributed.py} |   6 +-
 examples/llm-api/llm_logits_processor.py      |  51 +
 examples/llm-api/requirements.txt             |   2 -
 examples/mamba/README.md                      |   6 -
 examples/mamba/convert_checkpoint.py          | 478 ++-------
 examples/mamba/requirements.txt               |   2 +-
 examples/medusa/requirements.txt              |   4 +-
 examples/mixtral/README.md                    |   2 +-
 examples/mixtral/requirements.txt             |   2 +-
 examples/model_api/README.md                  |   2 +-
 examples/model_api/llama.py                   |   2 -
 examples/model_api/llama_multi_gpu.py         |   1 -
 examples/mpt/README.md                        |  56 -
 examples/mpt/convert_checkpoint.py            |  30 +-
 examples/mpt/requirements.txt                 |   2 +-
 examples/multimodal/README.md                 |   2 +-
 examples/nemotron/requirements.txt            |   2 +-
 examples/nemotron_nas/README.md               | 102 ++
 examples/nemotron_nas/calibration_utils.py    |  39 +
 examples/nemotron_nas/convert_checkpoint.py   | 162 +++
 examples/opt/requirements.txt                 |   2 +-
 examples/phi/README.md                        |  33 +-
 examples/phi/convert_checkpoint.py            |  48 +-
 examples/phi/requirements.txt                 |   2 +-
 examples/quantization/quantize.py             |  42 +-
 examples/quantization/requirements.txt        |   2 +-
 examples/qwen/requirements.txt                |   4 +-
 examples/qwenvl/requirements.txt              |   4 +-
 examples/recurrentgemma/requirements.txt      |   2 +-
 examples/redrafter/requirements.txt           |   4 +-
 examples/run.py                               | 467 +++++++--
 examples/skywork/requirements.txt             |   2 +-
 examples/smaug/requirements.txt               |   4 +-
 examples/summarize.py                         |   4 +-
 examples/utils.py                             |  29 +-
 examples/whisper/requirements.txt             |   2 +-
 requirements-dev.txt                          |   2 +
 requirements-windows.txt                      |  13 +-
 requirements.txt                              |   7 +-
 scripts/build_wheel.py                        |  41 +-
 tensorrt_llm/_utils.py                        |  28 +
 .../plugin_nodes/gpt_attention_node.py        |   3 +
 .../bench/{run => benchmark}/__init__.py      |   0
 .../bench/{run => benchmark}/dataclasses.py   | 102 +-
 tensorrt_llm/bench/benchmark/low_latency.py   | 336 ++++++
 .../{run/run.py => benchmark/throughput.py}   |  83 +-
 .../bench/{run => benchmark}/utils.py         |  68 +-
 tensorrt_llm/bench/build/benchmark_config.yml |  60 +-
 tensorrt_llm/bench/build/build.py             |  23 +-
 tensorrt_llm/bench/dataclasses.py             |  19 +-
 tensorrt_llm/bench/utils/data.py              |  41 +-
 tensorrt_llm/bench/utils/tokenize.py          | 105 --
 tensorrt_llm/builder.py                       |  40 +-
 tensorrt_llm/commands/bench.py                |   6 +-
 tensorrt_llm/commands/build.py                |  13 +-
 tensorrt_llm/executor.py                      | 273 +++--
 tensorrt_llm/functional.py                    |  90 +-
 tensorrt_llm/hlapi/__init__.py                |   7 +-
 tensorrt_llm/hlapi/llm.py                     |   2 +-
 tensorrt_llm/hlapi/llm_utils.py               | 148 ++-
 tensorrt_llm/hlapi/utils.py                   |  27 +-
 tensorrt_llm/layers/__init__.py               |   3 +-
 tensorrt_llm/layers/attention.py              | 107 +-
 tensorrt_llm/layers/embedding.py              |   9 +-
 tensorrt_llm/layers/linear.py                 |  18 +-
 tensorrt_llm/layers/mlp.py                    |   5 +-
 tensorrt_llm/layers/moe.py                    |  57 +-
 tensorrt_llm/lora_manager.py                  |   9 +
 tensorrt_llm/models/__init__.py               |   7 +-
 tensorrt_llm/models/automodel.py              |  18 +-
 tensorrt_llm/models/chatglm/convert.py        |   5 +-
 tensorrt_llm/models/convert_utils.py          |   5 +-
 tensorrt_llm/models/deepseek_v1/__init__.py   |  14 +
 tensorrt_llm/models/deepseek_v1/convert.py    | 361 +++++++
 tensorrt_llm/models/deepseek_v1/model.py      | 257 +++++
 tensorrt_llm/models/enc_dec/model.py          |  65 +-
 tensorrt_llm/models/falcon/model.py           |   3 +-
 tensorrt_llm/models/gemma/model.py            |   3 +-
 tensorrt_llm/models/gemma/smoothquant.py      |   4 +-
 tensorrt_llm/models/generation_mixin.py       | 131 ++-
 tensorrt_llm/models/gpt/config.py             |   5 +
 tensorrt_llm/models/gpt/model.py              |   6 +
 tensorrt_llm/models/grok/convert.py           |   6 +-
 tensorrt_llm/models/grok/model.py             |   6 +-
 tensorrt_llm/models/llama/convert.py          |  25 +-
 tensorrt_llm/models/llama/model.py            |  18 +-
 tensorrt_llm/models/mamba/config.py           | 340 ++++++
 tensorrt_llm/models/mamba/convert.py          | 245 +++++
 tensorrt_llm/models/mamba/model.py            |  49 +-
 tensorrt_llm/models/model_weights_loader.py   |  79 +-
 tensorrt_llm/models/modeling_utils.py         | 291 +++++-
 tensorrt_llm/models/nemotron_nas/__init__.py  |  14 +
 .../models/{deci => nemotron_nas}/config.py   |  39 +-
 .../models/{deci => nemotron_nas}/convert.py  | 157 +--
 .../{deci => nemotron_nas}/layer_config.py    |   0
 .../models/{deci => nemotron_nas}/model.py    | 286 ++++-
 tensorrt_llm/models/phi3/config.py            |  17 +-
 tensorrt_llm/models/phi3/convert.py           |  40 +-
 tensorrt_llm/models/phi3/model.py             |  69 +-
 tensorrt_llm/models/phi3/split_weights.py     |  53 +-
 tensorrt_llm/models/qwen/model.py             |   9 +-
 tensorrt_llm/models/recurrentgemma/model.py   |  19 +-
 .../models/redrafter/redrafter_helper.py      | 136 ++-
 tensorrt_llm/module.py                        |  61 ++
 tensorrt_llm/parameter.py                     |   6 +-
 tensorrt_llm/plugin/plugin.py                 |  21 +-
 tensorrt_llm/quantization/layers.py           |  60 +-
 tensorrt_llm/quantization/mode.py             |  10 +-
 tensorrt_llm/quantization/quantize.py         |  90 +-
 .../quantization/quantize_by_modelopt.py      | 187 +++-
 tensorrt_llm/runtime/generation.py            | 194 ++--
 tensorrt_llm/runtime/kv_cache_manager.py      |  21 +-
 tensorrt_llm/runtime/memory_pools/__init__.py |   0
 .../memory_pools/memory_pools_allocator.py    |  80 ++
 tensorrt_llm/runtime/memory_pools/pool.py     |   7 +
 .../memory_pools/pools_kv_cache_manager.py    |  67 ++
 tensorrt_llm/runtime/model_runner_cpp.py      | 235 +++--
 tensorrt_llm/tools/multimodal_builder.py      |  31 +-
 tensorrt_llm/version.py                       |   2 +-
 tests/attention/test_gpt_attention.py         | 107 +-
 tests/attention/test_gpt_attention_IFB.py     |  80 +-
 tests/bindings/test_bindings_ut.py            |  29 +-
 tests/bindings/test_executor_bindings.py      |  60 +-
 tests/conftest.py                             |  44 +
 tests/functional/test_moe.py                  |   3 +-
 tests/hlapi/apps/_test_llm_server.py          |  10 +-
 tests/hlapi/test_llm.py                       | 183 +++-
 tests/hlapi/test_llm_models.py                |  60 +-
 tests/hlapi/test_llm_multi_gpu.py             |  83 +-
 tests/hlapi/test_llm_quant.py                 |  25 +-
 tests/hlapi/test_llm_utils.py                 |   2 +-
 tests/model/test_decilm.py                    | 602 -----------
 tests/model/test_gpt.py                       |  59 +-
 tests/model/test_gpt_e2e.py                   |   1 -
 tests/model/test_llama.py                     |   3 +-
 tests/model/test_mamba.py                     |  11 +-
 tests/model/test_mistral.py                   |   3 +-
 tests/model/test_nemotron_nas.py              | 989 ++++++++++++++++++
 tests/model_api/test_model_quantization.py    |   7 +-
 tests/test_graph_rewriter.py                  |   2 +-
 tests/test_layer.py                           |   5 +-
 tests/test_model_runner_cpp.py                |  84 ++
 tests/test_module.py                          |   2 +
 tests/utils/cpp_paths.py                      |   5 +
 tests/utils/util.py                           |  10 +-
 384 files changed, 14603 insertions(+), 5392 deletions(-)
 create mode 160000 3rdparty/pybind11
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/common.h
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
 create mode 100644 cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
 create mode 100644 cpp/include/tensorrt_llm/common/algorithm.h
 create mode 100644 cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp
 create mode 100644 cpp/tensorrt_llm/layers/externalDraftTokensLayer.h
 mode change 100644 => 100755 cpp/tensorrt_llm/pybind/CMakeLists.txt
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/algorithms.h
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/bindings.h
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
 create mode 100644 cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
 create mode 100644 cpp/tensorrt_llm/pybind/common/algorithmBindings.h
 create mode 100644 cpp/tensorrt_llm/pybind/common/opaqueBindings.h
 create mode 100644 cpp/tensorrt_llm/pybind/utils/bindTypes.h
 rename docs/source/{ => advanced}/executor.md (81%)
 rename docs/source/{kv_cache_reuse.md => advanced/kv-cache-reuse.md} (99%)
 rename docs/source/{speculative_decoding.md => advanced/speculative-decoding.md} (91%)
 create mode 100644 docs/source/llm-api/index.md
 create mode 100644 docs/source/media/image-09-29-2024.png
 rename benchmarks/Suite.md => docs/source/performance/perf-benchmarking.md (52%)
 create mode 100644 examples/deepseek_v1/README.md
 rename {tensorrt_llm/models/deci => examples/deepseek_v1}/__init__.py (100%)
 create mode 100644 examples/deepseek_v1/convert_checkpoint.py
 create mode 100644 examples/deepseek_v1/requirements.txt
 create mode 100644 examples/draft_target_model/README.md
 create mode 100644 examples/draft_target_model/requirements.txt
 rename examples/llm-api/{llm_generate.py => llm_inference.py} (100%)
 rename examples/llm-api/{llm_generate_async.py => llm_inference_async.py} (100%)
 rename examples/llm-api/{llm_generate_async_streaming.py => llm_inference_async_streaming.py} (100%)
 create mode 100644 examples/llm-api/llm_inference_customize.py
 rename examples/llm-api/{llm_generate_distributed.py => llm_inference_distributed.py} (91%)
 create mode 100644 examples/llm-api/llm_logits_processor.py
 delete mode 100644 examples/llm-api/requirements.txt
 create mode 100644 examples/nemotron_nas/README.md
 create mode 100644 examples/nemotron_nas/calibration_utils.py
 create mode 100644 examples/nemotron_nas/convert_checkpoint.py
 rename tensorrt_llm/bench/{run => benchmark}/__init__.py (100%)
 rename tensorrt_llm/bench/{run => benchmark}/dataclasses.py (62%)
 create mode 100644 tensorrt_llm/bench/benchmark/low_latency.py
 rename tensorrt_llm/bench/{run/run.py => benchmark/throughput.py} (89%)
 rename tensorrt_llm/bench/{run => benchmark}/utils.py (60%)
 delete mode 100644 tensorrt_llm/bench/utils/tokenize.py
 create mode 100644 tensorrt_llm/models/deepseek_v1/__init__.py
 create mode 100644 tensorrt_llm/models/deepseek_v1/convert.py
 create mode 100644 tensorrt_llm/models/deepseek_v1/model.py
 create mode 100644 tensorrt_llm/models/mamba/config.py
 create mode 100644 tensorrt_llm/models/mamba/convert.py
 create mode 100644 tensorrt_llm/models/nemotron_nas/__init__.py
 rename tensorrt_llm/models/{deci => nemotron_nas}/config.py (86%)
 rename tensorrt_llm/models/{deci => nemotron_nas}/convert.py (77%)
 rename tensorrt_llm/models/{deci => nemotron_nas}/layer_config.py (100%)
 rename tensorrt_llm/models/{deci => nemotron_nas}/model.py (67%)
 create mode 100644 tensorrt_llm/runtime/memory_pools/__init__.py
 create mode 100644 tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py
 create mode 100644 tensorrt_llm/runtime/memory_pools/pool.py
 create mode 100644 tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py
 create mode 100644 tests/conftest.py
 delete mode 100644 tests/model/test_decilm.py
 create mode 100644 tests/model/test_nemotron_nas.py
 create mode 100644 tests/test_model_runner_cpp.py

diff --git a/.gitignore b/.gitignore
index d9463eeb2..ca13276bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,7 +37,7 @@ tensorrt_llm/bindings.pyi
 tensorrt_llm/bindings/*.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
-docs/source/llm-api
+docs/source/llm-api/*.rst
 docs/source/llm-api-examples/llm_*.rst
 *.swp
 
diff --git a/.gitmodules b/.gitmodules
index 6fdb69781..f9208d5de 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,4 +13,7 @@
 	url = https://github.com/NVIDIA/NVTX.git
 [submodule "3rdparty/ucxx"]
 	path = 3rdparty/ucxx
-	url = https://github.com/GuanLuo/ucxx.git
+	url = https://github.com/rapidsai/ucxx.git
+[submodule "3rdparty/pybind11"]
+	path = 3rdparty/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e4219d974..2dc60bd2d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -46,5 +46,5 @@ repos:
         args:
         - --skip=".git,3rdparty"
         - --exclude-file=examples/whisper/tokenizer.py
-        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid
+        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile,thrid,improbe
         exclude: 'tests/llm-test-defs/turtle/test_input_files'
diff --git a/3rdparty/pybind11 b/3rdparty/pybind11
new file mode 160000
index 000000000..f99ffd7e0
--- /dev/null
+++ b/3rdparty/pybind11
@@ -0,0 +1 @@
+Subproject commit f99ffd7e03001810a3e722bf48ad1a9e08415d7d
diff --git a/3rdparty/ucxx b/3rdparty/ucxx
index b99181779..5c745102c 160000
--- a/3rdparty/ucxx
+++ b/3rdparty/ucxx
@@ -1 +1 @@
-Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd
+Subproject commit 5c745102c26df11e68f11368bcd9649e81e981da
diff --git a/README.md b/README.md
index df0cdf10a..64c2c49a8 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.13.0-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-0.14.0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
@@ -17,6 +17,24 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
+* [2024/09/29] 🌟 AI at Meta PyTorch + TensorRT v2.4 🌟 ⚡TensorRT 10.1 ⚡PyTorch 2.4 ⚡CUDA 12.4 ⚡Python 3.12
+[➡️ link](https://github.com/pytorch/TensorRT/releases/tag/v2.4.0)
+<div align="center">
+<img src="docs/source/media/image-09-29-2024.png" width="50%">
+<div align="left">
+
+* [2024/09/17] ✨ NVIDIA TensorRT-LLM Meetup
+[➡️ link](https://drive.google.com/file/d/1RR8GqC-QbuaKuHj82rZcXb3MS20SWo6F/view?usp=share_link)
+
+* [2024/09/17] ✨ Accelerating LLM Inference at Databricks with TensorRT-LLM
+[➡️ link](https://drive.google.com/file/d/1NeSmrLaWRJAY1rxD9lJmzpB9rzr38j8j/view?usp=sharing)
+
+* [2024/09/17] ✨ TensorRT-LLM @ Baseten
+[➡️ link](https://drive.google.com/file/d/1Y7L2jqW-aRmt31mCdqhwvGMmCSOzBUjG/view?usp=share_link)
+
+* [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
+[➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)
+
 * [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
 
@@ -43,6 +61,9 @@ TensorRT-LLM
 * [2024/07/02] Let the @MistralAI MoE tokens fly 📈 🚀 #Mixtral 8x7B with NVIDIA #TensorRT #LLM on #H100.
 [➡️ Tech blog](https://developer.nvidia.com/blog/achieving-high-mixtral-8x7b-performance-with-nvidia-h100-tensor-core-gpus-and-tensorrt-llm?ncid=so-twit-928467)
 
+<details close>
+<summary>Previous News</summary>
+
 * [2024/06/24] Enhanced with NVIDIA #TensorRT #LLM, @upstage.ai’s solar-10.7B-instruct is ready to power your developer projects through our API catalog 🏎️. ✨[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )
 
 * [2024/06/18] CYMI: 🤩 Stable Diffusion 3 dropped last week 🎊 🏎️ Speed up your SD3 with #TensorRT INT8 Quantization[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )
@@ -55,10 +76,6 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights 
 * [2024/06/04] ✨ #TensorRT and GeForce #RTX unlock ComfyUI SD superhero powers 🦸⚡ 🎥 Demo: [➡️ link](https://youtu.be/64QEVfbPHyg)
 📗 DIY notebook: [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&name=ComfyUI_TensorRT&instance=L4%40g2-standard-4%3Anvidia-l4%3A1&diskStorage=500&cloudID=GCP&baseImage=docker.io%2Fpytorch%2Fpytorch%3A2.2.0-cuda12.1-cudnn8-runtime&ports=ComfUI%3A8188&file=https%3A%2F%2Fgithub.com%2Fbrevdev%2Fnotebooks%2Fblob%2Fmain%2Ftensorrt-comfyui.ipynb&launchableID=env-2hQX3n7ae5mq3NjNZ32DfAG0tJf)
 
-<details close>
-<summary>Previous News</summary>
-
-
 * [2024/05/28] ✨#TensorRT weight stripping for ResNet-50 ✨ ✅+99% compression
 ✅1 set of weights → ** GPUs\ ✅0 performance loss ✅** models…LLM, CNN, etc
 👀 📚 DIY [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&launchableID=env-2h6bym7h5GFNho3vpWQQeUYMwTM&instance=L4%40g6.xlarge&diskStorage=500&cloudID=devplane-brev-1&baseImage=nvcr.io%2Fnvidia%2Ftensorrt%3A24.05-py3&file=https%3A%2F%2Fgithub.com%2FNVIDIA%2FTensorRT%2Fblob%2Frelease%2F10.0%2Fsamples%2Fpython%2Fsample_weight_stripping%2Fnotebooks%2Fweight_stripping.ipynb&name=tensorrt_weight_stripping_resnet50)
@@ -68,10 +85,8 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
 
 * [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)
 
-
 * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)
 
-
 * [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md)
 * [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md)
 * [2023/12/04] [Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100](./docs/source/blogs/Falcon180B-H200.md)
@@ -88,7 +103,7 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
 ## TensorRT-LLM Overview
 
 TensorRT-LLM is a library for optimizing Large Language Model (LLM) inference.
-It provides state-of-the-art optimziations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs
+It provides state-of-the-art optimizations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ++) and much more, to perform inference efficiently on NVIDIA GPUs
 
 TensorRT-LLM provides a Python API to build LLMs into optimized
 [TensorRT](https://developer.nvidia.com/tensorrt) engines.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 00f450319..b368a6621 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -7,5 +7,6 @@ There are currently three workflows to benchmark TensorRT-LLM:
   - The recommended workflow that uses TensorRT-LLM C++ API and can take advantage of the latest features of TensorRT-LLM.
 * [Python benchmarks](./python)
   - The Python benchmarking scripts can only benchmark the Python runtime, which do not support the latest features, such as in-flight batching.
-* [The Python benchmarking suite](./Suite.md)
-  - This benchmarking suite is a current work in progress and is prone to large changes.
+* [The Python benchmarking suite](../docs/source/performance/perf-benchmarking.md)
+  - This benchmarker is native to TensorRT-LLM and is a Python benchmarker for reproducing and testing the performance of TensorRT-LLM.
+  - _NOTE_: This benchmarking suite is a current work in progress and is prone to large changes.
diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
index 45632350c..8e5d94a12 100644
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -145,6 +145,7 @@ struct BenchmarkParams
 {
     std::optional<SizeType32> maxTokensInPagedKvCache{std::nullopt};
     std::optional<float> freeGpuMemoryFraction{std::nullopt};
+    std::optional<float> crossKvCacheFraction{std::nullopt};
     bool enableTrtOverlap{false};
     bool enableBlockReuse{false};
     bool enableChunkedContext{false};
@@ -159,6 +160,8 @@ struct BenchmarkParams
     std::optional<int> sinkTokenLength{std::nullopt};
     bool multiBlockMode{true};
     bool enableContextFMHAFP32Acc{false};
+    bool cudaGraphMode{false};
+    SizeType32 cudaGraphCacheSize{0};
 
     // lora / peft params
     std::optional<std::string> loraDir{std::nullopt};
@@ -470,7 +473,38 @@ class Recorder
             mRequestBenchInfos[requestId].firstTokenSeen = true;
         }
 
-        mRequestBenchInfos[requestId].outputLength += 1;
+        mRequestBenchInfos[requestId].decodingIter += 1;
+    }
+
+    void recordToken(uint64_t requestId, std::list<NamedTensor> const& responseTensors)
+    {
+        int32_t outputLength = 1;
+        for (auto& tensor : responseTensors)
+        {
+            if (tensor.name == inference_request::kSequenceLengthTensorName)
+            {
+                // Tensor of shape nBeams, and we only need the first one
+                outputLength = *(bufferCast<int32_t>(*(tensor.tensor)));
+                break;
+            }
+        }
+
+        mRequestBenchInfos[requestId].outputLength += outputLength;
+        this->recordToken(requestId);
+    }
+
+    void recordToken(uint64_t requestId, texec::Response const& response)
+    {
+        auto outputTokenIds = response.getResult().outputTokenIds;
+
+        int32_t outputLength = 1;
+        for (auto const& beam : outputTokenIds)
+        {
+            outputLength = std::max(static_cast<int32_t>(beam.size()), outputLength);
+        }
+
+        mRequestBenchInfos[requestId].outputLength += outputLength;
+        this->recordToken(requestId);
     }
 
     void recordEnd(uint64_t requestId, std::list<NamedTensor> const& responseTensors, bool hasError)
@@ -500,7 +534,7 @@ class Recorder
         }
         else
         {
-            this->recordToken(requestId);
+            this->recordToken(requestId, responseTensors);
         }
     }
 
@@ -532,7 +566,7 @@ class Recorder
             }
             else
             {
-                this->recordToken(requestId);
+                this->recordToken(requestId, response);
             }
         }
     }
@@ -818,11 +852,13 @@ class ExecutorServer
         texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
         texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
             benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
-            benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
+            benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
+            benchmarkParams.crossKvCacheFraction);
         texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
             std::nullopt, benchmarkParams.loraHostCacheSize);
-        texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(
-            benchmarkParams.multiBlockMode, benchmarkParams.enableContextFMHAFP32Acc);
+        texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
+            benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode,
+            benchmarkParams.cudaGraphCacheSize);
         texec::ExecutorConfig executorConfig(
             maxBeamWidth, schedulerConfig, kvCacheConfig, benchmarkParams.enableChunkedContext, true);
         executorConfig.setGpuWeightsPercent(benchmarkParams.gpuWeightsPercent);
@@ -940,7 +976,7 @@ class ExecutorServer
                 {
                     if (!warmup && !response.hasError())
                     {
-                        mRecorder->recordToken(reqId);
+                        mRecorder->recordToken(reqId, response);
                     }
                 }
             }
@@ -1228,7 +1264,7 @@ class GptServer
             {
                 if (errMsg.empty())
                 {
-                    mRecorder->recordToken(requestId);
+                    mRecorder->recordToken(requestId, response_tensors);
                 }
             }
         }
@@ -1430,6 +1466,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     {
         optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
     }
+    if (benchmarkParams.crossKvCacheFraction)
+    {
+        optionalParams.kvCacheConfig.crossKvCacheFraction = benchmarkParams.crossKvCacheFraction;
+    }
     if (benchmarkParams.maxAttentionWindowVec)
     {
         optionalParams.kvCacheConfig.maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec;
@@ -1458,8 +1498,8 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
                 : benchmarkParams.executorLookaheadConfig.has_value()     ? texec::DecodingMode::Lookahead()
                                                                           : texec::DecodingMode::Auto(),
             benchmarkParams.executorLookaheadConfig, benchmarkParams.medusaChoices);
-    optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig(
-        benchmarkParams.multiBlockMode, benchmarkParams.enableContextFMHAFP32Acc);
+    optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
+        benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode, benchmarkParams.cudaGraphCacheSize);
 
     auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
     auto const worldConfig = WorldConfig::mpi(jsonConfig.getGpusPerNode(), jsonConfig.getTensorParallelism(),
@@ -1874,6 +1914,8 @@ int main(int argc, char* argv[])
         "random_seed", "integer random seed for exponential time delays.", cxxopts::value<int>()->default_value("420"));
     options.add_options()(
         "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
+    options.add_options()(
+        "cross_kv_cache_fraction", "Cross K-V Cache Fraction (from 0.0 to 1.0).", cxxopts::value<float>());
     options.add_options()("request_rate",
         "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
         cxxopts::value<float>());
@@ -1895,7 +1937,8 @@ int main(int argc, char* argv[])
     options.add_options()("return_generation_logits", "Whether to return generation logits.",
         cxxopts::value<bool>()->default_value("false"));
 
-    options.add_options()("scheduler_policy", "Choose scheduler policy between max_utilization/guaranteed_no_evict.",
+    options.add_options()("scheduler_policy",
+        "Choose scheduler policy between max_utilization/guaranteed_no_evict/static_batch.",
         cxxopts::value<std::string>()->default_value("guaranteed_no_evict"));
 
     options.add_options()("first_batch_delay",
@@ -1946,6 +1989,12 @@ int main(int argc, char* argv[])
         cxxopts::value<bool>()->default_value("true"));
     options.add_options()(
         "encoder_engine_dir", "Directory that store the engines of the encoder models.", cxxopts::value<std::string>());
+    options.add_options()("cuda_graph_mode", "When enabled, inference is executed with cuda graph.",
+        cxxopts::value<bool>()->default_value("false"));
+    options.add_options()("cuda_graph_cache_size",
+        "Specify how many cuda graphs are cached in the runtime. Larger cache gives better perf, but consumes more GPU "
+        "memory.",
+        cxxopts::value<SizeType32>()->default_value("0"));
 
     options.add_options()("enable_context_fmha_fp32_acc", "Enable FMHA runner FP32 accumulation",
         cxxopts::value<bool>()->default_value("false"));
@@ -2040,6 +2089,20 @@ int main(int argc, char* argv[])
     {
         benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
     }
+    // Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
+    if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
+    {
+        if (result.count("cross_kv_cache_fraction"))
+        {
+            benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
+        }
+        else
+        {
+            benchmarkParams.crossKvCacheFraction
+                = 0.5f; // default value if not set. but non enc-dec should not even have this param set
+        }
+    }
+
     // Argument: Enable TRT overlap
     benchmarkParams.enableTrtOverlap = result["enable_trt_overlap"].as<bool>();
 
@@ -2131,6 +2194,12 @@ int main(int argc, char* argv[])
     // Argument: enable_context_fmha_fp32_acc
     benchmarkParams.enableContextFMHAFP32Acc = result["enable_context_fmha_fp32_acc"].as<bool>();
 
+    // Argument: cuda_graph_mode
+    benchmarkParams.cudaGraphMode = result["cuda_graph_mode"].as<bool>();
+
+    // Argument: cuda_graph_mode
+    benchmarkParams.cudaGraphCacheSize = result["cuda_graph_cache_size"].as<SizeType32>();
+
     std::optional<TokenIdType> padId;
     // Argument: Padding token id
     if (result.count("pad_id"))
@@ -2168,6 +2237,10 @@ int main(int argc, char* argv[])
     {
         capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT;
     }
+    else if (capacitySchedulerPolicyArg == "static_batch")
+    {
+        capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kSTATIC_BATCH;
+    }
     else
     {
         TLLM_LOG_ERROR("Unexpected scheduler policy: " + capacitySchedulerPolicyArg);
@@ -2246,14 +2319,14 @@ int main(int argc, char* argv[])
     {
         texec::ModelType executorModelType;
         std::optional<std::string> decoderEngineDir = std::nullopt, encoderEngineDir = std::nullopt;
-        if (result.count("encoder_engine_dir") && result.count("engine_dir"))
+        if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
         {
             TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api.");
             TLLM_CHECK_WITH_INFO(
                 modelType == TrtGptModelType::InflightFusedBatching, "encoder-decoder only support inflight batching.");
             executorModelType = texec::ModelType::kENCODER_DECODER;
-            decoderEngineDir = result["engine_dir"].as<std::string>();
             encoderEngineDir = result["encoder_engine_dir"].as<std::string>();
+            decoderEngineDir = result["decoder_engine_dir"].as<std::string>();
         }
         else if (result.count("engine_dir"))
         {
diff --git a/benchmarks/cpp/utils/prepare_real_data.py b/benchmarks/cpp/utils/prepare_real_data.py
index 5f14f6747..94383cfa2 100644
--- a/benchmarks/cpp/utils/prepare_real_data.py
+++ b/benchmarks/cpp/utils/prepare_real_data.py
@@ -231,8 +231,6 @@ def dataset(root_args, **kwargs):
             }, root_args.output)
     else:
         print_dataset(
-            task_ids,
             input_ids,
             output_lens,
-            tokenizer=None,
         )
diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
index 04ba2ab0f..ce06c9f9f 100644
--- a/benchmarks/python/gpt_benchmark.py
+++ b/benchmarks/python/gpt_benchmark.py
@@ -80,7 +80,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
 
         kv_cache_type = KVCacheType.CONTINUOUS
         if hasattr(self, 'kv_cache_type'):
-            kv_cache_type = self.kv_cache_type
+            kv_cache_type = KVCacheType(self.kv_cache_type)
         else:
             if hasattr(self, 'paged_kv_cache'):
                 kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 32e89ae17..125526f7e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -316,6 +316,8 @@ endif()
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
+add_subdirectory(${3RDPARTY_DIR}/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+
 include_directories(
   ${CUDAToolkit_INCLUDE_DIRS}
   ${CUDNN_ROOT_DIR}/include
@@ -323,7 +325,8 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/include
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
-  ${3RDPARTY_DIR}/json/include)
+  ${3RDPARTY_DIR}/json/include
+  ${3RDPARTY_DIR}/pybind11/include)
 
 # TRT dependencies
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
@@ -381,7 +384,7 @@ endif()
 # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")
 
 set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE} -DENABLE_UCX=${ENABLE_UCX}"
+    "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
 )
 
 # Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
@@ -561,6 +564,7 @@ if(ENABLE_UCX)
                  NO_DEFAULT_PATH)
   endif()
 endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_UCX=${ENABLE_UCX}")
 
 file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
      REGEX "#define NV_TENSORRT_.*")
diff --git a/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
new file mode 100644
index 000000000..a08544e2a
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/common/algorithm.h"
+#include "tensorrt_llm/runtime/common.h"
+#include <variant>
+
+namespace tensorrt_llm::batch_manager
+{
+namespace kv_cache_manager
+{
+class KVCacheManager;
+}
+class BasePeftCacheManager;
+} // namespace tensorrt_llm::batch_manager
+
+namespace tensorrt_llm::batch_manager
+{
+
+using tensorrt_llm::runtime::SizeType32;
+
+/// @brief This scheduler takes into account the given request capacity and the KV cache capacity.
+///        Depending on the CapacitySchedulerPolicy it will schedule already started and new requests,
+///        or even pause previously started requests.
+class BaseCapacityScheduler
+{
+public:
+    explicit BaseCapacityScheduler(LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
+        : mNoScheduleUntilState(noScheduleUntilState)
+        , mNoScheduleAfterState(noScheduleAfterState)
+    {
+    }
+
+    [[nodiscard]] LlmRequestState constexpr getNoScheduleUntilState() const noexcept
+    {
+        return mNoScheduleUntilState;
+    }
+
+    [[nodiscard]] LlmRequestState constexpr getNoScheduleAfterState() const noexcept
+    {
+        return mNoScheduleAfterState;
+    }
+
+private:
+    /// The state until/after which the scheduler should not schedule requests
+    LlmRequestState mNoScheduleUntilState;
+    LlmRequestState mNoScheduleAfterState;
+};
+
+/// @brief Schedule up to maxNumRequests requests
+class MaxRequestsScheduler : public BaseCapacityScheduler
+{
+public:
+    explicit MaxRequestsScheduler(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    /// @brief Takes as input a sorted list of requests and outputs a sorted lists of requests
+    ///        to update for this current iteration, and a map of requests to pause
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+};
+
+/// @brief   Schedule requests using the MAX_UTILIZATION policy
+/// @details Try reserving resources to advance requests by one step,
+///          may pause previously started requests.
+class MaxUtilizationScheduler : public BaseCapacityScheduler
+{
+public:
+    MaxUtilizationScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager, bool manyMicroBatches,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    /// @return {fitsKvCache, fitsPeft}
+    std::pair<bool, bool> trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
+        RequestVector& scheduledRequests, SizeType32& numScheduledBlocks, SizeType32& numScheduledPeftPages,
+        std::unordered_set<uint64_t>& seenTaskIds) const;
+
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+    std::shared_ptr<BasePeftCacheManager> mPeftCacheManager{nullptr};
+    /// @brief Boolean that indicates if multiple micro batches might be in flight
+    bool mManyMicroBatches;
+};
+
+/// @brief Schedule requests using the GUARANTEED_NO_EVICT policy
+class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
+{
+public:
+    GuaranteedNoEvictScheduler(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+protected:
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> forwardImpl(
+        RequestList const& activeRequests, bool staticBatchScheduling) const;
+
+private:
+    SizeType32 mMaxNumRequests;
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mKvCacheManager{nullptr};
+    std::shared_ptr<kv_cache_manager::KVCacheManager> mCrossKvCacheManager{nullptr};
+    std::shared_ptr<BasePeftCacheManager> mPeftCacheManager{nullptr};
+};
+
+/// @brief Schedule requests using the STATIC_BATCH policy
+class StaticBatchScheduler : public GuaranteedNoEvictScheduler
+{
+public:
+    StaticBatchScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+};
+
+class CapacityScheduler : public Algorithm
+{
+public:
+    constexpr static auto name{"CapacityScheduler"};
+
+    CapacityScheduler() = default;
+
+    CapacityScheduler(SizeType32 maxNumRequests, std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    static CapacityScheduler make(SizeType32 maxNumRequests,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> kvCacheManager,
+        std::shared_ptr<kv_cache_manager::KVCacheManager> crossKvCacheManager,
+        std::shared_ptr<BasePeftCacheManager> peftCacheManager,
+        executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool manyMicroBatches = false,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE)
+    {
+        return CapacityScheduler{maxNumRequests, std::move(kvCacheManager), std::move(crossKvCacheManager),
+            std::move(peftCacheManager), capacitySchedulerPolicy, manyMicroBatches, noScheduleUntilState,
+            noScheduleAfterState};
+    }
+
+    [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(RequestList const& activeRequests) const;
+
+private:
+    std::variant<std::monostate, MaxRequestsScheduler, MaxUtilizationScheduler, GuaranteedNoEvictScheduler,
+        StaticBatchScheduler>
+        mScheduler;
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/common.h b/cpp/include/tensorrt_llm/batch_manager/common.h
new file mode 100644
index 000000000..6e4a76bc4
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/common.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/common.h"
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace tensorrt_llm::executor
+{
+class RequestWithId;
+}
+
+namespace tensorrt_llm::batch_manager
+{
+class LlmRequest;
+
+using RequestList = std::list<std::shared_ptr<LlmRequest>>;
+using RequestIdType = std::uint64_t;
+using RequestVector = std::vector<std::shared_ptr<LlmRequest>>;
+using ReqIdsSet = std::unordered_set<RequestIdType>;
+
+class ScheduledRequests
+{
+public:
+    /// @brief context phase requests (for decoder-only models) or encoder phase requests (for encoder-decoder models
+    /// and encoder-only models)
+    RequestVector contextRequests;
+
+    /// @brief generation phase requests (for decoder-only models) or empty for others
+    RequestVector generationRequests;
+
+    ScheduledRequests() = default;
+
+    explicit ScheduledRequests(RequestVector contextRequests, RequestVector generationRequests)
+        : contextRequests{std::move(contextRequests)}
+        , generationRequests{std::move(generationRequests)}
+    {
+    }
+
+    [[nodiscard]] bool empty() const
+    {
+        return contextRequests.empty() && generationRequests.empty();
+    }
+
+    [[nodiscard]] std::size_t size() const
+    {
+        return contextRequests.size() + generationRequests.size();
+    }
+};
+
+class BatchState
+{
+public:
+    BatchState() = default;
+
+    BatchState(runtime::SizeType32 numCtxRequests, runtime::SizeType32 numGenRequests, runtime::SizeType32 numTokens,
+        runtime::SizeType32 maxKvCacheLength)
+        : mNumCtxRequests{numCtxRequests}
+        , mNumGenRequests{numGenRequests}
+        , mNumTokens{numTokens}
+        , mMaxKvCacheLength{maxKvCacheLength}
+    {
+    }
+
+    bool isAnyContext() const
+    {
+        return mNumCtxRequests > 0;
+    }
+
+    bool operator==(BatchState const& other) const
+    {
+        return mNumCtxRequests == other.mNumCtxRequests && mNumGenRequests == other.mNumGenRequests
+            && mNumTokens == other.mNumTokens && mMaxKvCacheLength == other.mMaxKvCacheLength;
+    }
+
+    size_t hash() const
+    {
+        size_t h1 = std::hash<runtime::SizeType32>{}(mNumCtxRequests);
+        size_t h2 = std::hash<runtime::SizeType32>{}(mNumGenRequests);
+        size_t h3 = std::hash<runtime::SizeType32>{}(mNumTokens);
+        size_t h4 = std::hash<runtime::SizeType32>{}(mMaxKvCacheLength);
+        return h1 ^ h2 ^ h3 ^ h4;
+    }
+
+    runtime::SizeType32 mNumCtxRequests;
+    runtime::SizeType32 mNumGenRequests;
+    runtime::SizeType32 mNumTokens;
+    runtime::SizeType32 mMaxKvCacheLength;
+};
+
+struct BatchStateHash
+{
+    size_t operator()(BatchState const& bs) const
+    {
+        return bs.hash();
+    }
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
new file mode 100644
index 000000000..a7326eee7
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+
+#include <vector>
+
+using namespace tensorrt_llm::batch_manager::kv_cache_manager;
+
+namespace tensorrt_llm::batch_manager::eviction_policy
+{
+
+class BaseEvictionPolicy
+{
+public:
+    virtual ~BaseEvictionPolicy() = default;
+
+    virtual void initialize(
+        std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks)
+        = 0;
+
+    // Get a free block from the primary memory pool
+    virtual BlockPtr getFreePrimaryBlock() = 0;
+    // Get a free block from the secondary memory pool
+    virtual BlockPtr getFreeSecondaryBlock() = 0;
+    // Release a block. Prioritize the block for eviction if toFront=true
+    virtual void releaseBlock(BlockPtr block, bool toFront = false) = 0;
+    // Get the amount of free blocks in the primary memory pool
+    virtual SizeType32 getNumFreePrimaryBlocks() = 0;
+    // Get the amount of free blocks in the secondary memory pool
+    virtual SizeType32 getNumFreeSecondaryBlocks() = 0;
+    // Claim a free block. Called when the cache manager allocates or reuses a new block
+    virtual void claimBlock(KVCacheBlock block) = 0;
+};
+
+class LRUEvictionPolicy : public BaseEvictionPolicy
+{
+public:
+    void initialize(
+        std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks) override;
+    BlockPtr getFreePrimaryBlock() override;
+    BlockPtr getFreeSecondaryBlock() override;
+    void releaseBlock(BlockPtr block, bool toFront = false) override;
+    SizeType32 getNumFreePrimaryBlocks() override;
+    SizeType32 getNumFreeSecondaryBlocks() override;
+
+    void claimBlock(KVCacheBlock block);
+
+private:
+    FreeBlocksQueue mFreePrimaryBlocks;
+    FreeBlocksQueue mFreeSecondaryBlocks;
+
+    std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
+
+    SizeType32 mFreePrimaryBlocksSize;
+    SizeType32 mFreeSecondaryBlocksSize;
+};
+
+} // namespace tensorrt_llm::batch_manager::eviction_policy
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
index 0aa80adfe..b7295650a 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
@@ -41,7 +41,8 @@ class KvCacheConfig
         std::optional<std::vector<SizeType32>> maxAttentionWindowVec = std::nullopt,
         std::optional<SizeType32> sinkTokenLength = std::nullopt,
         std::optional<float> freeGpuMemoryFraction = std::nullopt, bool enableBlockReuse = false, bool useUvm = false,
-        std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true)
+        std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true,
+        std::optional<float> crossKvCacheFraction = std::nullopt)
         : maxTokens{maxTokens}
         , maxAttentionWindowVec{maxAttentionWindowVec}
         , sinkTokenLength{sinkTokenLength}
@@ -50,6 +51,7 @@ class KvCacheConfig
         , useUvm(useUvm)
         , hostCacheSize(hostCacheSize)
         , onboardBlocks(onboardBlocks)
+        , crossKvCacheFraction{crossKvCacheFraction}
     {
     }
 
@@ -57,7 +59,7 @@ class KvCacheConfig
         : KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindowVec(),
             kvCacheConfig.getSinkTokenLength(), kvCacheConfig.getFreeGpuMemoryFraction(),
             kvCacheConfig.getEnableBlockReuse(), false, kvCacheConfig.getHostCacheSize(),
-            kvCacheConfig.getOnboardBlocks())
+            kvCacheConfig.getOnboardBlocks(), kvCacheConfig.getCrossKvCacheFraction())
     {
     }
 
@@ -66,7 +68,8 @@ class KvCacheConfig
         return maxTokens == other.maxTokens && maxAttentionWindowVec == other.maxAttentionWindowVec
             && sinkTokenLength == other.sinkTokenLength && freeGpuMemoryFraction == other.freeGpuMemoryFraction
             && enableBlockReuse == other.enableBlockReuse && useUvm == other.useUvm
-            && hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
+            && hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks
+            && crossKvCacheFraction == other.crossKvCacheFraction;
     }
 
     friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
@@ -80,5 +83,7 @@ class KvCacheConfig
     bool useUvm;
     std::optional<size_t> hostCacheSize;
     bool onboardBlocks;
+    // Cross will use crossKvCacheFraction of KV Cache and self attention will use the rest.
+    std::optional<float> crossKvCacheFraction;
 };
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index 38b49bd23..cc7aa9374 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -22,6 +22,7 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
@@ -29,13 +30,18 @@
 #include <NvInferRuntime.h>
 
 #include <cstdint>
-#include <functional>
+#include <limits>
 #include <list>
 #include <memory>
 #include <optional>
 #include <unordered_map>
 #include <vector>
 
+namespace tensorrt_llm::batch_manager::eviction_policy
+{
+class BaseEvictionPolicy;
+}
+
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
@@ -124,6 +130,8 @@ class KVCacheBlock
 
     [[nodiscard]] IdType getBlockId() const;
 
+    [[nodiscard]] NextBlockMap getNextBlocks() const;
+
     [[nodiscard]] kernels::KVCacheIndex::UnderlyingType getMemoryPoolBlockIndex() const;
 
     [[nodiscard]] bool isPrimary() const;
@@ -144,22 +152,12 @@ class KVCacheBlock
 
     [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;
 
-    void setFreeBlockIterator(FreeBlocksQueue::iterator freeBlockIterator);
-
-    void resetFreeBlockIterator();
-
-    [[nodiscard]] std::optional<FreeBlocksQueue::iterator> const& getFreeBlockIterator() const;
-
     void setPrevBlock(BlockPtr prevBlock);
 
     void addNextBlock(BlockKey const& blockKey, BlockPtr block);
 
     void removeNextBlock(BlockKey const& blockKey);
 
-    static std::shared_ptr<KVCacheBlock> findBestGPUBlockToFree(std::shared_ptr<KVCacheBlock> searchStart);
-
-    static std::shared_ptr<KVCacheBlock> findLeafBlock(std::shared_ptr<KVCacheBlock> searchStart);
-
     [[nodiscard]] BlockPtr findMatchingBlock(BlockKey const& blockKey) const;
 
     //! \brief Free block from previous block if present.
@@ -203,14 +201,21 @@ class GenerationRequest
 {
 public:
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using SharedPtr = std::shared_ptr<GenerationRequest>;
 
-    explicit GenerationRequest(SizeType32 seqSlotIdx, SizeType32 numTokens, SizeType32 beamWidth)
-        : mSeqSlotIdx(seqSlotIdx)
+    explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth,
+        SizeType32 maxBlocks, SizeType32 numPools = 1)
+        : mRequestId(requestId)
         , mNumTokens(numTokens)
         , mBeamWidth(beamWidth)
         , mCacheBlockIds(beamWidth)
+        , mCacheBlockIndices{
+              runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
+                  runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
     {
+        auto cacheBlockIdsRange = runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices);
+        std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
+            tensorrt_llm::kernels::KVCacheIndex{
+                std::numeric_limits<tensorrt_llm::kernels::KVCacheIndex::UnderlyingType>::max()});
     }
 
     void addNewTokens(SizeType32 n)
@@ -225,9 +230,9 @@ class GenerationRequest
         mNumTokens -= n;
     }
 
-    [[nodiscard]] SizeType32 getSequenceSlotIdx() const
+    [[nodiscard]] LlmRequest::RequestIdType getRequestId() const
     {
-        return mSeqSlotIdx;
+        return mRequestId;
     }
 
     [[nodiscard]] SizeType32 getNumTokens() const
@@ -245,6 +250,16 @@ class GenerationRequest
         return mCacheBlockIds;
     }
 
+    [[nodiscard]] runtime::ITensor& getCacheBlockIndices()
+    {
+        return *mCacheBlockIndices;
+    }
+
+    [[nodiscard]] runtime::ITensor const& getCacheBlockIndices() const
+    {
+        return *mCacheBlockIndices;
+    }
+
     void addCacheBlock(SizeType32 beamIdx, KVCacheBlock::IdType blockId)
     {
         mCacheBlockIds.at(beamIdx).push_back(blockId);
@@ -272,37 +287,64 @@ class GenerationRequest
     }
 
 private:
-    // Slot id of the sequence
-    SizeType32 mSeqSlotIdx;
+    // Request id of the sequence
+    LlmRequest::RequestIdType mRequestId;
     // Current number of generated tokens
     SizeType32 mNumTokens;
     // Number of beams
     SizeType32 mBeamWidth;
-    // List of blocks allocated for each beam of the sequence
+    // List of block ids allocated for each beam of the sequence
     std::vector<std::vector<KVCacheBlock::IdType>> mCacheBlockIds;
+    // Tensor of block indices allocated for each beam of the sequence
+    runtime::ITensor::SharedPtr mCacheBlockIndices;
 };
 
-// BlockManager manages overall metadata of KVCacheBlocks in a layer of the
-// network. Layers are expected to be symmetric, so the metadata can be
-// reused for all layers of the network.
-// The array of cache blocks for a layer is called a pool.
-// Each pool has shape [max_blocks, 2, num_heads, tokens_per_block, head_size].
-// Size per block and number of blocks per pool are pre-determined and set in
-// constructor. These should not be changed after.
-// Block shape is [2, num_heads, tokens_per_block, head_size].
+// attach metadata to a pool pointer
+class KVCacheBlockPool
+{
+public:
+    SizeType32 numKvHeads;
+    SizeType32 numLayers;
+    SizeType32 blockSize;
+
+    // Memory pools. Primary is fast memory, secondary is slower memory used for offloading.
+    runtime::ITensor::SharedPtr primaryPtr;
+    runtime::ITensor::SharedPtr secondaryPtr;
+
+    KVCacheBlockPool(SizeType32 numKvHeads, SizeType32 numLayers, SizeType32 blockSize,
+        runtime::ITensor::SharedPtr primaryPtr = nullptr, runtime::ITensor::SharedPtr secondaryPtr = nullptr)
+        : numKvHeads(numKvHeads)
+        , numLayers(numLayers)
+        , blockSize(blockSize)
+        , primaryPtr(std::move(primaryPtr))
+        , secondaryPtr(std::move(secondaryPtr))
+    {
+    }
+};
+
+// The BlockManager manages the metadata of KVCacheBlocks.
+// It manages multiple arrays of cache blocks called pools.
+// Layers with the same number of kv heads are grouped under the same pool.
+// Each pool has shape [max_blocks, num_layers, 2, num_kv_heads, tokens_pre_block, head_size], where num_layers refers
+// to the number of layers with the same num_kv_heads that share that pool.
+// The metadata of KVCacheBlocks is shared between layers, so each block spans all of the managed pool - an allocated
+// block matches some chunk of memory in each pool. The shape of the chunk in every pool is [2, num_kv_heads,
+// tokens_per_block, head_size]. The size per block and number of blocks are pre-determined and set in the constructor.
 // BlockManager maintains a list of free blocks at any time.
 // Alloc pops off the block at the front, and Free pushes it back to the vector.
-// BlockManager maintains a vector of lists of seqSlotIdx to allocated blocks
+// BlockManager maintains a vector of lists of request ids to allocated blocks
 // per sequence. This can be used to Free all blocks belonging to a sequence.
 class BlockManager
 {
 public:
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
     using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType;
+    using BaseEvictionPolicy = tensorrt_llm::batch_manager::eviction_policy::BaseEvictionPolicy;
 
-    explicit BlockManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead,
+    explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
         SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
-        std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType = CacheType::kSELF);
+        SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
+        CacheType cacheType = CacheType::kSELF);
 
     ~BlockManager();
 
@@ -317,10 +359,6 @@ class BlockManager
     //! \brief Assign blocks for new sequence. Does not try to reuse blocks.
     void addSequence(GenerationRequest& sequence, SizeType32 numBlocks, SizeType32 unsharedBlockIdx);
 
-    //! \brief Release block, which puts it back onto free blocks queue.
-    //! \details Block appended by default, will be put at front if toFront is true.
-    void releaseBlock(std::shared_ptr<KVCacheBlock> block, bool toFront = false);
-
     //! \brief Allocate new block for each beam of the sequence.
     //! \details Might free cached blocks if no free blocks are available.
     void allocateBlock(GenerationRequest& sequence, bool shareAmongBeams = false);
@@ -336,10 +374,7 @@ class BlockManager
     //! \brief Release last block in the sequence
     void releaseLastBlock(GenerationRequest& sequence);
 
-    [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept
-    {
-        return mFreePrimaryBlocks.size();
-    }
+    [[nodiscard]] SizeType32 getNumFreeBlocks() const noexcept;
 
     [[nodiscard]] SizeType32 getNumAllocTotalBlocks() const
     {
@@ -381,21 +416,26 @@ class BlockManager
         return mTokensPerBlock;
     }
 
-    //! \brief Get size of one K/V cache block in one layer.
-    //! @details Volume of [numKvHeads, tokensPerBlock, sizePerHead]
-    [[nodiscard]] SizeType32 getBlockSize() const
+    //! \brief Get size of one K/V cache block in one layer for the specified pool.
+    //! @details Volume of [numKvHeads, tokensPerBlock, sizePerHead] in the specified pool.
+    [[nodiscard]] SizeType32 getBlockSize(SizeType32 poolIdx) const
     {
-        return mBlockSize;
+        return mPools.at(poolIdx).blockSize;
     }
 
-    [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool() const noexcept
+    [[nodiscard]] SizeType32 getNumPools() const noexcept
     {
-        return mPrimaryPool;
+        return mPools.size();
     }
 
-    [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool() const noexcept
+    [[nodiscard]] runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 poolIdx) const
     {
-        return mSecondaryPool;
+        return mPools.at(poolIdx).primaryPtr;
+    }
+
+    [[nodiscard]] runtime::ITensor::SharedPtr getSecondaryPool(SizeType32 poolIdx) const
+    {
+        return mPools.at(poolIdx).secondaryPtr;
     }
 
     [[nodiscard]] SizeType32 getNumLayers() const
@@ -403,10 +443,32 @@ class BlockManager
         return mNumLayers;
     }
 
+    [[nodiscard]] SizeType32 getNumPrimaryBlocks() const
+    {
+        return mNumPrimaryBlocks;
+    }
+
+    [[nodiscard]] SizeType32 getNumSecondaryBlocks() const
+    {
+        return mNumSecondaryBlocks;
+    }
+
+    [[nodiscard]] CacheType getCacheType() const
+    {
+        return mCacheType;
+    }
+
+    [[nodiscard]] SizeType32 getLayerPoolIdx(SizeType32 layerIdx) const
+    {
+        return mLayerToPool.at(layerIdx);
+    }
+
     //! \brief Get index in pool to K or V block.
     //! \param blockId the blockId as returned by getBlockId()
     //! \param fieldIdx either 0 (K) or 1 (V),
-    [[nodiscard]] kernels::KVCacheIndex getKOrVBlockIndex(KVCacheBlock::IdType blockId, SizeType32 fieldIdx) const;
+    //! \param poolIdx the index of the pool for which the index is calculated (each pool has different strides)
+    [[nodiscard]] kernels::KVCacheIndex getKOrVBlockIndex(
+        KVCacheBlock::IdType blockId, SizeType32 fieldIdx, SizeType32 poolIdx) const;
 
     //! \brief Bring offloaded block from secondary to primary memory.
     //! \details Does nothing of block is already in primary memory.
@@ -417,6 +479,11 @@ class BlockManager
     BlockKey findNewContextBlock(
         VecUniqueTokens const& uniqueTokens, std::shared_ptr<LlmRequest> const& llmRequest) const;
 
+    [[nodiscard]] runtime::BufferManager const& getBufferManager() const
+    {
+        return mBufferManager;
+    }
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -436,22 +503,15 @@ class BlockManager
     SizeType32 loadOrAllocateBlocks(
         std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks, GenerationRequest& sequence);
 
-    //! \brief Find best primary block to free.
-    //! \details The best primary block to free is the primary block that appears first in the queue and have no primary
-    //! block descendants
-    [[nodiscard]] std::shared_ptr<KVCacheBlock> findBestGPUBlockToFree();
-
     //! \brief Find block least likely to be reused, free it if necessary and return.
     [[nodiscard]] BlockPtr getFreeBlock();
 
-    //! \brief Claim block if it is in free blocks list.
-    void claimBlock(KVCacheBlock& block);
-
     //! \brief Free block from previous block and claim it from free blocks list.
     void claimLeafBlock(KVCacheBlock& block);
 
     //! \brief Compute pointer to raw KV block (K & V, all layers).
-    [[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(std::shared_ptr<KVCacheBlock> block) const;
+    [[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(
+        std::shared_ptr<KVCacheBlock> block, SizeType32 poolIdx) const;
 
     //! \brief Copy content of src block to dst.
     void copyBlock(BlockPtr src, BlockPtr dst);
@@ -460,23 +520,24 @@ class BlockManager
     // Number of blocks in pools
     SizeType32 mNumPrimaryBlocks;
     SizeType32 mNumSecondaryBlocks;
-    // List of free blocks. Blocks are either backed by fast primary memory or slow secondary memory,
-    // we maintain separate queues for these.
-    FreeBlocksQueue mFreePrimaryBlocks;
-    FreeBlocksQueue mFreeSecondaryBlocks;
+
     // List of allocated blocks for each sequences
-    std::vector<std::vector<BlockPtr>> mAllocatedBlocksPerSeq;
-    // Memory pools. Primary is fast memory, secondary is slower memory used for offloading.
-    runtime::ITensor::SharedPtr mPrimaryPool;
-    runtime::ITensor::SharedPtr mSecondaryPool;
+    std::unordered_map<LlmRequest::RequestIdType, std::vector<BlockPtr>> mAllocatedBlocksPerSeq;
+
+    // Pool per unique numKvHeads in the model
+    std::vector<KVCacheBlockPool> mPools;
+    // Matching of model layers to their pools
+    std::vector<SizeType32> mLayerToPool;
+
     // Whether offloaded blocks should be onboarded before reuse.
     bool mOnboardBlocks;
     // Buffer manager
     runtime::BufferManager mBufferManager;
+
+    // Size of a single KV heads
+    SizeType32 mSizePerHead;
     // Number of layers
     SizeType32 mNumLayers;
-    // Volume of [numKvHeads, tokensPerBlock, sizePerHead]
-    SizeType32 mBlockSize;
     // Used to keep track of number of free blocks during scheduling
     SizeType32 mSchedulingNumFreeBlocks;
     // Number of tokens per one block
@@ -489,6 +550,8 @@ class BlockManager
     std::size_t mAllocTotalBlocks, mAllocNewBlocks, mReusedBlocks;
     // KV cache type (self or cross)
     CacheType mCacheType;
+    // Eviction Policy
+    std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
 
 private:
     friend class KVCacheManager;
@@ -497,17 +560,24 @@ class BlockManager
 class KVCacheManager
 {
 public:
+    friend class KVCacheManagerBindings;
+
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using SequencesPtr = GenerationRequest::SharedPtr;
     using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
     using CacheType = tensorrt_llm::batch_manager::kv_cache_manager::CacheType;
 
-    KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
+    KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
         SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
         CudaStreamPtr stream, bool enableBlockReuse = false, bool onboardBlocks = true,
         CacheType cacheType = CacheType::kSELF);
 
+    KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
+        SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
+        SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
+        CudaStreamPtr stream, bool enableBlockReuse = true, bool onboardBlocks = true,
+        CacheType cacheType = CacheType::kSELF);
+
     void allocatePools(nvinfer1::DataType dtype, bool useUvm = false);
 
     void startScheduling();
@@ -583,10 +653,10 @@ class KVCacheManager
     /// @return  The number of blocks
     [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const;
 
-    void addContextTokens(SizeType32 seqSlotIdx, SizeType32 numTokens);
+    void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
 
-    /// @brief Increase size for request at seqSlotIdx. Allocate new KV cache block(s) if needed.
-    void addToken(SizeType32 seqSlotIdx);
+    /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
+    void addToken(LlmRequest::RequestIdType requestId);
 
     /// @brief Add new request to the KV cache manager.
     /// @param inputLength Input length for which KV cache need to be allocated.
@@ -594,34 +664,40 @@ class KVCacheManager
     /// @param llmRequest Optional request to use for KV cache lookup.
     /// @details If llmRequest is supplied and KV cache reuse is enabled, try to recover KV cache blocks for
     /// inputLength - 1 tokens and populate prepopulatedPromptLen.
-    void addSequence(SizeType32 seqSlotIdx, SizeType32 inputLength, SizeType32 beamWidth,
+    void addSequence(LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
         std::shared_ptr<LlmRequest> const& llmRequest = nullptr);
 
-    void removeSequence(SizeType32 seqSlotIdx, std::shared_ptr<LlmRequest> const& llmRequest = nullptr);
+    void removeSequence(LlmRequest::RequestIdType requestId, std::shared_ptr<LlmRequest> const& llmRequest = nullptr);
 
-    void schedulingRemoveSequence(SizeType32 seqSlotIdx);
+    void schedulingRemoveSequence(LlmRequest::RequestIdType requestId);
 
-    [[nodiscard]] runtime::ITensor::UniquePtr getBlockPoolPointers() const;
+    [[nodiscard]] runtime::ITensor::SharedPtr getBlockPoolPointers() const
+    {
+        return mBlockPoolPointers;
+    }
+
+    [[nodiscard]] runtime::ITensor::SharedPtr getLayerToPoolMapping() const
+    {
+        return mLayerToPoolMapping;
+    }
 
     void getBlockOffsetsOfBatch(
         runtime::ITensor& output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize, SizeType32 beamWidth) const;
 
     //! @return maxBlockCount of all beams
     SizeType32 copyBlockOffsets(
-        runtime::ITensor& output, SizeType32 outputSlotOffset, SizeType32 seqSlotIdx, SizeType32 beamWidth) const;
-
-    // Volume of [2, numKvHeads, tokensPerBlock, sizePerHead]
-    [[nodiscard]] static SizeType32 constexpr calculatePageSize(tensorrt_llm::runtime::ModelConfig const& modelConfig)
-    {
-        return 2 * modelConfig.getNbKvHeads() * modelConfig.getTokensPerBlock() * modelConfig.getSizePerHead();
-    }
+        runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const;
 
-    // numLayers * 2 * numKvHeads * sizePerHead
-    [[nodiscard]] static SizeType32 constexpr calculateCacheSizePerToken(
+    // Sum of numLayers * 2 * numKvHeads * sizePerHead for each pool
+    [[nodiscard]] static SizeType32 calculateCacheSizePerToken(
         tensorrt_llm::runtime::ModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig)
     {
-        return modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism()) * 2 * modelConfig.getNbKvHeads()
-            * modelConfig.getSizePerHead();
+        // NOTE: We expect the initialization of modelConfig to have already taken the tp size into account and do not
+        // address it here
+        // consider only local layers for the calculation
+        return modelConfig.getSumLocalKvHeads(
+                   worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank())
+            * 2 * modelConfig.getSizePerHead();
     }
 
     [[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
@@ -633,14 +709,14 @@ class KVCacheManager
         return mEnableBlockReuse;
     }
 
-    void removeToken(SizeType32 seqSlotIdx);
-    void rewindKVCache(SizeType32 seqSlotIdx, SizeType32 rewindLengths);
+    void removeToken(LlmRequest::RequestIdType requestId);
+    void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths);
 
-    [[nodiscard]] GenerationRequest const& getSequence(SizeType32 seqSlotIdx) const;
+    [[nodiscard]] GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const;
 
     [[nodiscard]] bool isCrossKv() const
     {
-        return mCacheType == CacheType::kCROSS;
+        return mBlockManager.getCacheType() == CacheType::kCROSS;
     }
 
     //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector.
@@ -650,7 +726,7 @@ class KVCacheManager
 
     //! \brief Store full context blocks contributed by llmRequest.
     //! \details These blocks become reusable from next step.
-    void storeContextBlocks(SizeType32 seqSlotIdx, std::shared_ptr<LlmRequest> const& llmRequest);
+    void storeContextBlocks(std::shared_ptr<LlmRequest> const& llmRequest);
 
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
 
@@ -658,14 +734,13 @@ class KVCacheManager
         SizeType32 tokensPerBlock, SizeType32 maxBeamWidth, SizeType32 sinkTokenLen, bool useOneMoreBlock);
 
 private:
-    void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 seqSlotIdx,
-        SizeType32 beamIdx, SizeType32 blockIdx, KVCacheBlock::IdType blockId) const;
+    void setOffsets(kernels::KVCacheIndex* offsetsPtr, nvinfer1::Dims const& offsetsShape, SizeType32 beamIdx,
+        SizeType32 blockIdx, KVCacheBlock::IdType blockId) const;
 
-    void resetBlockOffsets(SizeType32 seqSlotIdx, SizeType32 beamWidth);
-    void cacheBlockOffsets(GenerationRequest const& seq, SizeType32 seqSlotIdx);
-    void cacheNewBlockOffsets(GenerationRequest const& seq, SizeType32 seqSlotIdx);
-    void updateNewBlockPointer(GenerationRequest const& seq, SizeType32 seqSlotIdx, SizeType32 blockIdx);
-    void updateToken(SizeType32 seqSlotIdx, bool addToken);
+    void cacheBlockOffsets(GenerationRequest& seq);
+    void cacheNewBlockOffsets(GenerationRequest& seq);
+    void updateNewBlockPointer(GenerationRequest& seq, SizeType32 blockIdx);
+    void updateToken(GenerationRequest& sequence, bool addToken);
 
 private:
     // Maximum number of sequences
@@ -685,14 +760,13 @@ class KVCacheManager
     SizeType32 mSinkBlockTokenLength;
     // Block manager
     BlockManager mBlockManager;
-    // List of all sequences
-    std::vector<SequencesPtr> mSequences;
-    // buffer for block indices for all managed sequences
-    runtime::ITensor::SharedPtr mSequenceBlockIndices;
+    // Map of all sequences
+    std::unordered_map<LlmRequest::RequestIdType, GenerationRequest> mSequences;
     // Whether to cache KV pages for reuse
     bool mEnableBlockReuse;
-    // KV cache type (self or cross)
-    CacheType mCacheType;
+    // buffers for static tensors, will be created after allocating pools
+    runtime::ITensor::SharedPtr mBlockPoolPointers;
+    runtime::ITensor::SharedPtr mLayerToPoolMapping;
 };
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
index 81b91e24a..69ca1963b 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -65,6 +65,11 @@ class BlockIterator
         return ret;
     }
 
+    operator runtime::ITensor::SharedPtr()
+    {
+        return mCurrent;
+    }
+
     [[nodiscard]] bool operator==(BlockIterator const& other) const
     {
         return mIdx == other.mIdx && mPool.get() == other.mPool.get();
@@ -91,9 +96,9 @@ class BlockIterator
 };
 
 [[nodiscard]] BlockIterator getBlockBeginIt(
-    KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam);
+    KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam, SizeType32 poolIdx);
 
 [[nodiscard]] BlockIterator getBlockEndIt(
-    KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam);
+    KVCacheManager const& cacheManager, LlmRequest const& request, SizeType32 beam, SizeType32 poolIdx);
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 0124592e8..475970b7b 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -26,6 +26,7 @@
 #include "tensorrt_llm/runtime/samplingConfig.h"
 
 #include <cassert>
+#include <chrono>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -39,24 +40,22 @@ namespace tensorrt_llm::batch_manager
  * @brief The state of the request.
  *
  * Enum order must follow chronological order for state dependency check, @see hasReachedState().
- *
- * @todo(rkobus): refactor
  */
-enum LlmRequestState_t
+enum class LlmRequestState : int32_t
 {
-    REQUEST_STATE_UNKNOWN = 0,                          ///< Unknown state
-    REQUEST_STATE_ENCODER_INIT = 1,                     ///< Encoder phase starts (for encoder-decoder models)
-    REQUEST_STATE_CONTEXT_INIT = 2,                     ///< Context phase starts
-    REQUEST_STATE_GENERATION_IN_PROGRESS = 3,           ///< Generation phase is in progress
-    REQUEST_STATE_GENERATION_TO_COMPLETE = 4,           ///< Generation phase is to be completed
-    REQUEST_STATE_GENERATION_COMPLETE = 5,              ///< Generation phase completed
-    REQUEST_STATE_DISAGG_GENERATION_INIT = 6,           ///< For disaggregated serving only:
-                                                        /// new Generation request arrived at generation model
-    REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS = 7, ///< For disaggregated serving only:
-                                                        /// Waiting context-only request transmitting the kv cache
-    REQUEST_STATE_DISAGG_CONTEXT_COMPLETE = 8,          ///< Context-only request finished kv cache transmission.
-    REQUEST_STATE_DISAGG_GENERATION_TRANS_IN_PROGRESS
-    = 9,                                                ///< For disaggregated serving only: transmitting the kv cache
+    kUNKNOWN = 0,                             ///< Unknown state
+    kENCODER_INIT = 1,                        ///< Encoder phase starts (for encoder-decoder models)
+    kCONTEXT_INIT = 2,                        ///< Context phase starts
+    kGENERATION_IN_PROGRESS = 3,              ///< Generation phase is in progress
+    kGENERATION_TO_COMPLETE = 4,              ///< Generation phase is to be completed
+    kGENERATION_COMPLETE = 5,                 ///< Generation phase completed
+    kDISAGG_GENERATION_INIT = 6,              ///< For disaggregated serving only:
+                                              /// new Generation request arrived at generation model
+    kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 7,    ///< For disaggregated serving only:
+                                              /// Waiting context-only request transmitting the kv cache
+    kDISAGG_CONTEXT_COMPLETE = 8,             ///< Context-only request finished kv cache transmission.
+    kDISAGG_GENERATION_TRANS_IN_PROGRESS = 9, ///< For disaggregated serving only: transmitting the kv cache
+    kWAITING_TO_SEND_LOGITS = 10,             ///< Generation phase completed, logits not sent yet
 };
 
 enum LlmRequestType
@@ -114,7 +113,7 @@ class GenericLlmRequest
         , mPromptLen(inputTokens->size())
         , mMaxNewTokens(maxNewTokens)
         , mSamplingConfig(samplingConfig)
-        , mState(REQUEST_STATE_CONTEXT_INIT)
+        , mState(LlmRequestState::kCONTEXT_INIT)
         , mEndId(endId)
         , mPadId(padId)
         , mLogitsPostProcessor(logitsPostProcessor)
@@ -134,8 +133,7 @@ class GenericLlmRequest
         , mLoraWeights(std::move(loraWeights))
         , mLoraConfig(std::move(loraConfig))
         , mLookaheadConfig(std::move(lookaheadConfig))
-        , mContextChunkSize(std::nullopt)
-        , mContextCurrentPosition(0)
+        , mContextChunkSize{mPromptLen}
         , mLogProbs(samplingConfig.beamWidth)
         , mCumLogProbs(samplingConfig.beamWidth)
         , mDraftTokens(draftTokens.value_or(std::make_shared<VecTokens>()))
@@ -159,7 +157,7 @@ class GenericLlmRequest
     {
         if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
         {
-            mState = REQUEST_STATE_ENCODER_INIT;
+            mState = LlmRequestState::kENCODER_INIT;
         }
 
         initialize(*inputTokens, returnLogProbs);
@@ -170,7 +168,7 @@ class GenericLlmRequest
         , mPromptLen(req.getInputTokenIds().size())
         , mMaxNewTokens(req.getMaxTokens())
         , mSamplingConfig(req.getSamplingConfig(), req.getExternalDraftTokensConfig())
-        , mState(REQUEST_STATE_CONTEXT_INIT)
+        , mState(LlmRequestState::kCONTEXT_INIT)
         , mEndId(req.getEndId())
         , mPadId(req.getPadId())
         , mClientId(req.getClientId())
@@ -188,8 +186,7 @@ class GenericLlmRequest
         , mLoraWeights(std::nullopt)
         , mLoraConfig(std::nullopt)
         , mLookaheadConfig(std::nullopt)
-        , mContextChunkSize(std::nullopt)
-        , mContextCurrentPosition(0)
+        , mContextChunkSize{mPromptLen}
         , mLogProbs(mSamplingConfig.beamWidth)
         , mCumLogProbs(mSamplingConfig.beamWidth)
         , mDraftTokens(std::make_shared<VecTokens>())
@@ -212,7 +209,7 @@ class GenericLlmRequest
     {
         if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
         {
-            mState = REQUEST_STATE_DISAGG_GENERATION_INIT;
+            mState = LlmRequestState::kDISAGG_GENERATION_INIT;
         }
         if (mIsStreaming && mSamplingConfig.beamWidth > 1 && !mReturnAllGeneratedTokens)
         {
@@ -236,7 +233,7 @@ class GenericLlmRequest
 
         if (req.getEncoderInputTokenIds().has_value() || req.getEncoderInputFeatures().has_value())
         {
-            mState = REQUEST_STATE_ENCODER_INIT;
+            mState = LlmRequestState::kENCODER_INIT;
             if (req.getEncoderInputTokenIds().has_value())
             {
                 mEncoderTokens = std::make_shared<VecTokens>(req.getEncoderInputTokenIds().value());
@@ -394,6 +391,15 @@ class GenericLlmRequest
             mMaxNewTokens = maxNewTokens;
         }
 
+        if (mNumReturnSequences > 1 && mSamplingConfig.beamWidth > 1)
+        {
+            TLLM_THROW(
+                "Using mNumReturnSequences (%d) > 1 with beam search is currently disabled, since TensorRT-LLM returns "
+                "a total of mNumReturnSequences x beamWidth beams, rather than limiting the number of returned beams "
+                "to mNumReturnSequences. This restriction will be removed once the issue is resolved.",
+                mNumReturnSequences);
+        }
+
         TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config");
 
         // validate extra ids when enabling kv cache reuse with prompt table
@@ -402,7 +408,8 @@ class GenericLlmRequest
             TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.has_value() && mInputTokenExtraIds.value(),
                 "Input token extra ids must be provided when enabling kv cache reuse with prompt table");
             TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.value()->size() == static_cast<size_t>(mOrigPromptLen),
-                "inputTokenExtraIds vector size must be the same as input token vector size.");
+                "inputTokenExtraIds vector size (%lu) must be the same as input token vector size (%lu).",
+                mInputTokenExtraIds.value()->size(), static_cast<size_t>(mOrigPromptLen));
         }
     }
 
@@ -413,7 +420,7 @@ class GenericLlmRequest
 
     /// @brief Get the params of the context
     /// @return The params of the context
-    std::optional<executor::ContextPhaseParams> const& getContextPhaseParams() const noexcept
+    [[nodiscard]] std::optional<executor::ContextPhaseParams> const& getContextPhaseParams() const noexcept
     {
         return mContextPhaseParams;
     }
@@ -425,10 +432,10 @@ class GenericLlmRequest
 
     /// @brief Get the state params of the context
     /// @return The state params of the context
-    executor::ContextPhaseState const& getContextPhaseState() const
+    [[nodiscard]] executor::DataTransceiverState const& getDataTransceiverState() const
     {
         TLLM_CHECK(mContextPhaseParams.has_value());
-        return *static_cast<executor::ContextPhaseState const*>(mContextPhaseParams.value().getState());
+        return *static_cast<executor::DataTransceiverState const*>(mContextPhaseParams.value().getState());
     }
 
     /// @brief Get total number of tokens for this req (prompt + generated)
@@ -661,6 +668,11 @@ class GenericLlmRequest
         return mSequenceIndex > 0;
     }
 
+    [[nodiscard]] RequestIdType getParentRequestId() const
+    {
+        return mParentRequestId;
+    }
+
     /// @brief Return a vector of the last-generated tokens of shape [num_beams]
     [[nodiscard]] VecTokens const& getLastTokens()
     {
@@ -715,10 +727,10 @@ class GenericLlmRequest
         }
 
         // for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase
-        mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? REQUEST_STATE_ENCODER_INIT
-                                                                     : REQUEST_STATE_CONTEXT_INIT;
+        mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT
+                                                                     : LlmRequestState::kCONTEXT_INIT;
         mContextCurrentPosition = 0;
-        mContextChunkSize = std::nullopt;
+        mContextChunkSize = mPromptLen;
         mSeqSlot.reset();
     }
 
@@ -860,9 +872,9 @@ class GenericLlmRequest
         return mOrigPromptLen;
     }
 
-    void setPrepopulatedPromptLen(SizeType32 prepopulatedPromptLen)
+    [[nodiscard]] SizeType32 getPromptLen() const
     {
-        mPrepopulatedPromptLen = prepopulatedPromptLen;
+        return mPromptLen;
     }
 
     [[nodiscard]] SizeType32 getPrepopulatedPromptLen() const
@@ -870,6 +882,37 @@ class GenericLlmRequest
         return mPrepopulatedPromptLen;
     }
 
+    void setPrepopulatedPromptLen(SizeType32 prepopulatedPromptLen, SizeType32 kvTokensPerBlock)
+    {
+        auto const promptLen = getPromptLen();
+        TLLM_CHECK(prepopulatedPromptLen < promptLen);
+        mPrepopulatedPromptLen = prepopulatedPromptLen;
+
+        if (prepopulatedPromptLen > 0)
+        {
+            // Currently, the runtime process is to apply for cache first and then determine prepopulation.
+            // Use the prepopulated length to advance the context position and decrease chunk size if necessary.
+            auto chunkSize = getContextChunkSize();
+            if (prepopulatedPromptLen + chunkSize < promptLen)
+            {
+                // make sure to end at block boundary after current chunk
+                auto const flooredEndPosition
+                    = (prepopulatedPromptLen + chunkSize) / kvTokensPerBlock * kvTokensPerBlock;
+                chunkSize = flooredEndPosition - prepopulatedPromptLen;
+                TLLM_CHECK(chunkSize <= getContextChunkSize());
+            }
+            setContextCurrentPosition(prepopulatedPromptLen);
+            setContextChunkSize(chunkSize);
+
+            if (!isLastContextChunk())
+            {
+                TLLM_CHECK_WITH_INFO((getContextCurrentPosition() + getContextChunkSize()) % kvTokensPerBlock == 0,
+                    "To prevent cache fragmentation, the context position after current chunk should be divisible "
+                    "by the number of tokens per block, except for the last chunk.");
+            }
+        }
+    }
+
     void setDraftTokens(std::shared_ptr<VecTokens> const& draftTokens)
     {
         mDraftTokens = draftTokens;
@@ -1100,44 +1143,49 @@ class GenericLlmRequest
         mGenerationLogitsFragments.clear();
     }
 
-    [[nodiscard]] bool hasReachedState(LlmRequestState_t state) const noexcept
+    [[nodiscard]] bool hasReachedState(LlmRequestState state) const noexcept
     {
         return mState >= state;
     }
 
     [[nodiscard]] bool isEncoderInitState() const noexcept
     {
-        return mState == REQUEST_STATE_ENCODER_INIT;
+        return mState == LlmRequestState::kENCODER_INIT;
     }
 
     [[nodiscard]] bool isContextInitState() const noexcept
     {
-        return mState == REQUEST_STATE_CONTEXT_INIT;
+        return mState == LlmRequestState::kCONTEXT_INIT;
     }
 
     [[nodiscard]] bool isGenerationInProgressState() const noexcept
     {
-        return mState == REQUEST_STATE_GENERATION_IN_PROGRESS || mState == REQUEST_STATE_GENERATION_TO_COMPLETE;
+        return mState == LlmRequestState::kGENERATION_IN_PROGRESS || mState == LlmRequestState::kGENERATION_TO_COMPLETE;
     }
 
     [[nodiscard]] bool isGenerationCompleteState() const noexcept
     {
-        return mState == REQUEST_STATE_GENERATION_COMPLETE;
+        return mState == LlmRequestState::kGENERATION_COMPLETE;
     }
 
     [[nodiscard]] bool isDisaggGenerationInitState() const noexcept
     {
-        return mState == REQUEST_STATE_DISAGG_GENERATION_INIT;
+        return mState == LlmRequestState::kDISAGG_GENERATION_INIT;
     }
 
     [[nodiscard]] bool isDisaggContextTransmissionState() const noexcept
     {
-        return mState == REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS;
+        return mState == LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS;
     }
 
     [[nodiscard]] bool isDisaggContextCompleteState() const noexcept
     {
-        return mState == REQUEST_STATE_DISAGG_CONTEXT_COMPLETE;
+        return mState == LlmRequestState::kDISAGG_CONTEXT_COMPLETE;
+    }
+
+    [[nodiscard]] bool isCompleteWaitingToSendLogits() const noexcept
+    {
+        return mState == LlmRequestState::kWAITING_TO_SEND_LOGITS;
     }
 
     /// To determine whether the context is unchunked. When a context is chunked into only a part, it
@@ -1152,6 +1200,11 @@ class GenericLlmRequest
         return mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_CONTEXT_ONLY;
     }
 
+    [[nodiscard]] bool isGenerationOnlyRequest() const noexcept
+    {
+        return mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY;
+    }
+
     void setContextCurrentPosition(SizeType32 contextCurrentPosition)
     {
         mContextCurrentPosition = contextCurrentPosition;
@@ -1170,12 +1223,11 @@ class GenericLlmRequest
         return mPromptLen - getContextCurrentPosition();
     }
 
-    /// To retrieve the context chunk size, throw an exception when the context is not chunked.
     [[nodiscard]] SizeType32 getContextChunkSize() const
     {
-        TLLM_CHECK_WITH_INFO(
-            isContextInitState() && mContextChunkSize, "The current request is not in context chunking state.");
-        return mContextChunkSize.value();
+        TLLM_CHECK_WITH_INFO(isContextInitState() || isDisaggGenerationInitState(),
+            "getContextChunkSize is only possible during the context phase.");
+        return mContextChunkSize;
     }
 
     /// To set the context chunk size, throw an exception when the chunk size is negative. If the chunk
@@ -1183,45 +1235,34 @@ class GenericLlmRequest
     /// remaining length.
     void setContextChunkSize(SizeType32 size)
     {
-        TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase.");
+        TLLM_CHECK_WITH_INFO(isContextInitState(), "setContextChunkSize is only possible during the context phase.");
         TLLM_CHECK_WITH_INFO(size >= 0, "The chunk size of context (%d) can't be negative.", size);
         mContextChunkSize = std::min(size, getContextRemainingLength());
     }
 
     /// Determines whether the current position is only one chunk away from the end of the context.
-    /// It will return true when the context is not chunked.
     [[nodiscard]] bool isLastContextChunk() const noexcept
     {
-        return isFullContextRequest()
-            || (isContextInitState() && getContextCurrentPosition() + getContextChunkSize() == mPromptLen);
+        return isDisaggGenerationInitState() || getContextCurrentPosition() + getContextChunkSize() == mPromptLen;
     }
 
-    /// Returns whether the position is at the beginning of the context. It will return true when the
-    /// context is not chunked.
+    /// Returns whether the position is at the beginning of the context.
     [[nodiscard]] bool isFirstContextChunk() const noexcept
     {
-        return isFullContextRequest() || getContextCurrentPosition() == 0;
-    }
-
-    [[nodiscard]] executor::PriorityType priority() const noexcept
-    {
-        return mPriority;
+        return getContextCurrentPosition() == 0;
     }
 
     /// Move the cursor forward one chunk. When not chunked, move forward to the end of the context.
     void moveToNextContextChunk()
     {
         TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase.");
-        if (mContextChunkSize)
-        {
-            mContextCurrentPosition += getContextChunkSize();
-            setContextChunkSize(0);
-        }
-        else
-        {
-            TLLM_CHECK_WITH_INFO(mContextCurrentPosition == 0, "Full context out of bounds.");
-            mContextCurrentPosition = mPromptLen;
-        }
+        mContextCurrentPosition += getContextChunkSize();
+        setContextChunkSize(0);
+    }
+
+    [[nodiscard]] executor::PriorityType priority() const noexcept
+    {
+        return mPriority;
     }
 
     /// Increment the counter of decoding iterations.
@@ -1241,20 +1282,24 @@ class GenericLlmRequest
         return static_cast<float>(getMaxNumGeneratedTokens()) / mDecodingIter;
     }
 
+    [[nodiscard]] bool isFinished() const noexcept
+    {
+        return isGenerationCompleteState() || isDisaggContextTransmissionState() || isCompleteWaitingToSendLogits();
+    }
+
     /// @brief  Create a Response from the current state of the request
     /// @return An optional Response
-    std::optional<executor::Response> createResponse()
+    std::optional<executor::Response> createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0)
     {
         TLLM_CHECK(!isDisaggContextCompleteState());
-        if (isGenerationCompleteState() || (mIsStreaming && isGenerationInProgressState())
-            || isDisaggContextTransmissionState())
+        if (isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))
         {
             TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
 
             executor::Result result;
             result.sequenceIndex = mSequenceIndex;
 
-            result.isSequenceFinal = isGenerationCompleteState() || isDisaggContextTransmissionState();
+            result.isSequenceFinal = isFinished();
             mSequenceFinalVec->at(mSequenceIndex) = result.isSequenceFinal;
 
             result.isFinal = std::all_of(mSequenceFinalVec->begin(), mSequenceFinalVec->end(),
@@ -1273,7 +1318,7 @@ class GenericLlmRequest
                 }
                 // TODO: fill the rank ids
                 result.contextPhaseParams = executor::ContextPhaseParams{
-                    std::move(firstGenTokens), mContextPhaseParams.value().releaseState()};
+                    std::move(firstGenTokens), mRequestId, mContextPhaseParams.value().releaseState()};
             }
 
             auto const calculateNbTokensOut = [this](SizeType32 maxNbTokens)
@@ -1292,8 +1337,7 @@ class GenericLlmRequest
 
             auto const startTokenPos = maxNbTokens - maxNbTokensOut;
 
-            auto const shouldSendResponse = isGenerationCompleteState()
-                || (mIsStreaming && maxNbTokens > getMaxSentTokenLen()) || isDisaggContextTransmissionState();
+            auto const shouldSendResponse = isFinished() || (mIsStreaming && maxNbTokens > getMaxSentTokenLen());
 
             if (!shouldSendResponse)
             {
@@ -1333,6 +1377,11 @@ class GenericLlmRequest
                             = runtime::ITensor::slice(getGenerationLogitsHost(), startGenTokenPos, maxNbTokensOut);
                         result.generationLogits = executor::detail::ofITensor(generationLogitsHostCurrentStep);
                     }
+                    else if (useFastLogits)
+                    {
+                        result.specDecFastLogitsInfo
+                            = executor::SpeculativeDecodingFastLogitsInfo{mRequestId, mpiWorldRank};
+                    }
                     else
                     {
                         result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
@@ -1351,7 +1400,7 @@ class GenericLlmRequest
                 setMaxSentTokenLen(maxNbTokens);
 
                 auto requestId = isChild() ? mParentRequestId : mRequestId;
-                auto response = executor::Response(requestId, std::move(result));
+                auto response = executor::Response(requestId, std::move(result), mClientId);
 
                 return response;
             }
@@ -1372,12 +1421,29 @@ class GenericLlmRequest
         mDecodingIter = iter;
     }
 
+    void setKvCacheTransferStart(std::chrono::time_point<std::chrono::steady_clock> const& time)
+    {
+        mKvCacheTransferStart = time;
+    }
+
+    void setKvCacheTransferEnd(std::chrono::time_point<std::chrono::steady_clock> const& time)
+    {
+        mKvCacheTransferEnd = time;
+    }
+
+    [[nodiscard]] double getKvCacheTransferTimeMS() const
+    {
+        // get max with 0 in case this function is called while end time is not recorded
+        return std::max(
+            0.0, std::chrono::duration<double, std::milli>(mKvCacheTransferEnd - mKvCacheTransferStart).count());
+    }
+
     RequestIdType mRequestId;
     SizeType32 mPromptLen;
     SizeType32 mMaxNewTokens;
     // Tokens [beam_size, mPromptLen + getMaxNumGeneratedTokens()]
     runtime::SamplingConfig mSamplingConfig;
-    LlmRequestState_t mState;
+    LlmRequestState mState;
     std::optional<TokenIdType> mEndId;
     std::optional<TokenIdType> mPadId;
     std::optional<SizeType32> mSeqSlot;
@@ -1425,8 +1491,8 @@ class GenericLlmRequest
     // To enable chunked context, the FHMA paged kv-cache also needs to be enabled. Except for the last one,
     // the size of the context chunk needs to be an integer multiple of the kv-cache block size. The meaning
     // of null value is that the context is not chunked.
-    std::optional<SizeType32> mContextChunkSize;
-    SizeType32 mContextCurrentPosition;
+    SizeType32 mContextChunkSize{0};
+    SizeType32 mContextCurrentPosition{0};
 
     std::vector<VecLogProbs> mLogProbs; // [beamSize, seqLen]
     VecLogProbs mCumLogProbs;           // [beamSize]
@@ -1476,6 +1542,9 @@ class GenericLlmRequest
     RequestIdType mParentRequestId;
     std::shared_ptr<std::vector<bool>> mSequenceFinalVec; // Indicators whether each sibling completes generation.
 
+    std::chrono::time_point<std::chrono::steady_clock> mKvCacheTransferStart;
+    std::chrono::time_point<std::chrono::steady_clock> mKvCacheTransferEnd;
+
 private:
     void initialize(VecTokens const& inputTokens, bool outputLogProbs)
     {
@@ -1490,8 +1559,8 @@ class GenericLlmRequest
         {
             if (mInputTokenExtraIds.value()->size() != inputTokens.size())
             {
-                std::string errStr = "inputTokenExtraIds vector size must be the same as input token vector size.";
-                TLLM_THROW(errStr);
+                TLLM_THROW("inputTokenExtraIds vector size (%lu) must be the same as input token vector size (%lu).",
+                    mInputTokenExtraIds.value()->size(), inputTokens.size());
             }
             VecTokenExtraIds tokenExtraIds = *mInputTokenExtraIds.value();
             for (std::size_t i = 0; i < inputTokens.size(); ++i)
@@ -1575,6 +1644,8 @@ class GenericLlmRequest
 
 class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
 {
+    friend class LlmRequestBindings;
+
 public:
     using Base = GenericLlmRequest<runtime::ITensor::SharedPtr>;
     using TensorPtr = Base::TensorPtr;
diff --git a/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h b/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
new file mode 100644
index 000000000..2e932ba23
--- /dev/null
+++ b/cpp/include/tensorrt_llm/batch_manager/microBatchScheduler.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/common/algorithm.h"
+#include "tensorrt_llm/runtime/common.h"
+
+namespace tensorrt_llm::batch_manager
+{
+
+namespace batch_scheduler
+{
+
+struct ContextChunkingConfig
+{
+    ContextChunkingConfig() = default;
+
+    executor::ContextChunkingPolicy chunkingPolicy;
+    /// The minimum size, also known as the chunk unit size. It generally
+    /// needs to be equal to the size of the kv cache block or its integer
+    /// multiples (except for the last context chunk) to avoid fragmentation.
+    /// When set to null, it indicates that the context chunk is disabled.
+    tensorrt_llm::runtime::SizeType32 chunkUnitSize;
+};
+
+} // namespace batch_scheduler
+
+/// @brief This scheduler takes into account the desired batch size and limits of the TRT engine to schedule requests.
+class MicroBatchScheduler : Algorithm
+{
+public:
+    constexpr static auto name{"MicroBatchScheduler"};
+
+    using SizeType32 = tensorrt_llm::runtime::SizeType32;
+    using ContextChunkingPolicy = tensorrt_llm::executor::ContextChunkingPolicy;
+
+    MicroBatchScheduler() = default;
+
+    explicit MicroBatchScheduler(SizeType32 maxBatchSize, std::optional<SizeType32> maxNumTokens = std::nullopt,
+        std::optional<batch_scheduler::ContextChunkingConfig> ctxChunkConfig = std::nullopt,
+        std::optional<SizeType32> maxContextLength = std::nullopt,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+
+    static MicroBatchScheduler make(SizeType32 maxBatchSize, std::optional<SizeType32> maxNumTokens = std::nullopt,
+        std::optional<batch_scheduler::ContextChunkingConfig> ctxChunkConfig = std::nullopt,
+        std::optional<SizeType32> maxContextLength = std::nullopt,
+        LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE)
+    {
+        return MicroBatchScheduler{
+            maxBatchSize, maxNumTokens, ctxChunkConfig, maxContextLength, noScheduleUntilState, noScheduleAfterState};
+    }
+
+    std::tuple<RequestVector, RequestVector> operator()(
+        RequestVector const& activeRequests, ReqIdsSet const& inflightReqIds);
+
+    static void setCtxRequestsChunkSize(RequestVector const& contextsToBeChunked, ContextChunkingPolicy ctxChunkPolicy,
+        std::optional<SizeType32> ctxTokensCapacity, SizeType32 chunkUnitSize,
+        std::optional<SizeType32> const& maxContextLength);
+
+private:
+    template <ContextChunkingPolicy tPolicy>
+    static void setCtxRequestsChunkSize(RequestVector const& contextsToBeChunked,
+        std::optional<SizeType32> ctxTokensCapacity, SizeType32 chunkUnitSize,
+        std::optional<SizeType32> const& maxContextLength);
+
+    /// After the chunk sizes have been determined, this function will discard
+    /// any draft tokens that don't fit.
+    static void fitDraftTokens(RequestVector const& contextsToBeChunked, std::optional<SizeType32> ctxTokensCapacity,
+        SizeType32 chunkUnitSize, std::optional<SizeType32> const& maxContextLength);
+
+    /// The maximum number of requests returned by scheduleRequests
+    SizeType32 mMaxBatchSize;
+
+    /// The maximum number of tokens to include in a batch
+    std::optional<SizeType32> mMaxNumTokens;
+
+    /// The maximum length of the context. If the context exceeds this length,
+    /// it must be chunked, otherwise it cannot be processed. Therefore, it
+    /// needs to be set together with the chunk unit size to make sense.
+    /// When set to null, it indicates that context length is unlimited.
+    std::optional<SizeType32> mMaxContextLength;
+
+    std::optional<batch_scheduler::ContextChunkingConfig> mCtxChunkConfig;
+
+    /// The state until/after which the scheduler should not schedule requests
+    LlmRequestState mNoScheduleUntilState;
+    LlmRequestState mNoScheduleAfterState;
+};
+
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
index 65808134b..f86e76b4b 100644
--- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
@@ -51,6 +51,8 @@ class PeftTaskNotCachedException : public runtime::LoraExpectedException
 class BasePeftCacheManager
 {
 public:
+    friend class BasePeftCacheManagerBindings;
+
     using LlmRequestPtr = std::shared_ptr<LlmRequest>;
     using RequestVector = std::vector<LlmRequestPtr>;
     using PeftTable = std::map<uint64_t, std::shared_ptr<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>>;
diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
index fc61fd581..4a430d8c1 100644
--- a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
+++ b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
@@ -46,7 +46,9 @@ class TrtGptModelOptionalParams
         executor::SchedulerConfig const& schedulerConfig = executor::SchedulerConfig{},
         executor::ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig
         = executor::ExtendedRuntimePerfKnobConfig{},
-        std::optional<executor::DebugConfig> debugConfig = std::nullopt, uint64_t maxSeqIdleMicroseconds = 180000000)
+        std::optional<executor::DebugConfig> debugConfig = std::nullopt, uint64_t maxSeqIdleMicroseconds = 180000000,
+        std::optional<executor::SpeculativeDecodingConfig> specDecConfig = std::nullopt,
+        bool isLeaderInOrchMode = false)
         : kvCacheConfig{kvCacheConfig}
         , enableTrtOverlap{enableTrtOverlap}
         , deviceIds(deviceIds)
@@ -62,10 +64,12 @@ class TrtGptModelOptionalParams
         , extendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig)
         , debugConfig{std::move(debugConfig)}
         , maxSeqIdleMicroseconds{maxSeqIdleMicroseconds}
+        , speculativeDecodingConfig{std::move(specDecConfig)}
+        , isLeaderInOrchMode{isLeaderInOrchMode}
     {
     }
 
-    explicit TrtGptModelOptionalParams(executor::ExecutorConfig const& executorConfig)
+    explicit TrtGptModelOptionalParams(executor::ExecutorConfig const& executorConfig, bool isLeaderInOrchMode)
         : TrtGptModelOptionalParams(KvCacheConfig(executorConfig.getKvCacheConfig()), false,
             executorConfig.getParallelConfig().value_or(executor::ParallelConfig()).getDeviceIds(),
             executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
@@ -74,16 +78,7 @@ class TrtGptModelOptionalParams
             executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(), executorConfig.getMaxBatchSize(),
             executorConfig.getMaxNumTokens(), executorConfig.getSchedulerConfig(),
             executorConfig.getExtendedRuntimePerfKnobConfig(), executorConfig.getDebugConfig(),
-            executorConfig.getMaxSeqIdleMicroseconds())
-    {
-    }
-
-    // Copy constructor
-    TrtGptModelOptionalParams(TrtGptModelOptionalParams const& other)
-        : TrtGptModelOptionalParams(other.kvCacheConfig, other.enableTrtOverlap, other.deviceIds,
-            other.normalizeLogProbs, other.enableChunkedContext, other.peftCacheManagerConfig, other.decodingConfig,
-            other.gpuWeightsPercent, other.maxBeamWidth, other.maxBatchSize, other.maxNumTokens, other.schedulerConfig,
-            other.extendedRuntimePerfKnobConfig, other.debugConfig, other.maxSeqIdleMicroseconds)
+            executorConfig.getMaxSeqIdleMicroseconds(), executorConfig.getSpecDecConfig(), isLeaderInOrchMode)
     {
     }
 
@@ -103,6 +98,8 @@ class TrtGptModelOptionalParams
             && extendedRuntimePerfKnobConfig == other.extendedRuntimePerfKnobConfig //
             && debugConfig == other.debugConfig                                     //
             && maxSeqIdleMicroseconds == other.maxSeqIdleMicroseconds               //
+            && speculativeDecodingConfig == other.speculativeDecodingConfig         //
+            && isLeaderInOrchMode == other.isLeaderInOrchMode                       //
             ;
     }
 
@@ -126,6 +123,9 @@ class TrtGptModelOptionalParams
     std::optional<executor::DebugConfig> debugConfig;
     // Sequence is considered idle if not updated for this amount of time.
     uint64_t maxSeqIdleMicroseconds;
+    std::optional<executor::SpeculativeDecodingConfig> speculativeDecodingConfig;
+    // This rank is the leader worker in orchestrator mode
+    bool isLeaderInOrchMode;
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/common/algorithm.h b/cpp/include/tensorrt_llm/common/algorithm.h
new file mode 100644
index 000000000..9363504f7
--- /dev/null
+++ b/cpp/include/tensorrt_llm/common/algorithm.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace tensorrt_llm
+{
+
+// Base class for algorithms
+struct Algorithm
+{
+    Algorithm() = default;
+    Algorithm(Algorithm&&) = default;
+    Algorithm& operator=(Algorithm&&) = default;
+    Algorithm(Algorithm const&) = delete;
+    Algorithm& operator=(Algorithm const&) = delete;
+};
+
+} // namespace tensorrt_llm
diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h
index 71657c0bb..023f97d87 100644
--- a/cpp/include/tensorrt_llm/common/cudaUtils.h
+++ b/cpp/include/tensorrt_llm/common/cudaUtils.h
@@ -161,7 +161,7 @@ inline std::optional<bool> isCudaLaunchBlocking()
     return result;
 }
 
-inline void syncAndCheck(char const* const file, int const line)
+inline bool doCheckError()
 {
     auto const cudaLaunchBlocking = isCudaLaunchBlocking();
 #ifndef NDEBUG
@@ -170,7 +170,12 @@ inline void syncAndCheck(char const* const file, int const line)
     bool const checkError = cudaLaunchBlocking.value_or(false);
 #endif
 
-    if (checkError)
+    return checkError;
+}
+
+inline void syncAndCheck(char const* const file, int const line)
+{
+    if (doCheckError())
     {
         check(cudaGetLastError(), "cudaGetLastError", file, line);
         check(cudaDeviceSynchronize(), "cudaDeviceSynchronize", file, line);
diff --git a/cpp/include/tensorrt_llm/common/mpiUtils.h b/cpp/include/tensorrt_llm/common/mpiUtils.h
index edf3da004..d5801f36c 100644
--- a/cpp/include/tensorrt_llm/common/mpiUtils.h
+++ b/cpp/include/tensorrt_llm/common/mpiUtils.h
@@ -99,7 +99,6 @@ struct MpiTypeConverter<std::byte>
 };
 
 template <>
-
 struct MpiTypeConverter<half>
 
 {
@@ -380,9 +379,14 @@ class MpiComm
 
     void allreduce(void const* sendbuf, void* recvbuf, int count, MpiType dtype, MpiOp op) const;
     void allgather(void const* sendbuf, void* recvbuf, int count, MpiType dtype) const;
+
+    void allgatherv(void const* sendbuf, int sendcount, MpiType sendtype, void* recvbuf,
+        std::vector<int> const& recvcounts, std::vector<int> const& displs, MpiType recvtype) const;
+
     void barrier() const;
 
     void mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const;
+    bool improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const;
 
     //! \brief Returns if a message with the specified source and tag is available
     bool iprobe(int source, int tag, MPI_Status* status) const;
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index a96c24d43..e6e5e1e0e 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -43,7 +43,7 @@ char const* version() noexcept;
 
 class Model;
 class Serialization;
-class ContextPhaseState;
+class DataTransceiverState;
 
 /// @brief Sampling configuration
 class SamplingConfig
@@ -186,11 +186,13 @@ class ExternalDraftTokensConfig
 {
 public:
     explicit ExternalDraftTokensConfig(VecTokens tokens, std::optional<Tensor> logits = std::nullopt,
-        std::optional<FloatType> const& acceptanceThreshold = std::nullopt);
+        std::optional<FloatType> const& acceptanceThreshold = std::nullopt,
+        std::optional<bool> const& fastLogits = std::nullopt);
 
     [[nodiscard]] VecTokens getTokens() const;
     [[nodiscard]] std::optional<Tensor> getLogits() const;
     [[nodiscard]] std::optional<FloatType> getAcceptanceThreshold() const;
+    [[nodiscard]] std::optional<bool> getFastLogits() const;
 
 private:
     friend class Serialization;
@@ -200,6 +202,8 @@ class ExternalDraftTokensConfig
     std::optional<Tensor> mLogits;
     /// @brief The acceptance threshold. Must be > 0.f and <= 1.f
     std::optional<FloatType> mAcceptanceThreshold;
+    /// @brief Use direct transfer for draft logits
+    std::optional<bool> mFastLogits;
 };
 
 /// @brief Configuration for prompt tuning
@@ -283,8 +287,10 @@ struct LookaheadDecodingConfig
 class ContextPhaseParams
 {
 public:
-    explicit ContextPhaseParams(VecTokens firstGenTokens);
-    ContextPhaseParams(VecTokens firstGenTokens, void* state);
+    using RequestIdType = std::uint64_t;
+
+    explicit ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId);
+    ContextPhaseParams(VecTokens firstGenTokens, RequestIdType reqId, void* state);
 
     ContextPhaseParams(ContextPhaseParams const&);
     ContextPhaseParams(ContextPhaseParams&&);
@@ -295,6 +301,8 @@ class ContextPhaseParams
 
     [[nodiscard]] VecTokens const& getFirstGenTokens() const& noexcept;
     [[nodiscard]] VecTokens popFirstGenTokens() && noexcept;
+    [[nodiscard]] RequestIdType getReqId() const noexcept;
+
     [[nodiscard]] void const* getState() const noexcept;
     [[nodiscard]] void* getState() noexcept;
     [[nodiscard]] void* releaseState() noexcept;
@@ -304,6 +312,9 @@ class ContextPhaseParams
     static void deleter(void const* data);
     using StatePtr = std::unique_ptr<void, decltype(&deleter)>;
 
+    /// @brief This request corresponds to the request ID in the context phase.
+    RequestIdType mReqId{0};
+
     /// @brief The first tokens generated by context executor
     VecTokens mFirstGenTokens;
 
@@ -311,6 +322,18 @@ class ContextPhaseParams
     StatePtr mState{nullptr, deleter};
 };
 
+/// @brief Configuration for speculative decoding (both draft and target models)
+class SpeculativeDecodingConfig
+{
+public:
+    explicit SpeculativeDecodingConfig(bool fastLogits);
+
+    bool operator==(SpeculativeDecodingConfig const& other) const;
+
+    /// @brief Send logits tensor directly from draft to target model.
+    bool fastLogits;
+};
+
 /// @brief A class that holds information about the request
 class Request
 {
@@ -430,6 +453,16 @@ class Request
     std::unique_ptr<Impl> mImpl;
 };
 
+/// @brief Struct that holds the logits information when using direct transfer
+struct SpeculativeDecodingFastLogitsInfo
+{
+    /// @brief Draft request id
+    uint64_t draftRequestId;
+
+    /// @brief MPI world rank of the draft model leader
+    int32_t draftParticipantId;
+};
+
 /// @brief Struct that holds the generation result
 struct Result
 {
@@ -448,11 +481,14 @@ struct Result
     /// @brief The context logits. Size [promptLen, vocabSizePadded]
     std::optional<Tensor> contextLogits;
 
-    /// @brief The context logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming)
+    /// @brief The generation logits. Size [beamSize, maxNewTokens, vocabSizePadded] (non-streaming)
     /// or [maxNewTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens)
     /// or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)
     std::optional<Tensor> generationLogits;
 
+    /// @brief Logits information for direct transfer when using fast logits
+    std::optional<SpeculativeDecodingFastLogitsInfo> specDecFastLogitsInfo;
+
     /// @brief The encoder output. Size [encoderLen, hiddenSize]
     std::optional<Tensor> encoderOutput;
 
@@ -477,8 +513,8 @@ struct Result
 class Response
 {
 public:
-    Response(IdType requestId, std::string errorMsg);
-    Response(IdType requestId, Result Result);
+    Response(IdType requestId, std::string errorMsg, std::optional<IdType> clientId = std::nullopt);
+    Response(IdType requestId, Result Result, std::optional<IdType> clientId = std::nullopt);
 
     ~Response();
     Response(Response const& other);
@@ -489,6 +525,9 @@ class Response
     /// @brief Get the id of the request for which this response was generated
     [[nodiscard]] IdType getRequestId() const;
 
+    /// @brief Get the client id of the request for which this response was generated
+    [[nodiscard]] std::optional<IdType> getClientId() const;
+
     /// @brief Indicates if this response has an error or not
     [[nodiscard]] bool hasError() const;
 
@@ -538,13 +577,15 @@ class KvCacheConfig
         std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
         std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
         std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
-        std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true);
+        std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
+        std::optional<FloatType> const& crossKvCacheFraction = std::nullopt);
 
     [[nodiscard]] bool getEnableBlockReuse() const;
     [[nodiscard]] std::optional<SizeType32> getMaxTokens() const;
     [[nodiscard]] std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec() const;
     [[nodiscard]] std::optional<SizeType32> getSinkTokenLength() const;
     [[nodiscard]] std::optional<FloatType> getFreeGpuMemoryFraction() const;
+    [[nodiscard]] std::optional<FloatType> getCrossKvCacheFraction() const;
     [[nodiscard]] std::optional<size_t> getHostCacheSize() const;
     [[nodiscard]] bool getOnboardBlocks() const;
 
@@ -553,6 +594,7 @@ class KvCacheConfig
     void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec);
     void setSinkTokenLength(SizeType32 sinkTokenLength);
     void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
+    void setCrossKvCacheFraction(FloatType crossKvCacheFraction);
     void setHostCacheSize(size_t hostCacheSize);
     void setOnboardBlocks(bool onboardBlocks);
 
@@ -581,6 +623,12 @@ class KvCacheConfig
     /// allocated.
     std::optional<FloatType> mFreeGpuMemoryFraction;
 
+    /// @brief The fraction of the KV Cache memory should be reserved for cross attention
+    /// If set to p, self attention will use 1-p of KV Cache memory and cross attention
+    /// will use p of KV Cache memory. Default is 50%.
+    /// Should only be set when using encoder-decoder model.
+    std::optional<FloatType> mCrossKvCacheFraction;
+
     /// @brief Size of secondary memory pool in bytes. Default is 0.
     /// Having a secondary memory pool increases KV cache block reuse potential.
     std::optional<size_t> mHostCacheSize;
@@ -593,18 +641,24 @@ class KvCacheConfig
 class ExtendedRuntimePerfKnobConfig
 {
 public:
-    explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false);
+    explicit ExtendedRuntimePerfKnobConfig(bool multiBlockMode = true, bool enableContextFMHAFP32Acc = false,
+        bool cudaGraphMode = false, SizeType32 cudaGraphCacheSize = 0);
 
     bool operator==(ExtendedRuntimePerfKnobConfig const& other) const
     {
-        return mMultiBlockMode == other.mMultiBlockMode && mEnableContextFMHAFP32Acc == other.mEnableContextFMHAFP32Acc;
+        return mMultiBlockMode == other.mMultiBlockMode && mEnableContextFMHAFP32Acc == other.mEnableContextFMHAFP32Acc
+            && mCudaGraphMode == other.mCudaGraphMode && mCudaGraphCacheSize == other.mCudaGraphCacheSize;
     }
 
     [[nodiscard]] bool getMultiBlockMode() const;
     [[nodiscard]] bool getEnableContextFMHAFP32Acc() const;
+    [[nodiscard]] bool getCudaGraphMode() const;
+    [[nodiscard]] SizeType32 getCudaGraphCacheSize() const;
 
     void setMultiBlockMode(bool multiBlockMode);
     void setEnableContextFMHAFP32Acc(bool enableContextFMHAFP32Acc);
+    void setCudaGraphMode(bool cudaGraphMode);
+    void setCudaGraphCacheSize(SizeType32 cacheSize);
 
 private:
     friend class Serialization;
@@ -614,6 +668,13 @@ class ExtendedRuntimePerfKnobConfig
 
     /// @brief If enable FMHA runner FP32 accumulation.
     bool mEnableContextFMHAFP32Acc;
+
+    /// @brief Control if enable cuda graph.
+    bool mCudaGraphMode;
+
+    /// @brief Number of cuda graphs to be cached in the runtime.
+    /// The larger the cache, the better the perf, but more GPU memory is consumed.
+    SizeType32 mCudaGraphCacheSize;
 };
 
 /// @brief Configuration class for debugging output
@@ -622,27 +683,33 @@ class DebugConfig
     using StringVec = std::vector<std::string>;
 
 public:
-    explicit DebugConfig(bool dumpInputTensors = false, bool dumpOuputTensors = false, StringVec debugTensorNames = {});
+    explicit DebugConfig(bool debugInputTensors = false, bool debugOutputTensors = false,
+        StringVec debugTensorNames = {}, SizeType32 debugTensorsMaxIterations = 0);
 
     bool operator==(DebugConfig const& other) const;
 
-    [[nodiscard]] bool getDumpInputTensors() const;
-    [[nodiscard]] bool getDumpOutputTensors() const;
+    [[nodiscard]] bool getDebugInputTensors() const;
+    [[nodiscard]] bool getDebugOutputTensors() const;
     [[nodiscard]] StringVec const& getDebugTensorNames() const;
+    [[nodiscard]] SizeType32 getDebugTensorsMaxIterations() const;
 
-    void setDumpInputTensors(bool dumpInputTensors);
-    void setDumpOuputTensors(bool dumpOuputTensors);
+    void setDebugInputTensors(bool debugInputTensors);
+    void setDebugOutputTensors(bool debugOutputTensors);
     void setDebugTensorNames(StringVec const& debugTensorNames);
+    void setDebugTensorsMaxIterations(SizeType32 debugTensorsMaxIterations);
 
 private:
     friend class Serialization;
 
-    /// @brief If true, dump all input tensors.
-    bool mDumpInputTensors;
-    /// @brief If true, dump all output tensors.
-    bool mDumpOuputTensors;
-    /// @brief If not empty, only dump tensors in this list.
+    /// @brief If true, debug all input tensors.
+    bool mDebugInputTensors;
+    /// @brief If true, debug all output tensors.
+    bool mDebugOutputTensors;
+    /// @brief If not empty, only debug tensors in this list.
     StringVec mDebugTensorNames;
+    /// @brief If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations,
+    /// else dump them to files.
+    SizeType32 mDebugTensorsMaxIterations;
 };
 
 SizeType32 const kDefaultIterStatsMaxIterations = 1000;
@@ -847,7 +914,8 @@ class ExecutorConfig
         std::optional<SizeType32> maxQueueSize = std::nullopt,
         ExtendedRuntimePerfKnobConfig const& extendedRuntimePerfKnobConfig = ExtendedRuntimePerfKnobConfig(),
         std::optional<DebugConfig> debugConfig = std::nullopt, SizeType32 recvPollPeriodMs = 0,
-        uint64_t maxSeqIdleMicroseconds = 180000000);
+        uint64_t maxSeqIdleMicroseconds = 180000000,
+        std::optional<SpeculativeDecodingConfig> specDecConfig = std::nullopt);
 
     [[nodiscard]] SizeType32 getMaxBeamWidth() const;
     [[nodiscard]] SchedulerConfig getSchedulerConfig() const;
@@ -869,6 +937,7 @@ class ExecutorConfig
     [[nodiscard]] std::optional<DebugConfig> getDebugConfig() const;
     [[nodiscard]] SizeType32 getRecvPollPeriodMs() const;
     [[nodiscard]] uint64_t getMaxSeqIdleMicroseconds() const;
+    [[nodiscard]] std::optional<SpeculativeDecodingConfig> getSpecDecConfig() const;
 
     void setMaxBeamWidth(SizeType32 maxBeamWidth);
     void setMaxBatchSize(SizeType32 maxBatchSize);
@@ -890,6 +959,7 @@ class ExecutorConfig
     void setDebugConfig(DebugConfig const& debugConfig);
     void setRecvPollPeriodMs(SizeType32 const& recvPollPeriodMs);
     void setMaxSeqIdleMicroseconds(uint64_t maxNumTokens);
+    void setSpecDecConfig(SpeculativeDecodingConfig const& specDecConfig);
 
 private:
     friend class Serialization;
@@ -952,6 +1022,9 @@ class ExecutorConfig
     /// @brief The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default
     /// is 3 minutes.
     uint64_t mMaxSeqIdleMicroseconds;
+
+    /// @brief The speculative decoding configuration
+    std::optional<SpeculativeDecodingConfig> mSpeculativeDecodingConfig;
 };
 
 /// @brief The executor is responsible for receiving new requests and sending responses, and running the inference
@@ -1032,23 +1105,31 @@ class Executor
     /// @param id The request id for which to cancel the response
     void cancelRequest(IdType requestId);
 
-    /// @brief  Signals the server to shutdown
-    ///         This call is blocking. Only returns when all requests have terminated or timeout has been reached
+    /// @brief   Signals the server to shutdown.
+    /// @details This call is blocking. Only returns when all requests have terminated or timeout has been reached
     void shutdown();
 
-    /// @brief  Returns the per-iterations statistics computed since last call to getLatestIterationStats
-    ///         Contains at most iterStatsMaxIterations iterations
+    /// @brief  Returns the per-iterations statistics computed since last call to getLatestIterationStats.
+    ///         Contains at most iterStatsMaxIterations iterations.
     /// @return Iteration stats
     std::deque<IterationStats> getLatestIterationStats();
 
-    /// @brief  Returns the request stats of each iteration computed since last call to getLatestRequestStats
-    ///         Contains at most requestStatsMaxIterations iterations
+    /// @brief  Returns the request stats of each iteration computed since last call to getLatestRequestStats.
+    ///         Contains at most requestStatsMaxIterations iterations.
     /// @return Request stats grouped by iterations
     std::deque<RequestStatsPerIteration> getLatestRequestStats();
 
+    /// @brief  Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors.
+    ///         Contains at most debugTensorsMaxIterations iterations.
+    /// @return Request debug tensors grouped by iterations
+    std::deque<DebugTensorsPerIteration> getLatestDebugTensors();
+
     /// @brief  Indicates if the current process is allowed to enqueueRequests
     [[nodiscard]] bool canEnqueueRequests() const;
 
+    /// @brief  Indicates if the current process participates in this executor instance
+    [[nodiscard]] bool isParticipant() const;
+
 private:
     class Impl;
     std::unique_ptr<Impl> mImpl;
diff --git a/cpp/include/tensorrt_llm/executor/serialization.h b/cpp/include/tensorrt_llm/executor/serialization.h
index 11d22c3f0..28aba9dc1 100644
--- a/cpp/include/tensorrt_llm/executor/serialization.h
+++ b/cpp/include/tensorrt_llm/executor/serialization.h
@@ -75,10 +75,10 @@ class Serialization
     static void serialize(kv_cache::CacheState const& state, std::ostream& os);
     [[nodiscard]] static size_t serializedSize(kv_cache::CacheState const& state);
 
-    // ContextPhaseState
-    [[nodiscard]] static ContextPhaseState deserializeContextPhaseState(std::istream& is);
-    static void serialize(ContextPhaseState const& contextPhaseState, std::ostream& os);
-    [[nodiscard]] static size_t serializedSize(ContextPhaseState const& contextPhaseState);
+    // DataTransceiverState
+    [[nodiscard]] static DataTransceiverState deserializeDataTransceiverState(std::istream& is);
+    static void serialize(DataTransceiverState const& dataTransceiverState, std::ostream& os);
+    [[nodiscard]] static size_t serializedSize(DataTransceiverState const& dataTransceiverState);
 
     // ContextPhaseParams
     [[nodiscard]] static ContextPhaseParams deserializeContextPhaseParams(std::istream& is);
@@ -95,6 +95,11 @@ class Serialization
     static void serialize(Tensor const& tensor, std::ostream& os);
     [[nodiscard]] static size_t serializedSize(Tensor const& tensor);
 
+    // SpeculativeDecodingFastLogitsInfo
+    [[nodiscard]] static SpeculativeDecodingFastLogitsInfo deserializeSpecDecFastLogitsInfo(std::istream& is);
+    static void serialize(SpeculativeDecodingFastLogitsInfo const& info, std::ostream& os);
+    [[nodiscard]] static size_t serializedSize(SpeculativeDecodingFastLogitsInfo const& info);
+
     // Result
     [[nodiscard]] static Result deserializeResult(std::istream& is);
     static void serialize(Result const& result, std::ostream& os);
diff --git a/cpp/include/tensorrt_llm/executor/types.h b/cpp/include/tensorrt_llm/executor/types.h
index e07c539a9..5a8525caf 100644
--- a/cpp/include/tensorrt_llm/executor/types.h
+++ b/cpp/include/tensorrt_llm/executor/types.h
@@ -18,6 +18,7 @@
 
 #include <cstdint>
 #include <functional>
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
@@ -197,6 +198,10 @@ enum class CapacitySchedulerPolicy
     /// @brief GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run
     /// to completion without eviction.
     kGUARANTEED_NO_EVICT = 1,
+
+    /// @brief kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed.
+    /// Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction.
+    kSTATIC_BATCH = 2
 };
 
 std::ostream& operator<<(std::ostream& os, CapacitySchedulerPolicy policy);
@@ -332,6 +337,13 @@ enum class RequestStage
     kGENERATION_COMPLETE,
 };
 
+/// @brief Struct that holds the request stats in the case of disaggregated serving
+struct DisServingRequestStats
+{
+    /// @brief The total time spent on transferring KV cache from context phase to generation phase (ms)
+    double kvCacheTransferMS;
+};
+
 /// @brief Struct that holds the stats of a single request
 struct RequestStats
 {
@@ -350,6 +362,8 @@ struct RequestStats
     /// @brief Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks
     /// exhaustion for example)
     bool paused;
+    /// @brief Stats specific to disaggregated serving
+    std::optional<DisServingRequestStats> disServingStats;
 };
 
 /// @brief Struct that holds the stats of all requests in an iteration
@@ -361,6 +375,15 @@ struct RequestStatsPerIteration
     std::vector<RequestStats> requestStats;
 };
 
+/// @brief Struct that holds the debug tensors in an iteration
+struct DebugTensorsPerIteration
+{
+    /// @brief The iteration id for these tensors
+    IterationType iter;
+    /// @brief The debug tensors for this iteration
+    std::map<std::string, Tensor> debugTensors;
+};
+
 /// @brief The reason why the model stopped generating tokens for a request.
 enum class FinishReason
 {
@@ -423,6 +446,11 @@ class DecodingMode
         return DecodingMode{kExplicitDraftTokens | kStandardStopCriteria | kUseExplicitEosStop};
     }
 
+    static auto constexpr ExternalDraftTokens()
+    {
+        return DecodingMode{kExternalDraftTokens | kUsePenalties | kUseBanTokens | kStandardStopCriteria};
+    }
+
     auto constexpr useTemperature(bool useTemp)
     {
         mState = setBitTo(kUseTemperature, useTemp);
@@ -540,6 +568,11 @@ class DecodingMode
         return anyBitSet(kExplicitDraftTokens);
     }
 
+    [[nodiscard]] bool constexpr isExternalDraftTokens() const
+    {
+        return anyBitSet(kExternalDraftTokens);
+    }
+
     [[nodiscard]] bool constexpr isUseTemperature() const
     {
         return anyBitSet(kUseTemperature);
@@ -653,6 +686,7 @@ class DecodingMode
     static UnderlyingType constexpr kMedusa{1u << (kNumFlags + 4)};
     static UnderlyingType constexpr kLookahead{1u << (kNumFlags + 5)};
     static UnderlyingType constexpr kExplicitDraftTokens{1u << (kNumFlags + 6)};
+    static UnderlyingType constexpr kExternalDraftTokens{1u << (kNumFlags + 7)};
     static UnderlyingType constexpr kTopKTopP{kTopK | kTopP};
 
     [[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const
@@ -683,6 +717,7 @@ static_assert(!DecodingMode::Auto().isBeamSearch());
 static_assert(!DecodingMode::Auto().isMedusa());
 static_assert(!DecodingMode::Auto().isLookahead());
 static_assert(!DecodingMode::Auto().isExplicitDraftTokens());
+static_assert(!DecodingMode::Auto().isExternalDraftTokens());
 
 static_assert(DecodingMode::TopK().isTopK());
 static_assert(DecodingMode::TopK().isTopKorTopP());
@@ -703,6 +738,7 @@ static_assert(!DecodingMode::TopK().isBeamSearch());
 static_assert(!DecodingMode::TopK().isMedusa());
 static_assert(!DecodingMode::TopK().isLookahead());
 static_assert(!DecodingMode::TopK().isExplicitDraftTokens());
+static_assert(!DecodingMode::TopK().isExternalDraftTokens());
 
 static_assert(DecodingMode::TopP().isTopP());
 static_assert(DecodingMode::TopP().isTopKorTopP());
@@ -716,6 +752,7 @@ static_assert(!DecodingMode::TopP().isBeamSearch());
 static_assert(!DecodingMode::TopP().isMedusa());
 static_assert(!DecodingMode::TopP().isLookahead());
 static_assert(!DecodingMode::TopP().isExplicitDraftTokens());
+static_assert(!DecodingMode::TopP().isExternalDraftTokens());
 
 static_assert(DecodingMode::TopKTopP().isTopK());
 static_assert(DecodingMode::TopKTopP().isTopP());
@@ -729,6 +766,7 @@ static_assert(!DecodingMode::TopKTopP().isBeamSearch());
 static_assert(!DecodingMode::TopKTopP().isMedusa());
 static_assert(!DecodingMode::TopKTopP().isLookahead());
 static_assert(!DecodingMode::TopKTopP().isExplicitDraftTokens());
+static_assert(!DecodingMode::TopKTopP().isExternalDraftTokens());
 
 static_assert(DecodingMode::BeamSearch().isBeamSearch());
 static_assert(DecodingMode::BeamSearch().isUseStopCriteria());
@@ -737,6 +775,7 @@ static_assert(!DecodingMode::BeamSearch().isTopKorTopP());
 static_assert(!DecodingMode::BeamSearch().isMedusa());
 static_assert(!DecodingMode::BeamSearch().isLookahead());
 static_assert(!DecodingMode::BeamSearch().isExplicitDraftTokens());
+static_assert(!DecodingMode::BeamSearch().isExternalDraftTokens());
 
 static_assert(!DecodingMode::Medusa().isAuto());
 static_assert(!DecodingMode::Medusa().isTopK());
@@ -752,6 +791,7 @@ static_assert(DecodingMode::Medusa().isUseStopCriteria());
 static_assert(DecodingMode::Medusa().isUsePenalty());
 static_assert(DecodingMode::Medusa().isUseMinLength());
 static_assert(DecodingMode::Medusa().isMedusa());
+static_assert(!DecodingMode::Medusa().isExternalDraftTokens());
 
 static_assert(!DecodingMode::Lookahead().isAuto());
 static_assert(!DecodingMode::Lookahead().isTopK());
@@ -765,6 +805,7 @@ static_assert(DecodingMode::Lookahead().isUseStopCriteria());
 static_assert(DecodingMode::Lookahead().isUseStopWords());
 static_assert(DecodingMode::Lookahead().isUseExplicitEosStop());
 static_assert(DecodingMode::Lookahead().isLookahead());
+static_assert(!DecodingMode::Lookahead().isExternalDraftTokens());
 
 static_assert(!DecodingMode::ExplicitDraftTokens().isAuto());
 static_assert(!DecodingMode::ExplicitDraftTokens().isTopK());
@@ -778,4 +819,19 @@ static_assert(!DecodingMode::ExplicitDraftTokens().isUsePenalty());
 static_assert(DecodingMode::ExplicitDraftTokens().isUseStopCriteria());
 static_assert(!DecodingMode::ExplicitDraftTokens().isUseBanWords());
 static_assert(DecodingMode::ExplicitDraftTokens().isExplicitDraftTokens());
+static_assert(!DecodingMode::ExplicitDraftTokens().isExternalDraftTokens());
+
+static_assert(!DecodingMode::ExternalDraftTokens().isTopK());
+static_assert(!DecodingMode::ExternalDraftTokens().isTopP());
+static_assert(!DecodingMode::ExternalDraftTokens().isTopKorTopP());
+static_assert(!DecodingMode::ExternalDraftTokens().isTopKandTopP());
+static_assert(DecodingMode::ExternalDraftTokens().isUseBanWords());
+static_assert(DecodingMode::ExternalDraftTokens().isUseOccurrencePenalty());
+static_assert(DecodingMode::ExternalDraftTokens().isUseStopCriteria());
+static_assert(!DecodingMode::ExternalDraftTokens().isAuto());
+static_assert(!DecodingMode::ExternalDraftTokens().isBeamSearch());
+static_assert(!DecodingMode::ExternalDraftTokens().isMedusa());
+static_assert(!DecodingMode::ExternalDraftTokens().isLookahead());
+static_assert(!DecodingMode::ExternalDraftTokens().isExplicitDraftTokens());
+static_assert(DecodingMode::ExternalDraftTokens().isExternalDraftTokens());
 } // namespace tensorrt_llm::executor
diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h
index 68ebf0547..630617b11 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingInput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingInput.h
@@ -108,6 +108,20 @@ class DecodingInput
         TensorConstPtr medusaTargetTokensPerStep; //!<  [batchSize], on gpu
     };
 
+    class ExternalDraftTokensInputs
+    {
+    public:
+        TensorPtr draftLogits;
+        TensorPtr draftProbs;
+        TensorPtr targetProbs;
+        TensorPtr numDraftTokens;
+        TensorPtr draftTokenIds;
+        TensorPtr useDraftLogits;
+        SizeType32 step;
+        float constantThreshold;
+        bool useRandomAcceptanceThreshold;
+    };
+
     class ExplicitDraftTokensInputs
     {
     public:
@@ -138,6 +152,8 @@ class DecodingInput
     std::optional<ExplicitDraftTokensInputs> explicitDraftTokensInputs;
 
     std::optional<LookaheadInputs> lookaheadInputs;
+
+    std::optional<ExternalDraftTokensInputs> externalDraftTokensInputs;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/decodingOutput.h b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
index 146db40a4..50a76588a 100644
--- a/cpp/include/tensorrt_llm/runtime/decodingOutput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
@@ -95,10 +95,13 @@ class DecodingOutput
     // mandatory parameters for beam search
     TensorPtr logProbs;         // [BS, BM, MSL], must be float*
     TensorPtr cumLogProbs;      // [BS, BM], optional for sampling
-    TensorPtr parentIds;        // [BS, BM, MSL]
+    TensorPtr parentIds;        // [BS, BM, MSL] index of the beam where the previous token is
     TensorPtr lengths;          // [BS, BM], total sequence lengths including padding
     TensorPtr cacheIndirection; // [BS, BM, MSL], k/v indirection for next generation step
 
+    TensorPtr logProbsTiled;    // [MSL, BS, BM] Buffer used to store the transpose of the logProbs.
+                                // Needed because the kernels have been written to use that shape.
+
     BeamHypotheses beamHypotheses;
 
     // Speculative decoding
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
index 7ed345a8b..f12362ece 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
@@ -62,23 +62,8 @@ class IGptDecoder
 
     virtual void forwardSync(DecodingOutput& output, DecodingInput const& input) = 0;
 
-    virtual void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
-        BufferManager const& manager,
-        std::optional<std::reference_wrapper<SamplingConfig const>> samplingConfig = std::nullopt)
-        = 0;
-
     virtual SamplingConfig const& getSamplingConfig() = 0;
 
-    static void acceptDraftTokensByIds(ITensor const& targetTokenIds, ITensor const& draftTokenIds,
-        ITensor const& contextLengths, ITensor const& numDraftTokens, ITensor& sequenceLengths,
-        ITensor const& finishedVec, ITensor& finishedFinal, ITensor& finishedSum, ITensor const& batchSlots,
-        BufferManager::CudaStreamPtr const& stream);
-
-    static void acceptDraftTokensByLogits(ITensor& draftLogits, ITensor const& targetLogits, ITensor& draftProbs,
-        ITensor& targetProbs, ITensor const& numDraftTokens, ITensor& finished, ITensor const& batchSlots,
-        SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold,
-        curandState_t* curandState, BufferManager::CudaStreamPtr const& stream);
-
     static std::unique_ptr<IGptDecoder> create(executor::DecodingMode const& mode, nvinfer1::DataType dtype,
         size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength,
         BufferManager::CudaStreamPtr const& stream,
@@ -105,10 +90,6 @@ class GptDecoder : public virtual IGptDecoder
 
     void forwardSync(DecodingOutput& output, DecodingInput const& input) override;
 
-    void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
-        BufferManager const& manager,
-        std::optional<std::reference_wrapper<SamplingConfig const>> samplingConfig = std::nullopt) override;
-
     SamplingConfig const& getSamplingConfig() override
     {
         return mSamplingConfig;
@@ -119,8 +100,6 @@ class GptDecoder : public virtual IGptDecoder
     std::shared_ptr<tensorrt_llm::layers::DynamicDecodeLayer<T>> mDynamicDecodeLayer;
     std::shared_ptr<tensorrt_llm::runtime::DecodingLayerWorkspace> mDecodingLayerWorkspace;
 
-    TensorPtr mLogProbsTiled; // Buffer used to store the transpose of the logProbs. Needed because the kernels have
-                              // been written to use that shape.
     SamplingConfig mSamplingConfig;
 
     size_t mMaxBatchSize;
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
index 358826f50..50bd89924 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -62,12 +62,12 @@ class GptDecoderBatched : public IGptDecoderBatched
     void newRequests(std::vector<SizeType32> const& seqSlots, std::vector<decoder_batch::Request> const& requests,
         std::vector<SamplingConfig> const& samplingConfigs) override;
 
-    TokenPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) override;
+    DecoderFinishedEventPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) override;
 
-    void forwardSync(decoder_batch::Token const& token) override;
+    void forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent) override;
 
-    void forwardSync(
-        decoder_batch::Token const& token, decoder_batch::Output& output, decoder_batch::Input const& input) override;
+    void forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent, decoder_batch::Output& output,
+        decoder_batch::Input const& input) override;
 
     void forwardAsync(decoder::Output& output, decoder::Input const& input) override;
 
@@ -245,7 +245,7 @@ class GptDecoderBatched : public IGptDecoderBatched
     void newRequest(SizeType32 batchSlot, decoder_batch::Request const& request, SamplingConfig const& samplingConfig);
 
     //! @brief Allocate buffers for speculative decoding.
-    void allocateSpeculativeDecodingBuffers();
+    void allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype);
 
     //! @brief Setup buffers for speculative decoding.
     void setupSpeculativeDecoding(ModelConfig const& modelConfig);
@@ -271,7 +271,7 @@ class GptDecoderBatched : public IGptDecoderBatched
     void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const& request);
 
     //! @brief Updates finished state on host for all active requests
-    void updateFinished(decoder_batch::Token const& token);
+    void updateFinished(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent);
 
     //! @brief Sets inputs for explicit draft tokens.
     void setExplicitDraftTokensInputs(decoder_batch::Input const& input);
@@ -289,7 +289,7 @@ class GptDecoderBatched : public IGptDecoderBatched
     CudaStreamPtr mRuntimeStream;
     CudaStreamPtr mDecoderStream;
     BufferManager mBufferManager;
-    TokenPtr mForwardToken;
+    DecoderFinishedEventPtr mDecoderFinishEvent;
     CudaEvent mForwardEvent;
 
     using GptDecoderPtr = std::unique_ptr<IGptDecoder>;
@@ -300,10 +300,6 @@ class GptDecoderBatched : public IGptDecoderBatched
     DecodingInputPtr mJointDecodingInput;
     DecodingOutputPtr mJointDecodingOutput;
 
-    std::vector<bool> mAcceptByLogits;
-    TensorPtr mNumDraftTokens;
-    TensorPtr mCurandStates;
-
     std::vector<SizeType32> mNbSteps;
     std::vector<bool> mFinished;
     TensorPtr mFinishedSum;
@@ -313,18 +309,9 @@ class GptDecoderBatched : public IGptDecoderBatched
 
     TensorPtr mFinishedSteps;     // [maxTokensPerStep, batchSize, beamWidth] finished states of type FinishedState
                                   // for each generated token of maxTokensPerStep, on gpu
-    TensorPtr mDraftProbs;        // [batchSize, maxTokensPerEngineStep, beamWidth, vocabPadded], temporary data for
-                                  // speculative decoding accept by logits kernel, on gpu
-    TensorPtr mTargetProbs;       // [batchSize, maxTokensPerEngineStep, beamWidth, vocabPadded], temporary data for
-                                  // speculative decoding accept by logits kernel, on gpu
-    TensorPtr mDraftTokenIds;     // [batchSize, maxTokensPerEngineStep], draft token indices, on gpu
-    TensorPtr mDraftLogits;       // [batchSize, maxTokensPerEngineStep, vocabSizePadded], draft token logits, on gpu
 
     TensorPtr mBatchSlotsSetup;   // [maxBatchSize], int32_t, address map, pinned
     TensorPtr mBatchSlotsDecoder; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned
-    TensorPtr mBatchSlotsAcceptTokens; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned
-    TensorPtr mBatchSlotsAcceptLogits; // [maxTokensPerEngineStep, maxBatchSize], int32_t, address map, pinned
-    TensorPtr mTargetLogitsPtrs;       // [maxBatchSize], float*, pointers to target logits, pinned
     SizeType32 mMaxSequenceLength{};
     SizeType32 mMaxAttentionWindow{};
     SizeType32 mSinkTokenLength{};
diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
index 46cd19902..a4b8e4cc3 100644
--- a/cpp/include/tensorrt_llm/runtime/gptSession.h
+++ b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -115,7 +115,6 @@ class [[deprecated("Use the executor API instead.")]] GptSession
         std::optional<SizeType32> genMicroBatchSize = std::nullopt;
         std::optional<executor::DecodingMode> decodingMode = std::nullopt;
         bool normalizeLogProbs = true;
-        std::optional<std::filesystem::path> enginePath;
     };
 
     //! @brief Optional profiler class to profile the generation phase of an inference request
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
index 11464f80e..048fa05a7 100644
--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
@@ -75,11 +75,11 @@ class Input
 
 using Output = decoder::Output;
 
-// TODO: is this a bad name to mix up with token concept in LLM? Would 'Event' be better? And should move to common.h
-class Token
+// used just as a container for easy returning / passing to function
+class DecoderFinishedEvent
 {
 public:
-    explicit Token(CudaEvent&& event, std::vector<bool> const& active)
+    explicit DecoderFinishedEvent(CudaEvent&& event, std::vector<bool> const& active)
         : event(std::move(event))
         , active(active)
     {
@@ -96,7 +96,7 @@ class IGptDecoderBatched : public virtual IStatefulGptDecoder
 public:
     using CudaStreamPtr = std::shared_ptr<CudaStream>;
     using TensorPtr = std::shared_ptr<ITensor>;
-    using TokenPtr = std::unique_ptr<decoder_batch::Token const>;
+    using DecoderFinishedEventPtr = std::unique_ptr<decoder_batch::DecoderFinishedEvent const>;
 
     //! @brief Setup buffers for ExplicitDraftTokens decoding.
     virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0;
@@ -105,15 +105,15 @@ class IGptDecoderBatched : public virtual IStatefulGptDecoder
     virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0;
 
     //! @brief Run one step for all requests without blocking the host process and return the token for synchronization.
-    virtual TokenPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) = 0;
+    virtual DecoderFinishedEventPtr forwardAsync(decoder_batch::Output& output, decoder_batch::Input const& input) = 0;
 
     //! @brief Call decoder forwardSync and wait for the call to `forwardAsync` associated with a token to complete.
-    virtual void forwardSync(
-        decoder_batch::Token const& token, decoder_batch::Output& output, decoder_batch::Input const& input)
+    virtual void forwardSync(decoder_batch::DecoderFinishedEvent const& token, decoder_batch::Output& output,
+        decoder_batch::Input const& input)
         = 0;
 
     //! @brief Wait for the call to `forwardAsync` associated with a token to complete.
-    virtual void forwardSync(decoder_batch::Token const& token) = 0;
+    virtual void forwardSync(decoder_batch::DecoderFinishedEvent const& token) = 0;
 
     //! @brief Run one step for all requests and wait for completion on the host.
     virtual void forward(decoder_batch::Output& output, decoder_batch::Input const& input)
diff --git a/cpp/include/tensorrt_llm/runtime/ipcUtils.h b/cpp/include/tensorrt_llm/runtime/ipcUtils.h
index 0dd45531e..76afedca8 100644
--- a/cpp/include/tensorrt_llm/runtime/ipcUtils.h
+++ b/cpp/include/tensorrt_llm/runtime/ipcUtils.h
@@ -32,7 +32,7 @@ class IpcMemory
     using BufferPtr = IBuffer::SharedPtr;
 
     // MAX_ALL_REDUCE_BLOCKS for block_barrier, 1 for multi_gpu_barrier
-    size_t static constexpr FLAGS_SIZE = (kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);
+    size_t static constexpr FLAGS_SIZE = (tensorrt_llm::kernels::MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t);
 
     IpcMemory(
         std::size_t bufferSize, BufferManager const& manager, WorldConfig const& worldConfig, bool openIpc = true);
diff --git a/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h b/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h
index 56504bd94..3c6fe731a 100644
--- a/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h
+++ b/cpp/include/tensorrt_llm/runtime/lookaheadBuffers.h
@@ -62,6 +62,7 @@ class LookaheadRuntimeBuffers
         TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& worldConfig) const;
 
 public:
+    TensorPtr cumSumLength;            // [1] the cumulative sum of generation length, on pinned
     TensorPtr packedMasksDevice;       // [forwardBatchSize, tokensPerStep, numPackedMasks], on gpu
     TensorPtr generationLengthsDevice; // [forwardBatchSize], on gpu
     TensorPtr positionOffsetsDevice;   // [forwardBatchSize, tokensPerStep], on gpu
diff --git a/cpp/include/tensorrt_llm/runtime/modelConfig.h b/cpp/include/tensorrt_llm/runtime/modelConfig.h
index fc3ac2928..b1b495e75 100644
--- a/cpp/include/tensorrt_llm/runtime/modelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@@ -60,6 +60,9 @@ class ModelConfig
     {
         kATTENTION,
         kRECURRENT,
+        // NOTE: Linear and noop are attention alternatives introduced in Nemotron-NAS. They do not use the KV cache.
+        kLINEAR,
+        kNOOP,
     };
 
     enum class KVCacheType : std::int32_t
@@ -97,13 +100,13 @@ class ModelConfig
         kEnabled,
     };
 
-    explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbAttentionLayers, SizeType32 nbRnnLayers, SizeType32 nbHeads,
-        SizeType32 hiddenSize, nvinfer1::DataType dtype)
+    explicit ModelConfig(SizeType32 vocabSize, SizeType32 nbLayers, SizeType32 nbAttentionLayers,
+        SizeType32 nbRnnLayers, SizeType32 nbHeads, SizeType32 hiddenSize, nvinfer1::DataType dtype)
         : mVocabSize(vocabSize)
+        , mNbLayers(nbLayers)
         , mNbAttentionLayers(nbAttentionLayers)
         , mNbRnnLayers(nbRnnLayers)
         , mNbHeads(nbHeads)
-        , mNbKvHeads(nbHeads)
         , mHiddenSize(hiddenSize)
         , mSizePerHead(mHiddenSize / mNbHeads)
         , mDataType(dtype)
@@ -134,6 +137,10 @@ class ModelConfig
         , mUseShapeInference(true)
         , mManageWeightsType(ManageWeightsType::kDisabled)
     {
+        TLLM_CHECK_WITH_INFO(mNbLayers >= mNbAttentionLayers + mNbRnnLayers,
+            "Number of layers (%d) expected to be >= number of attention (%d) + number of rnn layers (%d)", mNbLayers,
+            mNbAttentionLayers, mNbRnnLayers);
+        setNbKvHeads(mNbHeads);
     }
 
     [[nodiscard]] static std::vector<SizeType32> getOptProfilesSplitPoints() noexcept
@@ -151,14 +158,55 @@ class ModelConfig
         return (mVocabSize + worldSize - 1) / worldSize * worldSize;
     }
 
-    [[nodiscard]] SizeType32 constexpr getNbAttentionLayers(SizeType32 pipelineParallelism = 1) const
+    [[nodiscard]] SizeType32 countLocalLayers(
+        LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
     {
-        return mNbAttentionLayers / pipelineParallelism;
+        TLLM_CHECK_WITH_INFO(pipelineParallelism > 0, "Invalid pipelineParallelism: %d", pipelineParallelism);
+        auto const numLocalLayers = mNbLayers / pipelineParallelism; // WARNING: assume no remainder
+        auto const firstLocalLayerIt = mLayerTypes.cbegin() + (numLocalLayers * pipelineParallelismRank);
+        return std::count(firstLocalLayerIt, firstLocalLayerIt + numLocalLayers, layerType);
     }
 
-    [[nodiscard]] SizeType32 constexpr getNbRnnLayers(SizeType32 pipelineParallelism = 1) const
+    [[nodiscard]] SizeType32 countLowerRankLayers(
+        LayerType layerType, SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
     {
-        return mNbRnnLayers / pipelineParallelism;
+        auto const numLocalLayers = mNbLayers / pipelineParallelism; // WARNING: assume no remainder
+        auto const firstLocalLayer = numLocalLayers * pipelineParallelismRank;
+        // count number of previous non-local attention layers
+        return std::count(mLayerTypes.cbegin(), mLayerTypes.cbegin() + firstLocalLayer, layerType);
+    }
+
+    [[nodiscard]] SizeType32 getNbLayers(SizeType32 pipelineParallelism = 1) const
+    {
+        return mNbLayers / pipelineParallelism; // WARNING: assume no remainder
+    }
+
+    [[nodiscard]] SizeType32 getNbAttentionLayers(
+        SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
+    {
+        // TODO(oargov): get rid of this invalid state
+        if (mLayerTypes.empty())
+        {
+            // this assumption might be wrong in a few cases, for example:
+            // layer types: [attention, recurrent, recurrent], pp=2 ==> first rank has 1 attention layer, not 0
+            TLLM_LOG_DEBUG("Assuming uniform distribution of attention layers between ranks");
+            return mNbAttentionLayers / pipelineParallelism;
+        }
+        return countLocalLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
+    }
+
+    [[nodiscard]] SizeType32 getNbRnnLayers(
+        SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
+    {
+        // TODO(oargov): get rid of this invalid state
+        if (mLayerTypes.empty())
+        {
+            // this assumption might be wrong in a few cases, for example:
+            // layer types: [attention, attention, recurrent], pp=2 ==> second rank has 1 rnn layer, not 0
+            TLLM_LOG_DEBUG("Assuming uniform distribution of recurrent layers between ranks");
+            return mNbRnnLayers / pipelineParallelism;
+        }
+        return countLocalLayers(LayerType::kRECURRENT, pipelineParallelism, pipelineParallelismRank);
     }
 
     [[nodiscard]] SizeType32 constexpr getNbHeads() const noexcept
@@ -166,14 +214,16 @@ class ModelConfig
         return mNbHeads;
     }
 
-    [[nodiscard]] SizeType32 constexpr getNbKvHeads() const noexcept
+    [[nodiscard]] SizeType32 getNbKvHeads(SizeType32 layerIdx) const
     {
-        return mNbKvHeads;
+        TLLM_CHECK_WITH_INFO(layerIdx < mNbAttentionLayers, "Layer index %d is out of bounds", layerIdx);
+        return mNumKvHeadsPerAttentionLayer[layerIdx];
     }
 
-    void constexpr setNbKvHeads(SizeType32 nbKvHeads) noexcept
+    // set the number of kv heads for all layers
+    void setNbKvHeads(SizeType32 nbKvHeads)
     {
-        mNbKvHeads = nbKvHeads;
+        mNumKvHeadsPerAttentionLayer = std::vector<SizeType32>(mNbAttentionLayers, nbKvHeads);
     }
 
     [[nodiscard]] SizeType32 constexpr getHiddenSize() const noexcept
@@ -645,12 +695,46 @@ class ModelConfig
         mModelName = modelName;
     }
 
+    [[nodiscard]] std::vector<SizeType32> const& getNumKvHeadsPerLayer() const
+    {
+        return mNumKvHeadsPerAttentionLayer;
+    }
+
+    [[nodiscard]] std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator>
+    getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
+    {
+        TLLM_CHECK_WITH_INFO(pipelineParallelism > 0, "Invalid pipelineParallelism: %d", pipelineParallelism);
+        // count number of previous non-local attention layers
+        auto const numPrevAttnLayers
+            = countLowerRankLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
+        auto const firstLocalAttentionLayerIt = mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers;
+        auto const numLocalAttentionLayers
+            = countLocalLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
+        return std::make_pair(firstLocalAttentionLayerIt, firstLocalAttentionLayerIt + numLocalAttentionLayers);
+    }
+
+    void setNumKvHeadsPerLayer(std::vector<SizeType32> const& headsPerLayer)
+    {
+        auto const numElems = static_cast<SizeType32>(headsPerLayer.size());
+        TLLM_CHECK_WITH_INFO(numElems == mNbAttentionLayers,
+            "Length of head_per_layer (%d) must match number of attention layers (%d)", numElems, mNbAttentionLayers);
+        mNumKvHeadsPerAttentionLayer = headsPerLayer;
+    }
+
+    [[nodiscard]] SizeType32 getSumLocalKvHeads(
+        SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
+    {
+        auto [cbegin, cend] = getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank);
+        auto const sumLocalHeads = std::reduce(cbegin, cend);
+        return sumLocalHeads;
+    }
+
 private:
     SizeType32 mVocabSize;
+    SizeType32 mNbLayers;
     SizeType32 mNbAttentionLayers;
     SizeType32 mNbRnnLayers;
     SizeType32 mNbHeads;
-    SizeType32 mNbKvHeads;
     SizeType32 mHiddenSize;
     SizeType32 mSizePerHead;
     nvinfer1::DataType mDataType;
@@ -703,6 +787,7 @@ class ModelConfig
     bool mUseShapeInference;
     ManageWeightsType mManageWeightsType;
     std::string mModelName;
+    std::vector<SizeType32> mNumKvHeadsPerAttentionLayer;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
index 8226c411c..e739e8188 100644
--- a/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
+++ b/cpp/include/tensorrt_llm/runtime/speculativeDecodingMode.h
@@ -97,8 +97,7 @@ class SpeculativeDecodingMode
 
     [[nodiscard]] bool constexpr variableDraftLength() const
     {
-        // Add Lookahead, when lookahead supports it.
-        return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens);
+        return anyBitSet(kDraftTokensExternal | kExplicitDraftTokens | kLookaheadDecoding);
     }
 
     [[nodiscard]] bool constexpr hasDraftLogits() const
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index 10debf560..2ff2de09e 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -348,9 +348,11 @@ endif()
 if(NOT WIN32) # Unix-like compilers
   set(UNDEFINED_FLAG "-Wl,--no-undefined")
   set(AS_NEEDED_FLAG "-Wl,--as-needed")
+  set(NO_AS_NEEDED_FLAG "-Wl,--no-as-needed")
 else() # Windows
   set(UNDEFINED_FLAG "")
   set(AS_NEEDED_FLAG "")
+  set(NO_AS_NEEDED_FLAG "")
 endif()
 
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index a7b70a468..70202b6ff 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2b202970ce1e9ee672df2692cc5bd1676be62b4185878ad8aa1afb0fe342f41
-size 4474050
+oid sha256:96164a1788ee2edfdb9f18906e4c2727d5593274f00f40b065ccefa7b2a71063
+size 5206662
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 0a4c8f235..a4e75f719 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52ddca820b9c23d3ce209e3cb321e205fc62563461324e47892b0312d38719ef
-size 4573988
+oid sha256:ecf5976593289620ab34b311a7c725ab946510edf610f58e0e0aff86610469a3
+size 5316564
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index 14737cbef..feddb36b7 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-625e1171d9e44b814e58c12c4ad7eead libtensorrt_llm_batch_manager_static.a
-bf803b865d786a024c5e9e2fd9e40791 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+99a273dde85e731ce80079d4769ea45f libtensorrt_llm_batch_manager_static.a
+52afd574ed63cd6157fd32c8f95770be libtensorrt_llm_batch_manager_static.pre_cxx11.a
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 2db07ac01..1f78589c5 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8f5c3fd7a4a0fe2a95d9632ffc1b5fcdf8351cf65483b046115309746ec001a
-size 4337326
+oid sha256:e7de37c449d41e183580feebd9bc581e11ad9c19ddde00a2b7f4e3dac6bd5bb6
+size 5048202
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index ab690974e..f81826dc0 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:28f3746c1b3c59f3bfe5afd8f960babddb0a7e31130ce84a5d0e0c20382e86b1
-size 4301934
+oid sha256:a4cdfaa251cf3d3e9219d161ef112af4c691a9e1a2e7791cd806696f9750f725
+size 5009770
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
index 61e5782a8..830a4f130 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-beb06bae50f372d9413d705ae5cc6986 libtensorrt_llm_batch_manager_static.a
-ea9db9b655537ba55110102dcbe62733 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+ac3aad947cbaa559cf0c96eca8b91a42 libtensorrt_llm_batch_manager_static.a
+03c5875d49ef25f416378a2997cf67ae libtensorrt_llm_batch_manager_static.pre_cxx11.a
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
index 5fd10d767..603fb407a 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d970a69f8de5e8f852a572715d95c2144936464422367e6b35b6e7f99f9097d6
-size 26765420
+oid sha256:0b4eaa12656c06eab96a526aaeebd8292ae91450641e0719827f429dca1af813
+size 32677392
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
index 45c04b499..27ad16618 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
@@ -1,2 +1,2 @@
-845200593f2128c3c25e02e59ee2d115 tensorrt_llm_batch_manager_static.lib
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+38905d5e30a4169dfa2a04efdb349d11 tensorrt_llm_batch_manager_static.lib
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h
index 9f2d93316..d7bf43b40 100644
--- a/cpp/tensorrt_llm/common/customAllReduceUtils.h
+++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@@ -21,7 +21,7 @@
 namespace tensorrt_llm::utils::customAllReduceUtils
 {
 
-constexpr size_t NUM_POINTERS_PER_RANK = 4;
+constexpr size_t NUM_POINTERS_PER_RANK = 7;
 
 // WARNING: MUST BE KEPT IN SYNC with tensorrt_llm/plugin/plugin.py
 inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
index 5a0ef3b1d..cfff14f64 100644
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -157,4 +157,42 @@ bool getEnvEnablePDL()
     return enablePDL;
 }
 
+bool getEnvUseUCXKvCache()
+{
+    static bool init = false;
+    static bool useUCXKVCache = false;
+    if (!init)
+    {
+        init = true;
+        {
+            char const* use_ucx_kv_cache = std::getenv("TRTLLM_USE_UCX_KVCACHE");
+            if (use_ucx_kv_cache)
+            {
+                if (use_ucx_kv_cache[0] == '1' && use_ucx_kv_cache[1] == '\0')
+                {
+                    useUCXKVCache = true;
+                }
+            }
+        }
+    }
+    return useUCXKVCache;
+}
+
+std::string getEnvUCXInterface()
+{
+    static bool init = false;
+    static std::string ucxInterface;
+    if (!init)
+    {
+        init = true;
+        {
+            char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE");
+            if (ucx_interface)
+            {
+                ucxInterface = ucx_interface;
+            }
+        }
+    }
+    return ucxInterface;
+}
 } // namespace tensorrt_llm::common
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
index 7aff8d40d..f8b71b00c 100644
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@@ -18,6 +18,7 @@
 #pragma once
 #include <cstdint>
 #include <optional>
+#include <string>
 
 namespace tensorrt_llm::common
 {
@@ -40,4 +41,8 @@ int getEnvMmhaKernelBlockSize();
 // Whether PDL is enabled.
 bool getEnvEnablePDL();
 
+bool getEnvUseUCXKvCache();
+
+std::string getEnvUCXInterface();
+
 } // namespace tensorrt_llm::common
diff --git a/cpp/tensorrt_llm/common/mpiUtils.cpp b/cpp/tensorrt_llm/common/mpiUtils.cpp
index c47bdf2ec..be1de0a9e 100644
--- a/cpp/tensorrt_llm/common/mpiUtils.cpp
+++ b/cpp/tensorrt_llm/common/mpiUtils.cpp
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <numeric>
+#include <unordered_set>
 
 #include "tensorrt_llm/common/mpiUtils.h"
 
@@ -127,7 +128,6 @@ std::vector<int> getWorldRanks(MpiComm const& comm)
     MPICHECK(MPI_Group_translate_ranks(group, groupSize, ranks.data(), worldGroup, worldRanks.data()));
     MPICHECK(MPI_Group_free(&group));
     MPICHECK(MPI_Group_free(&worldGroup));
-    std::sort(worldRanks.begin(), worldRanks.end());
 #else
     std::vector<int> worldRanks{0};
 #endif
@@ -314,6 +314,18 @@ void MpiComm::allgather(void const* sendbuf, void* recvbuf, int count, MpiType d
 #endif // ENABLE_MULTI_DEVICE
 }
 
+void MpiComm::allgatherv(void const* sendbuf, int sendcount, MpiType sendtype, void* recvbuf,
+    std::vector<int> const& recvcounts, std::vector<int> const& displs, MpiType recvtype) const
+{
+#if ENABLE_MULTI_DEVICE
+    MPICHECK(MPI_Allgatherv(sendbuf, sendcount, getMpiDtype(sendtype), recvbuf, recvcounts.data(), displs.data(),
+        getMpiDtype(recvtype), mComm));
+
+#else
+    TLLM_THROW("Multi device support is disabled.");
+#endif // ENABLE_MULTI_DEVICE
+}
+
 void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status) const
 {
 #if ENABLE_MULTI_DEVICE
@@ -323,6 +335,18 @@ void MpiComm::mprobe(int source, int tag, MPI_Message* msg, MPI_Status* status)
 #endif // ENABLE_MULTI_DEVICE
 }
 
+bool MpiComm::improbe(int source, int tag, MPI_Message* msg, MPI_Status* status) const
+{
+#if ENABLE_MULTI_DEVICE
+    int flag{0};
+    MPICHECK(MPI_Improbe(source, tag, mComm, &flag, msg, status));
+    return flag != 0;
+#else
+    TLLM_THROW("Multi device support is disabled.");
+    return false;
+#endif
+}
+
 bool MpiComm::iprobe(int source, int tag, MPI_Status* status) const
 {
 #if ENABLE_MULTI_DEVICE
@@ -391,31 +415,30 @@ MpiComm& MpiComm::mutableLocalSession()
 void MpiComm::refreshLocalSession()
 {
 #if ENABLE_MULTI_DEVICE
-    static std::vector<int> initSessionRanks;
     static std::mutex mutex;
     std::unique_lock lock(mutex);
-    if (initSessionRanks.empty())
-    {
-        auto initSessionRanks = getWorldRanks(MpiComm::session());
-        auto localSessionRanks = getWorldRanks(MpiComm::localSession());
-        std::vector<int> intersectionRanks;
-        std::set_intersection(initSessionRanks.begin(), initSessionRanks.end(), localSessionRanks.begin(),
-            localSessionRanks.end(), std::back_inserter(intersectionRanks));
-
-        MPI_Group worldGroup;
-        MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
-        MPI_Group localGroup;
-        MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
-        MPI_Comm localComm;
-        MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
-        MpiComm::mutableLocalSession().mFreeComm = true;
-        MpiComm::mutableLocalSession() = MpiComm{localComm, false};
-    }
-    else
+    auto initSessionRanks = getWorldRanks(MpiComm::session());
+    auto localSessionRanks = getWorldRanks(MpiComm::localSession());
+
+    // Add to intersectionRanks in order of initSessionRanks
+    std::vector<int> intersectionRanks;
+    std::unordered_set<int> localSessionRanksSet(localSessionRanks.begin(), localSessionRanks.end());
+    for (auto rank : initSessionRanks)
     {
-        TLLM_CHECK_WITH_INFO(getWorldRanks(MpiComm::session()) == initSessionRanks,
-            "Executors in the same process must use the same participant IDs.");
+        if (localSessionRanksSet.find(rank) != localSessionRanksSet.end())
+        {
+            intersectionRanks.push_back(rank);
+        }
     }
+
+    MPI_Group worldGroup;
+    MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
+    MPI_Group localGroup;
+    MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
+    MPI_Comm localComm;
+    MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
+    MpiComm::mutableLocalSession().mFreeComm = true;
+    MpiComm::mutableLocalSession() = MpiComm{localComm, false};
     TLLM_LOG_INFO("Refreshed the MPI local session");
 #endif // ENABLE_MULTI_DEVICE
 }
diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
index 979d8dd6f..c5a4fe0e2 100644
--- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
+++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
@@ -38,6 +38,12 @@ namespace common
 template <int VPT>
 struct BytesToType;
 
+template <>
+struct BytesToType<1>
+{
+    using type = uint8_t;
+};
+
 template <>
 struct BytesToType<2>
 {
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
index e5223bbfb..a27ea7589 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d21eecce6992b0099a580328a4eeeaf9bf4c52076e642a609088dca34c74803
-size 1751564
+oid sha256:3097d831283ec377c42227aed2b62d8fc0a3cd1bd766c730ace5372db3a8778b
+size 2214754
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index 247c944cc..2341a015b 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4cf43ade3c597e3258e880fc19d2d98e4974a773d932fc6e99b0bc56af67f750
-size 1781954
+oid sha256:cc302b9720b93c583ef0de78225d33bdc070000354e074eb362c6712cc228ed1
+size 2242542
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
index b71d7b8bc..47ec985c3 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-7b9dc1aa716176dab6a2efb46c9daf06 libtensorrt_llm_executor_static.a
-48d4741a6ef91bc3ed2209fc39d25edc libtensorrt_llm_executor_static.pre_cxx11.a
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+ff0d39f3c7a1d2df88b9b281f6c6883c libtensorrt_llm_executor_static.a
+bf82afc7a6e6afc288e7a2c7de1c8944 libtensorrt_llm_executor_static.pre_cxx11.a
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
index c5ca6fe2e..182cb7ab1 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e02165792048793f72a67666dbdad274ab52eaec1bad847deb96cb1c6f9900fa
-size 1814682
+oid sha256:2aa2554bf7a93a45ae311d0e9ffb2b43b26bd8697437101ced24564c8884995c
+size 3297132
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index 241934be8..07bee4a94 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b148526b181a9eb7c2c0eff11395e64c60526397f2d887b8ace87f3f130c750
-size 1724916
+oid sha256:fa0a8208fbbaa3eaa22ffe98cdba16dffdc112dd7c7273dfc0f4b5805092b65b
+size 3212266
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
index 7cf6fe271..16707d37a 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-9a82b1a3cc646a499f48d0ca0b154e06 libtensorrt_llm_executor_static.a
-ded4c067f44d95ac7ddb683b3b82cbf6 libtensorrt_llm_executor_static.pre_cxx11.a
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+9a3657a830e424f26494c81903245bf8 libtensorrt_llm_executor_static.a
+af374d630c9fd7da70d7971cd95cccfb libtensorrt_llm_executor_static.pre_cxx11.a
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
index 29549ac60..645586a70 100644
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6244f920c8327495d842d1835b7593e5a8a2fd9a119ec383d95c1794eacf527c
-size 19231936
+oid sha256:077d9336da40781c22c8c0da23fb844e91473a0bee9671bf7140c6e9434b3de3
+size 21155258
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
index 204f4631f..3757330bd 100644
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
@@ -1,2 +1,2 @@
-502454ea4f6888677f8d91c419def633 tensorrt_llm_executor_static.lib
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+04280fbceefc758ac20758609526b354 tensorrt_llm_executor_static.lib
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
index 8b44a419a..895a91483 100644
--- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
+++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h
@@ -630,7 +630,7 @@ void topKSoftMaxKernelLauncher(T const* logits, T const* bias, void* workspace,
     // ┃ pTemp    ┃ BS * PAD_K * VP * (2 * (PAD_K * 2) + 2) |                          | float     |
     // ┗━━━━━━━━━━┛ --------------------------------------------------------------------------------
 
-    // Stage1: gridDim(BS,BM,nVPart), blockDim(nBlockSize,1,1)
+    // beamStage1Kernel: gridDim(BS,BM,nVPart), blockDim(nBlockSize,1,1)
     // Each ThreadBlock takes `nVocabChunk` contiguous elements in logits to do TopK and reduce_md,
     //   then writes output into pTemp.
     // At end of this kernel, each ThreadBlock holds the indices and values of the top 2*BM elements,
@@ -647,7 +647,7 @@ void topKSoftMaxKernelLauncher(T const* logits, T const* bias, void* workspace,
     // ┃    md    ┃ 2               | 2         | float     |
     // ┗━━━━━━━━━━┛ -----------------------------------------
 
-    // Stage2: gridDim(BS,BM,1), blockDim(32/64/128,1,1)
+    // beamStage2Kernel: gridDim(BS,BM,1), blockDim(32/64/128,1,1)
     // Each TheadBlock takes `nVPart` contiguous Tiles in pTemp to do reduce_topk and reduce_md,
     //   writes output topk_id into in pTempId, writes topk_value + cumLogProbs into pTempVal.
 
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
index 5857e927d..0c228692c 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
@@ -165,7 +165,7 @@ void FusedMHARunnerV2::setupKernelParams(MHARunnerParams runnerParams)
     // Use exp2f optimization for warp-specialized ws kernels on Hopper.
     if (mLaunchParams.useBase2ExpTrick)
     {
-        // The kernel adopts the log2f optimziation.
+        // The kernel adopts the log2f optimization.
         constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
         set_alpha(mKernelParams.scale_bmm1, scale_bmm1 * float(kLog2e), DATA_TYPE_FP32);
     }
@@ -364,8 +364,8 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
 void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams)
 {
     // split D into multiple groups in order to match the TMA swizzle mode (128B)
-    const uint32_t d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
-    const uint32_t d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
+    uint32_t const d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
+    uint32_t const d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
 
     // separate q, k, v and o tma descriptors
     Multiple_tma_descriptor<4> qkv_tma_descriptor;
@@ -421,8 +421,8 @@ void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams)
     uint32_t fp32_to_tf32 = 0;
 
     // gmma descriptor mode
-    const uint32_t d_bytes_per_group = d_in_bytes / d_groups;
-    const cudaTmaDescSwizzle swizzle_mode = (d_bytes_per_group > 64
+    uint32_t const d_bytes_per_group = d_in_bytes / d_groups;
+    cudaTmaDescSwizzle const swizzle_mode = (d_bytes_per_group > 64
             ? cudaTmaDescSwizzle::SWIZZLE_128B
             : (d_bytes_per_group > 32 ? cudaTmaDescSwizzle::SWIZZLE_64B : cudaTmaDescSwizzle::SWIZZLE_32B));
 
@@ -474,8 +474,8 @@ void FusedMHARunnerV2::setPackedQkvTmaDescriptors(MHARunnerParams runnerParams)
 void FusedMHARunnerV2::setSeparateQKvTmaDescriptors(MHARunnerParams runnerParams)
 {
     // split D into multiple groups in order to match the TMA swizzle mode (128B)
-    const uint32_t d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
-    const uint32_t d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
+    uint32_t const d_in_bytes = get_size_in_bytes(mLaunchParams.padded_d, mFixedParams.dataType);
+    uint32_t const d_groups = d_in_bytes > 128 ? d_in_bytes / 128 : 1;
 
     uint32_t q_step = 0, kv_step = 0;
     xmmaKernel->getStepSize(q_step, kv_step, mKernelParams, mLaunchParams);
@@ -518,7 +518,7 @@ void FusedMHARunnerV2::setSeparateQKvTmaDescriptors(MHARunnerParams runnerParams
         = (get_size_in_bytes(mFixedParams.dataType) == 1) ? cudaTmaDescFormat::U8 : cudaTmaDescFormat::F16_RN;
 
     // gmma descriptor mode
-    const uint32_t d_bytes_per_group = d_in_bytes / d_groups;
+    uint32_t const d_bytes_per_group = d_in_bytes / d_groups;
     cudaTmaDescSwizzle const swizzle_mode = (d_bytes_per_group > 64
             ? cudaTmaDescSwizzle::SWIZZLE_128B
             : (d_bytes_per_group > 32 ? cudaTmaDescSwizzle::SWIZZLE_64B : cudaTmaDescSwizzle::SWIZZLE_32B));
diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
index 0f2a514bf..d84188139 100644
--- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
+++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu
@@ -17,8 +17,11 @@
 #include "customAllReduceKernels.h"
 #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh"
 #include "tensorrt_llm/common/cudaTypeUtils.cuh"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/dataType.h"
 #include "tensorrt_llm/common/envUtils.h"
+#include <cooperative_groups.h>
 #include <tuple>
 #include <type_traits>
 
@@ -174,12 +177,6 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag
 
 namespace reduce_fusion
 {
-namespace details
-{
-static constexpr int kBytesPerAccess = 16;
-static constexpr int kWarpSize = 32;
-static constexpr int kMaxCtaSize = 1024;
-}; // namespace details
 
 inline __device__ float warp_reduce_sum(float val)
 {
@@ -318,7 +315,7 @@ __global__ void rms_norm_kernel(AllReduceParams params)
 }
 
 template <typename T, bool Bias = false, bool Residual = false, bool Affine = false>
-void rms_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
+void rms_norm_kernel_launcher(AllReduceParams& params, cudaStream_t stream)
 {
     static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
     TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
@@ -387,6 +384,395 @@ void rms_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
     }
 }
 
+template <typename T>
+struct NegZero128b
+{
+    static constexpr int v = static_cast<int>(0x80008000);
+    static constexpr int4 value = {v, v, v, v};
+};
+
+template <>
+struct NegZero128b<float>
+{
+    static constexpr int v = static_cast<int>(0x80000000);
+    static constexpr int4 value = {v, v, v, v};
+};
+
+template <typename T>
+__device__ static constexpr int4 NegZero128b_v = NegZero128b<T>::value;
+
+template <typename T>
+__device__ __forceinline__ bool is_neg_zero(T& v);
+
+template <>
+__device__ __forceinline__ bool is_neg_zero<float>(float& v)
+{
+    uint32_t bits = *reinterpret_cast<uint32_t*>(&v);
+    return bits == 0x80000000;
+}
+
+template <>
+__device__ __forceinline__ bool is_neg_zero<half>(half& v)
+{
+    uint16_t bits = *reinterpret_cast<uint16_t*>(&v);
+    return bits == 0x8000;
+}
+
+template <>
+__device__ __forceinline__ bool is_neg_zero<__nv_bfloat16>(__nv_bfloat16& v)
+{
+    uint16_t bits = *reinterpret_cast<uint16_t*>(&v);
+    return bits == 0x8000;
+}
+
+template <typename ValType, typename VecType>
+__device__ __forceinline__ VecType remove_neg_zero(VecType const& vec)
+{
+    static constexpr int kIter = sizeof(VecType) / sizeof(ValType);
+    using ReadOnlyValType = std::add_const_t<ValType>;
+    VecType ret;
+#pragma unroll
+    for (int i = 0; i < kIter; ++i)
+    {
+        auto val = reinterpret_cast<ReadOnlyValType*>(&vec)[i];
+        reinterpret_cast<ValType*>(&ret)[i] = is_neg_zero(val) ? static_cast<ValType>(0.f) : val;
+    }
+    return ret;
+}
+
+template <typename ValType, typename VecType>
+__device__ __forceinline__ bool has_neg_zero(VecType const& vec)
+{
+    static constexpr int kIter = sizeof(VecType) / sizeof(ValType);
+    using ReadOnlyValType = std::add_const_t<ValType>;
+#pragma unroll
+    for (int i = 0; i < kIter; ++i)
+    {
+        auto val = reinterpret_cast<ReadOnlyValType*>(&vec)[i];
+        if (is_neg_zero(val))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+template <typename ValType, typename VecType>
+__device__ __forceinline__ bool all_neg_zero(VecType const& vec)
+{
+    static constexpr int kIter = sizeof(VecType) / sizeof(ValType);
+    using ReadOnlyValType = std::add_const_t<ValType>;
+#pragma unroll
+    for (int i = 0; i < kIter; ++i)
+    {
+        auto val = reinterpret_cast<ReadOnlyValType*>(&vec)[i];
+        if (!is_neg_zero(val))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+__device__ __forceinline__ void st_global_release(int4 const& val, int4* addr)
+{
+    asm volatile("st.release.global.sys.v4.b32 [%4], {%0, %1, %2, %3};" ::"r"(val.x), "r"(val.y), "r"(val.z),
+        "r"(val.w), "l"(addr));
+}
+
+__device__ __forceinline__ int4 ld_global_acquire(int4* addr)
+{
+    int4 val;
+    asm volatile("ld.acquire.global.sys.v4.b32 {%0, %1, %2, %3}, [%4];"
+                 : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                 : "l"(addr));
+    return val;
+}
+
+__device__ __forceinline__ void st_global_volatile(int4 const& val, int4* addr)
+{
+    asm volatile("st.volatile.global.v4.b32 [%4], {%0, %1, %2, %3};" ::"r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w),
+        "l"(addr));
+}
+
+__device__ __forceinline__ int4 ld_global_volatile(int4* addr)
+{
+    int4 val;
+    asm volatile("ld.volatile.global.v4.b32 {%0, %1, %2, %3}, [%4];"
+                 : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                 : "l"(addr));
+    return val;
+}
+
+template <typename ValType>
+__device__ __forceinline__ void set_neg_zero(int4* addr)
+{
+    st_global_volatile(NegZero128b_v<ValType>, addr);
+}
+
+template <typename T, int RanksPerNode, bool PushMode>
+struct Reducer;
+
+template <typename T, int RanksPerNode>
+struct Reducer<T, RanksPerNode, true>
+{
+    static __device__ __forceinline__ int4 allreduce(AllReduceParams& params, int global_offset)
+    {
+        using PackedStruct = typename PackedOn16Bytes<T>::Type;
+        int ping = params.barrier_flag % 3;
+        int pong = (params.barrier_flag + 2) % 3;
+        T const* local_input_buffer = reinterpret_cast<T const*>(params.local_input_buffer_ptr);
+        T* local_shared_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + ping * MAX_RANKS_PER_NODE]);
+        T* local_clean_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + pong * MAX_RANKS_PER_NODE]);
+        local_input_buffer += global_offset;
+        local_shared_buffer += global_offset;
+        local_clean_buffer += global_offset;
+        T* buffers[RanksPerNode];
+#pragma unroll
+        for (int ii = 0; ii < RanksPerNode; ++ii)
+        {
+            int rank = (params.local_rank + ii) % RanksPerNode;
+            buffers[ii] = reinterpret_cast<T*>(
+                              params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + ping * MAX_RANKS_PER_NODE])
+                + global_offset + params.local_rank * params.elts_total;
+        }
+        PackedStruct sum_vec, val;
+        val.packed = remove_neg_zero<T>(*reinterpret_cast<int4 const*>(local_input_buffer));
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            st_global_volatile(val.packed, reinterpret_cast<int4*>(buffers[ii]));
+        }
+        sum_vec.packed = val.packed;
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            int rank = (params.local_rank + ii) % RanksPerNode;
+            set_neg_zero<T>(reinterpret_cast<int4*>(local_clean_buffer + rank * params.elts_total));
+        }
+        PackedStruct vals[RanksPerNode - 1];
+        bool done = false;
+        while (!done)
+        {
+            done = true;
+#pragma unroll
+            for (int ii = 1; ii < RanksPerNode; ++ii)
+            {
+                int rank = (params.local_rank + ii) % RanksPerNode;
+                vals[ii - 1].packed
+                    = ld_global_volatile(reinterpret_cast<int4*>(local_shared_buffer + rank * params.elts_total));
+            }
+#pragma unroll
+            for (int ii = 0; ii < RanksPerNode - 1; ii++)
+            {
+                done &= !has_neg_zero<T>(vals[ii].packed);
+            }
+        }
+
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            sum_vec.packed = add128b(sum_vec, vals[ii - 1]);
+        }
+        return sum_vec.packed;
+    }
+};
+
+template <typename T, int RanksPerNode>
+struct Reducer<T, RanksPerNode, false>
+{
+    static __device__ __forceinline__ int4 allreduce(AllReduceParams& params, int global_offset)
+    {
+        using PackedStruct = typename PackedOn16Bytes<T>::Type;
+        int ping = params.barrier_flag % 3;
+        int pong = (params.barrier_flag + 2) % 3;
+        T const* local_input_buffer = reinterpret_cast<T const*>(params.local_input_buffer_ptr);
+        T* local_shared_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + ping * MAX_RANKS_PER_NODE]);
+        T* local_clean_buffer = reinterpret_cast<T*>(
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[params.local_rank + pong * MAX_RANKS_PER_NODE]);
+        local_input_buffer += global_offset;
+        local_shared_buffer += global_offset;
+        local_clean_buffer += global_offset;
+        T* buffers[RanksPerNode];
+#pragma unroll
+        for (int ii = 0; ii < RanksPerNode; ++ii)
+        {
+            int rank = (params.local_rank + ii) % RanksPerNode;
+            buffers[ii] = reinterpret_cast<T*>(
+                              params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + ping * MAX_RANKS_PER_NODE])
+                + global_offset;
+        }
+        PackedStruct sum_vec, val;
+        val.packed = remove_neg_zero<T>(*reinterpret_cast<int4 const*>(local_input_buffer));
+        st_global_volatile(val.packed, reinterpret_cast<int4*>(local_shared_buffer));
+        sum_vec.packed = val.packed;
+#pragma unroll
+        for (int ii = 1; ii < RanksPerNode; ++ii)
+        {
+            do
+            {
+                val.packed = ld_global_volatile(reinterpret_cast<int4*>(buffers[ii]));
+            } while (has_neg_zero<T>(val.packed));
+            sum_vec.packed = add128b(sum_vec, val);
+        }
+        set_neg_zero<T>(reinterpret_cast<int4*>(local_clean_buffer));
+        return sum_vec.packed;
+    }
+};
+
+template <int ClusterSize, typename T, int RanksPerNode, bool Bias = false, bool Affine = false, bool PushMode = true>
+static __global__ void lamport_style_one_shot_all_reduce_norm_kernel(AllReduceParams params)
+{
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    namespace cg = cooperative_groups;
+    static_assert(RanksPerNode <= 8);
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    using PackedStruct = typename PackedOn16Bytes<T>::Type;
+
+    cg::cluster_group cluster = cg::this_cluster();
+
+    __shared__ float cluster_acc;
+
+    int bid = blockIdx.x, tid = threadIdx.x;
+    int cluster_id = bid / ClusterSize, cluster_block_rank = bid % ClusterSize;
+
+    int token_id = cluster_id;
+    int cluster_offset = token_id * params.fusion_params.hidden_size;
+    int block_offset = cluster_block_rank * params.fusion_params.hidden_size / ClusterSize;
+    int thread_offset = tid * kPackedSize;
+
+    int inner_token_offset = block_offset + thread_offset;
+    int global_offset = cluster_offset + inner_token_offset;
+
+    T const* bias_buffer = reinterpret_cast<T const*>(params.fusion_params.bias_buffer);
+    T const* residual_buffer = reinterpret_cast<T const*>(params.fusion_params.residual_buffer);
+    T const* weight_buffer = reinterpret_cast<T const*>(params.fusion_params.weight_buffer);
+    T* local_final_output_buffer = reinterpret_cast<T*>(params.local_output_buffer_ptr);
+    T* intermediate_buffer = reinterpret_cast<T*>(params.fusion_params.intermediate_buffer);
+
+    local_final_output_buffer += global_offset;
+    intermediate_buffer += global_offset;
+    residual_buffer += global_offset;
+    bias_buffer += inner_token_offset;
+    weight_buffer += inner_token_offset;
+
+    PackedStruct weight_vec, bias_vec, residual_vec;
+    residual_vec.packed = *reinterpret_cast<int4 const*>(residual_buffer);
+    if constexpr (Bias)
+    {
+        bias_vec.packed = *reinterpret_cast<int4 const*>(bias_buffer);
+    }
+    if constexpr (Affine)
+    {
+        weight_vec.packed = *reinterpret_cast<int4 const*>(weight_buffer);
+    }
+
+    cudaGridDependencySynchronize();
+
+    float acc = 0.f;
+    PackedStruct sum_vec;
+    sum_vec.packed = Reducer<T, RanksPerNode, PushMode>::allreduce(params, global_offset);
+
+    if constexpr (Bias)
+    {
+        sum_vec.packed = add128b(sum_vec, bias_vec);
+    }
+    sum_vec.packed = add128b(sum_vec, residual_vec);
+    *reinterpret_cast<int4*>(intermediate_buffer) = sum_vec.packed;
+    acc = accumulate<T>(acc, sum_vec);
+    acc = block_reduce_sum(acc);
+    if (ClusterSize > 1)
+    {
+        if (threadIdx.x == 0)
+        {
+            cluster_acc = acc;
+        }
+        cluster.sync();
+        acc = 0.f;
+#pragma unroll
+        for (int ii = 0; ii < ClusterSize; ++ii)
+        {
+            acc += *cluster.map_shared_rank(&cluster_acc, ii);
+        }
+    }
+
+    float denom = __fsqrt_rn(__fdividef(acc, params.fusion_params.hidden_size) + params.fusion_params.eps);
+    sum_vec.packed = rms_norm<T, Affine>(denom, sum_vec, weight_vec);
+    *reinterpret_cast<int4*>(local_final_output_buffer) = sum_vec.packed;
+
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
+}
+
+int heuristic_min_warp_number(int tp_size, int hidden_size)
+{
+    if (hidden_size >= 4096)
+    {
+        return 4;
+    }
+    if (tp_size == 2)
+    {
+        return 32;
+    }
+    else
+    {
+        return 16;
+    }
+}
+
+template <typename T, int RanksPerNode, bool Bias, bool Affine>
+void lamport_style_one_shot_all_reduce_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
+{
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
+    int threads_per_token = params.fusion_params.hidden_size / kPackedSize;
+    int warps_per_token = (threads_per_token + details::kWarpSize - 1) / details::kWarpSize;
+    int token_num = params.elts_total / params.fusion_params.hidden_size;
+    int warp_min_number = heuristic_min_warp_number(RanksPerNode, params.fusion_params.hidden_size);
+    int cluster_size = std::min(((warps_per_token + warp_min_number - 1) / warp_min_number), details::kClusterMaxSize);
+    int cta_size = warps_per_token / cluster_size * details::kWarpSize;
+    TLLM_CHECK(cta_size <= details::kMaxCtaSize);
+    int cta_num = token_num * cluster_size;
+    cudaLaunchConfig_t kernel_config = {0};
+    kernel_config.gridDim = cta_num;
+    kernel_config.blockDim = cta_size;
+    kernel_config.dynamicSmemBytes = 0;
+    kernel_config.stream = stream;
+
+    cudaLaunchAttribute attribute[2];
+    attribute[0].id = cudaLaunchAttributeClusterDimension;
+    attribute[0].val.clusterDim.x = cluster_size;
+    attribute[0].val.clusterDim.y = 1;
+    attribute[0].val.clusterDim.z = 1;
+    kernel_config.attrs = attribute;
+    kernel_config.numAttrs = 1;
+    if (tensorrt_llm::common::getEnvEnablePDL())
+    {
+        attribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+        attribute[1].val.programmaticStreamSerializationAllowed = 1;
+        kernel_config.numAttrs++;
+    }
+#define LAUNCH_LAMPORT_KERNEL(CLUSTER_SIZE)                                                                            \
+    if (cluster_size == CLUSTER_SIZE)                                                                                  \
+    {                                                                                                                  \
+        TLLM_CUDA_CHECK(cudaLaunchKernelEx(&kernel_config,                                                             \
+            lamport_style_one_shot_all_reduce_norm_kernel<CLUSTER_SIZE, T, RanksPerNode, Bias, Affine>, params));      \
+        return;                                                                                                        \
+    }
+    LAUNCH_LAMPORT_KERNEL(1);
+    LAUNCH_LAMPORT_KERNEL(2);
+    LAUNCH_LAMPORT_KERNEL(3);
+    LAUNCH_LAMPORT_KERNEL(4);
+    LAUNCH_LAMPORT_KERNEL(5);
+    LAUNCH_LAMPORT_KERNEL(6);
+    LAUNCH_LAMPORT_KERNEL(7);
+    LAUNCH_LAMPORT_KERNEL(8);
+#undef LAUNCH_LAMPORT_KERNEL
+}
+
 template <typename T, int RanksPerNode, bool Bias = false, bool Affine = false, bool UseSmem = false>
 static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kernel(AllReduceParams params)
 {
@@ -495,80 +881,145 @@ static __global__ void __launch_bounds__(1024, 1) one_shot_all_reduce_norm_kerne
 #endif
 }
 
-template <typename T, int RanksPerNode, bool Bias, bool Affine>
-void one_shot_all_reduce_norm_kernel_launcher(AllReduceParams params, cudaStream_t stream)
+template <typename T>
+bool is_lamport_supported(int token_num)
 {
-    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
-    TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
-    int need_threads = params.fusion_params.hidden_size / kPackedSize;
-    int cta_size;
-    if (need_threads <= details::kMaxCtaSize)
+    static char* disableLamportReduceNormFusionChar = std::getenv("DISABLE_LAMPORT_REDUCE_NORM_FUSION");
+    bool disableLamportReduceNormFusion = (disableLamportReduceNormFusionChar != nullptr);
+    if (disableLamportReduceNormFusion)
+        return false;
+    static int sm = tensorrt_llm::common::getSMVersion();
+    if (sm < 90)
     {
-        cta_size = (need_threads + details::kWarpSize - 1) / details::kWarpSize * details::kWarpSize;
+        return false;
     }
-    else
+    if (!std::is_same_v<T, half> && !std::is_same_v<T, __nv_bfloat16>)
     {
-        cta_size = details::kMaxCtaSize;
+        return false;
     }
-    int norm_num = params.elts_total / params.fusion_params.hidden_size;
-    int cta_num = std::min(norm_num, static_cast<int>(MAX_ALL_REDUCE_BLOCKS));
-    int smem_size = 0;
-
-    if (cta_size * kPackedSize < params.fusion_params.hidden_size)
+    if (token_num > details::kLamportTokenNumThreshold)
     {
-        smem_size = params.fusion_params.hidden_size * sizeof(T);
-        if (tensorrt_llm::common::getEnvEnablePDL())
-        {
-            TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
-
-            cudaLaunchConfig_t kernelConfig = {0};
-            kernelConfig.gridDim = cta_num;
-            kernelConfig.blockDim = cta_size;
-            kernelConfig.dynamicSmemBytes = smem_size;
-            kernelConfig.stream = stream;
+        return false;
+    }
+    return true;
+}
 
-            cudaLaunchAttribute attribute[1];
-            attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-            attribute[0].val.programmaticStreamSerializationAllowed = 1;
-            kernelConfig.attrs = attribute;
-            kernelConfig.numAttrs = 1;
+bool is_lamport_supported(nvinfer1::DataType dataType, int token_num)
+{
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kFLOAT: return is_lamport_supported<float>(token_num);
+    case nvinfer1::DataType::kHALF: return is_lamport_supported<half>(token_num);
+#ifdef ENABLE_BF16
+    case nvinfer1::DataType::kBF16: return is_lamport_supported<__nv_bfloat16>(token_num);
+#endif
+    default: return false;
+    }
+}
 
-            TLLM_CUDA_CHECK(cudaLaunchKernelEx(
-                &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>, params));
-        }
-        else
-        {
-            one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>
-                <<<cta_num, cta_size, smem_size, stream>>>(params);
-        }
+template <typename T, int RanksPerNode, bool Bias, bool Affine>
+void one_shot_all_reduce_norm_kernel_launcher(AllReduceParams& params, cudaStream_t stream)
+{
+    int token_num = params.elts_total / params.fusion_params.hidden_size;
+    if (is_lamport_supported<T>(token_num))
+    {
+        lamport_style_one_shot_all_reduce_norm_kernel_launcher<T, RanksPerNode, Bias, Affine>(params, stream);
     }
     else
     {
-        if (tensorrt_llm::common::getEnvEnablePDL())
+        static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+        TLLM_CHECK(params.fusion_params.hidden_size % kPackedSize == 0);
+        int need_threads = params.fusion_params.hidden_size / kPackedSize;
+        int cta_size;
+        if (need_threads <= details::kMaxCtaSize)
         {
-            cudaLaunchConfig_t kernelConfig = {0};
-            kernelConfig.gridDim = cta_num;
-            kernelConfig.blockDim = cta_size;
-            kernelConfig.dynamicSmemBytes = smem_size;
-            kernelConfig.stream = stream;
-
-            cudaLaunchAttribute attribute[1];
-            attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-            attribute[0].val.programmaticStreamSerializationAllowed = 1;
-            kernelConfig.attrs = attribute;
-            kernelConfig.numAttrs = 1;
+            cta_size = (need_threads + details::kWarpSize - 1) / details::kWarpSize * details::kWarpSize;
+        }
+        else
+        {
+            cta_size = details::kMaxCtaSize;
+        }
+        int norm_num = params.elts_total / params.fusion_params.hidden_size;
+        int cta_num = std::min(norm_num, static_cast<int>(MAX_ALL_REDUCE_BLOCKS));
+        int smem_size = 0;
 
-            TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
-            TLLM_CUDA_CHECK(cudaLaunchKernelEx(
-                &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>, params));
+        if (cta_size * kPackedSize < params.fusion_params.hidden_size)
+        {
+            smem_size = params.fusion_params.hidden_size * sizeof(T);
+            if (tensorrt_llm::common::getEnvEnablePDL())
+            {
+                TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
+
+                cudaLaunchConfig_t kernelConfig = {0};
+                kernelConfig.gridDim = cta_num;
+                kernelConfig.blockDim = cta_size;
+                kernelConfig.dynamicSmemBytes = smem_size;
+                kernelConfig.stream = stream;
+
+                cudaLaunchAttribute attribute[1];
+                attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+                attribute[0].val.programmaticStreamSerializationAllowed = 1;
+                kernelConfig.attrs = attribute;
+                kernelConfig.numAttrs = 1;
+
+                TLLM_CUDA_CHECK(cudaLaunchKernelEx(
+                    &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>, params));
+            }
+            else
+            {
+                one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, true>
+                    <<<cta_num, cta_size, smem_size, stream>>>(params);
+            }
         }
         else
         {
-            one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>
-                <<<cta_num, cta_size, smem_size, stream>>>(params);
+            if (tensorrt_llm::common::getEnvEnablePDL())
+            {
+                cudaLaunchConfig_t kernelConfig = {0};
+                kernelConfig.gridDim = cta_num;
+                kernelConfig.blockDim = cta_size;
+                kernelConfig.dynamicSmemBytes = smem_size;
+                kernelConfig.stream = stream;
+
+                cudaLaunchAttribute attribute[1];
+                attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+                attribute[0].val.programmaticStreamSerializationAllowed = 1;
+                kernelConfig.attrs = attribute;
+                kernelConfig.numAttrs = 1;
+
+                TLLM_LOG_DEBUG("Enable PDL in one_shot_all_reduce_norm_kernel");
+                TLLM_CUDA_CHECK(cudaLaunchKernelEx(
+                    &kernelConfig, one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>, params));
+            }
+            else
+            {
+                one_shot_all_reduce_norm_kernel<T, RanksPerNode, Bias, Affine, false>
+                    <<<cta_num, cta_size, smem_size, stream>>>(params);
+            }
         }
     }
 }
+
+template <typename T>
+__global__ void lamport_initialize_kernel(T* buffer, size_t size)
+{
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    using PackedStruct = typename PackedOn16Bytes<T>::Type;
+    for (size_t offset = (blockIdx.x * blockDim.x + threadIdx.x) * kPackedSize; offset < size;
+         offset += gridDim.x * blockDim.x * kPackedSize)
+    {
+        set_neg_zero<T>(reinterpret_cast<int4*>(&buffer[offset]));
+    }
+}
+
+template <typename T>
+void lamport_initialize_kernel_launcher(void* buffer, size_t size, cudaStream_t stream)
+{
+    static constexpr int kPackedSize = details::kBytesPerAccess / sizeof(T);
+    int block_size = 1024;
+    int grid_size = (size + 1024 * kPackedSize - 1) / (1024 * kPackedSize);
+    lamport_initialize_kernel<T><<<grid_size, block_size, 0, stream>>>(reinterpret_cast<T*>(buffer), size);
+}
 }; // namespace reduce_fusion
 
 template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true, bool PUSH_MODE = false>
@@ -1117,13 +1568,24 @@ void AllReduceDispatchType(AllReduceParams& params, AllReduceStrategyType strat,
     }
 }
 
-AllReduceParams AllReduceParams::deserialize(int64_t* buffer, size_t tpSize, size_t tpRank)
+AllReduceParams AllReduceParams::deserialize(
+    int64_t* buffer, size_t tpSize, size_t tpRank, nvinfer1::DataType dataType, int token_num, AllReduceFusionOp op)
 {
     void* const* buffer_ptrs = reinterpret_cast<void* const*>(buffer);
-    auto const flag_ptr = &buffer[4 * tpSize];
+    int flag_offset;
+    if (op == AllReduceFusionOp::RESIDUAL_RMS_NORM && reduce_fusion::is_lamport_supported(dataType, token_num))
+    {
+        flag_offset = 0;
+    }
+    else
+    {
+        flag_offset = 1;
+    }
+    auto const flag_ptr
+        = &buffer[tensorrt_llm::utils::customAllReduceUtils::NUM_POINTERS_PER_RANK * tpSize + flag_offset];
     // cannot use 0 since 0 represents released state for barrier
     *flag_ptr += 1;
-    TLLM_LOG_TRACE("AllReduceParams's flag value is %d", *flag_ptr);
+    TLLM_LOG_TRACE("AllReduceParams's flag value is %d, flag offset %d", *flag_ptr, flag_offset);
     uint32_t flag_value = *flag_ptr;
     AllReduceParams params;
     // Even plugins use ping buffers, odd plugins use pong.
@@ -1208,4 +1670,25 @@ void residualRmsNorm(kernels::AllReduceParams& params, nvinfer1::DataType dataTy
     sync_check_cuda_error();
 }
 
+void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream)
+{
+    sync_check_cuda_error();
+    switch (dataType)
+    {
+    case nvinfer1::DataType::kFLOAT:
+        reduce_fusion::lamport_initialize_kernel_launcher<float>(buffer, size, stream);
+        break;
+    case nvinfer1::DataType::kHALF:
+        reduce_fusion::lamport_initialize_kernel_launcher<half>(buffer, size, stream);
+        break;
+#ifdef ENABLE_BF16
+    case nvinfer1::DataType::kBF16:
+        reduce_fusion::lamport_initialize_kernel_launcher<__nv_bfloat16>(buffer, size, stream);
+        break;
+#endif
+    default: TLLM_THROW("Unsupported dataType for customAllReduce");
+    }
+    sync_check_cuda_error();
+}
+
 } // namespace tensorrt_llm::kernels
diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
index ebe6b8795..6a67ba13e 100644
--- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
+++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
@@ -31,6 +31,15 @@ constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24;
 constexpr size_t MAX_RANKS_PER_NODE = 8;
 constexpr size_t DEFAULT_BLOCK_SIZE = 512;
 
+namespace reduce_fusion::details
+{
+static constexpr int kBytesPerAccess = 16;
+static constexpr int kWarpSize = 32;
+static constexpr int kMaxCtaSize = 1024;
+static constexpr int kClusterMaxSize = 8;
+static constexpr int kLamportTokenNumThreshold = 16;
+}; // namespace reduce_fusion::details
+
 // Warning: python definition is in tensorrt_llm/functional.py
 // they must be kept in sync
 enum class AllReduceStrategyType : int8_t
@@ -73,6 +82,7 @@ struct AllReduceFusionParams
     float eps;
     // new residual
     void* intermediate_buffer;
+    void* lamport_peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE * 3];
 };
 
 struct AllReduceParams
@@ -81,7 +91,8 @@ struct AllReduceParams
     size_t elts_per_rank;
     size_t elts_per_block;
     size_t rank_offset;
-    size_t ranks_per_node, local_rank;
+    size_t ranks_per_node;
+    size_t local_rank;
     uint32_t barrier_flag;
     uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE];
     uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE];
@@ -91,7 +102,8 @@ struct AllReduceParams
 
     AllReduceFusionParams fusion_params;
 
-    static AllReduceParams deserialize(int64_t* buffer, size_t tpSize, size_t tpRank);
+    static AllReduceParams deserialize(int64_t* buffer, size_t tpSize, size_t tpRank, nvinfer1::DataType dataType,
+        int token_num, AllReduceFusionOp op);
 };
 
 bool configurationSupported(AllReduceStrategyType algo, size_t msg_size, size_t n_ranks, nvinfer1::DataType type);
@@ -101,4 +113,6 @@ void customAllReduce(kernels::AllReduceParams& params, nvinfer1::DataType dataTy
 
 void residualRmsNorm(kernels::AllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream);
 
+void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, cudaStream_t stream);
+
 } // namespace tensorrt_llm::kernels
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl
index 1a0f6bc65..126e761ec 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl
@@ -74,7 +74,6 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe
     int occupancy = std::min(2, fused_moe::fused_gemm_maximum_active_blocks<GemmType>());
     int const threadblock_count = multi_processor_count * occupancy;
     TLLM_CHECK_WITH_INFO(occupancy > 0, "GPU lacks the shared memory resources to run fused_moe kernel");
-    GemmType gemm;
     using Arguments = typename GemmType::Arguments;
     Arguments args{{const_cast<ElementType_*>(A), const_cast<CutlassWeightType_*>(B), const_cast<ElementType_*>(biases),
                        reinterpret_cast<ElementType_*>(C), total_tokens_including_expert, static_cast<int>(gemm_n),
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
index 3a1b83c8c..5670b61ba 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
@@ -559,8 +559,9 @@ template <typename T, typename WeightType, typename OutputType, typename ScaleBi
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::supportsFusedGatedActivation(
     bool is_gated_activation, int gemm_n, int gemm_k) const
 {
+    constexpr bool ENABLE_FUSED_GATED_ACTIVATION = false; // TODO There is a bug that causes non-determinism
     return is_gated_activation && std::is_same_v<T, WeightType> && !std::is_same_v<T, float> && !use_fp8
-        && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0);
+        && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION;
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
index 9149cf698..cdc18a4c7 100755
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:38b241619f08ee636eee1d00a91d2fa2fc8a70f4afe1e12d01b180e6adeef7aa
-size 81578928
+oid sha256:e73dd3a8859cd67c62ab89a98381028bd20ac9e756f0346bbbaab0fb6c566eb7
+size 81578760
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
index d8f31a5f9..11bd98565 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-4a08f099886e0595057a20115658be51 libtensorrt_llm_nvrtc_wrapper.so
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
index 8977a08e5..d957c0b85 100755
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:28e39c448442c950d41012ad964057d84c8afc51aa116bbee17ccacd76b43e9f
+oid sha256:c11e0550552f4cc3568ac11de47079d5c6bd88aeb34ebbd52b39f4f732afbd7d
 size 84839528
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
index 14159937e..b1b08b1f3 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-4a873e8722270fed4a2e6a60c59aec27 libtensorrt_llm_nvrtc_wrapper.so
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
index 346481e68..a68f72234 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:534c4295300d7928a7949884c2791784a33fa46d1d93be567d81736f6d5dfb03
+oid sha256:1de32a25a27c7f5205f2e95452a48e65cae3311f2e5c087881e7fd2278c3bd77
 size 1128448
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
index cfe4399d6..eb4782449 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e207a8f57b944529163c7ed2ab30639a5f2779c5118602c6ebd50a623d16f845
+oid sha256:1a6c03470aaa69378d4989971ab9dd00ee427f7e14a85ba5e114ea0594c4de5e
 size 3488
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
index bc3bc70c5..2e3885d57 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
@@ -1,3 +1,3 @@
-b7e624ba775e9f5090ef4b67bcdbd7a2 tensorrt_llm_nvrtc_wrapper.lib
-ae3e4d6dd528f376dc29840ca316ab08 tensorrt_llm_nvrtc_wrapper.dll
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+c5f36e093e875c8ea84523fb1566d986 tensorrt_llm_nvrtc_wrapper.lib
+e6af3699a00052b3f151052e7fb6c9a4 tensorrt_llm_nvrtc_wrapper.dll
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu
index d3e2cbce6..1de9b04be 100644
--- a/cpp/tensorrt_llm/kernels/decodingKernels.cu
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu
@@ -711,4 +711,95 @@ void invokeTransposeLogProbs(float* outputLogProbs, float* outputLogProbsTiled,
 }
 
 } // namespace kernels
+
+namespace runtime::kernels
+{
+// Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree
+void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, BufferManager const& manager,
+    SamplingConfig const& samplingConfig)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto& finalOutputIds = *decodingOutput.gatheredIds;
+    auto const& finalOutputIdsShape = finalOutputIds.getShape();
+    auto const& decodingOutputIdsShape = decodingOutput.ids->getShape();
+    auto const batchSize = finalOutputIdsShape.d[0];
+    auto const beamWidth = finalOutputIdsShape.d[1];
+    auto const maxSeqLength = finalOutputIdsShape.d[2];
+
+    TLLM_CHECK_WITH_INFO(beamWidth > 1, "gatherTree is only needed for beam search.");
+
+    TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[0] == batchSize,
+        common::fmtstr("Decoder batch size (" FMT_DIM ") does not match final batch size (" FMT_DIM ")",
+            decodingOutputIdsShape.d[0], batchSize));
+    TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[1] == beamWidth,
+        common::fmtstr("Decoder beam width (" FMT_DIM ") does not match final beam width (" FMT_DIM ")",
+            decodingOutputIdsShape.d[1], beamWidth));
+    TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[2] <= maxSeqLength,
+        common::fmtstr("Decoder seq length size (" FMT_DIM ") is too large for final seq length (" FMT_DIM ")",
+            decodingOutputIdsShape.d[2], maxSeqLength));
+
+    auto const& stream = manager.getStream().get();
+
+    // prefill finalOutputIds with the EOS tokens from decodingInput.endIds
+    tensorrt_llm::kernels::invokeInitializeOutput(bufferCast<TokenIdType>(finalOutputIds),
+        bufferCast<TokenIdType>(*decodingInput.endIds), batchSize * beamWidth, maxSeqLength, stream);
+    sync_check_cuda_error();
+
+    std::vector<float> lengthPenaltyVec;
+    auto lengthPenaltyPtr = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize}), TRTDataType<float>::value));
+    if (!samplingConfig.lengthPenalty.has_value() || samplingConfig.lengthPenalty.value().size() == 0)
+    {
+        lengthPenaltyVec = std::vector<float>(batchSize, 1.0f);
+    }
+    else if (long int const size = samplingConfig.lengthPenalty.value().size(); size == 1)
+    {
+        lengthPenaltyVec = std::vector<float>(batchSize, samplingConfig.lengthPenalty.value()[0]);
+    }
+    else
+    {
+        TLLM_CHECK_WITH_INFO(size == batchSize,
+            common::fmtstr("Size of lengthPenalty in SamplingConfig (" FMT_DIM ") is different from batchSize (" FMT_DIM
+                           ")",
+                size, batchSize));
+        lengthPenaltyVec = samplingConfig.lengthPenalty.value();
+    }
+
+    lengthPenaltyPtr = manager.copyFrom(lengthPenaltyVec, ITensor::makeShape({batchSize}), runtime::MemoryType::kGPU);
+
+    tensorrt_llm::kernels::BeamHypotheses bh;
+    bh.nMaxBatchSize = batchSize;
+    bh.nBatchSize = batchSize;
+    bh.nBeamWidth = beamWidth;
+    bh.nMaxSeqLen = maxSeqLength;
+    bh.lengthPenalties = bufferCast<float>(*lengthPenaltyPtr);
+    bh.inputLengths = bufferCast<SizeType32>(*decodingInput.lengths);
+    bh.outputIds = bufferCast<TokenIdType>(finalOutputIds);
+    bh.logProbs = bufferCastOrNull<float>(decodingOutput.logProbs);
+    bh.logProbsTiled = bufferCast<float>(*decodingOutput.logProbsTiled);
+    bh.sequenceLengths = bufferCast<SizeType32>(*decodingOutput.lengths);
+    bh.cumLogProbs = bufferCast<float>(*decodingOutput.cumLogProbs);
+    bh.outputIdsCBA = bufferCast<TokenIdType>(*decodingOutput.beamHypotheses.outputIdsCBA);
+    bh.logProbsCBA = bufferCast<float>(*decodingOutput.beamHypotheses.logProbsCBA);
+    bh.sequenceLengthsCBA = bufferCast<SizeType32>(*decodingOutput.beamHypotheses.sequenceLengthsCBA);
+    bh.cumLogProbsCBA = bufferCast<float>(*decodingOutput.beamHypotheses.cumLogProbsCBA);
+    bh.normedScoresCBA = bufferCast<float>(*decodingOutput.beamHypotheses.normedScoresCBA);
+    bh.numBeamsCBA = bufferCast<SizeType32>(*decodingOutput.beamHypotheses.numBeamsCBA);
+    bh.minNormedScoresCBA = bufferCast<float>(*decodingOutput.beamHypotheses.minNormedScoresCBA);
+    bh.batchDones = bufferCast<bool>(*decodingOutput.beamHypotheses.batchDones);
+    bh.finished = bufferCast<tensorrt_llm::kernels::FinishedState>(*decodingOutput.finishReasons);
+    bh.outputIdsUnfinish = bufferCast<TokenIdType>(*decodingOutput.ids);
+    bh.parentIdsUnfinish = bufferCast<TokenIdType>(*decodingOutput.parentIds);
+
+    // This is where transpose is done
+    tensorrt_llm::kernels::invokeInsertUnfinishedPath(bh, stream);
+    sync_check_cuda_error();
+
+    tensorrt_llm::kernels::invokeFinalize(bh, stream);
+    sync_check_cuda_error();
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+} // namespace runtime::kernels
+
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.h b/cpp/tensorrt_llm/kernels/decodingKernels.h
index 6fdcc9056..73e97930d 100644
--- a/cpp/tensorrt_llm/kernels/decodingKernels.h
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.h
@@ -20,7 +20,9 @@
 #include "tensorrt_llm/kernels/beamSearchKernels.h"
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/decodingInput.h"
 #include "tensorrt_llm/runtime/decodingOutput.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -116,4 +118,22 @@ void invokeTransposeLogProbs(float* output_log_probs, float* output_log_probs_ti
     cudaStream_t stream);
 
 } // namespace kernels
+
+namespace runtime::kernels
+{
+//! \brief Inserts the running beams into the finished beams stored in the CBA buffers. (beams where the most likely
+//! continuation is the end token get stored separately, and another candidate next token is stored). Then sorts the
+//! beams according to their cumulative log probs. Note: the kernels in gatherTree modify the buffers inplace. When
+//! streaming, we use tmp buffers since beam search kernels expect ungathered data.
+//!
+//! \param decodingOutput contains a slice of the output buffers to gather. Also contains the
+//! DecodingOutput::BeamHypotheses object with the finished beams.
+//! \param decodingInput used for endIds and input lengths.
+//! \param manager the usual buffer manager.
+//! \param samplingConfig the usual buffer samplingConfig.
+
+void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, BufferManager const& manager,
+    SamplingConfig const& samplingConfig);
+} // namespace runtime::kernels
+
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/gptKernels.cu b/cpp/tensorrt_llm/kernels/gptKernels.cu
index 30ce90e0a..ae4c9d895 100644
--- a/cpp/tensorrt_llm/kernels/gptKernels.cu
+++ b/cpp/tensorrt_llm/kernels/gptKernels.cu
@@ -228,7 +228,7 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void computeSeqAndPaddingOffsets
         }
     }
 
-    // Perpare values for fmha.
+    // Prepare values for fmha.
     if (threadIdx.x == 0 && blockIdx.x == 0)
     {
         // Reset fmha tile counter to 0 before launching fmha kernels.
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
index da5714a59..75170e0be 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a66f773268e75b4cdda1de30e52a178d0d76ff4e0cce460272fae6da81a53715
+oid sha256:b441dc3c4773e25088812fd91cfb4974f0966291b4801c1f2f364a31669711cd
 size 25364090
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
index 2a59e4e6c..7f19a0ec7 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a04d66310a3c702bd2def83e26739164a72a22d9bffaef8749870e3c7c4b7be6
+oid sha256:8615ba1c4f8cf243e638765094ed399e694b87475bb3eb2e30b07fd304ede5f4
 size 25768990
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
index b81f3cf8f..bc334fbaa 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-952c054f0ae95852960c6dcd19a6a727 libtensorrt_llm_internal_cutlass_kernels_static.a
-629c1305cc5fdf8fdd61bb5d983b0ce1 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+4a9af041741dacb8179f6ea7f429b1e9 libtensorrt_llm_internal_cutlass_kernels_static.a
+f2a0bbcb400bd9ce12958ff6b418ca0f libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
index 0a07e42d6..a470bc574 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7b984e24b4e8f2204cd8f9b0b96a9c7fe12a890cee9a5c9a43163fed9a397f2
+oid sha256:1ca9cb8ec2f0f7ddb3604d624526ff01ddb3371379f28e4ad94c278fe14b7ebe
 size 44173728
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
index f1e9971d3..ec0ba15f1 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89799c5ec2c00651f46aa2673be8d19523e5d35bd810dba48a67650419a98ec8
+oid sha256:39cb4a742fcdc2005d847359870fab18a2b7bd1a22cd05c12a4959eb531a55d8
 size 43561142
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
index 4e98dd55b..cd208d12f 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-274d4e4f87e1bce533ebc6e8298ca28c libtensorrt_llm_internal_cutlass_kernels_static.a
-866bd2e84276b3ccef0459f53879d74a libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+f18fb02389d90717633943adf485d89c libtensorrt_llm_internal_cutlass_kernels_static.a
+ab42a2079f7314a5a6f46ad7ffa454cc libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
index f3d5dc691..772d87c3c 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6683b5ae21f2577f61b35d3d7e7d02623d8436279efdd12aa2251f5526d096e9
+oid sha256:597418556c9efeba5dff0b4f86cc4f41454d6cb6457c0cbc57232d47c9dc9ec4
 size 88140804
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
index 5e799c7cb..f26dce6b4 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
@@ -1,2 +1,2 @@
-f207648d1d9f1c1a4c6c6753a1032c9c tensorrt_llm_internal_cutlass_kernels_static.lib
-867c9da34e774ba1b0cdcd0a7d153a687b8e8dc6 commit
\ No newline at end of file
+5d0b1c092159da211bac1c7335897487 tensorrt_llm_internal_cutlass_kernels_static.lib
+cef1070ccdde579844de64f2a2bb8099bc3e5f02 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
index bfde5bae3..05cccf03d 100644
--- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
@@ -1458,7 +1458,7 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, ScaleBiasType,
     size_t const softmax_out_size = num_softmax_outs * sizeof(float);
     size_t const permuted_scales_size = mayHaveFinalizeFused() ? num_moe_inputs * sizeof(float) : 0;
     size_t const glu_inter_size = glu_inter_elems * gemm_output_dtype; // May be an intermediate type for quantization
-    size_t const fc1_result_size = interbuf_elems * sizeof(T);         // Acitvation quantizes so back to sizeof(T)
+    size_t const fc1_result_size = interbuf_elems * sizeof(T);         // Activation quantizes so back to sizeof(T)
     size_t const sorter_size = CubKeyValueSorter::getWorkspaceSize(num_rows, num_experts);
     size_t const fc2_result_size = permuted_elems * gemm_output_dtype; // May be an intermediate type for quantization
 
diff --git a/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu
index 21891c35f..c78fb15b9 100644
--- a/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu
@@ -1427,7 +1427,7 @@ void invokeAirTopPSamplingWithDeterministicPara(TopPSamplingKernelParams<T> cons
             kernel = airTopPSampling<T, IdxT, AccT, HisT, BitsPerPass, SAMPLING_BLOCK_SIZE, true, isDeterministic>;
         }
 
-        kernel<<<grid, SAMPLING_BLOCK_SIZE, 0, stream>>>(counters, histograms, countHistograms, params.outputIds,
+        kernel<<<grid, SAMPLING_BLOCK_SIZE, 0, stream>>>(counters, histograms, countHistograms, params.outputIdsPtrs,
             params.sequenceLength, params.finishedInput, params.finishedOutput, params.cumLogProbs,
             params.outputLogProbs, params.endIds, params.maxBatchSize, params.skipDecode, pass, buf1, idxBuf1, buf2,
             idxBuf2, params.batchSlots);
diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
index 04edc841a..c766e6da2 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
@@ -77,7 +77,7 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr
 
     if (finished != nullptr && finishState.isFinished())
     {
-        if (tid < k)
+        if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input
         {
             auto const index = tmpTopKBufIndex + tid;
             if (blockLane == 0 && tid == 0)
@@ -134,7 +134,7 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T*
     float const* topPs, curandState_t* curandState, TokenIdType const* endIds, SizeType32 vocabSize,
     bool const* skipDecode, SizeType32 const* batchSlots, SizeType32 maxBatchSize, bool normalizeLogProbs,
     bool logitHasProbs, SizeType32 const* tokensPerStep, SizeType32 maxTokensPerStep, SizeType32 maxSeqLen,
-    bool returnAllTopK)
+    bool returnAllSelectedTokens)
 {
     bool const IS_FP16 = std::is_same<T, half>::value;
     T const MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
@@ -215,13 +215,16 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T*
 
     if (tid == 0)
     {
-        auto randNum = static_cast<float>(curand_uniform(curandState + batchSlot) * probThreshold * sSum);
+        // if we want to return all top k indices, we should not do random sampling for probThreshold
+        auto randNum = returnAllSelectedTokens
+            ? static_cast<float>(probThreshold * sSum)
+            : static_cast<float>(curand_uniform(curandState + batchSlot) * probThreshold * sSum);
         auto* outputIdsRequestPtr = idsPtrs == nullptr ? ids + batchSlot * maxSeqLen : idsPtrs[batchSlot];
         for (SizeType32 ki = 0; ki < k; ki++)
         {
             auto expLogit = sVal2[ki];
             randNum = randNum - expLogit;
-            if (randNum <= 0.0f || ki == k - 1 || returnAllTopK)
+            if (randNum <= 0.0f || ki == k - 1 || returnAllSelectedTokens)
             {
                 auto idx = sId[ki];
                 // If sId is -1 here we force output token to the last from vocabulary to get vivid indicator of smth
@@ -230,10 +233,10 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T*
                     ? topKTmpIdBuf[(batchIdx * maxTokensPerStep + tokenIdx) * stride + idx] % vocabSize
                     : vocabSize - 1;
                 auto const curSeqLen = sequenceLengths == nullptr ? 0 : sequenceLengths[batchSlot];
-                auto const outIdx = returnAllTopK ? tokenIdx * maxTopK + ki : curSeqLen + tokenIdx;
+                auto const outIdx = returnAllSelectedTokens ? tokenIdx * maxTopK + ki : curSeqLen + tokenIdx;
                 outputIdsRequestPtr[outIdx] = outputId;
-                // cum log prob is not supported with returnAllTopK
-                if (!returnAllTopK)
+                // cum log prob is not supported with returnAllSelectedTokens
+                if (!returnAllSelectedTokens)
                 {
                     if (cumLogProbs != nullptr || outputLogProbs != nullptr)
                     {
@@ -255,9 +258,17 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T*
                     }
                     break;
                 }
+                if (returnAllSelectedTokens && randNum <= 0.0f)
+                {
+                    if (ki < k - 1)
+                    { // not the last k, write a -1 to to log top p tokens boundary for external draft token masking
+                        outputIdsRequestPtr[outIdx + 1] = -1;
+                    }
+                    break;
+                }
             }
         }
-        if (maxTokensPerStep == 1 && !returnAllTopK && sequenceLengths != nullptr && finishedOutput != nullptr
+        if (maxTokensPerStep == 1 && !returnAllSelectedTokens && sequenceLengths != nullptr && finishedOutput != nullptr
             && endIds != nullptr)
         {
             auto const seqLen = sequenceLengths[batchSlot];
@@ -297,7 +308,7 @@ __global__ void topKStage2Sampling(SizeType32 const* __restrict topKTmpIdBuf, T*
                     params.maxTopK, params.topKs, params.maxTopP, params.topPs, params.curandState, params.endIds,     \
                     params.vocabSizePadded, params.skipDecode, params.batchSlots, params.maxBatchSize,                 \
                     params.normalizeLogProbs, params.logitsHasProbs, params.tokensPerStep, params.maxTokensPerStep,    \
-                    params.maxSeqLen, params.returnAllTopK);                                                           \
+                    params.maxSeqLen, params.returnAllSelectedTokens);                                                 \
         }                                                                                                              \
     } while (0)
 
diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
index 0330cad31..dbf8cda0b 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
@@ -106,8 +106,8 @@ struct TopKSamplingKernelParams
     bool normalizeLogProbs{false};
     //! flag to highlight that logProbs contains probabilities
     bool logitsHasProbs{false};
-    //! flag to return all selectedTopK results
-    bool returnAllTopK{false};
+    //! flag to return all selected TopK results
+    bool returnAllSelectedTokens{false};
 
     void checkParams() const
     {
@@ -133,11 +133,11 @@ struct TopKSamplingKernelParams
         TLLM_CHECK(workspace);
         TLLM_CHECK(curandState);
 
-        TLLM_CHECK(maxTokensPerStep != 1 || returnAllTopK || sequenceLengths);
-        TLLM_CHECK(maxTokensPerStep != 1 || returnAllTopK || endIds);
+        TLLM_CHECK(maxTokensPerStep != 1 || returnAllSelectedTokens || sequenceLengths);
+        TLLM_CHECK(maxTokensPerStep != 1 || returnAllSelectedTokens || endIds);
         if (cumLogProbs != nullptr || outputLogProbs != nullptr)
         {
-            TLLM_CHECK(maxTokensPerStep == 1 && !returnAllTopK);
+            TLLM_CHECK(maxTokensPerStep == 1 && !returnAllSelectedTokens);
         }
         TLLM_CHECK(((finishedOutput == nullptr) ^ (endIds == nullptr)) == 0);
 
diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu
index 13da77bdf..472115b64 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu
@@ -196,11 +196,11 @@ __device__ void epilogue(SizeType32 batchId, SizeType32 currentStep, SizeType32
 }
 
 template <typename T, int blockSize>
-__global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenIdType** ids, SizeType32* sequenceLength,
-    FinishedState const* finishedInput, FinishedState* finishedOutput, float* cumLogProbs, float* outputLogProbs,
-    SizeType32 const* beginOffsetBuf, SizeType32 const* offsetBuf, SizeType32 vocabSize, curandState_t* curandState,
-    float const* topPs, TokenIdType const* endIds, SizeType32 maxBatchSize, bool const* skipDecode,
-    SizeType32 const* batchSlots)
+__global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenIdType* ids, TokenIdType** idsPtrs,
+    SizeType32* sequenceLength, FinishedState const* finishedInput, FinishedState* finishedOutput, float* cumLogProbs,
+    float* outputLogProbs, SizeType32 const* beginOffsetBuf, SizeType32 const* offsetBuf, SizeType32 vocabSize,
+    curandState_t* curandState, float const* topPs, TokenIdType const* endIds, SizeType32 maxBatchSize,
+    bool const* skipDecode, SizeType32 const* batchSlots, bool returnAllSelectedTokens, SizeType32 maxSeqLen)
 {
     /**
      * Each block processes one request row sorted in descending order by probabilities.
@@ -235,14 +235,16 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
     }
 
     auto const probThreshold = topPs[batchSlot];
-    auto const currentStep = sequenceLength[batchSlot];
+    auto const currentStep = sequenceLength == nullptr ? 0 : sequenceLength[batchSlot];
+    auto* outputIdsRequestPtr = idsPtrs == nullptr ? ids + batchSlot * maxSeqLen : idsPtrs[batchSlot];
 
     // With P in (0.0; 1.0] we draw a random number P' in range (0.0; P]
     // We will sum all probs moving from the largest probability to the smallest and
     // will choose the token which probability makes cumulative probability sum to exceed P'
     if (threadIdx.x == 0)
     {
-        randNumS = curand_uniform(curandState + blockIdx.x) * probThreshold;
+        // if we want to return all top p indices, we should not do random sampling for probThreshold
+        randNumS = returnAllSelectedTokens ? probThreshold : curand_uniform(curandState + blockIdx.x) * probThreshold;
     }
 
     // if beginOffsetBuf and offsetBuf of sorting have same value,
@@ -253,8 +255,15 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
         if (tid == 0)
         {
             auto offset = batchId * vocabSize;
-            epilogue(batchSlot, currentStep, offset, ids, sortedIdVals, sortedProbs, cumLogProbs, outputLogProbs,
-                endIds, sequenceLength, finishedOutput, maxBatchSize);
+            if (returnAllSelectedTokens)
+            {
+                outputIdsRequestPtr[currentStep] = sortedIdVals[offset];
+            }
+            else
+            {
+                epilogue(batchSlot, currentStep, offset, idsPtrs, sortedIdVals, sortedProbs, cumLogProbs,
+                    outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize);
+            }
         }
         return;
     }
@@ -267,7 +276,7 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
     __syncthreads();
 
     auto offset = batchId * vocabSize;
-    ids[batchSlot][currentStep] = sortedIdVals[offset];
+    outputIdsRequestPtr[currentStep] = sortedIdVals[offset];
     auto end = ((vocabSize + blockSize - 1) / blockSize) * blockSize;
     SizeType32 selectedTokenId = 0;
     // Cumulative sum
@@ -285,11 +294,31 @@ __global__ void topPSsampling(T* sortedProbs, TokenIdType* sortedIdVals, TokenId
         }
     }
 
-    // select first thread exceeded the prob threshold or the last thread in case of P=1.0f
-    if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1))
+    if (returnAllSelectedTokens)
+    {
+        __shared__ SizeType32 sharedSelectedTokenId;
+        if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1))
+        {
+            sharedSelectedTokenId = selectedTokenId;
+        }
+        __syncthreads();
+        for (int vi = tid; vi <= sharedSelectedTokenId; vi += blockSize)
+        {
+            outputIdsRequestPtr[vi] = sortedIdVals[offset + vi];
+        }
+        if (tid == 0 && sharedSelectedTokenId != end - 1)
+        {
+            outputIdsRequestPtr[sharedSelectedTokenId + 1] = -1; // a boundary to record the end of all selected top Ps.
+        }
+    }
+    else
     {
-        epilogue(batchSlot, currentStep, offset + selectedTokenId, ids, sortedIdVals, sortedProbs, cumLogProbs,
-            outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize);
+        // select first thread exceeded the prob threshold or the last thread in case of P=1.0f
+        if (threadIdx.x == min(blockDim.x - count, blockDim.x - 1))
+        {
+            epilogue(batchSlot, currentStep, offset + selectedTokenId, idsPtrs, sortedIdVals, sortedProbs, cumLogProbs,
+                outputLogProbs, endIds, sequenceLength, finishedOutput, maxBatchSize);
+        }
     }
 }
 
@@ -371,9 +400,10 @@ void invokeBatchTopPSampling(TopPSamplingKernelParams<T> const& params, cudaStre
     dim3 grid(params.batchSize);
     // Sample with Top P given sorted tokens
     topPSsampling<T, SAMPLING_BLOCK_SIZE><<<grid, SAMPLING_BLOCK_SIZE, 0, stream>>>(sortedProbs, sortedIdVals,
-        params.outputIds, params.sequenceLength, params.finishedInput, params.finishedOutput, params.cumLogProbs,
-        params.outputLogProbs, beginOffsetBuf, offsetBuf + 1, params.vocabSizePadded, params.curandState, params.topPs,
-        params.endIds, params.maxBatchSize, params.skipDecode, params.batchSlots);
+        params.outputIds, params.outputIdsPtrs, params.sequenceLength, params.finishedInput, params.finishedOutput,
+        params.cumLogProbs, params.outputLogProbs, beginOffsetBuf, offsetBuf + 1, params.vocabSizePadded,
+        params.curandState, params.topPs, params.endIds, params.maxBatchSize, params.skipDecode, params.batchSlots,
+        params.returnAllSelectedTokens, params.maxSeqLen);
     sync_check_cuda_error();
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
@@ -385,9 +415,13 @@ template void invokeBatchTopPSampling(TopPSamplingKernelParams<half> const& para
 
 __global__ void computeToppDecay(float* runtimeTopP, float const* runtimeInitialTopP, TokenIdType const** outputIds,
     float const* topPDecay, float const* topPMin, TokenIdType const* topPResetIds, SizeType32 const* sequenceLengths,
-    SizeType32 const* batchSlots)
+    SizeType32 const* batchSlots, SizeType32 localBatchSize)
 {
     auto const idx = static_cast<SizeType32>(blockDim.x * blockIdx.x + threadIdx.x);
+    if (idx >= localBatchSize)
+    {
+        return;
+    }
     auto const batchSlot = batchSlots[idx];
     auto const currentStep{sequenceLengths[batchSlot]};
     if (outputIds[batchSlot][currentStep] == topPResetIds[batchSlot])
@@ -406,8 +440,8 @@ void invokeComputeToppDecay(float* runtimeTopP, float const* runtimeInitialTopP,
 {
     dim3 block(std::min(localBatchSize, 512));
     dim3 grid((localBatchSize + block.x - 1) / block.x);
-    computeToppDecay<<<grid, block, 0, stream>>>(
-        runtimeTopP, runtimeInitialTopP, outputIds, topPDecay, topPMin, topPResetIds, sequenceLengths, batchSlots);
+    computeToppDecay<<<grid, block, 0, stream>>>(runtimeTopP, runtimeInitialTopP, outputIds, topPDecay, topPMin,
+        topPResetIds, sequenceLengths, batchSlots, localBatchSize);
 }
 
 __global__ void setTopPRuntimeArgs(SizeType32 batchSize, SizeType32 topK, SizeType32* topKs, SizeType32 topKsSize,
diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h
index 1cda8bc56..639d7d4d6 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.h
+++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.h
@@ -28,8 +28,13 @@ struct TopPSamplingKernelParams
     //! input buffer [batchSize, vocabSizePadded], required. Probabilities of each token in the vocab.
     T const* probs{nullptr};
 
-    //! output buffer [maxBatchSize][maxSeqLen], required. Contains pointers to rows with output tokens per request.
-    runtime::TokenIdType** outputIds{nullptr};
+    //! output buffer [maxBatchSize][maxSeqLen]. Contains pointers to rows with output tokens per request.
+    //! If nullptr, outputIds must be provided.
+    runtime::TokenIdType** outputIdsPtrs{nullptr};
+
+    //! output buffer [maxBatchSize, maxSeqLen], optional. Tensor to store output tokens.
+    //! Not used if outputIdsPtrs != nullptr
+    runtime::TokenIdType* outputIds{nullptr};
 
     //! pointer to the workspace. Has to be pre-allocated by caller.
     //! Function does not take ownership of the buffer.
@@ -73,6 +78,9 @@ struct TopPSamplingKernelParams
     runtime::SizeType32 batchSize{-1};
     runtime::SizeType32 maxBatchSize{-1};
     runtime::SizeType32 vocabSizePadded{-1};
+    runtime::SizeType32 maxSeqLen{-1};
+
+    bool returnAllSelectedTokens{false};
 
     void checkParams() const
     {
@@ -81,12 +89,17 @@ struct TopPSamplingKernelParams
         TLLM_CHECK(maxBatchSize >= batchSize);
         TLLM_CHECK(vocabSizePadded > 0);
         TLLM_CHECK(probs);
-        TLLM_CHECK(outputIds);
+        TLLM_CHECK(outputIds || outputIdsPtrs);
         TLLM_CHECK(workspace);
-        TLLM_CHECK(sequenceLength);
+        TLLM_CHECK((sequenceLength != nullptr) || returnAllSelectedTokens);
         TLLM_CHECK(curandState);
         TLLM_CHECK(topPs);
 
+        if (outputIds)
+        {
+            TLLM_CHECK(maxSeqLen > 0);
+        }
+
         TLLM_CHECK(((finishedOutput == nullptr) ^ (endIds == nullptr)) == 0);
     }
 };
diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu
index 6036695cd..427f1bb6b 100644
--- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu
@@ -35,230 +35,291 @@ namespace tensorrt_llm::kernels::speculative_decoding
 {
 namespace
 {
-__global__ void acceptDraftTokensByIds(TokenIdType const* draftIds, TokenIdType const* targetIds,
-    SizeType32 const* contextLengths, SizeType32 const* numsDraftTokens, SizeType32* sequenceLengths,
-    FinishedState const* finished, FinishedState* finishedFinal, SizeType32* finishedSum, SizeType32 const* batchSlots,
-    SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 maxSeqLen, SizeType32 maxDraftTokens)
-{
-    for (auto batchIdx = static_cast<SizeType32>(threadIdx.x); batchIdx < batchSize; batchIdx += blockDim.x)
-    {
-        auto const batchSlot = batchSlots[batchIdx];
-        auto const numDraftTokens = numsDraftTokens[batchSlot];
-
-        auto const contextLength = contextLengths[batchSlot];
-        auto& sequenceLength = sequenceLengths[batchSlot];
-        SizeType32 finishedDraftIdx = 0;
-        for (auto ti = contextLength; ti < min(sequenceLength, contextLength + numDraftTokens);
-             ++ti, ++finishedDraftIdx)
-        {
-            auto const draftIdx = ti - contextLength;
-            auto const targetTokenIdx = batchSlot * maxSeqLen + ti;
-            auto const draftTokenIdx = batchSlot * maxDraftTokens + draftIdx;
-            // Check if draft tokens are the same as target tokens
-            bool const accepted = draftIds[draftTokenIdx] == targetIds[targetTokenIdx];
-            if (!accepted)
-            {
-                // Set sequence length to the numAcceptedTokens + 1
-                sequenceLength = min(ti + 1, maxSeqLen);
-                // FIXME(nkorobov): do we need to set endIds here?
-                break;
-            }
-        }
-        FinishedState finishState = finished[finishedDraftIdx * maxBatchSize + batchSlot];
-        finishedFinal[batchSlot] = finishState;
-
-        if (finishedSum)
-        {
-            finishedSum[batchSlot] = static_cast<int>(finishState.isFinished());
-        }
-    }
-}
-} // namespace
-
-void invokeAcceptDraftTokensByIds(TokenIdType const* draftIds, TokenIdType const* targetIds,
-    SizeType32 const* contextLengths, SizeType32 const* numsDraftTokens, SizeType32* sequenceLengths,
-    FinishedState const* finished, FinishedState* finishedFinal, SizeType32* finishedSum, SizeType32 const* batchSlots,
-    SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth, SizeType32 maxSeqLen,
-    SizeType32 maxDraftTokens, cudaStream_t stream)
-{
-    TLLM_CHECK(beamWidth == 1);
-    dim3 block(min(1024, batchSize));
-    dim3 grid(1);
-    acceptDraftTokensByIds<<<grid, block, 0, stream>>>(draftIds, targetIds, contextLengths, numsDraftTokens,
-        sequenceLengths, finished, finishedFinal, finishedSum, batchSlots, batchSize, maxBatchSize, maxSeqLen,
-        maxDraftTokens);
-}
 
-namespace
-{
 template <typename T>
-__global__ void acceptDraftTokensByLogitsKernel(T const* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens,
-    FinishedState* finished, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 batchSize,
-    SizeType32 maxBatchSize, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSize,
-    bool randomThreshold, float constantThreshold)
+__global__ void maskTargetLogitsKernel(T* targetLogits, SizeType32 const* batchSlots, SizeType32 beamWidth,
+    SizeType32 vocabSize, FinishedState const* finishedInput, SizeType32 maxBatchSize, bool const* batchUseDraftLogits,
+    SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds, SizeType32* runtimeTopKDevicePtr, bool* maskBuffer)
 {
+    /**
+     * @brief Masking the selected token to -inf as was done in Huggingface TopK/TopP Logits Warper
+     * https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/generation/logits_process.py#L533
+     */
+
     auto const bid = blockIdx.x;
-    auto const draftTokenIdx = blockIdx.y;
     auto const batchIdx = bid / beamWidth;
-    auto const beamIdx = bid % beamWidth;
+    auto const tid = static_cast<SizeType32>(threadIdx.x);
     auto const batchSlot = batchSlots[batchIdx];
-    auto const batchSlotBeamWidth = batchSlot * beamWidth + beamIdx;
 
-    auto const numDraftTokens = numsDraftTokens[batchSlotBeamWidth];
+    constexpr bool IS_HALF = std::is_same<T, half>::value;
+    T const MAX_T_VAL = (IS_HALF) ? HALF_FLT_MAX : FLT_MAX;
+
+    auto targetLogitsBatch = targetLogits + batchIdx * vocabSize;
+    auto& finishedState = finishedInput[batchSlot];
 
-    if (draftTokenIdx >= numDraftTokens)
+    auto* outputIdsAfterSamplingPtr = outputIdsAfterSampling + batchSlot * vocabSize;
+    auto const useDraftLogits = batchUseDraftLogits[batchSlot];
+
+    if (finishedState.isSkipDecoding() || finishedState.isFinished())
     {
         return;
     }
 
-    auto const logitsOffset = (batchSlot * maxDraftTokens + draftTokenIdx) * beamWidth * vocabSize;
-    auto const draftProbsBatch = draftProbs + logitsOffset;
-    auto const targetProbsBatch = targetProbs + logitsOffset;
-    auto const vocabSizePadded = static_cast<SizeType32>((vocabSize + blockDim.x - 1) / blockDim.x) * blockDim.x;
+    __shared__ SizeType32 tokensToMask;
 
-    struct Candidate candidate;
-    __shared__ float threshold;
-    if (threadIdx.x == 0)
+    if (tid == 0)
     {
-        threshold = randomThreshold ? curand_uniform(curandState + batchSlot) : constantThreshold;
+        tokensToMask = runtimeTopKDevicePtr[batchSlot];
     }
     __syncthreads();
 
-    for (auto vIdx = static_cast<SizeType32>(threadIdx.x); vIdx < vocabSizePadded;
-         vIdx += static_cast<SizeType32>(blockDim.x))
+    for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
     {
-        bool const pred = vIdx < vocabSize;
-        auto const targetProb = pred ? static_cast<float>(targetProbsBatch[vIdx]) : 1.f;
-        auto const draftProb = pred ? static_cast<float>(draftProbsBatch[vIdx]) : 0.f;
-
-        if (draftProb > candidate.maxProb)
-        {
-            candidate.maxProb = draftProb;
-            candidate.rateQP = pred ? targetProb / draftProb : 0.f;
+        if (outputIdsAfterSamplingPtr[vIdx] == -1)
+        { // we need to find the -1 boundary from returnAllTopP outputIds if topK == 0 or number of topP indices < topK
+            tokensToMask = vIdx;
         }
+        maskBuffer[vIdx] = false;
     }
+
     __syncthreads();
 
-    typedef cub::BlockReduce<Candidate, 1024> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage reduce_buffer;
-    Candidate candidate_global = BlockReduce(reduce_buffer).Reduce(candidate, reduce_op);
+    if (!useDraftLogits && tid == 0)
+    {
+        targetOutputIds[batchSlot] = outputIdsAfterSamplingPtr[tokensToMask - 1];
+    }
+
+    for (SizeType32 vIdx = tid; vIdx < tokensToMask; vIdx += static_cast<SizeType32>(blockDim.x))
+    {
+        auto tokenToMask = outputIdsAfterSamplingPtr[vIdx];
+        maskBuffer[tokenToMask] = true;
+    }
+
     __syncthreads();
 
-    if (threadIdx.x == 0)
+    for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
     {
-        finished[draftTokenIdx * maxBatchSize * beamWidth + batchSlotBeamWidth]
-            = candidate_global.rateQP < threshold ? FinishedState::skipDecoding() : FinishedState::empty();
+        if (!maskBuffer[vIdx])
+        {
+            targetLogitsBatch[vIdx] = -MAX_T_VAL;
+        }
     }
 }
 
 template <typename T>
-__global__ void correctAcceptedStatesAndLogits(T const* draftProbs, T* targetProbs, T** targetLogits,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, SizeType32 const* batchSlots, SizeType32 batchSize,
-    SizeType32 maxBatchSize, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSize)
+__global__ void acceptDraftTokensKernel(T const* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens,
+    bool const* batchUseDraftLogits, TokenIdType const* draftIds, FinishedState const* finishedInput,
+    FinishedState* finishedOutput, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 maxDraftTokens,
+    SizeType32 beamWidth, SizeType32 vocabSize, bool randomThreshold, float constantThreshold, SizeType32 step,
+    bool* batchIsAccepted, SizeType32* targetOutputIds)
 {
     auto const bid = blockIdx.x;
+    auto const draftTokenIdx = step;
     auto const batchIdx = bid / beamWidth;
     auto const beamIdx = bid % beamWidth;
     auto const batchSlot = batchSlots[batchIdx];
     auto const batchSlotBeamWidth = batchSlot * beamWidth + beamIdx;
+    auto const tid = static_cast<SizeType32>(threadIdx.x);
+
     auto const numDraftTokens = numsDraftTokens[batchSlotBeamWidth];
+    auto const useDraftLogits = batchUseDraftLogits[batchSlotBeamWidth];
 
-    __shared__ SizeType32 numAcceptedTokens;
-    if (threadIdx.x == 0)
+    if (draftTokenIdx > numDraftTokens || finishedInput[batchSlot].isSkipDecoding()
+        || finishedInput[batchSlot].isFinished())
     {
-        numAcceptedTokens = numDraftTokens;
-        bool cummulativeSkipDecoding = false;
-        for (SizeType32 ti = 0; ti < numDraftTokens + 1; ++ti)
+        if (tid == 0)
         {
-            auto& finishedState = finished[ti * maxBatchSize * beamWidth + batchSlotBeamWidth];
-            bool localSkipDecoding = finishedState.isSkipDecoding();
-            if (cummulativeSkipDecoding == false && localSkipDecoding == true)
+            batchIsAccepted[batchSlot] = true;
+
+            // either finished or skip decode in previous step, this step don't need decoding
+            finishedOutput[batchSlot].setSkipDecoding();
+
+            // if previous step is finished, write the state to next step too
+            if (finishedInput[batchSlot].isFinished())
             {
-                numAcceptedTokens = ti;
+                finishedOutput[batchSlot] = finishedInput[batchSlot];
             }
+        }
+        return;
+    }
+
+    auto const logitsOffset = (batchSlot * maxDraftTokens + draftTokenIdx) * beamWidth * vocabSize;
+    auto const draftProbsBatch = draftProbs + logitsOffset;
+    auto const targetProbsBatch = targetProbs + (batchIdx * beamWidth * vocabSize);
 
-            finishedState = cummulativeSkipDecoding ? FinishedState::skipDecoding() : FinishedState::empty();
-            cummulativeSkipDecoding |= localSkipDecoding;
+    __shared__ bool isAccepted;
+    __shared__ T sSumVal;
+    if (tid == 0)
+    {
+        if (draftTokenIdx < numDraftTokens)
+        {
+            auto const draftOutputTokenId = draftIds[batchSlot * maxDraftTokens + draftTokenIdx];
+            if (useDraftLogits)
+            {
+                float threshold = randomThreshold ? curand_uniform(curandState + batchSlot) : constantThreshold;
+                auto const targetProb = static_cast<float>(targetProbsBatch[draftOutputTokenId]);
+                auto const draftProb = static_cast<float>(draftProbsBatch[draftOutputTokenId]);
+                auto rateQP = targetProb / draftProb;
+                if (rateQP < threshold)
+                {
+                    isAccepted = false;
+                    finishedOutput[batchSlot].setSkipDecoding();
+                }
+                else
+                {
+                    isAccepted = true;
+                }
+            }
+            else
+            {
+                // Check if draft tokens are the same as target tokens
+                isAccepted = targetOutputIds[batchSlot] == draftOutputTokenId;
+                if (!isAccepted)
+                {
+                    finishedOutput[batchSlot].setSkipDecoding();
+                }
+            }
         }
+        else
+        {
+            isAccepted = false;
+            finishedOutput[batchSlot].setSkipDecoding();
+        }
+        batchIsAccepted[batchSlot] = isAccepted;
     }
+
     __syncthreads();
 
-    if (numAcceptedTokens < numDraftTokens)
+    if (!isAccepted)
     {
-        auto const logitsIdx = (batchSlot * maxDraftTokens + numAcceptedTokens) * beamWidth * vocabSize;
-        auto const draftProbBatch = draftProbs + logitsIdx;
-        auto targetProbBatch = targetProbs + logitsIdx;
-        auto targetLogitsBatch = targetLogits[bid] + numAcceptedTokens * beamWidth * vocabSize;
-
-        float sumProbs = 0.f;
-        for (SizeType32 vIdx = static_cast<SizeType32>(threadIdx.x); vIdx < vocabSize;
-             vIdx += static_cast<SizeType32>(blockDim.x))
+        T const zeroVal = static_cast<T>(0.0f);
+        T sumVal = zeroVal;
+        for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
         {
-            auto const correctedProb = max(static_cast<float>(targetProbBatch[vIdx] - draftProbBatch[vIdx]), 0.f);
-            sumProbs += correctedProb;
-            targetProbBatch[vIdx] = correctedProb;
+            targetProbsBatch[vIdx]
+                -= (draftTokenIdx < numDraftTokens && useDraftLogits) ? draftProbsBatch[vIdx] : zeroVal;
+            targetProbsBatch[vIdx] = targetProbsBatch[vIdx] >= zeroVal ? targetProbsBatch[vIdx] : zeroVal;
+            sumVal += targetProbsBatch[vIdx];
         }
-
-        __shared__ float sumProbsShared;
-        sumProbs = blockReduceSum<float>((float) sumProbs);
-        if (threadIdx.x == 0)
+        sumVal = blockReduceSum<T>(sumVal);
+        if (tid == 0)
         {
-            sumProbsShared = max(sumProbs, 1e-6f);
+            sSumVal = sumVal;
         }
         __syncthreads();
 
-        for (SizeType32 vIdx = static_cast<SizeType32>(threadIdx.x); vIdx < vocabSize;
-             vIdx += static_cast<SizeType32>(blockDim.x))
+        for (SizeType32 vIdx = tid; vIdx < vocabSize; vIdx += static_cast<SizeType32>(blockDim.x))
         {
-            auto const correctedNormProb = static_cast<float>(targetProbBatch[vIdx]) / sumProbsShared;
-            targetLogitsBatch[vIdx] = __logf(correctedNormProb / (1.f - correctedNormProb));
+            targetProbsBatch[vIdx] /= sSumVal;
         }
     }
 }
+
+__global__ void forwardAcceptedTokensKernel(SizeType32 batchSize, SizeType32 const* batchSlots, bool* batchIsAccepted,
+    SizeType32* sequenceLengths, TokenIdType const* draftIds, TokenIdType** idsPtrs, SizeType32 step,
+    SizeType32 maxDraftTokens, TokenIdType const* endIds, FinishedState* finishedOutput)
+{
+    auto index = static_cast<SizeType32>(blockIdx.x * blockDim.x + threadIdx.x);
+    for (SizeType32 bi = index; bi < batchSize; bi += static_cast<SizeType32>(gridDim.x * blockDim.x))
+    {
+        auto const batchSlot = batchSlots[bi];
+        if (batchIsAccepted[batchSlot] && !finishedOutput[batchSlot].isSkipDecoding()
+            && !finishedOutput[batchSlot].isFinished())
+        {
+            auto const curSeqLen = sequenceLengths[batchSlot];
+            auto const draftTokenIdx = step;
+            auto const draftOutputTokenId = draftIds[batchSlot * maxDraftTokens + draftTokenIdx];
+            auto* outputIdsRequestPtr = idsPtrs[batchSlot];
+            auto const outIdx = curSeqLen;
+            outputIdsRequestPtr[outIdx] = draftOutputTokenId;
+            if (outputIdsRequestPtr[outIdx] == endIds[batchSlot])
+            {
+                finishedOutput[batchSlot].setFinishedEOS();
+                // Do not increase seq len when EOS is generated. Seq len should always contain only tokens to be
+                // outputted
+            }
+            else
+            {
+                // We don't need to set output finished state as it is assumed to be in non finished state
+                sequenceLengths[batchSlot] += 1;
+            }
+        }
+    }
+} // namespace
+
 } // namespace
 
 template <typename T>
-void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs, T* targetProbs,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold,
-    float constantThreshold, cudaStream_t stream)
+void invokeMaskTargetLogits(SizeType32 batchSize, T* targetLogits, SizeType32 const* batchSlots, SizeType32 beamWidth,
+    SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize,
+    bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds,
+    SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream)
 {
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     TLLM_CHECK(beamWidth == 1);
-    {
-        invokeAddBiasSoftMax(draftLogits, static_cast<T**>(nullptr), draftProbs, static_cast<T*>(nullptr), nullptr,
-            finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,
-            /* skip softmax */ false,
-            /* batchSlotLogits */ true, stream);
-        invokeAddBiasSoftMax(static_cast<T*>(nullptr), targetLogits, targetProbs, static_cast<T*>(nullptr), nullptr,
-            finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,
-            /* skip softmax */ false,
-            /* batchSlotLogits */ true, stream);
-    }
     {
         dim3 block(1024);
-        dim3 grid(batchSize * beamWidth, maxDraftTokens);
-        acceptDraftTokensByLogitsKernel<<<grid, block, 0, stream>>>(draftProbs, targetProbs, numsDraftTokens, finished,
-            curandState, batchSlots, batchSize, maxBatchSize, maxDraftTokens, beamWidth, vocabSizePadded,
-            randomThreshold, constantThreshold);
+        dim3 grid(batchSize * beamWidth);
+        maskTargetLogitsKernel<<<grid, block, 0, stream>>>(targetLogits, batchSlots, beamWidth, vocabSizePadded,
+            finishedInput, maxBatchSize, batchUseDraftLogits, outputIdsAfterSampling, targetOutputIds,
+            runtimeTopKDevicePtr, maskBuffer);
     }
+    sync_check_cuda_error();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void invokeAcceptDraftTokens(SizeType32 batchSize, T* draftProbs, T* targetProbs, SizeType32 const* numsDraftTokens,
+    bool const* batchUseDraftLogits, TokenIdType const* draftIds, FinishedState const* finishedInput,
+    FinishedState* finishedOutput, curandState_t* curandState, SizeType32 const* batchSlots, SizeType32 maxDraftTokens,
+    SizeType32 beamWidth, SizeType32 vocabSizePadded, bool randomThreshold, float constantThreshold, SizeType32 step,
+    bool* batchIsAccepted, SizeType32* targetOutputIds, cudaStream_t stream)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    TLLM_CHECK(beamWidth == 1);
     {
         dim3 block(1024);
         dim3 grid(batchSize * beamWidth);
-        correctAcceptedStatesAndLogits<<<grid, block, 0, stream>>>(draftProbs, targetProbs, targetLogits,
-            numsDraftTokens, finished, batchSlots, batchSize, maxBatchSize, maxDraftTokens, beamWidth, vocabSizePadded);
+        acceptDraftTokensKernel<<<grid, block, 0, stream>>>(draftProbs, targetProbs, numsDraftTokens,
+            batchUseDraftLogits, draftIds, finishedInput, finishedOutput, curandState, batchSlots, maxDraftTokens,
+            beamWidth, vocabSizePadded, randomThreshold, constantThreshold, step, batchIsAccepted, targetOutputIds);
     }
+    sync_check_cuda_error();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-template void acceptDraftTokensByLogits(float* draftLogits, float** targetLogits, float* draftProbs, float* targetProbs,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold,
-    float constantThreshold, cudaStream_t stream);
-template void acceptDraftTokensByLogits(half* draftLogits, half** targetLogits, half* draftProbs, half* targetProbs,
-    SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    SizeType32 const* batchSlots, SizeType32 batchSize, SizeType32 maxBatchSize, SizeType32 beamWidth,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, SizeType32 maxDraftTokens, bool randomThreshold,
-    float constantThreshold, cudaStream_t stream);
+template void invokeMaskTargetLogits(SizeType32 batchSize, float* targetLogits, SizeType32 const* batchSlots,
+    SizeType32 beamWidth, SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize,
+    bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds,
+    SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream);
+template void invokeMaskTargetLogits(SizeType32 batchSize, half* targetLogits, SizeType32 const* batchSlots,
+    SizeType32 beamWidth, SizeType32 vocabSizePadded, FinishedState const* finishedInput, SizeType32 maxBatchSize,
+    bool const* batchUseDraftLogits, SizeType32* outputIdsAfterSampling, SizeType32* targetOutputIds,
+    SizeType32* runtimeTopKDevicePtr, bool* maskBuffer, cudaStream_t stream);
+
+template void invokeAcceptDraftTokens(SizeType32 batchSize, float* draftProbs, float* targetProbs,
+    SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, TokenIdType const* draftIds,
+    FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState,
+    SizeType32 const* batchSlots, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSizePadded,
+    bool randomThreshold, float constantThreshold, SizeType32 step, bool* batchIsAccepted, SizeType32* targetOutputIds,
+    cudaStream_t stream);
+template void invokeAcceptDraftTokens(SizeType32 batchSize, half* draftProbs, half* targetProbs,
+    SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, TokenIdType const* draftIds,
+    FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState,
+    SizeType32 const* batchSlots, SizeType32 maxDraftTokens, SizeType32 beamWidth, SizeType32 vocabSizePadded,
+    bool randomThreshold, float constantThreshold, SizeType32 step, bool* batchIsAccepted, SizeType32* targetOutputIds,
+    cudaStream_t stream);
 
+void invokeForwardAcceptedTokens(SizeType32 batchSize, SizeType32 const* batchSlots, bool* batchIsAccepted,
+    SizeType32* outputSequenceLengths, TokenIdType const* draftIds, TokenIdType** idsPtrs, SizeType32 step,
+    SizeType32 maxDraftTokens, TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    dim3 block(std::min(static_cast<uint32_t>(batchSize), 256u));
+    dim3 grid(divUp(static_cast<uint32_t>(batchSize), block.x));
+    forwardAcceptedTokensKernel<<<grid, block, 0, stream>>>(batchSize, batchSlots, batchIsAccepted,
+        outputSequenceLengths, draftIds, idsPtrs, step, maxDraftTokens, endIds, finishedOutput);
+    sync_check_cuda_error();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
 } // namespace tensorrt_llm::kernels::speculative_decoding
diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h
index 4921e1390..69ee81e40 100644
--- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h
@@ -26,84 +26,77 @@
 namespace tensorrt_llm::kernels::speculative_decoding
 {
 
-//! \brief Accepts or rejects draft tokens based on the equality of draft and target tokens
-//! for speculative decoding. Target token is accepted if targetToken == draftToken.
-//! If number of accepted tokens N < maxDraftTokens, then function accepts N + 1 tokens of target model.
-//! sequenceLengths, finishedSum and finishedFinal are modified accordingly.
-//!
-//! \param draftIds input buffer [batchSize, maxDraftTokens].
-//! Indices of the draft tokens.
-//! \param targetIds input buffer [batchSize, maxSeqLen]. Indices of the tokens decoded by the target model
-//! \param contextLengths input buffer [batchSize]. Context lengths of the requests without draft tokens
-//! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request
-//! \param sequenceLengths input/output buffer [batchSize] sequence lengths of the requests in batch
-//! Modified in-place according to the accepted/rejected tokens
-//! \param finished input buffer [maxDraftTokens + 1, batchSize] finished states at each decoding iteration
-//! \param finishedFinal output buffer [batchSize] finished states after accepting/rejecting tokens
-//! \param finishedSum output buffer [1] total number of requests in batch that finished the execution
-//! \param batchSlots input buffer [batchSize], address map from local index
-//! to global index [0, batchSize] -> [0, maxBatchSize]
-//! \param batchSize current batch size
-//! \param maxBatchSize maximum batch size
-//! \param beamWidth beam width
-//! \param maxSeqLen maximum sequence length
-//! \param maxDraftTokens maximum number of draft tokens
-//! \param stream stream
-void invokeAcceptDraftTokensByIds(runtime::TokenIdType const* draftIds, runtime::TokenIdType const* targetIds,
-    runtime::SizeType32 const* contextLengths, runtime::SizeType32 const* numsDraftTokens,
-    runtime::SizeType32* sequenceLengths, FinishedState const* finished, FinishedState* finishedFinal,
-    runtime::SizeType32* finishedSum, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize,
-    runtime::SizeType32 maxBatchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxSeqLen,
-    runtime::SizeType32 maxDraftTokens, cudaStream_t stream);
-
-//! \brief Performs probabilistic acceptance of draft tokens based on their probability distributions.
-//! Corrects targetLogits for the next to the last accepted token
+//! \brief Accepts or rejects draft tokens based on their probability distributions or the equality of draft and target
+//! tokens. Corrects targetLogits for the last accepted token
 //! according to https://openreview.net/pdf?id=C9NEblP8vS
 //!
-//! \param draftLogits input/output buffer [draftTokens, batchSize, beamWidth, vocabSize].
-//! Initially contains token logits of the draft model.
-//! \param targetLogits input/output buffer [batchSize][draftTokens+1, beamWidth, vocabSize].
-//! Vector of pointers to the logits.
-//! Initially contains token logits of the target model.
-//! It is modified in-place for next to the last accepted token such as
-//! P'(x) = norm(max(0, P_{n+1}(x) - Q_{n+1}(x))), where N < maxDraftTokens is number of accepted tokens.
+//! \param batchSize current batch size
 //! \param draftProbs output buffer [maxDraftTokens, batchSize, beamWidth, vocabSize].
 //! Workspace buffer for token probabilities of the draft model.
 //! \param targetProbs output buffer [maxDraftTokens+1, batchSize, beamWidth, vocabSize].
 //! Workspace buffer for token probabilities of the target model.
 //! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request
-//! \param finished output buffer [draftTokens, batchSize, beamWidth].
-//! At each step sets to NOT_FINISHED if token is accepted or SKIP_DECODING if token is not accepted
-//! \param curandState input buffer [batchSize]. Curand states properly
-//! initialized using invokeCurandInitialize per request.
-//! \param batchSlots input buffer [batchSize], address map from local index
-//! to global index [0, batchSize] -> [0, maxBatchSize]
-//! \param batchSize current batch size
-//! \param maxBatchSize maximum batch size
-//! \param beamWidth beam width
-//! \param vocabSize unpadded vocab size
-//! \param vocabSizePadded padded vocab size
+//! \param batchUseDraftLogits input buffer [batchSize]. Acceptance logic using draft logits or not, per request
+//! \param draftIds input buffer [batchSize, draftTokens]. Pointer to draft token ids.
+//! \param finishedInput input buffer [batchSize, beamWidth].
+//! \param finishedOutput output buffer [batchSize, beamWidth]. At each step sets SKIP_DECODING if token is not
+//! accepted.
+//! \param curandState input buffer [batchSize]. Curand states properly initialized using invokeCurandInitialize
+//! per request.
+//! \param batchSlots input buffer [batchSize], address map from local index to global index [0, batchSize] ->
+//! [0, maxBatchSize].
 //! \param maxDraftTokens maximum number of draft tokens
+//! \param beamWidth beam width (only beamWidth == 1 supported)
+//! \param vocabSizePadded padded vocab size
 //! \param randomThreshold True if use uniformly sampled threshold for token acceptance
 //! \param constantThreshold threshold used to accept tokens if randomThreshold is false
+//! \param step The current step of decoding (draft token id index)
+//! \param batchIsAccepted output buffer [batchSize]. Stores acceptance result for multinomial sampling later or
+//! forwarding next step.
+//! \param targetOutputIds input/output buffer [batchSize]. Stores target sampling output ids for acceptById
+//! logics.
 //! \param stream stream
 template <typename T>
-void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs, T* targetProbs,
-    runtime::SizeType32 const* numsDraftTokens, FinishedState* finished, curandState_t* curandState,
-    runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 maxBatchSize,
-    runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSize, runtime::SizeType32 vocabSizePadded,
-    runtime::SizeType32 maxDraftTokens, bool randomThreshold, float constantThreshold, cudaStream_t stream);
+void invokeAcceptDraftTokens(runtime::SizeType32 batchSize, T* draftProbs, T* targetProbs,
+    runtime::SizeType32 const* numsDraftTokens, bool const* batchUseDraftLogits, runtime::TokenIdType const* draftIds,
+    FinishedState const* finishedInput, FinishedState* finishedOutput, curandState_t* curandState,
+    runtime::SizeType32 const* batchSlots, runtime::SizeType32 maxDraftTokens, runtime::SizeType32 beamWidth,
+    runtime::SizeType32 vocabSizePadded, bool randomThreshold, float constantThreshold, runtime::SizeType32 step,
+    bool* batchIsAccepted, runtime::SizeType32* targetOutputIds, cudaStream_t stream);
 
-struct Candidate // Hold probability maximum and rate of target / dfraft, used in `acceptDraftTokensByLogits`
-{
-    float maxProb{0.f};
-    float rateQP{0.f};
-};
+//! \brief Mask the target logits with -inf for unselected topK/topP token ids.
+//! according to
+//! https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/generation/utils.py#L4064
+//!
+//! \param batchSize current batch size
+//! \param targetLogits input/output buffer [batchSize][draftTokens+1, beamWidth, vocabSize].
+//! Vector of pointers to the logits. (beamWidth == 1)
+//! Initially contains token logits of the target model.
+//! \param batchSlots input buffer [batchSize], address map from local index to global index [0, batchSize] ->
+//! [0, maxBatchSize].
+//! \param beamWidth beam width (only beamWidth == 1 supported)
+//! \param vocabSizePadded padded vocab size
+//! \param finishedInput input buffer [batchSize, beamWidth].
+//! \param maxBatchSize maximum batch size
+//! \param batchUseDraftLogits input buffer [batchSize]. Acceptance logic using draft logits or not, per request
+//! \param outputIdsAfterSampling input buffer [batchSize, vocabSize]. Stores all selected IDs from sampling for
+//! masking.
+//! \param targetOutputIds input/output buffer [batchSize]. Stores target sampling output ids for acceptById
+//! logics.
+//! \param numsDraftTokens input buffer [batchSize]. Number of draft tokens per request
+//! \param runtimeTopKDevicePtr input buffer [batchSize] the topks in sampling step, for porting topK ids out.
+//! \param maskBuffer input buffer [batchSize, vocabSize] for masking calculation (index value to position).
+//! \param stream stream
+template <typename T>
+void invokeMaskTargetLogits(runtime::SizeType32 batchSize, T* targetLogits, runtime::SizeType32 const* batchSlots,
+    runtime::SizeType32 beamWidth, runtime::SizeType32 vocabSizePadded, FinishedState const* finishedInput,
+    runtime::SizeType32 maxBatchSize, bool const* batchUseDraftLogits, runtime::SizeType32* outputIdsAfterSampling,
+    runtime::SizeType32* targetOutputIds, runtime::SizeType32* runtimeTopKDevicePtr, bool* maskBuffer,
+    cudaStream_t stream);
 
-__device__ __forceinline__ Candidate reduce_op(Candidate const& a, Candidate const& b)
-{
-    // Max-reduce operator of Candidate
-    return (a.maxProb > b.maxProb) ? a : b;
-}
+void invokeForwardAcceptedTokens(runtime::SizeType32 batchSize, runtime::SizeType32 const* batchSlots,
+    bool* batchIsAccepted, runtime::SizeType32* outputSequenceLengths, runtime::TokenIdType const* draftIds,
+    runtime::TokenIdType** idsPtrs, runtime::SizeType32 step, runtime::SizeType32 maxDraftTokens,
+    runtime::TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream);
 
 } // namespace tensorrt_llm::kernels::speculative_decoding
diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h
index 0ba522222..f60ac784e 100644
--- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h
+++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h
@@ -73,7 +73,7 @@ void invokeLengthCriterion(FinishedState* finished, runtime::SizeType32* finishe
     runtime::SizeType32* numNewTokens, runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize,
     runtime::SizeType32 beamWidth, cudaStream_t stream);
 
-//! \brief Sets finished states based on the endIds and ajusts sequence length to length before the first EOS token.
+//! \brief Sets finished states based on the endIds and adjusts sequence length to length before the first EOS token.
 //! Does not support beamWidth > 1 for now.
 //!
 //! \param outputIds input buffer [maxBatchSize][beamWidth, maxSeqLen].
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
index c53510d3e..f6ecd5b72 100644
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -507,15 +507,12 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams<T, KVCacheBuff
                 VecType k_to_cache = params.position_shift_enabled ? k_wo_pos : k;
 
                 auto const dst_q_idx = static_cast<size_t>(global_token_idx) * params.q_hidden_size + hidden_idx;
-                QuantizedEltType* quantized_q_ptr = STORE_QKV
-                    ? reinterpret_cast<QuantizedEltType*>(params.QuantizedQKV) + src_q_idx
-                    : reinterpret_cast<QuantizedEltType*>(params.Q) + dst_q_idx;
                 VecType* q_ptr = STORE_QKV ? reinterpret_ptr<T, VecType>(params.QKV, src_q_idx)
                                            : reinterpret_ptr<T, VecType>(params.Q, dst_q_idx);
 
                 // Cast float scale to dst data type.
                 using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
-                TScale scaleOrigQuant;
+                [[maybe_unused]] TScale scaleOrigQuant;
                 if constexpr (FP8_OUTPUT || ENABLE_8BITS_CACHE)
                 {
                     mmha::convert_from_float(
@@ -525,6 +522,9 @@ __global__ void applyBiasRopeUpdateKVCache(QKVPreprocessingParams<T, KVCacheBuff
                 if constexpr (FP8_OUTPUT)
                 {
                     // Quant the vec to fp8 vec with the scale.
+                    QuantizedEltType* quantized_q_ptr = STORE_QKV
+                        ? reinterpret_cast<QuantizedEltType*>(params.QuantizedQKV) + src_q_idx
+                        : reinterpret_cast<QuantizedEltType*>(params.Q) + dst_q_idx;
                     mmha::store_8bits_vec(quantized_q_ptr, q, 0, scaleOrigQuant);
                 }
                 else
@@ -813,15 +813,12 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
         if (valid_token)
         {
             auto const dst_q_idx = static_cast<size_t>(global_token_idx) * params.q_hidden_size + hidden_idx;
-            QuantizedEltType* quantized_q_ptr = STORE_QKV
-                ? reinterpret_cast<QuantizedEltType*>(params.QuantizedQKV) + src_q_idx
-                : reinterpret_cast<QuantizedEltType*>(params.Q) + dst_q_idx;
             VecT* q_ptr = STORE_QKV ? reinterpret_ptr<T, VecT>(params.QKV, src_q_idx)
                                     : reinterpret_ptr<T, VecT>(params.Q, dst_q_idx);
 
             // Cast float scale to dst data type.
             using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
-            TScale scaleOrigQuant;
+            [[maybe_unused]] TScale scaleOrigQuant;
             if constexpr (FP8_OUTPUT || ENABLE_8BITS_CACHE)
             {
                 mmha::convert_from_float(&scaleOrigQuant, params.kvScaleOrigQuant ? params.kvScaleOrigQuant[0] : 1.0f);
@@ -830,6 +827,9 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams<T, KVCacheBu
             if constexpr (FP8_OUTPUT)
             {
                 // Quant the vec to fp8 vec with the scale.
+                QuantizedEltType* quantized_q_ptr = STORE_QKV
+                    ? reinterpret_cast<QuantizedEltType*>(params.QuantizedQKV) + src_q_idx
+                    : reinterpret_cast<QuantizedEltType*>(params.Q) + dst_q_idx;
                 mmha::store_8bits_vec(quantized_q_ptr, q, 0, scaleOrigQuant);
             }
             else
diff --git a/cpp/tensorrt_llm/layers/decodingLayer.cpp b/cpp/tensorrt_llm/layers/decodingLayer.cpp
index 1d91a626b..7e5c75964 100644
--- a/cpp/tensorrt_llm/layers/decodingLayer.cpp
+++ b/cpp/tensorrt_llm/layers/decodingLayer.cpp
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/layers/beamSearchLayer.h"
 #include "tensorrt_llm/layers/decodingParams.h"
 #include "tensorrt_llm/layers/explicitDraftTokensLayer.h"
+#include "tensorrt_llm/layers/externalDraftTokensLayer.h"
 #include "tensorrt_llm/layers/layerUtils.h"
 #include "tensorrt_llm/layers/lookaheadDecodingLayer.h"
 #include "tensorrt_llm/layers/medusaDecodingLayer.h"
@@ -96,6 +97,10 @@ DecodingLayer<T>::DecodingLayer(executor::DecodingMode const& mode, DecoderDomai
     {
         mDecodingLayer = std::make_unique<ExplicitDraftTokensLayer<T>>(decoderDomain, mBufferManager);
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        mDecodingLayer = std::make_unique<ExternalDraftTokensLayer<T>>(mDecodingMode, decoderDomain, mBufferManager);
+    }
     else
     {
         TLLM_CHECK_WITH_INFO(false,
@@ -144,6 +149,12 @@ void DecodingLayer<T>::setup(SizeType32 batchSize, SizeType32 beamWidth, TensorC
             beamWidth == 1, "Decoding mode is ExplicitDraftTokens, but beamWidth != 1 (%d != 1)", beamWidth);
         mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams, workspace);
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        TLLM_CHECK_WITH_INFO(
+            beamWidth == 1, "Decoding mode is external draft tokens, but beamWidth != 1 (%d != 1)", beamWidth);
+        mDecodingLayer->setup(batchSize, beamWidth, batchSlots, setupParams->decodingParams, workspace);
+    }
     else
     {
         TLLM_CHECK_WITH_INFO(false,
@@ -249,6 +260,45 @@ std::tuple<std::shared_ptr<BaseDecodingOutputs>, std::shared_ptr<BaseDecodingInp
         preparedInputs = baseInputs;
         preparedOutputs = baseOutputs;
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        auto externalDraftTokenParams = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+        auto const ite = externalDraftTokenParams->ite;
+        auto const step = externalDraftTokenParams->step;
+        auto const localBatchSize = static_cast<int64_t>(externalDraftTokenParams->localBatchSize);
+
+        TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() == 1,
+            "Decoding mode is TopK and/or TopP, but beamWidth != 1 (%d != 1)", localDecoderDomain.getBeamWidth());
+
+        // In sampling, we have supported batch sampling. So, we always compute all
+        // sentences once.
+        TensorConstPtr logitsSlice = ITensor::slice(*externalDraftTokenParams->logits, 0, localBatchSize);
+        TensorConstPtr endIdSlice = ITensor::slice(endIds, 0, localBatchSize);
+        auto decodeInputs = std::make_shared<ExternalDraftTokensInputs>(
+            endIdSlice, externalDraftTokenParams->batchSlots, step, ite, localBatchSize);
+
+        decodeInputs->finished = externalDraftTokenParams->finished;
+
+        decodeInputs->logits = logitsSlice;
+
+        if (externalDraftTokenParams->inputLengths)
+        {
+            auto& inputLengths = externalDraftTokenParams->inputLengths.value();
+            decodeInputs->inputLengths = ITensor::slice(inputLengths, 0, localBatchSize);
+        }
+        decodeInputs->draftLogits = externalDraftTokenParams->draftLogits;
+        decodeInputs->draftProbs = externalDraftTokenParams->draftProbs;
+        decodeInputs->targetProbs = externalDraftTokenParams->targetProbs;
+        decodeInputs->numDraftTokens = externalDraftTokenParams->numDraftTokens;
+        decodeInputs->draftTokenIds = externalDraftTokenParams->draftTokenIds;
+        decodeInputs->constantThreshold = externalDraftTokenParams->constantThreshold;
+        decodeInputs->useRandomAcceptanceThreshold = externalDraftTokenParams->useRandomAcceptanceThreshold;
+        decodeInputs->step = externalDraftTokenParams->step;
+        decodeInputs->useDraftLogits = externalDraftTokenParams->useDraftLogits;
+
+        preparedInputs = decodeInputs;
+        preparedOutputs = baseOutputs;
+    }
     else
     {
         TLLM_CHECK_WITH_INFO(false,
diff --git a/cpp/tensorrt_llm/layers/decodingLayer.h b/cpp/tensorrt_llm/layers/decodingLayer.h
index 78cd6b1b5..60780851f 100644
--- a/cpp/tensorrt_llm/layers/decodingLayer.h
+++ b/cpp/tensorrt_llm/layers/decodingLayer.h
@@ -45,7 +45,7 @@ class DecodingLayer : public BaseLayer
         std::shared_ptr<BaseDecodingInputs> const& inputs,
         std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
 
-    //! \brief Calls forwardSync of configired decoding layer.
+    //! \brief Calls forwardSync of configured decoding layer.
     void forwardSync(std::shared_ptr<BaseDecodingOutputs> const& outputs,
         std::shared_ptr<BaseDecodingInputs> const& inputs,
         std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
diff --git a/cpp/tensorrt_llm/layers/decodingParams.h b/cpp/tensorrt_llm/layers/decodingParams.h
index 0179add1d..40dbbba1f 100644
--- a/cpp/tensorrt_llm/layers/decodingParams.h
+++ b/cpp/tensorrt_llm/layers/decodingParams.h
@@ -210,8 +210,13 @@ struct LookaheadSetupParams : public DecodingSetupParams
     TensorPtr positionOffsets;
     //! see LookaheadDecodingOutputs::attentionPackedMasks
     TensorPtr attentionPackedMasks;
-    //! see LookaheadDecodingOutputs::actualGenerationLengths
-    TensorPtr actualGenerationLengths;
+};
+
+class ExternalDraftTokensSetupParams : public DecodingSetupParams
+{
+public:
+    std::optional<std::vector<runtime::SizeType32>> runtimeTopK; // [1] or [setupBatchSize] on cpu
+    std::optional<std::vector<float>> runtimeTopP;               // [1] or [setupBatchSize] on cpu
 };
 
 class BaseDecodingInputs
@@ -333,6 +338,33 @@ class SamplingInputs : public DecodingInputs
     bool probsComputed{};
 };
 
+class ExternalDraftTokensInputs : public DecodingInputs
+{
+public:
+    explicit ExternalDraftTokensInputs(TensorConstPtr endIds, TensorConstPtr batchSlots, runtime::SizeType32 step,
+        runtime::SizeType32 ite, runtime::SizeType32 localBatchSize)
+        : DecodingInputs{std::move(endIds), std::move(batchSlots), step, ite, localBatchSize}
+    {
+    }
+
+    TensorPtr draftLogits;
+    TensorPtr draftProbs;
+    TensorPtr targetProbs;
+    TensorPtr numDraftTokens;
+    TensorPtr draftTokenIds;
+    TensorPtr useDraftLogits;
+    runtime::SizeType32 step;
+    float constantThreshold;
+    bool useRandomAcceptanceThreshold;
+
+    //! optional parameters
+    //! [localBatchSize]
+    curandState_t* curandStates{};
+
+    //! Flag to mark that logits tensor contains probabilities
+    bool probsComputed{};
+};
+
 // Medusa inputs
 class MedusaDecodingInputs : public DecodingInputs
 {
@@ -479,7 +511,7 @@ class BeamSearchOutputs : public BaseDecodingOutputs
 //! {c'} is always accepted and {x', z'} is supposed to be accepted.
 //! The accepted tokens [c', x', z'] is saved in `outputIds` in-place, starting from `sequenceLength`.
 //! The `acceptedLength` is 3, and the accepted draft tokens length is 2.
-//! `sequenceLength` is also increaded by `acceptedLength` in-place.
+//! `sequenceLength` is also increased by `acceptedLength` in-place.
 //! The pathsOffset is {0, 1, 3} for {c', x', z'}.
 //! [] for accepted, <> for draft, {} for input/output.
 //!
@@ -551,8 +583,6 @@ class LookaheadDecodingOutputs : public SpeculativeDecodingOutputs
     TensorPtr positionOffsets;
     //! [maxBatchSize, maxDecodingTokens]
     TensorPtr positionIds;
-    //! The actual decoding tokens length, for debug and for future.
-    TensorPtr actualGenerationLengths;
 };
 
 class ExplicitDraftTokensOutputs : public SpeculativeDecodingOutputs
diff --git a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp
new file mode 100644
index 000000000..097fe116e
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.cpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "externalDraftTokensLayer.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/decodingCommon.h"
+#include "tensorrt_llm/kernels/samplingTopKKernels.h"
+#include "tensorrt_llm/kernels/samplingTopPKernels.h"
+#include "tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h"
+#include "tensorrt_llm/layers/defaultDecodingParams.h"
+#include "tensorrt_llm/layers/layerUtils.h"
+#include "tensorrt_llm/runtime/runtimeKernels.h"
+
+#include <algorithm>
+
+namespace tksd = tensorrt_llm::kernels::speculative_decoding;
+
+using namespace tensorrt_llm::common;
+using namespace tensorrt_llm::kernels;
+using namespace tensorrt_llm::runtime;
+
+namespace tensorrt_llm::layers
+{
+
+template <typename T>
+ExternalDraftTokensLayer<T>::ExternalDraftTokensLayer(executor::DecodingMode const& mode,
+    DecoderDomain const& decoderDomain, std::shared_ptr<BufferManager> bufferManager)
+    : BaseLayer(decoderDomain, bufferManager)
+    , mDecodingMode(mode)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    TLLM_CHECK_WITH_INFO(!mDecodingMode.isBeamSearch(), "ExternalDraftTokensLayer does not support Beam search mode");
+
+    allocateBuffer(decoderDomain.getBatchSize());
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::allocateBuffer(SizeType32 batchSize)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    // top k workspace size
+    auto workspaceSize = getTopKWorkspaceSize<T>(batchSize, 1, TOP_K_MAX, mDecoderDomain.getVocabSizePadded());
+    mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize);
+    // top p workspace size
+    workspaceSize = getTopPWorkspaceSize<T>(batchSize, mDecoderDomain.getVocabSizePadded());
+    mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize);
+    // multinomial (top p == 1) workspace size
+    workspaceSize = getTopPWorkspaceSize<float>(batchSize, mDecoderDomain.getVocabSizePadded());
+    mWorkspaceSize = std::max(workspaceSize, mWorkspaceSize);
+
+    // batchsize here is maxBatchSize
+    auto const batchSizeShape = ITensor::makeShape({batchSize});
+
+    mCurandStatesDevice
+        = mBufferManager->gpu(ITensor::makeShape({batchSize, sizeof(curandState_t)}), TRTDataType<int8_t>::value);
+    mBatchIsAccepted = mBufferManager->gpu(batchSizeShape, TRTDataType<bool>::value);
+    mRuntimeMultinomialDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+
+    // host buffers.
+    mSkipTopKDecodeDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<bool>::value);
+    mSkipTopKDecodeHost = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<bool>::value);
+    mSkipTopPDecodeDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<bool>::value);
+    mSkipTopPDecodeHost = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<bool>::value);
+    auto skipTopPDecodeHostRange = BufferRange<bool>(*mSkipTopPDecodeHost);
+    std::fill(skipTopPDecodeHostRange.begin(), skipTopPDecodeHostRange.end(), true);
+
+    mOutputIdsAfterSampling = mBufferManager->gpu(
+        ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType<TokenIdType>::value);
+    mTargetOutputIds = mBufferManager->gpu(ITensor::makeShape({batchSize}), TRTDataType<TokenIdType>::value);
+
+    mRuntimeTopKDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<SizeType32>::value);
+
+    mRuntimeTopPForTopKDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+
+    mRuntimeTopPDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+    mInitialTopPDevice = mBufferManager->gpu(batchSizeShape, TRTDataType<float>::value);
+
+    mMaskBuffer = mBufferManager->gpu(
+        ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType<bool>::value);
+
+    mSetupWorkspaceSize = std::max({mBatchIsAccepted->getSizeInBytes(), mRuntimeMultinomialDevice->getSizeInBytes(),
+        mSkipTopKDecodeDevice->getSizeInBytes(), mSkipTopPDecodeDevice->getSizeInBytes(),
+        mOutputIdsAfterSampling->getSizeInBytes(), mTargetOutputIds->getSizeInBytes(),
+        mRuntimeTopKDevice->getSizeInBytes(), mRuntimeTopPForTopKDevice->getSizeInBytes(),
+        mRuntimeTopPDevice->getSizeInBytes(), mInitialTopPDevice->getSizeInBytes(), mMaskBuffer->getSizeInBytes()});
+
+    mTargetLogits = mBufferManager->gpu(
+        ITensor::makeShape({batchSize, mDecoderDomain.getVocabSizePadded()}), TRTDataType<T>::value);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::setup(SizeType32 batchSize, SizeType32 beamWidth, TensorConstPtr batchSlots,
+    std::shared_ptr<BaseSetupParams> const& baseSetupParams,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto setupParams = std::dynamic_pointer_cast<ExternalDraftTokensSetupParams>(baseSetupParams);
+
+    workspace->initializeDeviceCurandStates(
+        setupParams->randomSeed, batchSize, workspace->getDeviceBatchSlots(), mCurandStatesDevice);
+
+    auto const* batchSlotsDevicePtr = workspace->getDeviceBatchSlotsPtr();
+    auto& runtimeMultinomialDeviceTensor = const_cast<ITensor&>(*mRuntimeMultinomialDevice);
+    tensorrt_llm::runtime::kernels::invokeFill(runtimeMultinomialDeviceTensor, 1.0f, mBufferManager->getStream());
+
+    auto* runtimeTopKDevicePtr = bufferCastOrNull<SizeType32>(mRuntimeTopKDevice);
+
+    // Prepare runtime top K
+    auto constexpr defaultTopK = 1u;
+    auto runtimeTopK = setupParams->runtimeTopK.value_or(std::vector<SizeType32>(batchSize, defaultTopK));
+    auto const runtimeTopKSize = runtimeTopK.size();
+    for (auto& topK : runtimeTopK)
+    {
+        if (topK < 0 || topK > TOP_K_MAX)
+        {
+            TLLM_LOG_WARNING(
+                "TopK (%d) is larger than max supported number (%d). Clip to max supported number.", topK, TOP_K_MAX);
+            topK = std::clamp(topK, 0, static_cast<SizeType32>(TOP_K_MAX));
+        }
+    }
+
+    if (runtimeTopKSize > 1)
+    {
+        TLLM_CHECK_WITH_INFO(runtimeTopK.size() == batchSize,
+            fmtstr("runtimeTopK.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopK.size(), batchSize));
+        DecodingLayerWorkspace::copyToWorkspace<SizeType32>(
+            *this->mBufferManager, runtimeTopK, workspace->getWorkspaceDeviceBuffer());
+        auto* setupWorkspaceDevicePtr = workspace->getWorkspaceDevicePtrAs<SizeType32>();
+        // fill top ks into runtimeTopKDevice
+        invokeScatterDecodingParams(
+            setupWorkspaceDevicePtr, runtimeTopKDevicePtr, batchSlotsDevicePtr, batchSize, getStream());
+    }
+
+    // FIXME(nkorobov): monotonically growing
+    auto const curMaxTopK = *std::max_element(std::begin(runtimeTopK), std::end(runtimeTopK));
+    mRuntimeMaxTopK = std::max(mRuntimeMaxTopK, curMaxTopK);
+
+    auto runtimeTopP = setupParams->runtimeTopP.value_or(std::vector<float>{});
+    auto const runtimeTopPSize = runtimeTopP.size();
+    auto* runtimeTopPForTopKDevicePtr = bufferCastOrNull<float>(mRuntimeTopPForTopKDevice);
+    auto* runtimeTopPDevicePtr = bufferCastOrNull<float>(mRuntimeTopPDevice);
+    auto* skipTopPDecodeHostPtr = bufferCastOrNull<bool>(mSkipTopPDecodeHost);
+
+    // if no top P, fill topP skip decode to true
+    if (runtimeTopPSize == 0)
+    {
+        auto const* batchSlotsPtr = bufferCast<SizeType32>(*batchSlots);
+        for (SizeType32 bi = 0; bi < batchSize; ++bi)
+        {
+            auto const bid = batchSlotsPtr[bi];
+            skipTopPDecodeHostPtr[bid] = true;
+        }
+        auto skipTopPDecodeHostSlice = IBuffer::slice(mSkipTopPDecodeHost, 0, mDecoderDomain.getBatchSize());
+        mBufferManager->copy(*skipTopPDecodeHostSlice, *mSkipTopPDecodeDevice);
+    }
+    else
+    {
+        for (auto& topP : runtimeTopP)
+        {
+            if (topP < 0.f || topP > 1.0f)
+            {
+                TLLM_LOG_WARNING("TopP (%f) is out of range ([0.0, 1.0f]). Clip to closest number.", topP);
+                topP = std::clamp(topP, 0.f, 1.f);
+            }
+        }
+        if (runtimeTopPSize > 1)
+        {
+            TLLM_CHECK_WITH_INFO(runtimeTopP.size() == batchSize,
+                fmtstr("runtimeTopP.size() (%lu) == batchSize (%d) is not satisfied!", runtimeTopP.size(), batchSize));
+            DecodingLayerWorkspace::copyToWorkspace<float>(
+                *this->mBufferManager, runtimeTopP, workspace->getWorkspaceDeviceBuffer());
+            auto* setupWorkspaceDevicePtr = workspace->getWorkspaceDevicePtrAs<float>();
+            // fill runtime top p device for top k kernel
+            invokeScatterDecodingParams(
+                setupWorkspaceDevicePtr, runtimeTopPForTopKDevicePtr, batchSlotsDevicePtr, batchSize, getStream());
+            // fill runtime top p device for top p kernel
+            invokeScatterDecodingParams(
+                setupWorkspaceDevicePtr, runtimeTopPDevicePtr, batchSlotsDevicePtr, batchSize, getStream());
+        }
+    }
+    // if no topP, default topP is 0.0f, but in invokeSetupTopKRuntimeArgs, it gets set to 1.0f if k > 0
+    auto const topP = (runtimeTopPSize == 0) ? DefaultDecodingParams::getTopP() : runtimeTopP.front();
+
+    auto* skipTopKDecodeDevicePtr = bufferCastOrNull<bool>(mSkipTopKDecodeDevice);
+    {
+        dim3 block(std::min(static_cast<uint32_t>(batchSize), 256u));
+        dim3 grid(divUp(static_cast<uint32_t>(batchSize), block.x));
+        // support topK up to TOP_K_MAX.
+        invokeSetupTopKRuntimeArgs(batchSize, curMaxTopK, runtimeTopKDevicePtr, runtimeTopKSize, topP,
+            runtimeTopPForTopKDevicePtr, runtimeTopPSize, skipTopKDecodeDevicePtr, batchSlotsDevicePtr, getStream());
+    }
+    auto const skipTopKHostDecodeDeviceSlice = ITensor::slice(mSkipTopKDecodeDevice, 0, mDecoderDomain.getBatchSize());
+    auto skipTopKDecodeHostSlice = ITensor::slice(mSkipTopKDecodeHost, 0, mDecoderDomain.getBatchSize());
+    mBufferManager->copy(*skipTopKHostDecodeDeviceSlice, *skipTopKDecodeHostSlice);
+
+    auto* skipTopPDecodeDevicePtr = bufferCast<bool>(*mSkipTopPDecodeDevice);
+    {
+        auto* initialTopPDevicePtr = bufferCast<float>(*mInitialTopPDevice);
+        invokeSetTopPRuntimeArgs(batchSize, curMaxTopK, runtimeTopKDevicePtr, runtimeTopKSize, topP,
+            runtimeTopPDevicePtr, runtimeTopPSize, skipTopPDecodeDevicePtr, batchSlotsDevicePtr, initialTopPDevicePtr,
+            getStream());
+    }
+    auto const skipTopPHostDecodeDeviceSlice = ITensor::slice(mSkipTopPDecodeDevice, 0, mDecoderDomain.getBatchSize());
+    auto skipTopPDecodeHostSlice = ITensor::slice(mSkipTopPDecodeHost, 0, mDecoderDomain.getBatchSize());
+    mBufferManager->copy(*skipTopPHostDecodeDeviceSlice, *skipTopPDecodeHostSlice);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const* endIds = bufferCast<TokenIdType>(*inputs->endIds);
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCast<FinishedState::UnderlyingType>(*inputs->finished.value()))
+        : nullptr;
+
+    inputs->curandStates = reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStatesDevice));
+    inputs->probsComputed = true;
+
+    auto runtimeLogitsPtr = bufferCast<T>(*workspace->getDeviceRuntimeLogits());
+    auto logitsPtrsPtr = static_cast<T**>(nullptr);
+    auto biasPtr = static_cast<T*>(nullptr);
+    auto const* batchSlotsPtr = workspace->getDeviceBatchSlotsPtr();
+    mBufferManager->copy(runtimeLogitsPtr, *mTargetLogits);
+    invokeAddBiasSoftMax(runtimeLogitsPtr, logitsPtrsPtr, runtimeLogitsPtr, biasPtr, endIds, finishedInput,
+        batchSlotsPtr, batchSize, mDecoderDomain.getBatchSize(), /* bw */ 1, mDecoderDomain.getVocabSize(),
+        mDecoderDomain.getVocabSizePadded(), /*skipSoftMax*/ false, /* batchSlotLogits */ false, getStream());
+
+    auto const targetTokenIdsShape = (*outputs->outputIds).getShape();
+
+    // Fill the buffer for selected ids from sampling with zero. -1 will be set as a boundary if topP kernel is required
+    auto& outputIdsAfterSamplingTensor = const_cast<ITensor&>(*mOutputIdsAfterSampling);
+    tensorrt_llm::runtime::kernels::invokeFill(outputIdsAfterSamplingTensor, 0, mBufferManager->getStream());
+
+    // The logits from target engine should go through samplings first.
+    // gptDecoderBatched.cpp is calling dynamic decoder step by step, in this step, dynamic Decoder already forwarded
+    // PenaltyLayer, BanWordsLayer. For (TopK > 0) && (TopK == 0 && TopP == 0), we invoke TopK sampling kernel. The same
+    // logic is implemented in SamplingLayer.cpp
+    getAllTopKs(outputs, baseInputs, workspace);
+
+    // Only for (TopK == 0 && TopP > 0), we invoke TopP sampling
+    getAllTopPs(outputs, baseInputs, workspace);
+
+    // After all selected tokens are filled in mOutputIdsAfterSampling by topK, topP kernels, token acceptance logics
+    // starts. First we mask the logits of unselected token id to -inf as HF's TopK, TopP implementation. We compute the
+    // logit probs of draft and target and go through acceptance logics.
+    acceptDraftTokens(outputs, baseInputs, workspace);
+
+    // If the token of the sequence is not accepted, a multinomial sampling is required for the bonus token.
+    // Multinomial sampling is achieved through TopP kernel with TopP = 1 and already weighted-sum target logits.
+    // The acceptance result of each batch is used as skipDecode in topP kernel. If is accepted, no sampling is needed
+    // (early exit). Forwarding for the next step is also set in this kernel.
+    multinomialSampling(outputs, baseInputs, workspace);
+
+    // For the sequence with accepted tokens, we simply forward a step.
+    forwardAcceptedTokens(outputs, baseInputs, workspace);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+size_t ExternalDraftTokensLayer<T>::getWorkspaceSize() const noexcept
+{
+    return std::max(mWorkspaceSize, mSetupWorkspaceSize);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::acceptDraftTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto const draftLogitsShape = (*inputs->draftLogits).getShape();
+    auto const maxBatchSize = mDecoderDomain.getBatchSize();
+    auto const maxTokensPerStep = draftLogitsShape.d[1]; // 1
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+    auto constexpr beamWidth = 1;
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCastOrNull<FinishedState::UnderlyingType>(inputs->finished))
+        : nullptr;
+
+    FinishedState* finishedOutput = (outputs->finished)
+        ? reinterpret_cast<FinishedState*>(bufferCastOrNull<FinishedState::UnderlyingType>(outputs->finished))
+        : nullptr;
+
+    tksd::invokeMaskTargetLogits(batchSize, bufferCast<T>(*mTargetLogits), workspace->getDeviceBatchSlotsPtr(),
+        beamWidth, mDecoderDomain.getVocabSizePadded(), finishedInput, maxBatchSize,
+        bufferCast<bool>(*inputs->useDraftLogits), bufferCast<SizeType32>(*mOutputIdsAfterSampling),
+        bufferCast<SizeType32>(*mTargetOutputIds), bufferCastOrNull<SizeType32>(mRuntimeTopKDevice),
+        bufferCast<bool>(*mMaskBuffer), getStream());
+
+    if (inputs->step == 0)
+    {
+        invokeAddBiasSoftMax(bufferCast<T>(*inputs->draftLogits), static_cast<T**>(nullptr),
+            bufferCast<T>(*inputs->draftProbs), static_cast<T*>(nullptr), nullptr, finishedInput,
+            workspace->getDeviceBatchSlotsPtr(), batchSize, maxBatchSize, beamWidth * maxTokensPerStep,
+            mDecoderDomain.getVocabSize(), mDecoderDomain.getVocabSizePadded(),
+            /* skip softmax */ false,
+            /* batchSlotLogits */ true, getStream());
+    }
+
+    invokeAddBiasSoftMax(bufferCast<T>(*mTargetLogits), static_cast<T**>(nullptr), bufferCast<T>(*inputs->targetProbs),
+        static_cast<T*>(nullptr), nullptr, finishedInput, workspace->getDeviceBatchSlotsPtr(), batchSize, maxBatchSize,
+        beamWidth /* 1 */, mDecoderDomain.getVocabSize(), mDecoderDomain.getVocabSizePadded(),
+        /* skip softmax */ false,
+        /* batchSlotLogits */ false, getStream());
+
+    sync_check_cuda_error();
+
+    tksd::invokeAcceptDraftTokens(batchSize, bufferCast<T>(*inputs->draftProbs), bufferCast<T>(*inputs->targetProbs),
+        bufferCast<SizeType32>(*inputs->numDraftTokens), bufferCast<bool>(*inputs->useDraftLogits),
+        bufferCast<TokenIdType>(*inputs->draftTokenIds), finishedInput, finishedOutput, inputs->curandStates,
+        workspace->getDeviceBatchSlotsPtr(), maxTokensPerStep, beamWidth, mDecoderDomain.getVocabSizePadded(),
+        inputs->useRandomAcceptanceThreshold, inputs->constantThreshold, inputs->step,
+        bufferCast<bool>(*mBatchIsAccepted), bufferCast<SizeType32>(*mTargetOutputIds), getStream());
+
+    sync_check_cuda_error();
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::multinomialSampling(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+    auto probs = bufferCastOrNull<T>(inputs->targetProbs);
+    auto* sequenceLength = bufferCastOrNull<SizeType32>(outputs->sequenceLength);
+    auto const* endIds = bufferCastOrNull<TokenIdType>(inputs->endIds);
+
+    FinishedState* finishedOutput = (outputs->finished)
+        ? reinterpret_cast<FinishedState*>(bufferCastOrNull<FinishedState::UnderlyingType>(outputs->finished))
+        : nullptr;
+    TopPSamplingKernelParams<T> params{};
+
+    params.probs = probs;
+    params.outputIdsPtrs = bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr);
+    params.workspace = workspace->getRawWorkspaceDevicePtr();
+    params.topPs = bufferCastOrNull<float>(mRuntimeMultinomialDevice);
+    params.sequenceLength = sequenceLength;
+    params.endIds = endIds;
+    params.batchSlots = workspace->getDeviceBatchSlotsPtr();
+    params.finishedInput = nullptr;
+    params.finishedOutput = finishedOutput;
+    params.skipDecode = bufferCastOrNull<bool>(mBatchIsAccepted);
+    params.cumLogProbs = nullptr;
+    params.outputLogProbs = nullptr;
+    params.curandState = inputs->curandStates;
+    params.batchSize = batchSize;
+    params.maxBatchSize = mDecoderDomain.getBatchSize();
+    params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
+
+    invokeBatchTopPSampling<T>(params, getStream());
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::getAllTopKs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto logits = bufferCastOrNull<T>(inputs->logits);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const* batchSlotsHost = bufferCast<SizeType32>(*inputs->batchSlots);
+    auto* skipDecodeHostPtr = bufferCastOrNull<bool>(mSkipTopKDecodeHost);
+    auto const skip = allOfBatchSlots(batchSlotsHost, skipDecodeHostPtr, batchSize, true);
+    if (skip)
+    {
+        return;
+    }
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCastOrNull<FinishedState::UnderlyingType>(inputs->finished))
+        : nullptr;
+
+    TopKSamplingKernelParams<T> params{};
+    params.logProbs = logits;
+    params.outputIds = bufferCastOrNull<TokenIdType>(mOutputIdsAfterSampling);
+    params.workspace = workspace->getRawWorkspaceDevicePtr();
+    params.maxTopP = 1.0f;
+    params.topPs = bufferCastOrNull<float>(mRuntimeTopPForTopKDevice);
+    params.maxTopK = mRuntimeMaxTopK;
+    params.topKs = bufferCastOrNull<SizeType32>(mRuntimeTopKDevice);
+    params.batchSlots = workspace->getDeviceBatchSlotsPtr();
+    params.finishedInput = finishedInput;
+    params.skipDecode = bufferCastOrNull<bool>(mSkipTopKDecodeDevice);
+    params.curandState = inputs->curandStates;
+    params.batchSize = batchSize;
+    params.maxBatchSize = mDecoderDomain.getBatchSize();
+    params.maxTokensPerStep = 1;
+    params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
+    params.returnAllSelectedTokens = true;
+    params.maxSeqLen = mDecoderDomain.getVocabSizePadded(); // workaround for returning all topKs with outputIds
+    params.logitsHasProbs = inputs->probsComputed;
+
+    invokeBatchTopKSampling(params, getStream());
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::getAllTopPs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+
+    auto logits = bufferCastOrNull<T>(inputs->logits);
+
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const* batchSlotsHost = bufferCast<SizeType32>(*inputs->batchSlots);
+    auto* skipDecodeHostPtr = bufferCastOrNull<bool>(mSkipTopPDecodeHost);
+    auto const skip = allOfBatchSlots(batchSlotsHost, skipDecodeHostPtr, batchSize, true);
+    if (skip)
+    {
+        return;
+    }
+
+    FinishedState const* finishedInput = (inputs->finished)
+        ? reinterpret_cast<FinishedState const*>(bufferCastOrNull<FinishedState::UnderlyingType>(inputs->finished))
+        : nullptr;
+
+    TopPSamplingKernelParams<T> params{};
+    params.probs = logits;
+    params.outputIds = bufferCastOrNull<TokenIdType>(mOutputIdsAfterSampling);
+    params.workspace = workspace->getRawWorkspaceDevicePtr();
+    params.topPs = bufferCastOrNull<float>(mRuntimeTopPDevice);
+    params.batchSlots = workspace->getDeviceBatchSlotsPtr();
+    params.finishedInput = finishedInput;
+    params.skipDecode = bufferCastOrNull<bool>(mSkipTopPDecodeDevice);
+    params.curandState = inputs->curandStates;
+    params.batchSize = batchSize;
+    params.maxBatchSize = mDecoderDomain.getBatchSize();
+    params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
+    params.returnAllSelectedTokens = true;
+    params.maxSeqLen = mDecoderDomain.getVocabSizePadded();
+
+    invokeBatchTopPSampling<T>(params, getStream());
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void ExternalDraftTokensLayer<T>::forwardAcceptedTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+    std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+    std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    auto inputs = std::dynamic_pointer_cast<ExternalDraftTokensInputs>(baseInputs);
+    auto const batchSize = inputs->logits.value()->getDimension<0>();
+
+    auto const draftLogitsShape = (*inputs->draftLogits).getShape();
+    auto const maxTokensPerStep = draftLogitsShape.d[1]; // 1
+
+    FinishedState* finishedOutput = (outputs->finished)
+        ? reinterpret_cast<FinishedState*>(bufferCastOrNull<FinishedState::UnderlyingType>(outputs->finished))
+        : nullptr;
+
+    tksd::invokeForwardAcceptedTokens(batchSize, workspace->getDeviceBatchSlotsPtr(),
+        bufferCast<bool>(*mBatchIsAccepted), bufferCastOrNull<SizeType32>(outputs->sequenceLength),
+        bufferCast<TokenIdType>(*inputs->draftTokenIds), bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr),
+        inputs->step, maxTokensPerStep, bufferCastOrNull<TokenIdType>(inputs->endIds), finishedOutput, getStream());
+
+    sync_check_cuda_error();
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template class ExternalDraftTokensLayer<float>;
+template class ExternalDraftTokensLayer<half>;
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h
new file mode 100644
index 000000000..4122c7c35
--- /dev/null
+++ b/cpp/tensorrt_llm/layers/externalDraftTokensLayer.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/layers/baseLayer.h"
+#include "tensorrt_llm/layers/decodingParams.h"
+#include "tensorrt_llm/runtime/common.h"
+
+#include <curand_kernel.h>
+
+namespace tensorrt_llm::layers
+{
+
+//! \brief Top class for sampling layers.
+//! It sets up and executes TopKSamplingLayer and TopPSamplingLayer samplings
+template <typename T>
+class ExternalDraftTokensLayer : public BaseLayer
+{
+public:
+    using Base = BaseLayer;
+
+    ExternalDraftTokensLayer(executor::DecodingMode const& mode, DecoderDomain const& decoderDomain,
+        std::shared_ptr<runtime::BufferManager> bufferManager);
+
+    void setup(runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, TensorConstPtr batchSlots,
+        std::shared_ptr<BaseSetupParams> const& setupParams,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
+
+    void forwardAsync(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& inputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace) override;
+
+    //! @returns workspace needed for this layer in bytes
+    [[nodiscard]] size_t getWorkspaceSize() const noexcept override;
+
+protected:
+    runtime::SizeType32 mRuntimeMaxTopK{0};
+
+private:
+    using Base::mDecoderDomain;
+
+    executor::DecodingMode mDecodingMode;
+
+    size_t mWorkspaceSize{0};
+    size_t mSetupWorkspaceSize{0};
+
+    TensorPtr mCurandStatesDevice;
+    TensorPtr mSkipTopKDecodeDevice;
+    TensorPtr mSkipTopKDecodeHost;
+    TensorPtr mSkipTopPDecodeDevice;
+    TensorPtr mSkipTopPDecodeHost;
+
+    TensorPtr mBatchIsAccepted;
+    TensorPtr mRuntimeMultinomialDevice;
+
+    TensorPtr mOutputIdsAfterSampling;
+    TensorPtr mTargetOutputIds;
+    TensorPtr mRuntimeTopKDevice;
+    TensorPtr mRuntimeTopPForTopKDevice;
+    TensorPtr mRuntimeTopPDevice;
+    TensorPtr mInitialTopPDevice;
+    TensorPtr mMaskBuffer;
+
+    TensorPtr mTargetLogits;
+
+private:
+    void allocateBuffer(runtime::SizeType32 batchSize);
+    void acceptDraftTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void multinomialSampling(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void getAllTopKs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void getAllTopPs(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+    void forwardAcceptedTokens(std::shared_ptr<BaseDecodingOutputs> const& outputs,
+        std::shared_ptr<BaseDecodingInputs> const& baseInputs,
+        std::shared_ptr<runtime::DecodingLayerWorkspace> const& workspace);
+};
+
+} // namespace tensorrt_llm::layers
diff --git a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp
index 5b3062be0..db78160b9 100644
--- a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp
+++ b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.cpp
@@ -18,8 +18,12 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/layers/decodingParams.h"
 #include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/lookaheadModule.h"
+#include <memory>
 #include <tuple>
 
 namespace tensorrt_llm::layers
@@ -27,6 +31,36 @@ namespace tensorrt_llm::layers
 
 using namespace tensorrt_llm::runtime;
 
+LookaheadAlgorithm::LookaheadAlgorithm(
+    runtime::SizeType32 maxW, runtime::SizeType32 maxN, runtime::SizeType32 maxG, runtime::SizeType32 id)
+    : mMaxW(maxW)
+    , mMaxN(maxN)
+    , mMaxG(maxG)
+    , mFilling(0)
+    , mPoolManager(maxG)
+    , mId(id)
+    , mGoldenTokensMax(
+          runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxN * 2 - 1}), nvinfer1::DataType::kINT32))
+    , mPrefillsMax(runtime::BufferManager::cpu(
+          runtime::ITensor::makeShape({(maxN <= 1 ? 0 : maxN - 2)}), nvinfer1::DataType::kINT32))
+    , mKeyTokensMax(runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW}), nvinfer1::DataType::kINT32))
+    , mPastTokensMax(
+          runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW * (maxN - 1)}), nvinfer1::DataType::kINT32))
+    , mGuessTokensMax(
+          runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxG * (maxN - 1)}), nvinfer1::DataType::kINT32))
+{
+    runtime::SizeType32 maxGeneratedLen, maxDraftLen;
+    std::tie(maxGeneratedLen, std::ignore, maxDraftLen, std::ignore)
+        = executor::LookaheadDecodingConfig(maxW, maxN, maxG).calculateSpeculativeResource();
+    mAttentionMask = runtime::BufferManager::cpu(
+        runtime::ITensor::makeShape({maxDraftLen, maxDraftLen}), nvinfer1::DataType::kBOOL);
+    mDraftTokensMax
+        = runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxDraftLen}), nvinfer1::DataType::kINT32);
+    mSampledTokensMax
+        = runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxGeneratedLen}), nvinfer1::DataType::kINT32);
+    mEncodeMapMax = runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxDraftLen}), nvinfer1::DataType::kINT32);
+}
+
 void LookaheadAlgorithm::setup(TensorConstPtr const& prompt, SizeType32 w, SizeType32 n, SizeType32 g)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@@ -36,7 +70,7 @@ void LookaheadAlgorithm::setup(TensorConstPtr const& prompt, SizeType32 w, SizeT
     mW = w;
     mN = n;
     mG = g;
-    std::tie(std::ignore, std::ignore, mRuntimeMaxDraftLen, std::ignore)
+    std::tie(std::ignore, std::ignore, mRuntimeMaxDraftLen, mRuntimeMaxDraftPathLen)
         = executor::LookaheadDecodingConfig(mW, mN, mG).calculateSpeculativeResource();
 
     mPoolManager.setup(mG);
@@ -81,8 +115,8 @@ void LookaheadAlgorithm::accept(TensorConstPtr const& generatedTokens)
 }
 
 //! lookahead has two phase, prefill the past tokens matrix and maintain past tokens matrix.
-runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens, TensorPtr const& positionIds,
-    TensorPtr const& samplingMask, runtime::SizeType32 offset)
+runtime::SizeType32 LookaheadAlgorithm::lookahead(
+    TensorPtr const& draftTokens, TensorPtr const& positionIds, runtime::SizeType32 startPosId)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -90,7 +124,6 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens,
     SizeType32 len = prefill + mFilling * mW;
     TLLM_CHECK(len <= ITensor::volume(draftTokens->getShape()));
     TLLM_CHECK(len <= ITensor::volume(positionIds->getShape()));
-    TLLM_CHECK(len <= ITensor::volume(samplingMask->getShape()));
     BufferRange<TokenIdType> prefillRange(*mPrefills);
     BufferRange<TokenIdType> pastRange(*mPastTokens);
     BufferRange<TokenIdType> draftRange(*draftTokens);
@@ -112,11 +145,6 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens,
     }
 
     BufferRange<TokenIdType> positionIdsRange(*positionIds);
-    BufferRange<bool> samplingMaskRange(*samplingMask);
-    for (auto& v : samplingMaskRange)
-    {
-        v = 0;
-    }
     SizeType32 idx = 0, wj = 0;
     auto fillPosition = [&positionIdsRange, &idx](SizeType32 start, SizeType32 len)
     {
@@ -127,20 +155,18 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens,
     };
     if (prefill >= 0)
     {
-        fillPosition(offset, prefill);
+        fillPosition(startPosId, prefill);
         for (wj = 0; wj < mW; wj++)
         {
-            fillPosition(offset + prefill + wj, mFilling);
-            samplingMaskRange[prefill + wj * mFilling + mFilling - 1] = true;
+            fillPosition(startPosId + prefill + wj, mFilling);
         }
     }
     else
     {
-        fillPosition(offset, mFilling - 1);
+        fillPosition(startPosId, mFilling - 1);
         for (wj = 1; wj < mW; wj++)
         {
-            fillPosition(offset - 1 + wj, mFilling);
-            samplingMaskRange[wj * mFilling + mFilling - 1 - 1] = true;
+            fillPosition(startPosId - 1 + wj, mFilling);
         }
     }
     PRINT_VALUES(positionIds);
@@ -150,7 +176,7 @@ runtime::SizeType32 LookaheadAlgorithm::lookahead(TensorPtr const& draftTokens,
 }
 
 runtime::SizeType32 LookaheadAlgorithm::guess(TensorPtr const& guessTokens, TensorPtr const& guessIds,
-    TensorPtr const& samplingMask, runtime::SizeType32 offset, runtime::TokenIdType lastToken)
+    runtime::SizeType32 startPosId, runtime::TokenIdType lastToken)
 {
     auto guesses = mPoolManager.guess(lastToken, mW);
 
@@ -158,67 +184,227 @@ runtime::SizeType32 LookaheadAlgorithm::guess(TensorPtr const& guessTokens, Tens
     std::for_each(guesses.begin(), guesses.end(), [&len](auto& a) { len += ITensor::volume(a->getShape()); });
     TLLM_CHECK(len <= ITensor::volume(guessTokens->getShape()));
     TLLM_CHECK(len <= ITensor::volume(guessIds->getShape()));
-    TLLM_CHECK(len <= ITensor::volume(samplingMask->getShape()));
     BufferRange<TokenIdType> guessTokensRange(*guessTokens);
     BufferRange<SizeType32> guessIdsRange(*guessIds);
-    BufferRange<bool> samplingMaskRange(*samplingMask);
 
     SizeType32 cur = 0;
     for (auto guess : guesses)
     {
         BufferRange<TokenIdType const> guessRange(*guess);
         std::copy(guessRange.begin(), guessRange.end(), guessTokensRange.begin() + cur);
-        SizeType32 tmp = offset;
+        SizeType32 tmp = startPosId;
         std::for_each(
             guessIdsRange.begin() + cur, guessIdsRange.begin() + cur + mN - 1, [&tmp](auto& v) { v = tmp++; });
         cur += ITensor::volume(guess->getShape());
     }
 
-    std::for_each(samplingMaskRange.begin(), samplingMaskRange.begin() + len, [](auto& a) { a = true; });
-
     return len;
 }
 
+void LookaheadAlgorithm::posIdsToMask(TensorPtr const& mask, TensorConstPtr const& posIds)
+{
+    auto len = ITensor::volume(posIds->getShape());
+    TLLM_CHECK(mask->getDimension<0>() >= len);
+    TLLM_CHECK(mask->getDimension<1>() >= len);
+    auto posIdsRange = BufferRange<SizeType32 const>(*posIds);
+    auto maskLocation = BufferLocation<bool>(*mask);
+
+    for (auto& item : maskLocation)
+    {
+        item = false;
+    }
+
+    if (len > 0)
+    {
+        std::vector<std::pair<SizeType32, SizeType32>> stack;
+        for (auto i = 0; i < len; i++)
+        {
+            auto cur = posIdsRange[i];
+            while (stack.size() > 0 && cur <= stack.back().second)
+            {
+                stack.pop_back();
+            }
+            TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true);
+            stack.push_back(std::make_pair(i, cur));
+            for (auto prev : stack)
+            {
+                maskLocation.at(i, prev.first) = true;
+            }
+        }
+    }
+}
+
+struct TreeValue;
+using TreeMap = std::unordered_map<TokenIdType, TreeValue>;
+
+struct TreeValue
+{
+    TreeValue()
+        : nexts(std::make_shared<TreeMap>())
+    {
+    }
+
+    using Nexts = std::shared_ptr<TreeMap>;
+    Nexts nexts{nullptr};
+    std::list<SizeType32> sources;
+};
+
+using TreeNode = TreeMap::value_type;
+
+template <typename BF, typename AF>
+void treeDFS(TreeNode& node, BF const& visitBefore, AF const& visitAfter)
+{
+    visitBefore(node);
+    for (auto& next : *(node.second.nexts))
+    {
+        treeDFS(next, visitBefore, visitAfter);
+    }
+    visitAfter(node);
+}
+
+SizeType32 LookaheadAlgorithm::treeEncode(
+    TensorPtr const& tokens, TensorPtr const& posIds, TensorPtr const& mask, TensorPtr const& encodeMap)
+{
+    TLLM_CHECK(ITensor::volume(tokens->getShape()) == ITensor::volume(posIds->getShape()));
+    auto len = ITensor::volume(tokens->getShape());
+
+    BufferRange<TokenIdType> tokensRange(*tokens);
+    BufferRange<SizeType32> posIdsRange(*posIds);
+    BufferLocation<bool> maskLocation(*mask);
+    BufferRange<SizeType32> mapRange(*encodeMap);
+
+    auto branches = std::make_shared<TreeMap>();
+
+    for (auto i = 0; i < len; i++)
+    {
+        auto nexts = branches;
+        for (auto j = 0; j <= i; j++)
+        {
+            if (maskLocation.at(i, j))
+            {
+                auto pos = posIdsRange[j];
+                auto tok = tokensRange[j];
+                auto found = nexts->find(tok);
+                if (found != nexts->end())
+                {
+                    found->second.sources.push_back(j);
+                    nexts = found->second.nexts;
+                }
+                else
+                {
+                    auto [inserted, ok] = nexts->insert({tok, TreeValue()});
+                    inserted->second.sources.push_back(j);
+                    nexts = inserted->second.nexts;
+                }
+            }
+        }
+    }
+
+    for (auto& item : maskLocation)
+    {
+        item = 0;
+    }
+    std::vector<std::pair<SizeType32, TokenIdType>> stack;
+    SizeType32 offset = 0;
+    SizeType32 posId = posIdsRange.size() ? posIdsRange[0] : 0;
+
+    auto visitBefore
+        = [&stack, &maskLocation, &tokensRange, &posIdsRange, &posId, &offset, &mapRange](TreeNode const& node)
+    {
+        stack.push_back(std::make_pair(offset, node.first));
+        for (auto const& source : node.second.sources)
+        {
+            mapRange[source] = offset;
+        }
+        for (auto const& prev : stack)
+        {
+            maskLocation.at(offset, prev.first) = true;
+        }
+        tokensRange[offset] = node.first;
+        posIdsRange[offset] = posId;
+        offset++;
+        posId++;
+    };
+    auto visitAfter = [&stack, &posId](TreeNode const& node)
+    {
+        stack.pop_back();
+        posId--;
+    };
+
+    for (auto& next : *branches)
+    {
+        treeDFS(next, visitBefore, visitAfter);
+    }
+
+    for (SizeType32 i = offset; i < len; i++)
+    {
+        tokensRange[i] = 0;
+        posIdsRange[i] = 0;
+    }
+    for (SizeType32 i = 0; i < len; i++)
+    {
+        for (SizeType32 j = i < offset ? offset : 0; j < len; j++)
+        {
+            maskLocation.at(i, j) = false;
+        }
+    }
+
+    return offset;
+}
+
 void LookaheadAlgorithm::prepare(TensorPtr const& draftTokens, TensorPtr const& positionIds,
-    TensorPtr const& samplingMask, TensorPtr const& length, TensorConstPtr const& offsetPtr,
-    TensorConstPtr const& lastTokenPtr)
+    TensorPtr const& draftLengthPtr, TensorPtr const& attentionMask, SizeType32 attentionMaskOffset,
+    TensorConstPtr const& lastPositionIdPtr, TensorConstPtr const& lastTokenPtr)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
     if (mRuntimeMaxDraftLen == 0)
     {
-        (BufferRange<SizeType32>(*length))[0] = 0;
+        mDraftTokens = ITensor::slice(mDraftTokensMax, 0, 0);
+        mEncodeMap = ITensor::slice(mEncodeMapMax, 0, 0);
+        (BufferRange<SizeType32>(*draftLengthPtr))[0] = 0;
         return;
     }
 
     auto lastToken = BufferRange<TokenIdType const>(*lastTokenPtr)[0];
-    auto offset = BufferRange<SizeType32 const>(*offsetPtr)[0];
+    auto offset = BufferRange<SizeType32 const>(*lastPositionIdPtr)[0];
 
     SizeType32 inputLen = ITensor::volume(draftTokens->getShape());
     TLLM_CHECK(inputLen >= mRuntimeMaxDraftLen);
 
     BufferRange<TokenIdType> draftRange(*draftTokens);
     BufferRange<TokenIdType> positionRange(*positionIds);
-    BufferRange<bool> samplingRange(*samplingMask);
 
     SizeType32 filledLen = 0;
 
     filledLen += lookahead(ITensor::slice(draftTokens, filledLen, mRuntimeMaxDraftLen - filledLen),
-        ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen),
-        ITensor::slice(samplingMask, filledLen, mRuntimeMaxDraftLen - filledLen), offset);
+        ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), offset);
 
     auto guessStart = filledLen;
     filledLen += guess(ITensor::slice(draftTokens, filledLen, mRuntimeMaxDraftLen - filledLen),
-        ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen),
-        ITensor::slice(samplingMask, filledLen, mRuntimeMaxDraftLen - filledLen), offset, lastToken);
+        ITensor::slice(positionIds, filledLen, mRuntimeMaxDraftLen - filledLen), offset, lastToken);
     auto guessEnd = filledLen;
 
+    std::copy(draftRange.begin() + guessStart, draftRange.begin() + guessEnd,
+        BufferRange<TokenIdType>(*mGuessTokensMax).begin());
     mGuessTokens = ITensor::slice(mGuessTokensMax, 0, guessEnd - guessStart);
 
-    std::copy(draftRange.begin() + guessStart, draftRange.begin() + guessEnd,
-        BufferRange<TokenIdType>(*mGuessTokens).begin());
+    posIdsToMask(mAttentionMask, ITensor::slice(positionIds, 0, filledLen));
 
-    (BufferRange<SizeType32>(*length))[0] = filledLen;
+    auto draftLen = treeEncode(ITensor::slice(draftTokens, 0, filledLen), ITensor::slice(positionIds, 0, filledLen),
+        mAttentionMask, mEncodeMapMax);
+
+    for (SizeType32 i = 0; i < draftLen; i++)
+    {
+        BufferRange<bool> srcRange(*ITensor::at(mAttentionMask, {i}));
+        BufferRange<bool> dstRange(*ITensor::slice(attentionMask, {i + attentionMaskOffset, attentionMaskOffset}));
+        std::copy(srcRange.begin(), srcRange.end(), dstRange.begin());
+    }
+
+    std::copy(draftRange.begin(), draftRange.begin() + draftLen, BufferRange<TokenIdType>(*mDraftTokensMax).begin());
+    mDraftTokens = ITensor::slice(mDraftTokensMax, 0, draftLen);
+    (BufferRange<SizeType32>(*draftLengthPtr))[0] = draftLen;
+    mEncodeMap = ITensor::slice(mEncodeMapMax, 0, filledLen);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
@@ -229,29 +415,31 @@ void LookaheadAlgorithm::verify(TensorPtr const& accepted, TensorPtr const& acce
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
-    TLLM_CHECK(ITensor::volume(goldenTokens->getShape()) == ITensor::volume(mGuessTokens->getShape()));
+    TLLM_CHECK(ITensor::volume(goldenTokens->getShape()) == ITensor::volume(mDraftTokens->getShape()));
     BufferRange<TokenIdType const> goldRange(*goldenTokens);
-    BufferRange<TokenIdType> guessTokensRange(*mGuessTokens);
-    auto guessSize = ITensor::volume(mGuessTokens->getShape());
+    BufferRange<TokenIdType> draftRange(*mDraftTokens);
+    BufferLocation<bool const> maskLocation(*mAttentionMask);
+    auto draftSize = ITensor::volume(mDraftTokens->getShape());
+    auto end = *BufferRange<TokenIdType const>(*endToken).begin();
 
-    SizeType32 guesses = (mN - 1 > 0) ? (guessSize / (mN - 1)) : 0;
-    SizeType32 hit = 0, maxHit = 0, hitIdx = 0;
-    for (SizeType32 i = 0; i < guesses; i++)
+    SizeType32 maxHit = 0, hitIdx = 0;
+    for (SizeType32 i = 0; i < draftSize; i++)
     {
         SizeType32 hit = 0;
-        for (SizeType32 j = 0; j < mN - 1; j++)
+        TokenIdType cur = newLastToken;
+        for (SizeType32 j = 0; j < draftSize; j++)
         {
-            auto idx = i * (mN - 1) + j;
-            bool ok
-                = (j == 0) ? (newLastToken == guessTokensRange[idx]) : (goldRange[idx - 1] == guessTokensRange[idx]);
-            bool finish = guessTokensRange[idx] == *BufferRange<TokenIdType const>(*endToken).begin();
-            if (ok && !finish)
-            {
-                hit++;
-            }
-            else
+            if (maskLocation.at(i, j))
             {
-                break;
+                if (draftRange[j] == cur && draftRange[j] != end)
+                {
+                    hit++;
+                    cur = goldRange[j];
+                }
+                else
+                {
+                    break;
+                }
             }
         }
         if (hit > maxHit)
@@ -261,17 +449,19 @@ void LookaheadAlgorithm::verify(TensorPtr const& accepted, TensorPtr const& acce
         }
     }
 
-    BufferRange<TokenIdType> acceptedRange(*accepted);
-    acceptedRange[0] = newLastToken;
-    std::copy(goldRange.begin() + hitIdx * (mN - 1), goldRange.begin() + hitIdx * (mN - 1) + maxHit,
-        acceptedRange.begin() + 1);
+    maxHit = maxHit > mRuntimeMaxDraftPathLen ? mRuntimeMaxDraftPathLen : maxHit;
 
+    SizeType32 acceptedIdx = 0;
+    BufferRange<TokenIdType> acceptedRange(*accepted);
     BufferRange<SizeType32> acceptedOffsetsRange(*acceptedOffsets);
-    auto lookSize = 1 + mN - 2 - mFilling + mFilling * mW;
-    // acceptedOffsetsRange[0] = 0;
-    for (SizeType32 i = 0; i < maxHit; i++)
+    acceptedRange[acceptedIdx] = newLastToken;
+    for (SizeType32 j = 0; j < draftSize; j++)
     {
-        acceptedOffsetsRange[i] = lookSize + hitIdx * (mN - 1) + i - 1;
+        if (maskLocation.at(hitIdx, j) && acceptedIdx < maxHit)
+        {
+            acceptedOffsetsRange[acceptedIdx++] = j;
+            acceptedRange[acceptedIdx] = goldRange[j];
+        }
     }
 
     *BufferRange<SizeType32>(*acceptedLength).begin() = maxHit + 1;
@@ -325,7 +515,19 @@ void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
     TLLM_CHECK(ITensor::volume(acceptedTokens->getShape()) >= mN);
-    BufferRange<TokenIdType const> sampledRange(*sampledTokens);
+    BufferRange<TokenIdType const> zippedTokensRange(*sampledTokens);
+    BufferRange<TokenIdType const> sampledRange(*mSampledTokensMax);
+
+    BufferRange<SizeType32 const> mapRange(*mEncodeMap);
+    BufferRange<TokenIdType> unzipRange(*mSampledTokensMax);
+    mSampledTokens = ITensor::slice(mSampledTokensMax, 0, mEncodeMap->getShape().d[0] + 1);
+
+    unzipRange[0] = zippedTokensRange[0];
+    for (SizeType32 i = 0; i < mapRange.size(); i++)
+    {
+        unzipRange[i + 1] = zippedTokensRange[mapRange[i] + 1];
+    }
+
     BufferRange<TokenIdType> keyRange(*mKeyTokens);
     BufferRange<TokenIdType> pastRange(*mPastTokens);
 
@@ -359,13 +561,15 @@ void LookaheadAlgorithm::update(TensorPtr const& acceptedTokens, TensorPtr const
     }
 
     auto guessSize = ITensor::volume(mGuessTokens->getShape());
-    auto outputSize = ITensor::volume(sampledTokens->getShape());
+    auto outputSize = ITensor::volume(mSampledTokens->getShape());
     auto lookSize = 1 + (mN > 1 ? mN - 2 : 0) - mFilling + mFilling * mW;
     TLLM_CHECK(guessSize + lookSize == outputSize);
 
-    TensorConstPtr goldenTokens = ITensor::slice(sampledTokens, lookSize, guessSize);
+    TensorConstPtr goldenTokens = ITensor::slice(mSampledTokens, lookSize, guessSize);
+
+    auto& acptLen = *BufferRange<SizeType32>(*acceptedLength).begin();
 
-    verify(acceptedTokens, acceptedOffsets, acceptedLength, newLastToken, goldenTokens, endToken);
+    verify(acceptedTokens, acceptedOffsets, acceptedLength, newLastToken, ITensor::slice(sampledTokens, 1), endToken);
 
     accept(ITensor::slice(acceptedTokens, 0, *BufferRange<SizeType32>(*acceptedLength).begin()));
 
diff --git a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h
index 99df44128..485734c5a 100644
--- a/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h
+++ b/cpp/tensorrt_llm/layers/lookaheadAlgorithm.h
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/layers/decodingParams.h"
 #include "tensorrt_llm/runtime/common.h"
 #include <curand_kernel.h>
+#include <tuple>
 
 namespace tensorrt_llm::layers
 {
@@ -35,24 +36,7 @@ class LookaheadAlgorithm
     //! @brief Currently the resource management is to be aligned with batch manager.
     //! @param w, n, g is the Jacobi window, n-gram level and guess set size respectively.
     LookaheadAlgorithm(
-        runtime::SizeType32 maxW, runtime::SizeType32 maxN, runtime::SizeType32 maxG, runtime::SizeType32 id = 0)
-        : mMaxW(maxW)
-        , mMaxN(maxN)
-        , mMaxG(maxG)
-        , mFilling(0)
-        , mPoolManager(maxG)
-        , mId(id)
-        , mGoldenTokensMax(
-              runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxN * 2 - 1}), nvinfer1::DataType::kINT32))
-        , mPrefillsMax(runtime::BufferManager::cpu(
-              runtime::ITensor::makeShape({(maxN <= 1 ? 0 : maxN - 2)}), nvinfer1::DataType::kINT32))
-        , mKeyTokensMax(runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW}), nvinfer1::DataType::kINT32))
-        , mPastTokensMax(
-              runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxW * (maxN - 1)}), nvinfer1::DataType::kINT32))
-        , mGuessTokensMax(
-              runtime::BufferManager::cpu(runtime::ITensor::makeShape({maxG * (maxN - 1)}), nvinfer1::DataType::kINT32))
-    {
-    }
+        runtime::SizeType32 maxW, runtime::SizeType32 maxN, runtime::SizeType32 maxG, runtime::SizeType32 id = 0);
 
     //! @brief setup per request, fill internal states from @param prompt.
     void setup(TensorConstPtr const& prompt, runtime::SizeType32 w, runtime::SizeType32 n, runtime::SizeType32 g);
@@ -62,43 +46,55 @@ class LookaheadAlgorithm
     void accept(TensorConstPtr const& generatedTokens);
 
     //! @brief combine lookahead and guess to prepare the tensors.
-    //! input @param offsetPtr is position id of the last golden token, in a TensorPtr.
+    //! input @param lastPositionIdPtr is position id of the last golden token, in a TensorPtr.
     //! input @param lastTokenPtr the last golden token for searching in the pool, in a TensorPtr.
-    //! output @param draftTokens, positionIds, samplingMask; including the golden token, the lookahead
-    //! and the verification branch information. @param length holds the draft tokens length.
-    void prepare(TensorPtr const& draftTokens, TensorPtr const& positionIds, TensorPtr const& samplingMask,
-        TensorPtr const& length, TensorConstPtr const& offsetPtr, TensorConstPtr const& lastTokenPtr);
+    //! output @param draftTokens, positionIds includes the lookahead and the verification branch information.
+    //! output @param draftLengthPtr holds the draft tokens length.
+    //! output @param attentionMask holds the draft tokens dependency mask, and attentionMaskOffset is the index offset
+    //! in attentionMask.
+    void prepare(TensorPtr const& draftTokens, TensorPtr const& positionIds, TensorPtr const& draftLengthPtr,
+        TensorPtr const& attentionMask, runtime::SizeType32 attentionMaskOffset,
+        TensorConstPtr const& lastPositionIdPtr, TensorConstPtr const& lastTokenPtr);
 
     //! @brief update the internal states and generate accepted tokens from @param outputTokens.
-    //! input @param sampledTokens is the all the tokens from the language model. The position at samplingMask=1 is
-    //! valid. input @param endToken is the end token for `verify` early quit.
-    //! output @param acceptedTokens, acceptedOffsets ind @param acceptedLength.
+    //! input @param sampledTokens is the all the tokens from the language model.
+    //! input @param endToken is the end token for `verify` early quit.
+    //! output @param acceptedTokens, acceptedOffsets in @param acceptedLength.
     void update(TensorPtr const& acceptedTokens, TensorPtr const& acceptedOffsets, TensorPtr const& acceptedLength,
         TensorConstPtr const& sampledTokens, TensorConstPtr const& endToken);
 
+    //! generate attention @param mask from @param posIds.
+    static void posIdsToMask(TensorPtr const& mask, TensorConstPtr const& posIds);
+
+    //! inplace encode the @param tokens and @param posIds according to attention @param masks, and record the offsets
+    //! in @param encodeMap.
+    static runtime::SizeType32 treeEncode(
+        TensorPtr const& tokens, TensorPtr const& posIds, TensorPtr const& masks, TensorPtr const& encodeMap);
+
 private:
     //! @brief generate lookahead branch information.
-    //! input @param offset the position id of the last golden token.
-    //! output @param draftTokens, positionIds, samplingMask of the lookahead branch.
+    //! input @param startPosId is the first position id of the draftTokens.
+    //! output @param draftTokens, positionIds of the lookahead branch.
     //! @return the actual filled lookahead length.
-    runtime::SizeType32 lookahead(TensorPtr const& draftTokens, TensorPtr const& positionIds,
-        TensorPtr const& samplingMask, runtime::SizeType32 offset);
+    runtime::SizeType32 lookahead(
+        TensorPtr const& draftTokens, TensorPtr const& positionIds, runtime::SizeType32 startPosId);
 
     //! @brief generate verification branch information. Also save the guessed tokens for future verification.
-    //! input @param offset the position id of the last golden token.
+    //! input @param startPosId the first position id.
     //! input @param lastToken the last golden token for searching in the pool.
-    //! output @param guessTokens, guessIds, samplingMask of the verification branch.
+    //! output @param guessTokens, guessIds of the verification branch.
     //! @return the actual filled guess length.
-    runtime::SizeType32 guess(TensorPtr const& guessTokens, TensorPtr const& guessIds, TensorPtr const& samplingMask,
-        runtime::SizeType32 offset, runtime::TokenIdType lastToken);
+    runtime::SizeType32 guess(TensorPtr const& guessTokens, TensorPtr const& guessIds, runtime::SizeType32 startPosId,
+        runtime::TokenIdType lastToken);
 
     //! @brief verify the guessed tokens results and generate the longest accepted tokens.
     //! input @param newLastToken is the new-generated last golden token.
-    //! input @param goldenTokens is the guessed token results from the language model.
+    //! input @param sampledTokens is the generated token results from the language model.
     //! input @param endToken is the end token for early quit detection.
-    //! output @param accepted, acceptedOffsets in @param acceptedLength, .
+    //! output @param accepted in @param acceptedLength, including the first golden one.
+    //! output @param acceptedOffsets is the offsets of draft tokens, excluding the first golden one.
     void verify(TensorPtr const& accepted, TensorPtr const& acceptedOffsets, TensorPtr const& acceptedLength,
-        runtime::TokenIdType newLastToken, TensorConstPtr const& goldenTokens, TensorConstPtr const& endToken);
+        runtime::TokenIdType newLastToken, TensorConstPtr const& sampledTokens, TensorConstPtr const& endToken);
 
 private:
     LookaheadPoolManager mPoolManager;
@@ -117,6 +113,13 @@ class LookaheadAlgorithm
     //! the same guess tokens from `guess` and used in `verify`
     TensorPtr mGuessTokensMax; // shape [mMaxG*(mMaxN-1)]
     TensorPtr mGuessTokens;    // shape [mG*(mN-1)]
+    TensorPtr mDraftTokensMax;
+    TensorPtr mDraftTokens;
+    TensorPtr mAttentionMask;
+    TensorPtr mEncodeMapMax;
+    TensorPtr mEncodeMap;
+    TensorPtr mSampledTokensMax;
+    TensorPtr mSampledTokens;
 
     //! look ahead algorithm parameters, Window size, Level and Guess set size.
     //! max for reserving resources and current for current request.
@@ -127,6 +130,7 @@ class LookaheadAlgorithm
     runtime::SizeType32 mN{0};
     runtime::SizeType32 mG{0};
     runtime::SizeType32 mRuntimeMaxDraftLen{0};
+    runtime::SizeType32 mRuntimeMaxDraftPathLen{0};
     //! in prefilling mode when mFilling < mN-1.
     runtime::SizeType32 mFilling;
 
diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp
index 8214abfb4..414572322 100644
--- a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp
+++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.cpp
@@ -24,6 +24,7 @@
 #include "tensorrt_llm/layers/lookaheadAlgorithm.h"
 #include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/lookaheadModule.h"
@@ -75,19 +76,21 @@ LookaheadDecodingLayer<T>::CpuAlgorithmResources::CpuAlgorithmResources(DecoderD
         ITensor::makeShape({maxTokensPerStep, maxBatchSize, beamWidth}), nvinfer1::DataType::kINT32);
     mPathsOffsets
         = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32);
+    mPathsOffsetsBatch
+        = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32);
     mNumNewTokens = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32);
     mNumNewTokensCumSum = BufferManager::cpu(ITensor::makeShape({maxBatchSize + 1}), nvinfer1::DataType::kINT32);
     mNextDraftTokens = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32);
     mNextDraftPosIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32);
     mGenerationLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32);
-    mGenerationLengthsMax = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32);
     mPositionOffsets
         = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep}), nvinfer1::DataType::kINT32);
     mPositionIds = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep}), nvinfer1::DataType::kINT32);
+    mAttentionMask
+        = BufferManager::cpu(ITensor::makeShape({maxTokensPerStep, maxTokensPerStep}), nvinfer1::DataType::kBOOL);
     mPackedMask = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxTokensPerStep,
                                          static_cast<ITensor::DimType64>(divUp(maxTokensPerStep, 32))}),
         nvinfer1::DataType::kINT32);
-    mSamplingMask = BufferManager::cpu(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kBOOL);
     mNextDraftLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32);
     mSequenceLengths = BufferManager::cpu(maxBatchShape1D, nvinfer1::DataType::kINT32);
 }
@@ -113,7 +116,6 @@ LookaheadDecodingLayer<T>::LookaheadDecodingLayer(
 
     mWorkspaceSize = getTopKWorkspaceSize<T>(maxBatchSize, maxTokensPerStep, maxTopK, vocabSizePadded);
     mTargetTokensDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kINT32);
-    mSamplingMaskDevice = mBufferManager->gpu(maxBatchShape2D, nvinfer1::DataType::kBOOL);
     mCurandStatesDevice
         = mBufferManager->gpu(ITensor::makeShape({maxBatchSize, sizeof(curandState_t)}), nvinfer1::DataType::kINT8);
 
@@ -168,6 +170,7 @@ void LookaheadDecodingLayer<T>::setup(SizeType32 batchSize, SizeType32 beamWidth
         {
             SizeType32 gbi = batchSlotsRange[bi];
             (BufferRange<SizeType32>(*mCpuAlgo->mGenerationLengths))[gbi] = 1;
+            (BufferRange<SizeType32>(*mCpuAlgo->mNextDraftLengths))[gbi] = 0;
             BufferLocation<SizeType32>(*mCpuAlgo->mPositionOffsets).at(gbi, 0) = 0;
             BufferRange<SizeType32> packedMaskRange(*ITensor::at(mCpuAlgo->mPackedMask, {gbi}));
             for (auto& mask : packedMaskRange)
@@ -184,11 +187,6 @@ void LookaheadDecodingLayer<T>::setup(SizeType32 batchSize, SizeType32 beamWidth
             PRINT_SHAPE(setupParams->attentionPackedMasks);
             mBufferManager->copy(
                 *ITensor::at(mCpuAlgo->mGenerationLengths, {gbi}), *ITensor::at(setupParams->generationLengths, {gbi}));
-            if (setupParams->actualGenerationLengths)
-            {
-                mBufferManager->copy(*ITensor::at(mCpuAlgo->mGenerationLengths, {gbi}),
-                    *ITensor::at(setupParams->actualGenerationLengths, {gbi}));
-            }
             mBufferManager->copy(
                 *ITensor::at(mCpuAlgo->mPositionOffsets, {gbi}), *ITensor::at(setupParams->positionOffsets, {gbi}));
             mBufferManager->copy(
@@ -224,7 +222,7 @@ void LookaheadDecodingLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs
     params.maxBatchSize = mDecoderDomain.getBatchSize();
     params.batchSize = batchSize;
     params.maxTopK = 1;
-    params.returnAllTopK = true;
+    params.returnAllSelectedTokens = true;
     params.maxTokensPerStep = mDecoderDomain.getMaxDecodingTokens();
     params.maxSeqLen = mDecoderDomain.getMaxDecodingTokens();
     params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
@@ -261,39 +259,32 @@ size_t LookaheadDecodingLayer<T>::getWorkspaceSize() const noexcept
     return std::max(mWorkspaceSize, mSetupWorkspaceSize);
 }
 
-template <typename T>
-void LookaheadDecodingLayer<T>::posIdsToMask(TensorPtr mask, TensorConstPtr posIds)
+inline void initAttentionMask(TensorPtr const& mask, std::shared_ptr<runtime::BufferManager>& bufferManager)
 {
-    auto len = ITensor::volume(posIds->getShape());
-    TLLM_CHECK(mask->getDimension<0>() > len);
-    TLLM_CHECK(mask->getDimension<1>() * 32 > len);
-    auto posIdsRange = BufferRange<SizeType32 const>(*posIds);
-    auto maskLocation = BufferLocation<SizeType32>(*mask);
-
-    for (auto i = 0; i < maskLocation.size(); i++)
+    bufferManager->setZero(*mask);
+    BufferLocation<bool> maskLocation(*mask);
+    auto maskShape = mask->getShape();
+    for (SizeType32 i = 0; i < maskShape.d[0]; i++)
     {
-        maskLocation[i] = 0;
+        maskLocation.at(i, 0) = true;
     }
-    maskLocation.at(0, 0) = 1;
+}
 
-    auto setBit = [](SizeType32& x, SizeType32 idx) { x |= (1 << idx); };
-    if (len > 0)
+inline void convertBoolToInt32(TensorPtr const& dst, TensorConstPtr const& src)
+{
+    auto dstShape = dst->getShape();
+    auto srcShape = src->getShape();
+    TLLM_CHECK(dstShape.d[0] == srcShape.d[0]);
+    TLLM_CHECK(dstShape.d[1] * 32 >= srcShape.d[1]);
+    BufferLocation<SizeType32> dstLocation(*dst);
+    BufferLocation<bool const> srcLocation(*src);
+
+    auto setBit = [](SizeType32& x, SizeType32 idx, bool value) { x |= (value << idx); };
+    for (auto i = 0; i < srcShape.d[0]; i++)
     {
-        std::vector<std::pair<SizeType32, SizeType32>> stack;
-        stack.emplace_back(0, posIdsRange[0] - 1);
-        for (auto i = 1; i < len + 1; i++)
+        for (auto j = 0; j < srcShape.d[1]; j++)
         {
-            auto cur = posIdsRange[i - 1];
-            while (stack.size() > 0 && cur <= stack.back().second)
-            {
-                stack.pop_back();
-            }
-            TLLM_CHECK(stack.size() > 0 ? cur == stack.back().second + 1 : true);
-            stack.emplace_back(i, cur);
-            for (auto prev : stack)
-            {
-                setBit(maskLocation.at(i, prev.first / 32), prev.first % 32);
-            }
+            setBit(dstLocation.at(i, j / 32), j % 32, srcLocation.at(i, j));
         }
     }
 }
@@ -307,12 +298,16 @@ void LookaheadDecodingLayer<T>::forwardSyncCPU(
     mCpuAlgo->mBatchSlots->reshape(inputs->batchSlots->getShape());
     mBufferManager->copy(*inputs->batchSlots, *mCpuAlgo->mBatchSlots);
     mBufferManager->copy(*inputs->curTokensPerStep.value(), *mCpuAlgo->mTokensPerStep);
-    mBufferManager->copy(*inputs->curTokensPerStep.value(), *mCpuAlgo->mTokensPerStep);
     mBufferManager->copy(*inputs->endIds, *mCpuAlgo->mEndIds);
     mBufferManager->copy(*outputs->sequenceLength.value(), *mCpuAlgo->mSequenceLengths);
 
     mBufferManager->copy(*mTargetTokensDevice, *mCpuAlgo->mTargetTokens);
 
+    if (outputs->prevDraftLengths)
+    {
+        mBufferManager->copy(*mCpuAlgo->mNextDraftLengths, *outputs->prevDraftLengths);
+    }
+
     mBufferManager->getStream().synchronize();
 
     auto const batchSize = inputs->localBatchSize;
@@ -325,15 +320,16 @@ void LookaheadDecodingLayer<T>::forwardSyncCPU(
     BufferRange<SizeType32> numNewTokensCumSumRange(*mCpuAlgo->mNumNewTokensCumSum);
     BufferRange<SizeType32> batchSlotsRange(*mCpuAlgo->mBatchSlots);
     BufferRange<SizeType32> generationLengthsRange(*mCpuAlgo->mGenerationLengths);
-    BufferRange<SizeType32> generationLengthsMaxRange(*mCpuAlgo->mGenerationLengthsMax);
     BufferRange<SizeType32> nextDraftLengthsRange(*mCpuAlgo->mNextDraftLengths);
     BufferRange<SizeType32> sequenceLengthsRange(*mCpuAlgo->mSequenceLengths);
     BufferLocation<SizeType32> pathsOffsetLocation(*mCpuAlgo->mPathsOffsets);
+    BufferLocation<SizeType32> pathsOffsetBatchLocation(*mCpuAlgo->mPathsOffsetsBatch);
     BufferLocation<TokenIdType> outputIdsLocation(*mCpuAlgo->mOutputIds);
 
     mBufferManager->setZero(*mCpuAlgo->mPathsOffsets);
     mBufferManager->setZero(*mCpuAlgo->mNumNewTokens);
     mBufferManager->setZero(*mCpuAlgo->mNumNewTokensCumSum);
+    mBufferManager->setZero(*mCpuAlgo->mPackedMask);
 
     for (SizeType32 bi = 0; bi < batchSize; bi++)
     {
@@ -342,7 +338,6 @@ void LookaheadDecodingLayer<T>::forwardSyncCPU(
 
         SizeType32 const tokensPerStep = generationLengthsRange[gbi];
         TensorPtr sampledTokens = ITensor::slice(mCpuAlgo->mTargetTokens, {gbi, 0}, tokensPerStep);
-        PRINT_VALUES(sampledTokens);
 
         if (tokensPerStep == 1)
         {
@@ -369,14 +364,18 @@ void LookaheadDecodingLayer<T>::forwardSyncCPU(
 
         sequenceLengthsRange[gbi] += numNewTokensRange[gbi];
 
+        initAttentionMask(mCpuAlgo->mAttentionMask, mBufferManager);
+
         theAlgo.prepare(                                     //
             ITensor::at(mCpuAlgo->mNextDraftTokens, {gbi}),  //
             ITensor::at(mCpuAlgo->mNextDraftPosIds, {gbi}),  //
-            ITensor::at(mCpuAlgo->mSamplingMask, {gbi}),     //
             ITensor::at(mCpuAlgo->mNextDraftLengths, {gbi}), //
+            mCpuAlgo->mAttentionMask, 1,                     //
             ITensor::at(mCpuAlgo->mSequenceLengths, {gbi}),  //
             ITensor::at(mCpuAlgo->mOutputIds, {gbi, numNewTokensRange[gbi] - 1}));
 
+        convertBoolToInt32(ITensor::at(mCpuAlgo->mPackedMask, {gbi}), mCpuAlgo->mAttentionMask);
+
         BufferLocation<SizeType32> posIdsLocation(*ITensor::at(mCpuAlgo->mPositionIds, {gbi}));
         for (auto& posid : posIdsLocation)
         {
@@ -385,39 +384,35 @@ void LookaheadDecodingLayer<T>::forwardSyncCPU(
         mBufferManager->copy(*ITensor::slice(mCpuAlgo->mNextDraftPosIds, {gbi, 0}, nextDraftLengthsRange[gbi]),
             *ITensor::slice(mCpuAlgo->mPositionIds, {gbi, 1}, nextDraftLengthsRange[gbi]));
 
-        posIdsToMask(                                  //
-            ITensor::at(mCpuAlgo->mPackedMask, {gbi}), //
-            ITensor::slice(mCpuAlgo->mNextDraftPosIds, {gbi, 0}, nextDraftLengthsRange[gbi]));
-
         BufferRange<SizeType32> offsetRange(*ITensor::at(mCpuAlgo->mPositionOffsets, {gbi}));
-        TLLM_CHECK_WITH_INFO(
-            posIdsLocation.size() == offsetRange.size(), "%ld, %ld", posIdsLocation.size(), offsetRange.size());
         for (auto i = 0; i < posIdsLocation.size(); i++)
         {
             offsetRange[i] = posIdsLocation[i] - posIdsLocation[0];
         }
+
         TensorPtr accepted = ITensor::slice(mCpuAlgo->mOutputIds, {gbi, 0}, numNewTokensRange[gbi]);
         TensorPtr draft = ITensor::slice(mCpuAlgo->mNextDraftTokens, {gbi, 0}, nextDraftLengthsRange[gbi]);
-
         TLLM_LOG_DEBUG("CPU ALGO [ %d ] forward, %s", gbi, D(sampledTokens).values().c_str());
         TLLM_LOG_DEBUG("[%d][%d] CPU ALGO [ %d ] forward, %s, %s", mGlobalSteps, batchSize, gbi,
             D(accepted).values().c_str(), D(draft).values().c_str());
     }
 
-    numNewTokensCumSumRange[0] = 0;
     SizeType32 pi = 0;
-    for (SizeType32 bi = 0; bi < numNewTokensRange.size(); bi++)
+    numNewTokensCumSumRange[0] = 0;
+    for (SizeType32 bi = 0; bi < batchSize; bi++)
     {
-        SizeType32 acceptedDraftLen = numNewTokensRange[bi] <= 1 ? 0 : (numNewTokensRange[bi] - 1);
+        SizeType32 gbi = batchSlotsRange[bi];
+        SizeType32 acceptedDraftLen = numNewTokensRange[gbi] <= 1 ? 0 : (numNewTokensRange[gbi] - 1);
         numNewTokensCumSumRange[bi + 1] = numNewTokensCumSumRange[bi] + acceptedDraftLen;
         for (SizeType32 tj = 0; tj < acceptedDraftLen; tj++)
         {
-            pathsOffsetLocation[pi++] = pathsOffsetLocation.at(bi, tj);
+            pathsOffsetBatchLocation[pi++] = pathsOffsetLocation.at(gbi, tj);
         }
     }
-    for (; pi < pathsOffsetLocation.size(); pi++)
+
+    for (; pi < pathsOffsetBatchLocation.size(); pi++)
     {
-        pathsOffsetLocation[pi++] = 0;
+        pathsOffsetBatchLocation[pi++] = 0;
     }
 
     TLLM_CHECK(outputs->numNewTokens);
@@ -425,34 +420,28 @@ void LookaheadDecodingLayer<T>::forwardSyncCPU(
     mBufferManager->copy(*mCpuAlgo->mSequenceLengths, *outputs->sequenceLength.value());
     mBufferManager->copy(*mCpuAlgo->mNewTokens, *outputs->newTokens);
 
-    mBufferManager->copy(*mCpuAlgo->mPathsOffsets, *outputs->pathsOffsets);
     mBufferManager->copy(*mCpuAlgo->mNumNewTokens, *outputs->numNewTokens.value());
+    mBufferManager->copy(*mCpuAlgo->mPathsOffsetsBatch, *outputs->pathsOffsets);
     mBufferManager->copy(*mCpuAlgo->mNumNewTokensCumSum, *outputs->numNewTokensCumSum); //
     mBufferManager->copy(*mCpuAlgo->mNextDraftTokens, *outputs->nextDraftTokens);
 
-    mBufferManager->copy(*mCpuAlgo->mPackedMask, *outputs->packedMasks);
+    for (SizeType32 bi = 0; bi < batchSize; bi++)
+    {
+        SizeType32 gbi = batchSlotsRange[bi];
+        // nextDraftLengthsRange[gbi] = mDecoderDomain.getMaxDecodingTokens() - 1;
+        generationLengthsRange[gbi] = nextDraftLengthsRange[gbi] + 1;
+    }
 
     if (outputs->nextDraftLengths)
     {
         mBufferManager->copy(*mCpuAlgo->mNextDraftLengths, *outputs->nextDraftLengths);
     }
 
-    for (SizeType32 bi = 0; bi < batchSize; bi++)
-    {
-        SizeType32 gbi = batchSlotsRange[bi];
-        generationLengthsRange[gbi] = nextDraftLengthsRange[gbi] + 1;
-        generationLengthsMaxRange[gbi] = mDecoderDomain.getMaxDecodingTokens();
-    }
     mBufferManager->copy(*mCpuAlgo->mPackedMask, *outputs->packedMasks);
-    mBufferManager->copy(*mCpuAlgo->mGenerationLengthsMax, *outputs->generationLengths);
+    mBufferManager->copy(*mCpuAlgo->mGenerationLengths, *outputs->generationLengths);
     mBufferManager->copy(*mCpuAlgo->mPositionOffsets, *outputs->positionOffsets);
     mBufferManager->copy(*mCpuAlgo->mPositionIds, *outputs->positionIds);
 
-    if (outputs->actualGenerationLengths)
-    {
-        mBufferManager->copy(*mCpuAlgo->mGenerationLengths, *outputs->actualGenerationLengths);
-    }
-
     mBufferManager->getStream().synchronize();
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h
index 536d21727..e20b59b22 100644
--- a/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h
+++ b/cpp/tensorrt_llm/layers/lookaheadDecodingLayer.h
@@ -48,7 +48,6 @@ class LookaheadDecodingLayer : public BaseLayer
 private:
     void forwardSyncCPU(std::shared_ptr<LookaheadDecodingOutputs> const& outputs,
         std::shared_ptr<LookaheadDecodingInputs> const& inputs);
-    void posIdsToMask(TensorPtr mask, TensorConstPtr posIds);
 
 private:
     using Base::mDecoderDomain;
@@ -57,7 +56,6 @@ class LookaheadDecodingLayer : public BaseLayer
     size_t mSetupWorkspaceSize{};
     TensorPtr mCurandStatesDevice;
     TensorPtr mTargetTokensDevice;
-    TensorPtr mSamplingMaskDevice;
 
     struct CpuAlgorithmResources
     {
@@ -72,17 +70,17 @@ class LookaheadDecodingLayer : public BaseLayer
 
         TensorPtr mOutputIds;
         TensorPtr mPathsOffsets;
+        TensorPtr mPathsOffsetsBatch;
         TensorPtr mNumNewTokens;
         TensorPtr mNumNewTokensCumSum;
         TensorPtr mNewTokens;
 
         TensorPtr mNextDraftTokens;
         TensorPtr mNextDraftPosIds;
-        TensorPtr mSamplingMask;
         TensorPtr mNextDraftLengths;
         TensorPtr mSequenceLengths;
         TensorPtr mGenerationLengths;
-        TensorPtr mGenerationLengthsMax;
+        TensorPtr mAttentionMask;
         TensorPtr mPackedMask;
         TensorPtr mPositionOffsets;
         TensorPtr mPositionIds;
diff --git a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
index d109132e1..739cf6500 100644
--- a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
+++ b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
@@ -121,14 +121,20 @@ class BufferLocation : public runtime::BufferRange<T>
 class DebugTensor
 {
 public:
-    DebugTensor(runtime::ITensor const& tensor, char const* name)
+    DebugTensor(runtime::ITensor const& tensor, char const* name,
+        std::shared_ptr<runtime::BufferManager> bufferManager = nullptr,
+        std::shared_ptr<runtime::CudaStream> stream = nullptr)
         : mTensor(tensor)
         , mName(name)
+        , mBufferManager(bufferManager)
+        , mStream(stream)
     {
     }
 
-    DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name)
-        : DebugTensor(*tensor, name)
+    DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name,
+        std::shared_ptr<runtime::BufferManager> bufferManager = nullptr,
+        std::shared_ptr<runtime::CudaStream> stream = nullptr)
+        : DebugTensor(*tensor, name, bufferManager, stream)
     {
     }
 
@@ -187,9 +193,11 @@ class DebugTensor
         runtime::BufferManager::ITensorPtr hostPtr{nullptr};
         if (mTensor.getMemoryType() == runtime::MemoryType::kGPU)
         {
-            runtime::BufferManager manager{std::make_shared<runtime::CudaStream>()};
-            hostPtr = manager.copyFrom(mTensor, runtime::MemoryType::kCPU);
-            manager.getStream().synchronize();
+            auto theManager = mBufferManager
+                ? mBufferManager
+                : std::make_shared<runtime::BufferManager>(mStream ? mStream : std::make_shared<runtime::CudaStream>());
+            hostPtr = theManager->copyFrom(mTensor, runtime::MemoryType::kCPU);
+            theManager->getStream().synchronize();
         }
         return hostPtr;
     }
@@ -343,12 +351,80 @@ class DebugTensor
         TLLM_LOG_DEBUG(shape());
     }
 
+    template <typename T>
+    void randomize(runtime::SizeType32 vtype)
+    {
+        runtime::BufferRange<T> tensorRange(const_cast<runtime::ITensor&>(mTensor));
+        for (auto& item : tensorRange)
+        {
+            item = vtype == 0 ? 0 : vtype == 1 ? 1 : rand();
+        }
+    }
+
+    void randomize(void)
+    {
+        if (mTensor.getMemoryType() == runtime::MemoryType::kGPU)
+        {
+            runtime::ITensor& nonConstTensor = const_cast<runtime::ITensor&>(mTensor);
+            runtime::BufferManager manager{std::make_shared<runtime::CudaStream>()};
+            runtime::ITensor::SharedConstPtr cpuBuffer = manager.cpu(mTensor.getShape(), mTensor.getDataType());
+            DebugTensor(cpuBuffer, "cpuBuffer").randomize();
+            manager.copy(*cpuBuffer, nonConstTensor);
+            manager.getStream().synchronize();
+        }
+        else
+        {
+            switch (mTensor.getDataType())
+            {
+            case nvinfer1::DataType::kBOOL: return randomize<bool>(3);
+            case nvinfer1::DataType::kFLOAT: return randomize<float>(3);
+            case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(3);
+            case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(3);
+            case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(3);
+            case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(3);
+            default: return;
+            }
+        }
+    }
+
+    void setZeros(void)
+    {
+        switch (mTensor.getDataType())
+        {
+        case nvinfer1::DataType::kBOOL: return randomize<bool>(0);
+        case nvinfer1::DataType::kFLOAT: return randomize<float>(0);
+        case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(0);
+        case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(0);
+        case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(0);
+        case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(0);
+        default: return;
+        }
+    }
+
+    void setOnes(void)
+    {
+        switch (mTensor.getDataType())
+        {
+        case nvinfer1::DataType::kBOOL: return randomize<bool>(1);
+        case nvinfer1::DataType::kFLOAT: return randomize<float>(1);
+        case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(1);
+        case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(1);
+        case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(1);
+        case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(1);
+        default: return;
+        }
+    }
+
 private:
     runtime::ITensor const& mTensor;
     std::string mName;
+    std::shared_ptr<runtime::BufferManager> mBufferManager;
+    std::shared_ptr<runtime::CudaStream> mStream;
 };
 
 #define D(x) tensorrt_llm::layers::DebugTensor(x, #x)
+#define Db(x, bufferManager) tensorrt_llm::layers::DebugTensor(x, #x, bufferManager, nullptr)
+#define Ds(x, stream) tensorrt_llm::layers::DebugTensor(x, #x, nullptr, stream)
 #define PRINT_TOKENS(x) D(x).print_tokens()
 #define PRINT_VALUES(x) D(x).print_values()
 #define PRINT_SHAPE(x) D(x).print_shape()
diff --git a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp
index ac8f78ec1..9f5fc6d38 100644
--- a/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp
+++ b/cpp/tensorrt_llm/layers/medusaDecodingLayer.cpp
@@ -390,7 +390,7 @@ void MedusaDecodingLayer<T>::sampleNewDraftTokens(SpeculativeDecodingOutputs con
     params.maxBatchSize = maxBatchSizeHeadNums;
     params.maxTokensPerStep = 1;
     params.vocabSizePadded = mDecoderDomain.getVocabSizePadded();
-    params.returnAllTopK = true;
+    params.returnAllSelectedTokens = true;
 
     invokeBatchTopKSampling(params, getStream());
 
diff --git a/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp b/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp
index fc0774450..f583b0e3e 100644
--- a/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp
+++ b/cpp/tensorrt_llm/layers/topPSamplingLayer.cpp
@@ -267,7 +267,7 @@ void TopPSamplingLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> con
 
     TopPSamplingKernelParams<T> params{};
     params.probs = probs;
-    params.outputIds = bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr);
+    params.outputIdsPtrs = bufferCastOrNull<TokenIdType*>(outputs->outputIdsPtr);
     params.workspace = workspace->getRawWorkspaceDevicePtr();
     params.topPs = bufferCastOrNull<float>(mRuntimeTopPDevice);
     params.sequenceLength = sequenceLength;
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
index 291900103..71ad6591f 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
@@ -164,7 +164,7 @@ bool GPTAttentionPluginCommon::convertMMHAParamsToXQAParams(tensorrt_llm::kernel
     memset(&xqaParams, 0, sizeof(XQAParams));
     xqaParams.data_type = ConvertMMHAToXQAParamsHelper<T, KVCacheBuffer>::data_type;
 
-    xqaParams.layer_idx = mLayerIdx;
+    xqaParams.layer_idx = mLayerIdxInCachePool;
     xqaParams.num_q_heads = mNumHeads;
     xqaParams.num_kv_heads = mNumKVHeads;
     xqaParams.head_size = mHeadSize;
@@ -376,13 +376,13 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, CROSS
 
 #define INSTANTIATE_MMHA_DISPATCH(T_MMHA, T)                                                                           \
     template void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, false>&,                       \
-        const FusedQKVMaskedAttentionDispatchParams<T, KVLinearBuffer>&, cudaStream_t stream);                         \
+        FusedQKVMaskedAttentionDispatchParams<T, KVLinearBuffer> const&, cudaStream_t stream);                         \
     template void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, true>&,                        \
-        const FusedQKVMaskedAttentionDispatchParams<T, KVLinearBuffer>&, cudaStream_t stream);                         \
+        FusedQKVMaskedAttentionDispatchParams<T, KVLinearBuffer> const&, cudaStream_t stream);                         \
     template void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, false>&,                       \
-        const FusedQKVMaskedAttentionDispatchParams<T, KVBlockArray>&, cudaStream_t stream);                           \
+        FusedQKVMaskedAttentionDispatchParams<T, KVBlockArray> const&, cudaStream_t stream);                           \
     template void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, true>&,                        \
-        const FusedQKVMaskedAttentionDispatchParams<T, KVBlockArray>&, cudaStream_t stream);
+        FusedQKVMaskedAttentionDispatchParams<T, KVBlockArray> const&, cudaStream_t stream);
 INSTANTIATE_MMHA_DISPATCH(float, float)
 INSTANTIATE_MMHA_DISPATCH(uint16_t, half)
 #ifdef ENABLE_BF16
@@ -391,8 +391,8 @@ INSTANTIATE_MMHA_DISPATCH(__nv_bfloat16, __nv_bfloat16)
 #undef INSTANTIATE_MMHA_DISPATCH
 
 GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads, int vision_start, int vision_length,
-    int num_kv_heads, int head_size, int unidirectional, float q_scaling, float qk_tanh_scale,
-    tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
+    int num_kv_heads, int layer_idx_in_cache_pool, int head_size, int unidirectional, float q_scaling,
+    float qk_tanh_scale, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
     int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE
     float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
     float rotary_embedding_scale, float rotary_embedding_short_m_scale, float rotary_embedding_long_m_scale,
@@ -411,6 +411,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(int layer_idx, int num_heads,
     , mVisionStart(vision_start)
     , mVisionLength(vision_length)
     , mNumKVHeads(num_kv_heads)
+    , mLayerIdxInCachePool(layer_idx_in_cache_pool)
     , mHeadSize(head_size)
     , mUnidirectional(unidirectional)
     , mQScaling(q_scaling)
@@ -525,6 +526,7 @@ GPTAttentionPluginCommon::GPTAttentionPluginCommon(void const* data, size_t leng
     read(d, mVisionStart);
     read(d, mVisionLength);
     read(d, mNumKVHeads);
+    read(d, mLayerIdxInCachePool);
     read(d, mHeadSize);
     read(d, mUnidirectional);
     read(d, mQScaling);
@@ -721,7 +723,7 @@ int GPTAttentionPluginCommon::enqueueContext(EnqueueContextParams<T, KVCacheBuff
 
     KVCacheBuffer kv_cache_buffer;
     auto const elemSize = mKVCacheQuantMode.hasKvCacheQuant() ? sizeof(int8_t) : sizeof(T);
-    auto const sizePerToken = num_kv_heads * head_size * elemSize;
+    auto sizePerToken = num_kv_heads * head_size * elemSize;
     KVBlockArray::DataType* hostKvCacheBlockOffsets = nullptr;
     if (useKVCache())
     {
@@ -1751,13 +1753,13 @@ void GPTAttentionPluginCommon::destroy() noexcept
 size_t GPTAttentionPluginCommon::getCommonSerializationSize() const noexcept
 {
     return sizeof(mLayerIdx) + sizeof(mNumHeads) + +sizeof(mVisionStart) + sizeof(mVisionLength) + sizeof(mNumKVHeads)
-        + sizeof(mHeadSize) + sizeof(mUnidirectional) + sizeof(mQScaling) + sizeof(mQKTanhScale)
-        + sizeof(mPositionEmbeddingType) + sizeof(mRotaryEmbeddingDim) + sizeof(mRotaryEmbeddingBase)
-        + sizeof(mRotaryEmbeddingScaleType) + sizeof(mRotaryEmbeddingScale) + sizeof(mRotaryEmbeddingShortMscale)
-        + sizeof(mRotaryEmbeddingLongMscale) + sizeof(mRotaryEmbeddingMaxPositions)
-        + sizeof(mRotaryEmbeddingOriginalMaxPositions) + sizeof(mTpSize) + sizeof(mTpRank) + sizeof(mEnableContextFMHA)
-        + sizeof(mFMHAForceFP32Acc) + sizeof(mMultiBlockMode) + sizeof(mEnableXQA)
-        + sizeof(unsigned int) // mKVCacheQuantMode
+        + sizeof(mLayerIdxInCachePool) + sizeof(mHeadSize) + sizeof(mUnidirectional) + sizeof(mQScaling)
+        + sizeof(mQKTanhScale) + sizeof(mPositionEmbeddingType) + sizeof(mRotaryEmbeddingDim)
+        + sizeof(mRotaryEmbeddingBase) + sizeof(mRotaryEmbeddingScaleType) + sizeof(mRotaryEmbeddingScale)
+        + sizeof(mRotaryEmbeddingShortMscale) + sizeof(mRotaryEmbeddingLongMscale)
+        + sizeof(mRotaryEmbeddingMaxPositions) + sizeof(mRotaryEmbeddingOriginalMaxPositions) + sizeof(mTpSize)
+        + sizeof(mTpRank) + sizeof(mEnableContextFMHA) + sizeof(mFMHAForceFP32Acc) + sizeof(mMultiBlockMode)
+        + sizeof(mEnableXQA) + sizeof(unsigned int) // mKVCacheQuantMode
         + sizeof(mRemovePadding) + sizeof(mMaskType) + sizeof(mBlockSparseParams) + sizeof(mPagedKVCache)
         + sizeof(mTokensPerBlock) + sizeof(mType) + sizeof(mMaxContextLength) + sizeof(mQKVBiasEnabled)
         + sizeof(mCrossAttention) + sizeof(mMaxDistance) + sizeof(mPosShiftEnabled) + sizeof(mDenseContextFMHA)
@@ -1776,6 +1778,7 @@ void GPTAttentionPluginCommon::serializeCommon(void* buffer) const noexcept
     write(d, mVisionStart);
     write(d, mVisionLength);
     write(d, mNumKVHeads);
+    write(d, mLayerIdxInCachePool);
     write(d, mHeadSize);
     write(d, mUnidirectional);
     write(d, mQScaling);
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
index 56aff1e69..36b7e9afc 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h
@@ -38,7 +38,7 @@ class GPTAttentionPluginCommon : public BasePlugin
     GPTAttentionPluginCommon() = delete;
 
     GPTAttentionPluginCommon(int layer_idx, int num_heads, int vision_start, int vision_length, int num_kv_heads,
-        int head_size, int unidirectional, float q_scaling, float qk_tanh_scale,
+        int layer_idx_in_cache_pool, int head_size, int unidirectional, float q_scaling, float qk_tanh_scale,
         tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
         int rotary_embedding_dim, // for RoPE. Use 0 for non-RoPE
         float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
@@ -307,6 +307,7 @@ class GPTAttentionPluginCommon : public BasePlugin
     int mVisionStart;
     int mVisionLength;
     int mNumKVHeads;
+    int mLayerIdxInCachePool;
     int mHeadSize;
     int mUnidirectional;
     float mQScaling;
@@ -389,6 +390,7 @@ class GPTAttentionPluginCommon : public BasePlugin
         ss << "gptAttentionCommon members ====================" << std::endl;
         ss << "mNumHeads: " << mNumHeads << std::endl;
         ss << "mNumKVHeads: " << mNumKVHeads << std::endl;
+        ss << "mLayerIdxInCachePool " << mLayerIdxInCachePool << std::endl;
         ss << "mHeadSize: " << mHeadSize << std::endl;
         ss << "mUnidirectional: " << mUnidirectional << std::endl;
         ss << "mQScaling: " << mQScaling << std::endl;
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
index c9320cd1c..94e218453 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 #include "gptAttentionPlugin.h"
+#include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/kernels/unfusedAttentionKernels.h"
@@ -26,6 +27,7 @@
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/utils/debugUtils.h"
+#include <NvInferRuntimeBase.h>
 #include <algorithm>
 #include <cstdint>
 #include <functional>
@@ -41,8 +43,8 @@ static char const* GPT_ATTENTION_PLUGIN_VERSION{"1"};
 static char const* GPT_ATTENTION_PLUGIN_NAME{"GPTAttention"};
 
 GPTAttentionPlugin::GPTAttentionPlugin(int layer_idx, int num_heads, int vision_start, int vision_length,
-    int num_kv_heads, int head_size, int unidirectional, float q_scaling, float qk_tanh_scale,
-    tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
+    int num_kv_heads, int layer_idx_in_cache_pool, int head_size, int unidirectional, float q_scaling,
+    float qk_tanh_scale, tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
     int rotary_embedding_dim, // for RoPE. 0 for non-RoPE
     float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
     float rotary_embedding_scale, float rotary_embedding_short_m_scale,
@@ -57,9 +59,9 @@ GPTAttentionPlugin::GPTAttentionPlugin(int layer_idx, int num_heads, int vision_
     bool pos_shift_enabled, bool dense_context_fmha, bool use_paged_context_fmha, bool use_fp8_context_fmha,
     bool use_cache, bool is_spec_decoding_enabled, bool spec_decoding_is_generation_length_variable,
     int spec_decoding_max_generation_length)
-    : GPTAttentionPluginCommon(layer_idx, num_heads, vision_start, vision_length, num_kv_heads, head_size,
-        unidirectional, q_scaling, qk_tanh_scale, position_embedding_type, rotary_embedding_dim, rotary_embedding_base,
-        rotary_embedding_scale_type, rotary_embedding_scale, rotary_embedding_short_m_scale,
+    : GPTAttentionPluginCommon(layer_idx, num_heads, vision_start, vision_length, num_kv_heads, layer_idx_in_cache_pool,
+        head_size, unidirectional, q_scaling, qk_tanh_scale, position_embedding_type, rotary_embedding_dim,
+        rotary_embedding_base, rotary_embedding_scale_type, rotary_embedding_scale, rotary_embedding_short_m_scale,
         rotary_embedding_long_m_scale, rotary_embedding_max_positions, rotary_embedding_original_max_positions, tp_size,
         tp_rank, unfuse_qkv_gemm, context_fmha_type, enable_xqa, kv_cache_quant_mode, remove_input_padding, mask_type,
         block_sparse_params, paged_kv_cache, tokens_per_block, type, max_context_length, qkv_bias_enabled,
@@ -94,6 +96,7 @@ bool GPTAttentionPlugin::isEntryUsed(IdxEntry const& entry) const
     case IdxEntry::KV_CACHE_BLOCK_OFFSETS: return useKVCache() && mPagedKVCache;
     case IdxEntry::HOST_KV_CACHE_BLOCK_OFFSETS: return useKVCache() && mPagedKVCache;
     case IdxEntry::HOST_KV_CACHE_POOL_POINTERS: return useKVCache() && mPagedKVCache;
+    case IdxEntry::HOST_KV_CACHE_POOL_MAPPING: return useKVCache() && mPagedKVCache;
     case IdxEntry::PAST_KEY_VALUE: return useKVCache() && !mPagedKVCache;
     case IdxEntry::KV_CACHE_QUANTIZATION_SCALE: return useKVCache() && mKVCacheQuantMode.hasKvCacheQuant();
     case IdxEntry::KV_CACHE_DEQUANTIZATION_SCALE: return useKVCache() && mKVCacheQuantMode.hasKvCacheQuant();
@@ -244,6 +247,11 @@ bool GPTAttentionPlugin::supportsFormatCombination(
         // kv cache pool pointers
         return inOut[pos].type == nvinfer1::DataType::kINT64 && inOut[pos].format == TensorFormat::kLINEAR;
     }
+    else if (useKVCache() && mPagedKVCache && (pos == getIdx(IdxEntry::HOST_KV_CACHE_POOL_MAPPING)))
+    {
+        // kv cache pool mapping
+        return inOut[pos].type == nvinfer1::DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR;
+    }
     else if (useKVCache() && mKVCacheQuantMode.hasInt8KvCache()
         && (!mPagedKVCache && (pos == getIdx(IdxEntry::PAST_KEY_VALUE) || pos == nbInputs + 1)))
     {
@@ -625,27 +633,36 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
         auto const& kvCacheBlockOffsets = inputDesc[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)];
         auto const& kvCacheBlockOffsetsShape = inputDesc[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)].dims;
         max_blocks_per_sequence = kvCacheBlockOffsetsShape.d[kvCacheBlockOffsetsShape.nbDims - 1];
-        auto const seqStride = getStride(kvCacheBlockOffsetsShape, 0);
+
+        std::int32_t const* host_pool_mapping
+            = static_cast<std::int32_t const*>(inputs[getIdx(IdxEntry::HOST_KV_CACHE_POOL_MAPPING)]);
+
+        const int32_t layerToPool = host_pool_mapping[mLayerIdx];
+        auto const seqStride = getStride(kvCacheBlockOffsetsShape, 1);
+        auto const poolStride = getStride(kvCacheBlockOffsetsShape, 0);
         auto const seqOffset = seqIdxBeg * seqStride;
+        auto const poolOffset = layerToPool * poolStride;
 
         block_offsets
             = reinterpret_cast<kernels::KVBlockArray::DataType*>(inputs[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)])
-            + seqOffset;
+            + poolOffset + seqOffset;
 
         host_block_offsets
             = reinterpret_cast<kernels::KVBlockArray::DataType*>(inputs[getIdx(IdxEntry::HOST_KV_CACHE_BLOCK_OFFSETS)])
-            + seqOffset;
+            + poolOffset + seqOffset;
 
         auto const* const typed_host_pool_pointers
             = static_cast<char* const*>(inputs[getIdx(IdxEntry::HOST_KV_CACHE_POOL_POINTERS)]);
 
         auto const cacheElemSize = (mKVCacheQuantMode.hasKvCacheQuant() ? 1 : sizeof(T));
+
         auto const blockSize = mTokensPerBlock * mNumKVHeads * mHeadSize;
         auto const bytesPerBlock = blockSize * cacheElemSize;
-        auto const layerOffset = mLayerIdx * 2 * bytesPerBlock;
+        auto const layerOffset = mLayerIdxInCachePool * 2 * bytesPerBlock;
 
-        host_primary_pool_pointer = reinterpret_cast<void*>(typed_host_pool_pointers[0] + layerOffset);
-        host_secondary_pool_pointer = reinterpret_cast<void*>(typed_host_pool_pointers[1] + layerOffset);
+        host_primary_pool_pointer = reinterpret_cast<void*>(typed_host_pool_pointers[layerToPool * 2] + layerOffset);
+        host_secondary_pool_pointer
+            = reinterpret_cast<void*>(typed_host_pool_pointers[layerToPool * 2 + 1] + layerOffset);
     }
 
     AttentionOutT* context_buf_ = static_cast<AttentionOutT*>(outputs[0])
@@ -963,8 +980,9 @@ IPluginV2* GPTAttentionPluginCreator::createPlugin(char const* name, PluginField
         auto* obj = new GPTAttentionPlugin(p.getScalar<int32_t>("layer_idx").value(),
             p.getScalar<int32_t>("num_heads").value(), p.getScalar<int32_t>("vision_start").value(),
             p.getScalar<int32_t>("vision_length").value(), p.getScalar<int32_t>("num_kv_heads").value(),
-            p.getScalar<int32_t>("head_size").value(), p.getScalar<int32_t>("unidirectional").value(),
-            p.getScalar<float>("q_scaling").value(), p.getScalar<float>("qk_tanh_scale").value(),
+            p.getScalar<int32_t>("layer_idx_in_cache_pool").value(), p.getScalar<int32_t>("head_size").value(),
+            p.getScalar<int32_t>("unidirectional").value(), p.getScalar<float>("q_scaling").value(),
+            p.getScalar<float>("qk_tanh_scale").value(),
             static_cast<PositionEmbeddingType>(p.getScalar<int8_t>("position_embedding_type").value()),
             p.getScalar<int32_t>("rotary_embedding_dim").value(), p.getScalar<float>("rotary_embedding_base").value(),
             static_cast<RotaryScalingType>(p.getScalar<int8_t>("rotary_embedding_scale_type").value()),
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
index aeeae99ce..7982d3c07 100644
--- a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
+++ b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h
@@ -85,7 +85,7 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
 {
 public:
     GPTAttentionPlugin(int layer_idx, int num_heads, int vision_start, int vision_length, int num_kv_heads,
-        int head_size, int unidirectional, float q_scaling, float qk_tanh_scale,
+        int layer_idx_in_cache_pool, int head_size, int unidirectional, float q_scaling, float qk_tanh_scale,
         tensorrt_llm::kernels::PositionEmbeddingType position_embedding_type,
         int rotary_embedding_dim, // for RoPE. 0 for non-RoPE
         float rotary_embedding_base, tensorrt_llm::kernels::RotaryScalingType rotary_embedding_scale_type,
@@ -182,6 +182,7 @@ class GPTAttentionPlugin : public GPTAttentionPluginCommon
         KV_CACHE_BLOCK_OFFSETS,
         HOST_KV_CACHE_BLOCK_OFFSETS,
         HOST_KV_CACHE_POOL_POINTERS,
+        HOST_KV_CACHE_POOL_MAPPING,
         PAST_KEY_VALUE,
         KV_CACHE_QUANTIZATION_SCALE,
         KV_CACHE_DEQUANTIZATION_SCALE,
diff --git a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
index ffc4b7a8d..6f6512f13 100644
--- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
@@ -259,7 +259,7 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
         int idx = 0;
         for (int reqId = 0; reqId < numReqs; reqId++)
         {
-            const RequestType reqType = static_cast<RequestType const>(reqTypes[reqId]);
+            RequestType const reqType = static_cast<RequestType const>(reqTypes[reqId]);
             if (reqType == RequestType::kGENERATION)
             {
                 mExpandLoraWeightPtrs.push_back(reinterpret_cast<void const*>(loraWeightModulePtrs[reqId * 2]));
@@ -284,7 +284,7 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
             fmtstr("LoraParams and input dims don't match, lora tokens %d input tokens %d", idx, numTokens));
     }
 
-    // only used for unifed gemm
+    // only used for unified gemm
     auto bestTactic = mPluginProfiler->getBestConfig(numTokens, mGemmId);
     mLoraImpl->setBestTactic(bestTactic);
     mLoraImpl->run(numTokens, numReqs, input, mExpandLoraRanks.data(), mExpandLoraWeightPtrs.data(), mWeightIndex,
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
index 784055cc5..72304b5ff 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@@ -143,7 +143,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation(
     {
         if (!isAuto)
         {
-            TLLM_LOG_WARNING("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL");
+            TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL");
         }
         return AllReduceStrategyType::NCCL;
     }
@@ -305,14 +305,17 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
             ++tpRank;
         }
 
+        int token_num = size / inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1];
+
         auto params = tensorrt_llm::kernels::AllReduceParams::deserialize(
-            reinterpret_cast<int64_t*>(const_cast<void*>(inputs[1])), tpSize, tpRank);
+            reinterpret_cast<int64_t*>(const_cast<void*>(inputs[1])), tpSize, tpRank, mType, token_num, mOp);
 
         params.local_output_buffer_ptr = outputs[0];
         params.local_input_buffer_ptr = inputs[0];
         params.elts_total = size;
         if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM)
         {
+
             int fusion_ptr_idx = 2;
             params.fusion_params.bias_buffer = mBias ? inputs[fusion_ptr_idx++] : nullptr;
             params.fusion_params.residual_buffer = inputs[fusion_ptr_idx++];
@@ -320,6 +323,15 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe
             params.fusion_params.hidden_size = inputDesc[0].dims.d[inputDesc[0].dims.nbDims - 1];
             params.fusion_params.eps = mEps;
             params.fusion_params.intermediate_buffer = outputs[1];
+            for (int i = 0; i < tpSize; ++i)
+            {
+                params.fusion_params.lamport_peer_comm_buffer_ptrs[i]
+                    = reinterpret_cast<void**>(const_cast<void*>(inputs[1]))[tpSize * 4 + i];
+                params.fusion_params.lamport_peer_comm_buffer_ptrs[i + tensorrt_llm::kernels::MAX_RANKS_PER_NODE]
+                    = reinterpret_cast<void**>(const_cast<void*>(inputs[1]))[tpSize * 5 + i];
+                params.fusion_params.lamport_peer_comm_buffer_ptrs[i + tensorrt_llm::kernels::MAX_RANKS_PER_NODE * 2]
+                    = reinterpret_cast<void**>(const_cast<void*>(inputs[1]))[tpSize * 6 + i];
+            }
         }
         tensorrt_llm::kernels::customAllReduce(params, mType, runtimeStrategy, mConfig, mOp, stream);
     }
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
old mode 100644
new mode 100755
index 65f54c0c3..daae58398
--- a/cpp/tensorrt_llm/pybind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -3,35 +3,19 @@ set(TRTLLM_PYBIND_MODULE
     ${TRTLLM_PYBIND_MODULE}
     PARENT_SCOPE)
 
-if(NOT BUILD_PYT)
-  message(
-    FATAL_ERROR
-      "Python bindings for C++ runtime require PyTorch. Please enable BUILD_PYT"
-  )
-endif()
-
-execute_process(
-  COMMAND ${Python3_EXECUTABLE} "-c"
-          "import pybind11 as pb11; print(pb11.get_cmake_dir(),end='');"
-  RESULT_VARIABLE PYBIND_CMAKE_DIR_RET
-  OUTPUT_VARIABLE PYBIND_CMAKE_DIR)
-
-if(PYBIND_CMAKE_DIR_RET MATCHES 0)
-  list(APPEND CMAKE_PREFIX_PATH "${PYBIND_CMAKE_DIR}")
-else()
-  message(ERROR "pybind11 CMake directory not found.")
-endif()
-
-find_package(pybind11 REQUIRED)
-
 set(SRCS
-    bindings.cpp
+    batch_manager/algorithms.cpp
+    batch_manager/bindings.cpp
     batch_manager/gptManager.cpp
-    batch_manager/llmRequest.cpp
     batch_manager/inferenceRequest.cpp
+    batch_manager/kvCacheManager.cpp
+    batch_manager/llmRequest.cpp
     batch_manager/namedTensor.cpp
     executor/bindings.cpp
-    executor/executor.cpp)
+    executor/executor.cpp
+    bindings.cpp)
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
 
 pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS})
 
@@ -42,12 +26,11 @@ target_link_directories(${TRTLLM_PYBIND_MODULE} PUBLIC
                         "${TORCH_INSTALL_PREFIX}/lib")
 target_link_libraries(
   ${TRTLLM_PYBIND_MODULE}
-  PUBLIC ${SHARED_TARGET} ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python
-         ${UNDEFINED_FLAG})
-target_compile_definitions(${TRTLLM_PYBIND_MODULE}
-                           PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE})
-target_compile_definitions(${TRTLLM_PYBIND_MODULE}
-                           PUBLIC PYBIND11_DETAILED_ERROR_MESSAGES=1)
+  PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
+         ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
+target_compile_definitions(
+  ${TRTLLM_PYBIND_MODULE} PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE}
+                                 PYBIND11_DETAILED_ERROR_MESSAGES=1)
 
 if(NOT WIN32)
   set_target_properties(
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
new file mode 100644
index 000000000..15fc1ee4f
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "algorithms.h"
+#include "tensorrt_llm/batch_manager/capacityScheduler.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/pybind/common/algorithmBindings.h"
+
+namespace py = pybind11;
+
+using namespace tensorrt_llm::batch_manager;
+using namespace PybindUtils;
+
+void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::module_& m)
+{
+    // Algorithms with custom bindings
+    py::class_<CapacityScheduler>(m, CapacityScheduler::name)
+        .def_static("make", &CapacityScheduler::make, py::arg("max_num_requests"), py::arg("kv_cache_manager"),
+            py::arg("cross_kv_cache_manager"), py::arg("peft_cache_manager"), py::arg("capacity_scheduler_policy"),
+            py::arg("many_micro_batches") = false,
+            py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"),
+            py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE,
+                "LlmRequestState.GENERATION_COMPLETE"))
+        .def(py::init())
+        .def("__call__", &CapacityScheduler::operator())
+        .def("name", [](CapacityScheduler const&) { return CapacityScheduler::name; });
+
+    py::class_<MicroBatchScheduler>(m, MicroBatchScheduler::name)
+        .def_static("make", &MicroBatchScheduler::make, py::arg("max_batch_size"),
+            py::arg_v("max_num_tokens", std::nullopt, "None"), py::arg_v("ctx_chunk_config", std::nullopt, "None"),
+            py::arg_v("max_context_length", std::nullopt, "None"),
+            py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"),
+            py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE,
+                "LlmRequestState.GENERATION_COMPLETE"))
+        .def(py::init())
+        .def("__call__", &MicroBatchScheduler::operator())
+        .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h
new file mode 100644
index 000000000..895a4d13e
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace tensorrt_llm::pybind::batch_manager::algorithms
+{
+
+void initBindings(pybind11::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
new file mode 100644
index 000000000..20de984f9
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -0,0 +1,41 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/pybind/utils/bindTypes.h"
+
+namespace py = pybind11;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tle = tensorrt_llm::executor;
+
+using namespace tensorrt_llm::runtime;
+
+namespace tensorrt_llm::pybind::batch_manager
+{
+
+void initBindings(pybind11::module_& m)
+{
+    py::class_<tb::batch_scheduler::ContextChunkingConfig>(m, "ContextChunkingConfig")
+        .def(py::init<tle::ContextChunkingPolicy, tensorrt_llm::runtime::SizeType32>(), py::arg("chunking_policy"),
+            py::arg("chunk_unit_size"))
+        .def_readwrite("chunking_policy", &tb::batch_scheduler::ContextChunkingConfig::chunkingPolicy)
+        .def_readwrite("chunk_unit_size", &tb::batch_scheduler::ContextChunkingConfig::chunkUnitSize);
+}
+
+} // namespace tensorrt_llm::pybind::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.h b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
new file mode 100644
index 000000000..326143d4f
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace tensorrt_llm::pybind::batch_manager
+{
+
+void initBindings(pybind11::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h b/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h
index 3f19dddc7..0c3b81796 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/gptManager.h
@@ -21,6 +21,7 @@
 #include "namedTensor.h"
 #include "tensorrt_llm/batch_manager/GptManager.h"
 #include "tensorrt_llm/batch_manager/callbacks.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ops/tensor.h>
 #include <functional>
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h
index 98ae79b34..d30864e6e 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/inferenceRequest.h
@@ -20,6 +20,7 @@
 #include "tensorrt_llm/batch_manager/inferenceRequest.h"
 #include "tensorrt_llm/pybind/batch_manager/llmRequest.h"
 #include "tensorrt_llm/pybind/batch_manager/namedTensor.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ATen.h>
 #include <pybind11/pybind11.h>
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
new file mode 100644
index 000000000..1e6e59b42
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include "kvCacheManager.h"
+#include "tensorrt_llm/pybind/utils/bindTypes.h"
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace py = pybind11;
+
+void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
+{
+    // TODO: Provide proper bindings
+    py::classh<tb::kv_cache_manager::KVCacheManager>(m, "KVCacheManager");
+}
+
+void tb::BasePeftCacheManagerBindings::initBindings(py::module_& m)
+{
+    // TODO: Provide proper bindings
+    py::classh<tb::BasePeftCacheManager>(m, "BasePeftCacheManager");
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
new file mode 100644
index 000000000..7753c684d
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.h
@@ -0,0 +1,36 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace tensorrt_llm::batch_manager::kv_cache_manager
+{
+class KVCacheManagerBindings
+{
+public:
+    static void initBindings(pybind11::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager::kv_cache_manager
+
+namespace tensorrt_llm::batch_manager
+{
+class BasePeftCacheManagerBindings
+{
+public:
+    static void initBindings(pybind11::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
index 193940083..4ef2e6851 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
@@ -17,22 +17,29 @@
 #include "llmRequest.h"
 
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/pybind/utils/bindTypes.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchUtils.h"
 #include "tensorrt_llm/runtime/torchView.h"
 
+#include <ATen/ATen.h>
 #include <pybind11/functional.h>
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
 #include <memory>
 
 namespace tb = tensorrt_llm::batch_manager;
 namespace tr = tensorrt_llm::runtime;
+namespace tle = tensorrt_llm::executor;
 
 using namespace tensorrt_llm::pybind::batch_manager;
 
+using LlmRequestPtr = std::shared_ptr<tb::LlmRequest>;
+using RequestList = std::list<LlmRequestPtr>;
+
 namespace
 {
 
@@ -166,7 +173,6 @@ void LlmRequest::initBindings(py::module_& m)
         .def_property_readonly("orig_prompt_len", &LlmRequest::getOrigPromptLen)
         .def("has_draft_tokens", &LlmRequest::hasDraftTokens)
         .def("move_to_next_context_chunk", &LlmRequest::moveToNextContextChunk)
-        .def("is_full_context_request", py::overload_cast<>(&LlmRequest::isFullContextRequest, py::const_))
         .def("is_last_context_chunk", py::overload_cast<>(&LlmRequest::isLastContextChunk, py::const_))
         .def("is_first_context_chunk", py::overload_cast<>(&LlmRequest::isFirstContextChunk, py::const_))
         .def("get_context_remaining_length", py::overload_cast<>(&LlmRequest::getContextRemainingLength, py::const_))
@@ -180,3 +186,140 @@ void LlmRequest::initBindings(py::module_& m)
             { self.setDraftLogits(std::make_optional<LlmRequest::TensorPtr>(logits)); })
         .def_property("num_return_sequences", &LlmRequest::getNumReturnSequences, &LlmRequest::setNumReturnSequences);
 }
+
+void tb::LlmRequestBindings::initBindings(py::module_& m)
+{
+    py::classh<tb::LlmRequest>(m, "PyLlmRequest")
+        .def("get_num_tokens", &tb::LlmRequest::getNumTokens, py::arg("beam"))
+        .def_property_readonly("max_beam_num_tokens", &tb::LlmRequest::getMaxBeamNumTokens)
+        .def("get_token", &tb::LlmRequest::getToken, py::arg("beam"), py::arg("pos"))
+        .def("get_tokens", py::overload_cast<tb::LlmRequest::SizeType32>(&tb::LlmRequest::getTokens, py::const_),
+            py::arg("beam"))
+        .def("get_tokens", py::overload_cast<>(&tb::LlmRequest::getTokens, py::const_))
+        .def_property_readonly("max_num_generated_tokens", &tb::LlmRequest::getMaxNumGeneratedTokens)
+        .def("add_new_token", &tb::LlmRequest::addNewToken, py::arg("token"), py::arg("beam"))
+        .def("add_new_tokens", &tb::LlmRequest::addNewTokens, py::arg("beam_tokens"))
+        .def("set_generated_tokens", &tb::LlmRequest::setGeneratedTokens, py::arg("generated_beam_tokens"))
+        .def("pause", &tb::LlmRequest::pause, py::arg("max_input_len"))
+        .def_property("max_sent_token_len", &tb::LlmRequest::getMaxSentTokenLen, &tb::LlmRequest::setMaxSentTokenLen)
+        .def("prompt_embedding_table",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getPromptEmbeddingTable();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("bad_words_list",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getBadWordsList();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def_property(
+            "draft_logits",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getDraftLogits();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            },
+            [](tb::LlmRequest& self, at::Tensor& logits)
+            { self.setDraftLogits(std::make_optional<tb::LlmRequest::TensorPtr>(tr::TorchView::of(logits))); })
+        .def("embedding_bias",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getEmbeddingBias();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("lora_config",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getLoraConfig();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("lora_weights",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getLoraWeights();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def("stop_words_list",
+            [](tb::LlmRequest& self)
+            {
+                std::optional<at::Tensor> value{std::nullopt};
+                auto tensor = self.getStopWordsList();
+                if (tensor)
+                {
+                    value = tr::Torch::tensor(*tensor);
+                }
+                return value;
+            })
+        .def_property_readonly("prompt_vocab_size", &tb::LlmRequest::getPromptVocabSize)
+        .def_property_readonly("lora_task_id", &tb::LlmRequest::getLoraTaskId)
+        .def_property_readonly("lookahead_config", &tb::LlmRequest::getLookaheadConfig)
+        .def_property_readonly(
+            "context_current_position", py::overload_cast<>(&tb::LlmRequest::getContextCurrentPosition, py::const_))
+        .def_property("context_chunk_size", &tb::LlmRequest::getContextChunkSize, &tb::LlmRequest::setContextChunkSize)
+        .def_readwrite("request_id", &tb::LlmRequest::mRequestId)
+        .def_readwrite("prompt_len", &tb::LlmRequest::mPromptLen)
+        .def_readwrite("max_new_tokens", &tb::LlmRequest::mMaxNewTokens)
+        .def_readwrite("sampling_config", &tb::LlmRequest::mSamplingConfig)
+        .def_readwrite("state", &tb::LlmRequest::mState)
+        .def_readwrite("is_streaming", &tb::LlmRequest::mIsStreaming)
+        .def_readwrite("end_id", &tb::LlmRequest::mEndId)
+        .def_readwrite("pad_id", &tb::LlmRequest::mPadId)
+        .def_readwrite("seq_slot", &tb::LlmRequest::mSeqSlot)
+        .def_property_readonly("return_log_probs", &tb::LlmRequest::returnLogProbs)
+        .def_property_readonly("return_context_logits", &tb::LlmRequest::setReturnContextLogits)
+        .def_property_readonly("return_generation_logits", &tb::LlmRequest::setReturnGenerationLogits)
+        .def_property_readonly("log_probs", py::overload_cast<>(&tb::LlmRequest::getLogProbs, py::const_))
+        .def("get_log_probs", py::overload_cast<tb::LlmRequest::SizeType32>(&tb::LlmRequest::getLogProbs, py::const_))
+        .def("set_log_probs", &tb::LlmRequest::setLogProbs, py::arg("log_probs"), py::arg("beam"))
+        .def("set_return_encoder_output", &tb::LlmRequest::setReturnEncoderOutput, py::arg("return_encoder_output"))
+        .def("get_return_encoder_output", &tb::LlmRequest::getReturnEncoderOutput)
+        .def("priority", py::overload_cast<>(&tb::LlmRequest::priority, py::const_))
+        .def("set_priority", py::overload_cast<tle::PriorityType>(&tb::LlmRequest::setPriority))
+        .def_property_readonly("cum_log_probs", &tb::LlmRequest::getCumLogProbs)
+        .def("set_cum_log_prob", &tb::LlmRequest::setCumLogProb, py::arg("cum_log_prob"), py::arg("beam"))
+        .def_property_readonly("orig_prompt_len", &tb::LlmRequest::getOrigPromptLen)
+        .def("has_draft_tokens", &tb::LlmRequest::hasDraftTokens)
+        .def("move_to_next_context_chunk", &tb::LlmRequest::moveToNextContextChunk)
+        .def("is_last_context_chunk", py::overload_cast<>(&tb::LlmRequest::isLastContextChunk, py::const_))
+        .def("is_first_context_chunk", py::overload_cast<>(&tb::LlmRequest::isFirstContextChunk, py::const_))
+        .def(
+            "get_context_remaining_length", py::overload_cast<>(&tb::LlmRequest::getContextRemainingLength, py::const_))
+        .def_property(
+            "draft_tokens", [](tb::LlmRequest& self) { return *self.getDraftTokens(); },
+            [](tb::LlmRequest& self, tb::LlmRequest::VecTokens& draftTokens)
+            { self.setDraftTokens(std::make_shared<tb::LlmRequest::VecTokens>(std::move(draftTokens))); });
+
+    py::bind_vector<tb::RequestVector>(m, "RequestVector");
+}
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
index 34ea424e6..1bc265600 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ATen.h>
 #include <ATen/ops/tensor.h>
@@ -25,6 +26,15 @@
 #include <optional>
 #include <pybind11/pybind11.h>
 
+namespace tensorrt_llm::batch_manager
+{
+class LlmRequestBindings
+{
+public:
+    static void initBindings(pybind11::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
+
 namespace tensorrt_llm::pybind::batch_manager
 {
 
@@ -91,6 +101,7 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
         std::optional<LlmRequest::LogitsPostProcessor> callback);
 
     [[nodiscard]] std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> toTrtLlm() const;
+
     static void initBindings(pybind11::module_& m);
 };
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h b/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h
index 9a0bf661d..522aa52e5 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/namedTensor.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/namedTensor.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 
 #include <ATen/ATen.h>
 
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 7a6c25c29..71950bbe5 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -23,18 +23,20 @@
 #include <torch/extension.h>
 #include <vector>
 
+#include "tensorrt_llm/batch_manager/BatchManager.h"
+#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
+#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
+#include "tensorrt_llm/common/mpiUtils.h"
+#include "tensorrt_llm/common/quantization.h"
+#include "tensorrt_llm/pybind/batch_manager/algorithms.h"
+#include "tensorrt_llm/pybind/batch_manager/bindings.h"
 #include "tensorrt_llm/pybind/batch_manager/gptManager.h"
 #include "tensorrt_llm/pybind/batch_manager/inferenceRequest.h"
+#include "tensorrt_llm/pybind/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/pybind/batch_manager/llmRequest.h"
 #include "tensorrt_llm/pybind/batch_manager/namedTensor.h"
 #include "tensorrt_llm/pybind/executor/bindings.h"
 #include "tensorrt_llm/pybind/utils/pathCaster.h"
-
-#include "tensorrt_llm/batch_manager/BatchManager.h"
-#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
-#include "tensorrt_llm/batch_manager/trtGptModelOptionalParams.h"
-#include "tensorrt_llm/common/mpiUtils.h"
-#include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/memoryCounters.h"
@@ -178,19 +180,25 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .def(py::self != py::self);
 
     py::class_<tr::ModelConfig>(m, "ModelConfig")
-        .def(py::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, nvinfer1::DataType>(),
-            py::arg("vocab_size"), py::arg("num_attention_layers"), py::arg("num_rnn_layers"), py::arg("num_heads"),
-            py::arg("hidden_size"), py::arg("data_type"))
+        .def(py::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, nvinfer1::DataType>(),
+            py::arg("vocab_size"), py::arg("num_layers"), py::arg("num_attention_layers"), py::arg("num_rnn_layers"),
+            py::arg("num_heads"), py::arg("hidden_size"), py::arg("data_type"))
         .def_property_readonly("vocab_size", &tr::ModelConfig::getVocabSize)
         .def("vocab_size_padded", &tr::ModelConfig::getVocabSizePadded, py::arg("world_size"))
-        .def("num_attention_layers", &tr::ModelConfig::getNbAttentionLayers, py::arg("pipeline_parallelism") = 1)
-        .def("num_rnn_layers", &tr::ModelConfig::getNbRnnLayers, py::arg("pipeline_parallelism") = 1)
+        .def("num_layers", &tr::ModelConfig::getNbLayers, py::arg("pipeline_parallelism") = 1)
+        .def("num_attention_layers", &tr::ModelConfig::getNbAttentionLayers, py::arg("pipeline_parallelism") = 1,
+            py::arg("pipeline_parallelism_rank") = 0)
+        .def("num_rnn_layers", &tr::ModelConfig::getNbRnnLayers, py::arg("pipeline_parallelism") = 1,
+            py::arg("pipeline_parallelism_rank") = 0)
+        .def("num_kv_heads", &tr::ModelConfig::getNbKvHeads, py::arg("layer_idx"))
+        .def("set_num_kv_heads", &tr::ModelConfig::setNbKvHeads, py::arg("num_kv_heads"))
         .def_property_readonly("num_heads", &tr::ModelConfig::getNbHeads)
         .def_property_readonly("hidden_size", &tr::ModelConfig::getHiddenSize)
         .def_property_readonly("size_per_head", &tr::ModelConfig::getSizePerHead)
         .def_property_readonly("data_type", &tr::ModelConfig::getDataType)
-        .def_property("num_kv_heads", &tr::ModelConfig::getNbKvHeads, &tr::ModelConfig::setNbKvHeads)
         .def_property("head_size", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead)
+        .def_property(
+            "num_kv_heads_per_layer", &tr::ModelConfig::getNumKvHeadsPerLayer, &tr::ModelConfig::setNumKvHeadsPerLayer)
         .def_property("use_gpt_attention_plugin",
             py::overload_cast<>(&tr::ModelConfig::useGptAttentionPlugin, py::const_),
             py::overload_cast<bool>(&tr::ModelConfig::useGptAttentionPlugin))
@@ -317,16 +325,20 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
             py::overload_cast<tr::WorldConfig const&>(&tr::GptJsonConfig::engineFilename, py::const_),
             py::arg("world_config"));
 
-    py::enum_<tb::LlmRequestState_t>(m, "LlmRequestState")
-        .value("REQUEST_STATE_UNKNOWN", tb::LlmRequestState_t::REQUEST_STATE_UNKNOWN)
-        .value("REQUEST_STATE_ENCODER_INIT", tb::LlmRequestState_t::REQUEST_STATE_ENCODER_INIT)
-        .value("REQUEST_STATE_CONTEXT_INIT", tb::LlmRequestState_t::REQUEST_STATE_CONTEXT_INIT)
-        .value("REQUEST_STATE_GENERATION_IN_PROGRESS", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_IN_PROGRESS)
-        .value("REQUEST_STATE_GENERATION_TO_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_TO_COMPLETE)
-        .value("REQUEST_STATE_GENERATION_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_COMPLETE);
+    py::enum_<tb::LlmRequestState>(m, "LlmRequestState")
+        .value("UNKNOWN", tb::LlmRequestState::kUNKNOWN)
+        .value("ENCODER_INIT", tb::LlmRequestState::kENCODER_INIT)
+        .value("CONTEXT_INIT", tb::LlmRequestState::kCONTEXT_INIT)
+        .value("GENERATION_IN_PROGRESS", tb::LlmRequestState::kGENERATION_IN_PROGRESS)
+        .value("GENERATION_TO_COMPLETE", tb::LlmRequestState::kGENERATION_TO_COMPLETE)
+        .value("GENERATION_COMPLETE", tb::LlmRequestState::kGENERATION_COMPLETE);
 
     tpb::NamedTensor::initBindings(m);
     tpb::LlmRequest::initBindings(m);
+    tb::kv_cache_manager::KVCacheManagerBindings::initBindings(m);
+    tb::BasePeftCacheManagerBindings::initBindings(m);
+
+    tb::LlmRequestBindings::initBindings(m);
 
     auto tensorNames = m.def_submodule("tensor_names");
     // Input tensor names
@@ -406,8 +418,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .def(py::pickle(gptModelParamsGetState, gptModelParamsSetState))
         .def("__eq__", &tb::TrtGptModelOptionalParams::operator==);
 
-    tpb::GptManager::initBindings(m);
-
     py::class_<tr::MemoryCounters>(m, "MemoryCounters")
         .def_static("instance", &tr::MemoryCounters::getInstance, py::return_value_policy::reference)
         .def_property_readonly("gpu", &tr::MemoryCounters::getGpu)
@@ -441,4 +451,11 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
                 auto& world = tensorrt_llm::mpi::MpiComm::world();
                 tensorrt_llm::mpi::MpiComm::setSession(world.split(color, rank));
             });
+
+    auto mInternal = m.def_submodule("internal", "Internal submodule of TRTLLM runtime");
+
+    tensorrt_llm::pybind::batch_manager::initBindings(mInternal);
+    tensorrt_llm::pybind::batch_manager::algorithms::initBindings(mInternal);
+
+    tpb::GptManager::initBindings(m);
 }
diff --git a/cpp/tensorrt_llm/pybind/common/algorithmBindings.h b/cpp/tensorrt_llm/pybind/common/algorithmBindings.h
new file mode 100644
index 000000000..0a81a4e63
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/common/algorithmBindings.h
@@ -0,0 +1,39 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "opaqueBindings.h"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <string>
+
+namespace py = pybind11;
+
+namespace PybindUtils
+{
+template <typename T>
+void makeAlgorithmBindings(py::module_& m)
+{
+    py::class_<T>(m, T::name).def(py::init()).def("forward", &T::forward).def("name", [](T const&) { return T::name; });
+}
+
+template <typename T>
+void instantiatePybindAlgorithm(py::module_& m);
+} // namespace PybindUtils
+
+#define INSTANTIATE_ALGORITHM(TYPE)                                                                                    \
+    template <>                                                                                                        \
+    void PybindUtils::instantiatePybindAlgorithm<TYPE>(py::module_ & m)                                                \
+    {                                                                                                                  \
+        makeAlgorithmBindings<TYPE>(m);                                                                                \
+    };
diff --git a/cpp/tensorrt_llm/pybind/common/opaqueBindings.h b/cpp/tensorrt_llm/pybind/common/opaqueBindings.h
new file mode 100644
index 000000000..59f98a76d
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/common/opaqueBindings.h
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include <pybind11/stl_bind.h>
+
+PYBIND11_MAKE_OPAQUE(tensorrt_llm::batch_manager::RequestVector)
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index c578eb181..5ca057704 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -93,7 +93,8 @@ void InitBindings(pybind11::module_& m)
 
     py::enum_<tle::CapacitySchedulerPolicy>(m, "CapacitySchedulerPolicy")
         .value("MAX_UTILIZATION", tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)
-        .value("GUARANTEED_NO_EVICT", tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT);
+        .value("GUARANTEED_NO_EVICT", tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
+        .value("STATIC_BATCH", tle::CapacitySchedulerPolicy::kSTATIC_BATCH);
 
     py::enum_<tle::ContextChunkingPolicy>(m, "ContextChunkingPolicy")
         .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS)
@@ -153,6 +154,11 @@ void InitBindings(pybind11::module_& m)
             [](tle::IterationStats const& iterationStats)
             { return tle::JsonSerialization::toJsonStr(iterationStats); });
 
+    py::class_<tle::DebugTensorsPerIteration>(m, "DebugTensorsPerIteration")
+        .def(py::init<>())
+        .def_readwrite("iter", &tle::DebugTensorsPerIteration::iter)
+        .def_readwrite("debug_tensors", &tle::DebugTensorsPerIteration::debugTensors);
+
     py::enum_<tle::RequestStage>(m, "RequestStage")
         .value("QUEUED", tle::RequestStage::kQUEUED)
         .value("ENCODER_IN_PROGRESS", tle::RequestStage::kENCODER_IN_PROGRESS)
@@ -160,6 +166,10 @@ void InitBindings(pybind11::module_& m)
         .value("GENERATION_IN_PROGRESS", tle::RequestStage::kGENERATION_IN_PROGRESS)
         .value("GENERATION_COMPLETE", tle::RequestStage::kGENERATION_COMPLETE);
 
+    py::class_<tle::DisServingRequestStats>(m, "DisServingRequestStats")
+        .def(py::init<>())
+        .def_readwrite("kv_cache_transfer_ms", &tle::DisServingRequestStats::kvCacheTransferMS);
+
     py::class_<tle::RequestStats>(m, "RequestStats")
         .def(py::init<>())
         .def_readwrite("id", &tle::RequestStats::id)
@@ -169,6 +179,7 @@ void InitBindings(pybind11::module_& m)
         .def_readwrite("avg_num_decoded_tokens_per_iter", &tle::RequestStats::avgNumDecodedTokensPerIter)
         .def_readwrite("scheduled", &tle::RequestStats::scheduled)
         .def_readwrite("paused", &tle::RequestStats::paused)
+        .def_readwrite("dis_serving_stats", &tle::RequestStats::disServingStats)
         .def("to_json_str",
             [](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); });
 
@@ -289,7 +300,8 @@ void InitBindings(pybind11::module_& m)
         .def_property_readonly("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize);
 
     py::class_<tle::ContextPhaseParams>(m, "ContextPhaseParams")
-        .def(py::init<VecTokens>(), py::arg("first_gen_tokens"));
+        .def(py::init<VecTokens, tle::ContextPhaseParams::RequestIdType>(), py::arg("first_gen_tokens"),
+            py::arg("req_id"));
 
     py::class_<tle::Request> request(m, "Request");
     request
@@ -393,12 +405,18 @@ void InitBindings(pybind11::module_& m)
         .def_readwrite("encoder_output", &tle::Result::encoderOutput)
         .def_readwrite("finish_reasons", &tle::Result::finishReasons)
         .def_readwrite("sequence_index", &tle::Result::sequenceIndex)
-        .def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal);
+        .def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal)
+        .def_readwrite("decoding_iter", &tle::Result::decodingIter)
+        .def_readwrite("context_phase_params", &tle::Result::contextPhaseParams)
+        .def_readwrite("sequence_index", &tle::Result::sequenceIndex);
 
     py::class_<tle::Response>(m, "Response")
-        .def(py::init<IdType, std::string>(), py::arg("request_id"), py::arg("error_msg"))
-        .def(py::init<IdType, tle::Result>(), py::arg("request_id"), py::arg("result"))
+        .def(py::init<IdType, std::string, std::optional<IdType>>(), py::arg("request_id"), py::arg("error_msg"),
+            py::arg("client_id") = std::nullopt)
+        .def(py::init<IdType, tle::Result, std::optional<IdType>>(), py::arg("request_id"), py::arg("result"),
+            py::arg("client_id") = std::nullopt)
         .def_property_readonly("request_id", &tle::Response::getRequestId)
+        .def_property_readonly("client_id", &tle::Response::getClientId)
         .def("has_error", &tle::Response::hasError)
         .def_property_readonly("error_msg", &tle::Response::getErrorMsg)
         .def_property_readonly("result", &tle::Response::getResult);
@@ -430,25 +448,27 @@ void InitBindings(pybind11::module_& m)
     {
         return py::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
             self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
-            self.getOnboardBlocks());
+            self.getOnboardBlocks(), self.getCrossKvCacheFraction());
     };
     auto kvCacheConfigSetstate = [](py::tuple state)
     {
-        if (state.size() != 7)
+        if (state.size() != 8)
         {
             throw std::runtime_error("Invalid state!");
         }
         return tle::KvCacheConfig(state[0].cast<bool>(), state[1].cast<std::optional<SizeType32>>(),
             state[2].cast<std::optional<std::vector<SizeType32>>>(), state[3].cast<std::optional<SizeType32>>(),
-            state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>());
+            state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>(),
+            state[7].cast<std::optional<float>>());
     };
     py::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
         .def(py::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
-                 std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool>(),
+                 std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
+                 std::optional<float> const&>(),
             py::arg("enable_block_reuse") = false, py::arg("max_tokens") = py::none(),
             py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(),
             py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(),
-            py::arg("onboard_blocks") = true)
+            py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none())
         .def_property(
             "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
         .def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
@@ -460,6 +480,8 @@ void InitBindings(pybind11::module_& m)
             &tle::KvCacheConfig::setFreeGpuMemoryFraction)
         .def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
         .def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
+        .def_property("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
+            &tle::KvCacheConfig::setCrossKvCacheFraction)
         .def(py::pickle(kvCacheConfigGetstate, kvCacheConfigSetstate));
 
     py::class_<tle::OrchestratorConfig>(m, "OrchestratorConfig")
@@ -567,25 +589,31 @@ void InitBindings(pybind11::module_& m)
         .def(py::pickle(decodingConfigGetstate, decodingConfigSetstate));
 
     auto debugConfigGetstate = [](tle::DebugConfig const& self)
-    { return py::make_tuple(self.getDumpInputTensors(), self.getDumpOutputTensors(), self.getDebugTensorNames()); };
+    {
+        return py::make_tuple(self.getDebugInputTensors(), self.getDebugOutputTensors(), self.getDebugTensorNames(),
+            self.getDebugTensorsMaxIterations());
+    };
     auto debugConfigSetstate = [](py::tuple state)
     {
-        if (state.size() != 3)
+        if (state.size() != 4)
         {
             throw std::runtime_error("Invalid state!");
         }
-        return tle::DebugConfig(
-            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<std::vector<std::string>>());
+        return tle::DebugConfig(state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<std::vector<std::string>>(),
+            state[3].cast<SizeType32>());
     };
     py::class_<tle::DebugConfig>(m, "DebugConfig")
-        .def(py::init<bool, bool, std::vector<std::string>>(), py::arg("dump_input_tensors") = false,
-            py::arg("dump_output_tensors") = false, py::arg("debug_tensor_names") = py::none())
+        .def(py::init<bool, bool, std::vector<std::string>, SizeType32>(), py::arg("debug_input_tensors") = false,
+            py::arg("debug_output_tensors") = false, py::arg("debug_tensor_names") = py::none(),
+            py::arg("debug_tensors_max_iterations") = false)
         .def_property(
-            "dump_input_tensors", &tle::DebugConfig::getDumpInputTensors, &tle::DebugConfig::setDumpInputTensors)
+            "debug_input_tensors", &tle::DebugConfig::getDebugInputTensors, &tle::DebugConfig::setDebugInputTensors)
         .def_property(
-            "dump_output_tensors", &tle::DebugConfig::getDumpOutputTensors, &tle::DebugConfig::setDumpOuputTensors)
+            "debug_output_tensors", &tle::DebugConfig::getDebugOutputTensors, &tle::DebugConfig::setDebugOutputTensors)
         .def_property(
             "debug_tensor_names", &tle::DebugConfig::getDebugTensorNames, &tle::DebugConfig::setDebugTensorNames)
+        .def_property("debug_tensors_max_iterations", &tle::DebugConfig::getDebugTensorsMaxIterations,
+            &tle::DebugConfig::setDebugTensorsMaxIterations)
         .def(py::pickle(debugConfigGetstate, debugConfigSetstate));
 
     auto logitsPostProcessorConfigGetstate = [](tle::LogitsPostProcessorConfig const& self)
@@ -615,14 +643,18 @@ void InitBindings(pybind11::module_& m)
 
     auto extendedRuntimePerfKnobConfigSetstate = [](py::tuple state)
     {
-        if (state.size() != 2)
+        if (state.size() != 4)
         {
             throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!");
         }
-        return tle::ExtendedRuntimePerfKnobConfig(state[0].cast<bool>(), state[1].cast<bool>());
+        return tle::ExtendedRuntimePerfKnobConfig(
+            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<bool>(), state[2].cast<SizeType32>());
     };
     auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self)
-    { return py::make_tuple(self.getMultiBlockMode(), self.getEnableContextFMHAFP32Acc()); };
+    {
+        return py::make_tuple(self.getMultiBlockMode(), self.getEnableContextFMHAFP32Acc(), self.getCudaGraphMode(),
+            self.getCudaGraphCacheSize());
+    };
     py::class_<tle::ExtendedRuntimePerfKnobConfig>(m, "ExtendedRuntimePerfKnobConfig")
         .def(
             py::init<bool, bool>(), py::arg("multi_block_mode") = true, py::arg("enable_context_fmha_fp32_acc") = false)
@@ -630,6 +662,10 @@ void InitBindings(pybind11::module_& m)
             &tle::ExtendedRuntimePerfKnobConfig::setMultiBlockMode)
         .def_property("enable_context_fmha_fp32_acc", &tle::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc,
             &tle::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc)
+        .def_property("cuda_graph_mode", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphMode,
+            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphMode)
+        .def_property("cuda_graph_cache_size", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize,
+            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize)
         .def(py::pickle(extendedRuntimePerfKnobConfigGetstate, extendedRuntimePerfKnobConfigSetstate));
 
     auto executorConfigGetState = [](tle::ExecutorConfig const& self)
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.h b/cpp/tensorrt_llm/pybind/executor/bindings.h
index 7a686b19b..59916dcd6 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.h
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.h
@@ -16,6 +16,8 @@
  */
 
 #pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include <pybind11/pybind11.h>
 
 namespace tensorrt_llm::pybind::executor
diff --git a/cpp/tensorrt_llm/pybind/executor/executor.cpp b/cpp/tensorrt_llm/pybind/executor/executor.cpp
index 229edfc31..768c08bf4 100644
--- a/cpp/tensorrt_llm/pybind/executor/executor.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executor.cpp
@@ -184,6 +184,7 @@ void Executor::initBindings(py::module_& m)
         .def("cancel_request", &Executor::cancelRequest, py::arg("id") = py::none())
         .def("get_latest_iteration_stats", &Executor::getLatestIterationStats)
         .def("get_latest_request_stats", &Executor::getLatestRequestStats)
+        .def("get_latest_debug_tensors", &Executor::getLatestDebugTensors)
         .def("can_enqueue_requests", &Executor::canEnqueueRequests);
 }
 
diff --git a/cpp/tensorrt_llm/pybind/executor/executor.h b/cpp/tensorrt_llm/pybind/executor/executor.h
index 921988f26..b70ba4c9c 100644
--- a/cpp/tensorrt_llm/pybind/executor/executor.h
+++ b/cpp/tensorrt_llm/pybind/executor/executor.h
@@ -16,8 +16,10 @@
  */
 
 #pragma once
+
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include <pybind11/pybind11.h>
 
 namespace tle = tensorrt_llm::executor;
@@ -103,6 +105,11 @@ class Executor
         return mExecutor->getLatestRequestStats();
     }
 
+    std::deque<tle::DebugTensorsPerIteration> getLatestDebugTensors()
+    {
+        return mExecutor->getLatestDebugTensors();
+    }
+
     [[nodiscard]] bool canEnqueueRequests() const
     {
         return mExecutor->canEnqueueRequests();
diff --git a/cpp/tensorrt_llm/pybind/executor/streamCaster.h b/cpp/tensorrt_llm/pybind/executor/streamCaster.h
index 4838cc6cc..e0c0ccf01 100644
--- a/cpp/tensorrt_llm/pybind/executor/streamCaster.h
+++ b/cpp/tensorrt_llm/pybind/executor/streamCaster.h
@@ -17,10 +17,10 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
 #include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
+#include <pybind11/pybind11.h>
 
 namespace PYBIND11_NAMESPACE
 {
diff --git a/cpp/tensorrt_llm/pybind/executor/tensorCaster.h b/cpp/tensorrt_llm/pybind/executor/tensorCaster.h
index 894e0af30..e3c596503 100644
--- a/cpp/tensorrt_llm/pybind/executor/tensorCaster.h
+++ b/cpp/tensorrt_llm/pybind/executor/tensorCaster.h
@@ -17,11 +17,11 @@
 
 #pragma once
 
-#include <pybind11/pybind11.h>
-
 #include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchView.h"
+#include <pybind11/pybind11.h>
 #include <torch/extension.h>
 
 namespace PYBIND11_NAMESPACE
diff --git a/cpp/tensorrt_llm/pybind/utils/bindTypes.h b/cpp/tensorrt_llm/pybind/utils/bindTypes.h
new file mode 100644
index 000000000..727c364d9
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/utils/bindTypes.h
@@ -0,0 +1,69 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
+#include <pybind11/pybind11.h>
+
+namespace PybindUtils
+{
+
+namespace py = pybind11;
+
+template <typename T>
+void bindList(py::module& m, std::string const& name)
+{
+    py::class_<T>(m, name.c_str())
+        .def(py::init())
+        .def("push_back", [](T& lst, const typename T::value_type& value) { lst.push_back(value); })
+        .def("pop_back", [](T& lst) { lst.pop_back(); })
+        .def("push_front", [](T& lst, const typename T::value_type& value) { lst.push_front(value); })
+        .def("pop_front", [](T& lst) { lst.pop_front(); })
+        .def("__len__", [](T const& lst) { return lst.size(); })
+        .def(
+            "__iter__", [](T& lst) { return py::make_iterator(lst.begin(), lst.end()); }, py::keep_alive<0, 1>())
+        .def("__getitem__",
+            [](T const& lst, size_t index)
+            {
+                if (index >= lst.size())
+                    throw py::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                return *it;
+            })
+        .def("__setitem__",
+            [](T& lst, size_t index, const typename T::value_type& value)
+            {
+                if (index >= lst.size())
+                    throw py::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                *it = value;
+            });
+}
+
+template <typename T>
+void bindSet(py::module& m, std::string const& name)
+{
+    py::class_<T>(m, name.c_str())
+        .def(py::init())
+        .def("clear", &T::clear)
+        .def("size", &T::size)
+        // .def("insert", py::overload_cast<const typename T::value_type&>(&T::insert))
+        .def("erase", py::overload_cast<typename T::value_type const&>(&T::erase))
+        .def("__contains__", [](T const& s, typename T::value_type x) { return s.find(x) != s.end(); })
+        .def(
+            "__iter__", [](T& s) { return py::make_iterator(s.begin(), s.end()); }, py::keep_alive<0, 1>());
+}
+
+} // namespace PybindUtils
diff --git a/cpp/tensorrt_llm/pybind/utils/pathCaster.h b/cpp/tensorrt_llm/pybind/utils/pathCaster.h
index 571be82ad..e74da30dd 100644
--- a/cpp/tensorrt_llm/pybind/utils/pathCaster.h
+++ b/cpp/tensorrt_llm/pybind/utils/pathCaster.h
@@ -22,6 +22,7 @@
 #include "pybind11/detail/descr.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
+#include "tensorrt_llm/pybind/common/opaqueBindings.h"
 #include <filesystem>
 
 namespace PYBIND11_NAMESPACE
diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
index 3f1954335..2ce57d5dd 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoder.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
@@ -49,11 +49,6 @@ GptDecoder<T>::GptDecoder(executor::DecodingMode const& mode, size_t maxBatchSiz
     auto const decodingDomain = tensorrt_llm::layers::DecoderDomain(
         maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, speculativeDecodingModule);
     mDynamicDecodeLayer = std::make_shared<tensorrt_llm::layers::DynamicDecodeLayer<T>>(mode, decodingDomain, mManager);
-    auto constexpr nvFloatType = TRTDataType<float>::value;
-    mLogProbsTiled = mManager->gpu(ITensor::makeShape({static_cast<SizeType32>(maxSequenceLength),
-                                       static_cast<SizeType32>(maxBatchSize), static_cast<SizeType32>(maxBeamWidth)}),
-        nvFloatType);
-    mManager->setZero(*mLogProbsTiled);
 
     mDecodingLayerWorkspace = std::make_unique<tensorrt_llm::runtime::DecodingLayerWorkspace>(
         mManager, decodingDomain, TRTDataType<T>::value, mDynamicDecodeLayer->getWorkspaceSize());
@@ -166,6 +161,19 @@ void GptDecoder<T>::setup(SamplingConfig const& samplingConfig, size_t batchSize
         lookaheadParams->attentionPackedMasks = output->lookaheadOutputs->packedMasks;
         setupParams->decodingParams = std::move(lookaheadParams);
     }
+    else if (mDecodingMode.isExternalDraftTokens())
+    {
+        auto externalDraftTokensParams = std::make_shared<tl::ExternalDraftTokensSetupParams>();
+        // signed to unsigned
+        if (mSamplingConfig.topK)
+        {
+            auto const& topK = mSamplingConfig.topK.value();
+            externalDraftTokensParams->runtimeTopK = std::vector<SizeType32>(std::begin(topK), std::end(topK));
+        }
+
+        externalDraftTokensParams->runtimeTopP = mSamplingConfig.topP;
+        setupParams->decodingParams = std::move(externalDraftTokensParams);
+    }
     setupParams->decodingParams->randomSeed = mSamplingConfig.randomSeed;
 
     mDecodingLayerWorkspace->setDeviceBatchSlots(batchSlots);
@@ -249,6 +257,27 @@ void prepareMedusaInputs(
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
+void prepareExternalDraftTokensInputs(
+    DecodingInput const& inputs, size_t maxBatchSize, std::shared_ptr<tl::DecodingInputs>& baseInputs)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto inputParams = std::dynamic_pointer_cast<tl::ExternalDraftTokensInputs>(baseInputs);
+
+    auto const& externalDraftTokensInputs = inputs.externalDraftTokensInputs.value();
+
+    inputParams->draftLogits = externalDraftTokensInputs.draftLogits;
+    inputParams->draftProbs = externalDraftTokensInputs.draftProbs;
+    inputParams->targetProbs = externalDraftTokensInputs.targetProbs;
+    inputParams->numDraftTokens = externalDraftTokensInputs.numDraftTokens;
+    inputParams->draftTokenIds = externalDraftTokensInputs.draftTokenIds;
+    inputParams->constantThreshold = externalDraftTokensInputs.constantThreshold;
+    inputParams->useRandomAcceptanceThreshold = externalDraftTokensInputs.useRandomAcceptanceThreshold;
+    inputParams->step = externalDraftTokensInputs.step;
+    inputParams->useDraftLogits = externalDraftTokensInputs.useDraftLogits;
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
 void prepareExplicitDraftTokensInput(DecodingInput const& inputs, std::shared_ptr<tl::DecodingInputs>& baseInputs)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@@ -321,6 +350,11 @@ std::shared_ptr<tl::BaseDecodingInputs> prepareInputs(
         forwardParams
             = std::make_shared<tl::ExplicitDraftTokensInputs>(input.endIds, input.batchSlots, input.batchSize);
     }
+    else if (decodingMode.isExternalDraftTokens())
+    {
+        forwardParams = std::make_shared<tl::ExternalDraftTokensInputs>(
+            input.endIds, input.batchSlots, input.step, ite, input.batchSize);
+    }
 
     // No logits for explicit draft tokens
     if (!decodingMode.isExplicitDraftTokens())
@@ -384,6 +418,11 @@ std::shared_ptr<tl::BaseDecodingInputs> prepareInputs(
         forwardParams->localBatchSize = input.batchSize;
     }
 
+    if (decodingMode.isExternalDraftTokens())
+    {
+        prepareExternalDraftTokensInputs(input, maxBatchSize, forwardParams);
+    }
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 
     return forwardParams;
@@ -491,8 +530,7 @@ void prepareSpeculativeDecodingOutputs(DecodingOutput& output, std::shared_ptr<t
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-std::shared_ptr<tl::BaseDecodingOutputs> prepareOutputs(
-    DecodingOutput& output, DecodingOutput::TensorPtr& logProbsTiled, tle::DecodingMode const& decodingMode)
+std::shared_ptr<tl::BaseDecodingOutputs> prepareOutputs(DecodingOutput& output, tle::DecodingMode const& decodingMode)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     std::shared_ptr<tl::BaseDecodingOutputs> outputParams;
@@ -549,7 +587,7 @@ std::shared_ptr<tl::BaseDecodingOutputs> prepareOutputs(
     if (output.logProbs)
     {
         outputParams->outputLogProbs = output.logProbs;
-        outputParams->outputLogProbsTiled = logProbsTiled;
+        outputParams->outputLogProbsTiled = output.logProbsTiled;
     }
 
     // Beam search outputs
@@ -575,7 +613,7 @@ void GptDecoder<T>::forwardAsync(DecodingOutput& output, DecodingInput const& in
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto forwardParams = prepareInputs<T>(input, mMaxBatchSize, mDecodingMode);
-    auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode);
+    auto outputParams = prepareOutputs(output, mDecodingMode);
 
     mDynamicDecodeLayer->forwardAsync(outputParams, forwardParams, mDecodingLayerWorkspace);
 
@@ -587,207 +625,15 @@ void GptDecoder<T>::forwardSync(DecodingOutput& output, DecodingInput const& inp
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto forwardParams = prepareInputs<T>(input, mMaxBatchSize, mDecodingMode);
-    auto outputParams = prepareOutputs(output, mLogProbsTiled, mDecodingMode);
+    auto outputParams = prepareOutputs(output, mDecodingMode);
 
     mDynamicDecodeLayer->forwardSync(outputParams, forwardParams, mDecodingLayerWorkspace);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-// Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree
-template <typename T>
-void GptDecoder<T>::gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
-    BufferManager const& manager, std::optional<std::reference_wrapper<SamplingConfig const>> samplingConfig)
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    auto& finalOutputIds = *decodingOutput.gatheredIds;
-    auto const& finalOutputIdsShape = finalOutputIds.getShape();
-    auto const& decodingOutputIdsShape = decodingOutput.ids->getShape();
-    auto const batchSize = finalOutputIdsShape.d[0];
-    auto const beamWidth = finalOutputIdsShape.d[1];
-    auto const maxSeqLength = finalOutputIdsShape.d[2];
-
-    TLLM_CHECK_WITH_INFO(beamWidth > 1, "gatherTree is only needed for beam search.");
-
-    TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[0] == batchSize,
-        common::fmtstr("Decoder batch size (" FMT_DIM ") does not match final batch size (" FMT_DIM ")",
-            decodingOutputIdsShape.d[0], batchSize));
-    TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[1] == beamWidth,
-        common::fmtstr("Decoder beam width (" FMT_DIM ") does not match final beam width (" FMT_DIM ")",
-            decodingOutputIdsShape.d[1], beamWidth));
-    TLLM_CHECK_WITH_INFO(decodingOutputIdsShape.d[2] <= maxSeqLength,
-        common::fmtstr("Decoder seq length size (" FMT_DIM ") is too large for final seq length (" FMT_DIM ")",
-            decodingOutputIdsShape.d[2], maxSeqLength));
-
-    auto const& stream = manager.getStream().get();
-
-    // prefill finalOutputIds with the EOS tokens from decodingInput.endIds
-    tensorrt_llm::kernels::invokeInitializeOutput(bufferCast<TokenIdType>(finalOutputIds),
-        bufferCast<TokenIdType>(*decodingInput.endIds), batchSize * beamWidth, maxSeqLength, stream);
-    sync_check_cuda_error();
-
-    // Prepare length penalty, use the value from samplingConfig or 1.0f by default
-    SamplingConfig const& samplingConf = samplingConfig ? (*samplingConfig).get() : mSamplingConfig;
-    std::vector<float> lengthPenaltyVec;
-    TensorPtr lengthPenaltyPtr
-        = std::shared_ptr(manager.gpu(ITensor::makeShape({batchSize}), TRTDataType<float>::value));
-    if (!samplingConf.lengthPenalty.has_value() || samplingConf.lengthPenalty.value().size() == 0)
-    {
-        lengthPenaltyVec = std::vector<float>(batchSize, 1.0f);
-    }
-    else if (long int const size = samplingConf.lengthPenalty.value().size(); size == 1)
-    {
-        lengthPenaltyVec = std::vector<float>(batchSize, samplingConf.lengthPenalty.value()[0]);
-    }
-    else
-    {
-        TLLM_CHECK_WITH_INFO(size == batchSize,
-            common::fmtstr("Size of lengthPenalty in SamplingConfig (" FMT_DIM ") is different from batchSize (" FMT_DIM
-                           ")",
-                size, batchSize));
-        lengthPenaltyVec = samplingConf.lengthPenalty.value();
-    }
-
-    lengthPenaltyPtr = manager.copyFrom(lengthPenaltyVec, ITensor::makeShape({batchSize}), runtime::MemoryType::kGPU);
-
-    tensorrt_llm::kernels::BeamHypotheses bh;
-    bh.nMaxBatchSize = batchSize;
-    bh.nBatchSize = batchSize;
-    bh.nBeamWidth = beamWidth;
-    bh.nMaxSeqLen = maxSeqLength;
-    bh.lengthPenalties = bufferCast<float>(*lengthPenaltyPtr);
-    bh.inputLengths = bufferCast<SizeType32>(*decodingInput.lengths);
-    bh.outputIds = bufferCast<TokenIdType>(finalOutputIds);
-    bh.logProbs = bufferCastOrNull<float>(decodingOutput.logProbs);
-    bh.logProbsTiled = bufferCast<float>(*mLogProbsTiled);
-    bh.sequenceLengths = bufferCast<SizeType32>(*decodingOutput.lengths);
-    bh.cumLogProbs = bufferCast<float>(*decodingOutput.cumLogProbs);
-    bh.outputIdsCBA = bufferCast<TokenIdType>(*decodingOutput.beamHypotheses.outputIdsCBA);
-    bh.logProbsCBA = bufferCast<float>(*decodingOutput.beamHypotheses.logProbsCBA);
-    bh.sequenceLengthsCBA = bufferCast<SizeType32>(*decodingOutput.beamHypotheses.sequenceLengthsCBA);
-    bh.cumLogProbsCBA = bufferCast<float>(*decodingOutput.beamHypotheses.cumLogProbsCBA);
-    bh.normedScoresCBA = bufferCast<float>(*decodingOutput.beamHypotheses.normedScoresCBA);
-    bh.numBeamsCBA = bufferCast<SizeType32>(*decodingOutput.beamHypotheses.numBeamsCBA);
-    bh.minNormedScoresCBA = bufferCast<float>(*decodingOutput.beamHypotheses.minNormedScoresCBA);
-    bh.batchDones = bufferCast<bool>(*decodingOutput.beamHypotheses.batchDones);
-    bh.finished = bufferCast<tensorrt_llm::kernels::FinishedState>(*decodingOutput.finishReasons);
-    bh.outputIdsUnfinish = bufferCast<TokenIdType>(*decodingOutput.ids);
-    bh.parentIdsUnfinish = bufferCast<TokenIdType>(*decodingOutput.parentIds);
-
-    // This is where transpose is done
-    tensorrt_llm::kernels::invokeInsertUnfinishedPath(bh, stream);
-    sync_check_cuda_error();
-
-    tensorrt_llm::kernels::invokeFinalize(bh, stream);
-    sync_check_cuda_error();
-
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-}
-
 namespace tensorrt_llm::runtime
 {
 template class GptDecoder<float>;
 template class GptDecoder<half>;
 } // namespace tensorrt_llm::runtime
-
-void IGptDecoder::acceptDraftTokensByIds(ITensor const& targetTokenIds, ITensor const& draftTokenIds,
-    ITensor const& contextLengths, ITensor const& numDraftTokens, ITensor& sequenceLengths, ITensor const& finishedVec,
-    ITensor& finishedFinal, ITensor& finishedSum, ITensor const& batchSlots, BufferManager::CudaStreamPtr const& stream)
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-
-    auto const finishedVecShape = finishedVec.getShape();
-    auto const maxBatchSize = finishedVecShape.d[1];
-    auto const batchSlotsShape = batchSlots.getShape();
-    auto const batchSize = batchSlotsShape.d[0];
-    auto const targetTokenIdsShape = targetTokenIds.getShape();
-    auto const beamWidth = targetTokenIdsShape.d[1];
-    auto const maxSeqLength = targetTokenIdsShape.d[2];
-    auto const maxDraftTokens = draftTokenIds.getDimension<1>();
-
-    TLLM_CHECK_WITH_INFO(beamWidth == 1,
-        common::fmtstr("Beam width (" FMT_DIM ") > 1 is not supported for the speculative decoding", beamWidth));
-
-    TLLM_CHECK_WITH_INFO(batchSize <= maxBatchSize,
-        common::fmtstr("Batch size (" FMT_DIM ") is not smaller or equal to max batch size (" FMT_DIM ")", batchSize,
-            maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(draftTokenIds.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Draft tokens batch size (" FMT_DIM ") is not equal to target batch size (" FMT_DIM ")",
-            draftTokenIds.getDimension<0>(), maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(contextLengths.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Context length batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")",
-            contextLengths.getDimension<0>(), maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(numDraftTokens.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Num draft tokens batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")",
-            numDraftTokens.getDimension<0>(), maxBatchSize));
-
-    TLLM_CHECK_WITH_INFO(sequenceLengths.getDimension<0>() == maxBatchSize,
-        common::fmtstr("Sequence length batch size (" FMT_DIM ") is not equal to batch size (" FMT_DIM ")",
-            sequenceLengths.getDimension<0>(), maxBatchSize));
-
-    tksd::invokeAcceptDraftTokensByIds(bufferCast<TokenIdType>(draftTokenIds), bufferCast<TokenIdType>(targetTokenIds),
-        bufferCast<SizeType32>(contextLengths), bufferCast<SizeType32>(numDraftTokens),
-        bufferCast<SizeType32>(sequenceLengths),
-        reinterpret_cast<tensorrt_llm::kernels::FinishedState const*>(
-            bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finishedVec)),
-        reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
-            bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finishedFinal)),
-        bufferCast<int>(finishedSum), bufferCast<SizeType32>(batchSlots), batchSize, maxBatchSize, beamWidth,
-        maxSeqLength, maxDraftTokens, stream->get());
-
-    sync_check_cuda_error();
-
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-}
-
-void IGptDecoder::acceptDraftTokensByLogits(ITensor& draftLogits, ITensor const& targetLogits, ITensor& draftProbs,
-    ITensor& targetProbs, ITensor const& numDraftTokens, ITensor& finished, ITensor const& batchSlots,
-    SizeType32 vocabSize, SizeType32 vocabSizePadded, bool useRandomAcceptThreshold, float randomAcceptThreshold,
-    curandState_t* curandState, BufferManager::CudaStreamPtr const& stream)
-{
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-
-    auto const draftLogitsShape = draftLogits.getShape();
-    auto const maxBatchSize = draftLogitsShape.d[0];
-    auto const maxTokensPerStep = draftLogitsShape.d[1];
-    auto const batchSlotsShape = batchSlots.getShape();
-    auto const batchSize = batchSlotsShape.d[0];
-    auto constexpr beamWidth = 1;
-
-    TLLM_CHECK_WITH_INFO(
-        beamWidth == 1, common::fmtstr("Beam width (%d) > 1 is not supported for the speculative decoding", beamWidth));
-
-    TLLM_CHECK(draftLogitsShape.d[2] == vocabSize);
-
-    if (draftLogits.getDataType() == nvinfer1::DataType::kFLOAT)
-    {
-        tksd::acceptDraftTokensByLogits(bufferCast<float>(draftLogits),
-            const_cast<float**>(reinterpret_cast<float const* const*>(bufferCast<int64_t>(targetLogits))),
-            bufferCast<float>(draftProbs), bufferCast<float>(targetProbs), bufferCast<SizeType32>(numDraftTokens),
-            reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
-                bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finished)),
-            curandState, bufferCast<SizeType32>(batchSlots), batchSize, maxBatchSize, beamWidth, vocabSize,
-            vocabSizePadded, maxTokensPerStep, useRandomAcceptThreshold, randomAcceptThreshold, stream->get());
-    }
-    else if (draftLogits.getDataType() == nvinfer1::DataType::kHALF)
-    {
-        tksd::acceptDraftTokensByLogits(bufferCast<half>(draftLogits),
-            const_cast<half**>(reinterpret_cast<half const* const*>(bufferCast<int64_t>(targetLogits))),
-            bufferCast<half>(draftProbs), bufferCast<half>(targetProbs), bufferCast<SizeType32>(numDraftTokens),
-            reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
-                bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(finished)),
-            curandState, bufferCast<SizeType32>(batchSlots), batchSize, maxBatchSize, beamWidth, vocabSize,
-            vocabSizePadded, maxTokensPerStep, useRandomAcceptThreshold, randomAcceptThreshold, stream->get());
-    }
-    else
-    {
-        TLLM_THROW("Incorrect logits dtype. Only float32 and float16 are supported");
-    }
-
-    sync_check_cuda_error();
-
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-}
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
index d0cf75283..3930f9aa9 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -93,30 +93,28 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz
     auto constexpr nvFloatType = TRTDataType<float>::value;
 
     auto& dInput = mJointDecodingInput;
-    auto dummyLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    auto endIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    auto batchSlots = mBufferManager.emptyTensor(MemoryType::kPINNED, nvSizeType);
-    dInput
-        = std::make_unique<DecodingInput>(0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots));
-
+    { // prevent reusing these vars after std::move
+        auto dummyLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
+        auto endIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+        auto batchSlots = mBufferManager.emptyTensor(MemoryType::kPINNED, nvSizeType);
+        dInput = std::make_unique<DecodingInput>(
+            0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots));
+    }
     dInput->sequenceLimitLength = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     dInput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
 
     auto& dOutput = mJointDecodingOutput;
-    auto outputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    auto gatheredOutputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    dOutput = std::make_unique<DecodingOutput>(std::move(outputIds), std::move(gatheredOutputIds));
-
+    { // prevent reusing these vars after std::move
+        auto outputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+        auto gatheredOutputIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+        dOutput = std::make_unique<DecodingOutput>(std::move(outputIds), std::move(gatheredOutputIds));
+    }
     dOutput->newTokensSteps = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    dOutput->parentIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
+    dOutput->parentIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     mFinishedSteps
         = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<tk::FinishedState::UnderlyingType>::value);
-    mDraftProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    mTargetProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    mBatchSlotsSetup = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
-    mBatchSlotsDecoder = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
-    mBatchSlotsAcceptTokens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
-    mBatchSlotsAcceptLogits = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
+    mBatchSlotsSetup = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
+    mBatchSlotsDecoder = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     // use batchSize many entries instead of the usual 1
     dOutput->finishedSum = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     mFinishedSum = BufferManager::pinned(ITensor::makeShape({1}), nvSizeType);
@@ -127,16 +125,12 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz
     dOutput->finishReasons
         = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<tk::FinishedState::UnderlyingType>::value);
 
-    mNumDraftTokens = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
-    mCurandStates = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT8);
-    mDraftTokenIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
-    mDraftLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
-    mTargetLogitsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<float*>::value);
+    dOutput->logProbsTiled = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<float>::value);
 
     dInput->stopWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<int32_t*>::value);
-    dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
+    dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     dInput->badWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<int32_t*>::value);
-    dInput->badWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
+    dInput->badWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvSizeType);
     dInput->embeddingBias = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
 
     int device;
@@ -147,13 +141,13 @@ GptDecoderBatched::GptDecoderBatched(std::size_t vocabSize, std::size_t vocabSiz
 
     if (!mSpeculativeDecodingMode.isNone())
     {
-        allocateSpeculativeDecodingBuffers();
+        allocateSpeculativeDecodingBuffers(dtype);
     }
 
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void GptDecoderBatched::allocateSpeculativeDecodingBuffers()
+void GptDecoderBatched::allocateSpeculativeDecodingBuffers(nvinfer1::DataType dtype)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
@@ -199,6 +193,22 @@ void GptDecoderBatched::allocateSpeculativeDecodingBuffers()
     }
     dOutput->speculativeDecodingOutputs = speculativeDecodingOutputs;
 
+    if (mSpeculativeDecodingMode.isDraftTokensExternal())
+    {
+        DecodingInput::ExternalDraftTokensInputs externalDraftTokensInputs;
+
+        externalDraftTokensInputs.draftLogits = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
+        externalDraftTokensInputs.draftProbs = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
+        externalDraftTokensInputs.targetProbs = mBufferManager.emptyTensor(MemoryType::kGPU, dtype);
+        externalDraftTokensInputs.numDraftTokens = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
+        externalDraftTokensInputs.useDraftLogits
+            = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<bool>::value);
+        externalDraftTokensInputs.draftTokenIds
+            = mBufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
+
+        dInput->externalDraftTokensInputs = externalDraftTokensInputs;
+    }
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -249,6 +259,7 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     auto const maxTokensPerStepXmaxBatchSizeXmaxBeamWidth
         = ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize, maxBeamWidth});
     auto const maxBatchSizeXmaxTokensPerStep = ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep});
+    auto const jointOutputIdsShape = ITensor::makeShape({maxBatchSize, maxBeamWidth, maxSequenceLength});
 
     auto& dInput = *mJointDecodingInput;
     dInput.maxLength = mMaxSequenceLength;
@@ -266,8 +277,6 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     inputLengths.reshape(maxBatchSizeXmaxBeamWidth);
     mBufferManager.setZero(inputLengths);
 
-    auto const jointOutputIdsShape = ITensor::makeShape({maxBatchSize, maxBeamWidth, maxSequenceLength});
-
     auto& dOutput = *mJointDecodingOutput;
     dOutput.ids->reshape(jointOutputIdsShape);
 
@@ -294,15 +303,18 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
 
     mBatchSlotsSetup->reshape(ITensor::makeShape({maxBatchSize}));
     mBatchSlotsDecoder->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
-    mBatchSlotsAcceptTokens->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
-    mBatchSlotsAcceptLogits->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
 
     if (mSpeculativeDecodingMode.isDraftTokensExternal())
     {
-        mDraftProbs->reshape(ITensor::makeShape(
+        dInput.externalDraftTokensInputs->draftProbs->reshape(ITensor::makeShape(
             {maxBatchSize, maxTokensPerEngineStep, maxBeamWidth, static_cast<SizeType32>(mVocabSizePadded)}));
-        mTargetProbs->reshape(ITensor::makeShape(
+        dInput.externalDraftTokensInputs->targetProbs->reshape(ITensor::makeShape(
             {maxBatchSize, maxTokensPerEngineStep, maxBeamWidth, static_cast<SizeType32>(mVocabSizePadded)}));
+        dInput.externalDraftTokensInputs->draftLogits->reshape(
+            ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep, static_cast<SizeType32>(mVocabSizePadded)}));
+        dInput.externalDraftTokensInputs->draftTokenIds->reshape(maxBatchSizeXmaxTokensPerStep);
+        dInput.externalDraftTokensInputs->numDraftTokens->reshape(ITensor::makeShape({maxBatchSize, 1}));
+        dInput.externalDraftTokensInputs->useDraftLogits->reshape(ITensor::makeShape({maxBatchSize, 1}));
     }
 
     dOutput.parentIds->reshape(jointOutputIdsShape);
@@ -315,7 +327,7 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidth);
     mBufferManager.setZero(*dOutput.cumLogProbs);
 
-    dOutput.logProbs->reshape(ITensor::makeShape({maxBatchSize, maxBeamWidth, mMaxSequenceLength}));
+    dOutput.logProbs->reshape(jointOutputIdsShape);
     mBufferManager.setZero(*dOutput.logProbs);
 
     if (maxBeamWidth > 1)
@@ -323,14 +335,8 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
         dOutput.beamHypotheses.reshape(maxBatchSize, maxBeamWidth, mMaxSequenceLength);
     }
 
-    // speculative decoding only works for beam width == 1
-    mDraftTokenIds->reshape(maxBatchSizeXmaxTokensPerStep);
-    mDraftLogits->reshape(
-        ITensor::makeShape({maxBatchSize, maxTokensPerEngineStep, static_cast<SizeType32>(mVocabSizePadded)}));
-    mAcceptByLogits.resize(maxBatchSize);
-    mNumDraftTokens->reshape(ITensor::makeShape({maxBatchSize, 1}));
-    mCurandStates->reshape(ITensor::makeShape({maxBatchSize, sizeof(curandState_t)}));
-    mTargetLogitsPtrs->reshape(ITensor::makeShape({maxTokensPerEngineStep, maxBatchSize}));
+    dOutput.logProbsTiled->reshape(ITensor::makeShape({maxSequenceLength, maxBatchSize, maxBeamWidth}));
+    mBufferManager.setZero(*dOutput.logProbsTiled);
 
     const_cast<ITensor&>(*dInput.embeddingBias)
         .reshape(ITensor::makeShape({maxBatchSize, static_cast<SizeType32>(mVocabSizePadded)}));
@@ -586,7 +592,6 @@ void GptDecoderBatched::newRequestSpeculativeDecoding(
     SizeType32 batchIdx, decoder_batch::Request const& request, SamplingConfig const& samplingConfig)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    mAcceptByLogits[batchIdx] = false;
 
     if (mSpeculativeDecodingMode.predictsDraftTokens())
     {
@@ -634,40 +639,41 @@ void GptDecoderBatched::newRequestDraftTokensExternal(
     auto const& stream = mDecoderStream;
     BufferManager manager{stream};
 
-    auto constexpr localBatchSize = 1;
+    auto& dJointInput = *mJointDecodingInput;
+    auto useDraftLogits = false;
 
     auto const numDraftTokens = request.generatedTokensPerEngineStep - 1;
     if (request.draftLogits.has_value())
     {
         TensorPtr draftLogitsView = ITensor::view(request.draftLogits.value());
-        mAcceptByLogits[batchIdx] = true;
+        useDraftLogits = true;
 
-        TensorPtr draftLogitsReqBatchSlice = ITensor::slice(mDraftLogits, batchIdx, 1);
+        TensorPtr draftLogitsReqBatchSlice
+            = ITensor::slice(dJointInput.externalDraftTokensInputs->draftLogits, batchIdx, 1);
         draftLogitsReqBatchSlice->squeeze(0);
         TensorPtr draftLogitsReqTokensSlice = ITensor::slice(draftLogitsReqBatchSlice, 0, numDraftTokens);
         manager.copy(*draftLogitsView, *draftLogitsReqTokensSlice);
     }
-    TensorPtr draftTokensReqBatchSlice = ITensor::slice(mDraftTokenIds, batchIdx, 1);
+    auto useDraftLogitsView = ITensor::slice(dJointInput.externalDraftTokensInputs->useDraftLogits, batchIdx, 1);
+    kernels::invokeFill(*useDraftLogitsView, useDraftLogits, *stream);
+
+    TensorPtr draftTokensReqBatchSlice
+        = ITensor::slice(dJointInput.externalDraftTokensInputs->draftTokenIds, batchIdx, 1);
     draftTokensReqBatchSlice->squeeze(0);
     TensorPtr draftTokensReqTokensSlice = ITensor::slice(draftTokensReqBatchSlice, 0, numDraftTokens);
     TensorPtr draftTokensView = ITensor::view(request.draftTokens, ITensor::makeShape({numDraftTokens}));
     manager.copy(*draftTokensView, *draftTokensReqTokensSlice);
 
-    auto const curandStatesView = ITensor::slice(mCurandStates, batchIdx, 1);
-    auto curandState = reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*curandStatesView));
-    auto batchSlotsPtr = bufferCast<SizeType32>(*ITensor::slice(mBatchSlotsSetup, 0, localBatchSize));
-    if (samplingConfig.randomSeed.has_value())
-    {
-        tk::invokeCurandInitialize(
-            curandState, batchSlotsPtr, localBatchSize, samplingConfig.randomSeed.value()[0], stream->get());
-    }
-    else
-    {
-        tk::invokeCurandInitialize(curandState, batchSlotsPtr, localBatchSize, 0, stream->get());
-    }
-    auto numDraftTokensView = ITensor::slice(mNumDraftTokens, batchIdx, 1);
+    auto numDraftTokensView = ITensor::slice(dJointInput.externalDraftTokensInputs->numDraftTokens, batchIdx, 1);
     kernels::invokeFill(*numDraftTokensView, numDraftTokens, *stream);
 
+    bool const useRandomAcceptanceThreshold = !samplingConfig.draftAcceptanceThreshold.has_value();
+    float const constantThreshold
+        = useRandomAcceptanceThreshold ? 0 : samplingConfig.draftAcceptanceThreshold.value()[0];
+
+    dJointInput.externalDraftTokensInputs->useRandomAcceptanceThreshold = useRandomAcceptanceThreshold;
+    dJointInput.externalDraftTokensInputs->constantThreshold = constantThreshold;
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -798,7 +804,7 @@ void GptDecoderBatched::forwardDispatch(
     }
 }
 
-GptDecoderBatched::TokenPtr GptDecoderBatched::forwardAsync(
+GptDecoderBatched::DecoderFinishedEventPtr GptDecoderBatched::forwardAsync(
     decoder_batch::Output& output, decoder_batch::Input const& input)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@@ -808,7 +814,7 @@ GptDecoderBatched::TokenPtr GptDecoderBatched::forwardAsync(
     CudaEvent eventStop{};
     mRuntimeStream->record(eventStop);
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return std::make_unique<decoder_batch::Token>(std::move(eventStop), input.active);
+    return std::make_unique<decoder_batch::DecoderFinishedEvent>(std::move(eventStop), input.active);
 }
 
 void GptDecoderBatched::forwardDecoder(
@@ -833,8 +839,6 @@ void GptDecoderBatched::forwardDecoder(
 
     auto batchSlotsDecoderPtr = maxBeamWidth > 1 && input.seqSlots ? bufferCast<SizeType32>(*input.seqSlots)
                                                                    : bufferCast<SizeType32>(*mBatchSlotsDecoder);
-    auto batchSlotsAcceptTokensPtr = bufferCast<SizeType32>(*mBatchSlotsAcceptTokens);
-    auto batchSlotsAcceptLogitsPtr = bufferCast<SizeType32>(*mBatchSlotsAcceptLogits);
     auto& dInput = *mJointDecodingInput;
     auto& dOutput = *mJointDecodingOutput;
     auto& decoder = *mDecoder;
@@ -859,26 +863,12 @@ void GptDecoderBatched::forwardDecoder(
     }
 
     SizeType32 localBatchDecoderIdx = 0;
-    SizeType32 localBatchAcceptTokensIdx = 0;
-    SizeType32 localBatchAcceptLogitsIdx = 0;
     for (SizeType32 bi = 0; bi < mActualBatchSize; ++bi)
     {
         if (mFinished[bi] || !input.active.at(bi) || step >= mNumDecodingEngineTokens[bi])
         {
             continue;
         }
-
-        if (!mAcceptByLogits[bi] && mMaxDecodingDecoderTokens == 1 && mNumDecodingEngineTokens[bi] > 1
-            && step == mNumDecodingEngineTokens[bi] - 1)
-        {
-            batchSlotsAcceptTokensPtr[step * mActualBatchSize + localBatchAcceptTokensIdx] = bi;
-            localBatchAcceptTokensIdx++;
-        }
-        else if (mAcceptByLogits[bi] && mMaxDecodingDecoderTokens == 1 && mNumDecodingEngineTokens[bi] > 1 && step == 0)
-        {
-            batchSlotsAcceptLogitsPtr[step * mActualBatchSize + localBatchAcceptLogitsIdx] = bi;
-            localBatchAcceptLogitsIdx++;
-        }
         batchSlotsDecoderPtr[step * mActualBatchSize + localBatchDecoderIdx] = bi;
         localBatchDecoderIdx++;
     }
@@ -887,9 +877,6 @@ void GptDecoderBatched::forwardDecoder(
         = *std::max_element(std::begin(mNumDecodingEngineTokens), std::end(mNumDecodingEngineTokens));
 
     std::vector<SharedConstPtr> logitsVec;
-    auto targetLogitsPtrsSlice = ITensor::slice(mTargetLogitsPtrs, step, 1);
-    auto targetLogitsPtrsSlicePtr = reinterpret_cast<void const**>(bufferCast<int64_t>(*targetLogitsPtrsSlice));
-    SizeType32 targetLogitsIdx = 0;
     for (SizeType32 bi = 0; bi < mActualBatchSize; ++bi)
     {
         if (mFinished[bi] || !input.active.at(bi) || step >= mNumDecodingEngineTokens[bi])
@@ -899,32 +886,6 @@ void GptDecoderBatched::forwardDecoder(
         auto const& targetLogits = allTargetLogits[bi];
         TensorPtr logitsSlice = ITensor::slice(targetLogits, step, singleRequest);
         logitsVec.push_back(logitsSlice);
-        targetLogitsPtrsSlicePtr[targetLogitsIdx++] = logitsSlice->data();
-    }
-
-    if (async && localBatchAcceptLogitsIdx > 0)
-    {
-        // These params are only used for testing. Thus, can be per batch instead of per request
-        auto const& samplingConfig = decoder.getSamplingConfig();
-        bool const useRandomAcceptanceThreshold = !samplingConfig.draftAcceptanceThreshold.has_value();
-        float const randomAcceptanceThreshold
-            = useRandomAcceptanceThreshold ? 0 : samplingConfig.draftAcceptanceThreshold.value()[0];
-
-        TensorPtr batchSlotsAcceptLogitsStepSlice = ITensor::slice(mBatchSlotsAcceptLogits, step, 1);
-        batchSlotsAcceptLogitsStepSlice->squeeze(0);
-        TensorPtr batchSlotsAcceptLogitsSlice
-            = ITensor::slice(batchSlotsAcceptLogitsStepSlice, 0, localBatchAcceptLogitsIdx);
-
-        IGptDecoder::acceptDraftTokensByLogits(
-            /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mDraftLogits,
-            /* [maxBatchSize][maxDecodingTokens, vocabPadded] */ *targetLogitsPtrsSlice,
-            /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mDraftProbs,
-            /* [maxBatchSize, maxDecodingTokens, vocabPadded] */ *mTargetProbs,
-            /* [maxBatchSize] */ *mNumDraftTokens,
-            /* [maxDecodingTokens, maxBatchSize] */ *mFinishedSteps,
-            /* [bs] */ *batchSlotsAcceptLogitsSlice, static_cast<SizeType32>(mVocabSize),
-            static_cast<SizeType32>(mVocabSizePadded), useRandomAcceptanceThreshold, randomAcceptanceThreshold,
-            reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStates)), stream);
     }
 
     TensorPtr finishedStepsInput = ITensor::slice(mFinishedSteps, step, 1);
@@ -953,6 +914,11 @@ void GptDecoderBatched::forwardDecoder(
         dInput.medusaInputs->medusaLogits = input.predictedDraftLogits;
     }
 
+    if (mSpeculativeDecodingMode.isDraftTokensExternal())
+    {
+        dInput.externalDraftTokensInputs->step = step;
+    }
+
     dOutput.newTokens = newTokensStepView;
     dOutput.finishReasons = finishedStepsOutput;
     dOutput.lengths = sequenceLengths;
@@ -982,26 +948,6 @@ void GptDecoderBatched::forwardDecoder(
         mNbSteps[bi] += 1;
         mFinished[bi] = mNbSteps[bi] >= mMaxNewTokens[bi];
     }
-    if (async && localBatchAcceptTokensIdx > 0)
-    {
-        TensorPtr batchSlotsAcceptTokensStepSlice = ITensor::slice(mBatchSlotsAcceptTokens, step, 1);
-        batchSlotsAcceptTokensStepSlice->squeeze(0);
-        auto batchSlotsAcceptTokensSlice
-            = ITensor::slice(batchSlotsAcceptTokensStepSlice, 0, localBatchAcceptTokensIdx);
-
-        // Update finished state for 0th step
-        auto finishedFinal = ITensor::slice(mFinishedSteps, step, 1);
-        IGptDecoder::acceptDraftTokensByIds(
-            /* [maxBatchSize, maxBeamWidth, maxSeqLen] */ *dOutput.ids,
-            /* [maxBatchSize, maxDecodingDraftTokens] */ *mDraftTokenIds,
-            /* [maxBatchSize] */ *dInput.lengths,
-            /* [maxBatchSize] */ *mNumDraftTokens,
-            /* [maxBatchSize] */ *dOutput.lengths,
-            /* [maxDecodingTokens, maxBatchSize] */ *mFinishedSteps,
-            /* [maxBatchSize] */ *finishedFinal,
-            /* [maxBatchSize] */ *dOutput.finishedSum,
-            /* [bs] */ *batchSlotsAcceptTokensSlice, stream);
-    }
 
     // If last iteration
     if (async && step == maxDecodingEngineTokens - mMaxDecodingDecoderTokens)
@@ -1014,12 +960,12 @@ void GptDecoderBatched::forwardDecoder(
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void GptDecoderBatched::updateFinished(decoder_batch::Token const& token)
+void GptDecoderBatched::updateFinished(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     for (std::int32_t i = 0; i < mActualBatchSize; ++i)
     {
-        if (token.active[i] && !mFinished[i])
+        if (decoderFinishEvent.active[i] && !mFinished[i])
         {
             auto finishedSum = ITensor::slice(mJointDecodingOutput->finishedSum, i, 1);
             mFinished[i] = mFinished[i]
@@ -1030,25 +976,25 @@ void GptDecoderBatched::updateFinished(decoder_batch::Token const& token)
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void GptDecoderBatched::forwardSync(decoder_batch::Token const& token)
+void GptDecoderBatched::forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    token.event.synchronize();
+    decoderFinishEvent.event.synchronize();
 
-    updateFinished(token);
+    updateFinished(decoderFinishEvent);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void GptDecoderBatched::forwardSync(
-    decoder_batch::Token const& token, decoder_batch::Output& output, decoder_batch::Input const& input)
+void GptDecoderBatched::forwardSync(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent,
+    decoder_batch::Output& output, decoder_batch::Input const& input)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    token.event.synchronize();
+    decoderFinishEvent.event.synchronize();
 
     forwardDispatch(output, input, ForwardType::kSYNC);
 
-    updateFinished(token);
+    updateFinished(decoderFinishEvent);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
@@ -1060,7 +1006,6 @@ CudaEvent GptDecoderBatched::postProcessRequest(
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
     auto& stream = mRuntimeStream;
-    auto& decoder = *mDecoder;
     auto manager = BufferManager{stream};
 
     auto& dJointInput = *mJointDecodingInput;
@@ -1093,7 +1038,7 @@ CudaEvent GptDecoderBatched::postProcessRequest(
     TLLM_CHECK(dOutput.newTokens->getShape().d[0] == 1);
     dOutput.newTokens->squeeze(0);
     dOutput.newTokens = ITensor::slice(dOutput.newTokens, batchSlot, 1);
-
+    dOutput.logProbsTiled = dJointOutput.logProbsTiled;
     if (streaming)
     {
         // in case of streaming we shouldn't overwrite the data in beamHypotheses, since the beam search kernels expect
@@ -1105,7 +1050,7 @@ CudaEvent GptDecoderBatched::postProcessRequest(
         dOutput.cumLogProbs = mCumLogProbsTmp;
     }
 
-    decoder.gatherTree(dOutput, dInput, manager, samplingConfig);
+    kernels::gatherTree(dOutput, dInput, manager, samplingConfig);
 
     CudaEvent event{};
     stream->record(event);
@@ -1227,7 +1172,7 @@ void GptDecoderBatched::forwardAsync(decoder::Output& output, decoder::Input con
     batchOutput.cacheIndirection = output.cacheIndirection;
     batchOutput.sequenceLengths = output.sequenceLengths;
 
-    mForwardToken = forwardAsync(batchOutput, batchInput);
+    mDecoderFinishEvent = forwardAsync(batchOutput, batchInput);
     mBufferManager.setZero(*mFinishedSum);
     kernels::reduce(
         *mFinishedSum, *ITensor::slice(mJointDecodingOutput->finishedSum, 0, mActualBatchSize), *mRuntimeStream);
@@ -1239,7 +1184,7 @@ void GptDecoderBatched::forwardAsync(decoder::Output& output, decoder::Input con
 void GptDecoderBatched::forwardSync()
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    forwardSync(*mForwardToken);
+    forwardSync(*mDecoderFinishEvent);
     // wait for mFinishedSum to be updated
     mForwardEvent.synchronize();
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
index e64fde8e5..da58300fa 100644
--- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
+++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
@@ -85,6 +85,8 @@ std::vector<ModelConfig::LayerType> buildLayerTypes(
 
     auto constexpr layerNameAttention = "attention";
     auto constexpr layerNameRecurrent = "recurrent";
+    auto constexpr layerNameLinear = "linear";
+    auto constexpr layerNameNoop = "no_op";
 
     // The json field specifies a "group" of layers, which gets repeated multiple times
     // Note that the total number of layers does not need to be a multiple of a layer
@@ -102,9 +104,17 @@ std::vector<ModelConfig::LayerType> buildLayerTypes(
         {
             result[i] = ModelConfig::LayerType::kRECURRENT;
         }
+        else if (layerStringTypes[i % groupSize] == layerNameLinear)
+        {
+            result[i] = ModelConfig::LayerType::kLINEAR;
+        }
+        else if (layerStringTypes[i % groupSize] == layerNameNoop)
+        {
+            result[i] = ModelConfig::LayerType::kNOOP;
+        }
         else
         {
-            TLLM_LOG_ERROR("Unknown layer type: %s", layerStringTypes[i % groupSize].c_str());
+            TLLM_LOG_WARNING("Unknown layer type: %s, assuming attention", layerStringTypes[i % groupSize].c_str());
         }
     }
 
@@ -147,9 +157,25 @@ ModelConfig createModelConfig(
 
     auto const mlpHiddenSize = parseJsonFieldOptional<SizeType32>(config, mlpHiddenSizeField);
 
-    auto modelConfig = ModelConfig{vocabSize, numAttentionLayers, numRnnLayers, numHeads, hiddenSize, dataType};
+    auto numKvHeadsPerAttentionLayer
+        = parseJsonFieldOr<std::vector<SizeType32>>(config, "num_kv_heads_per_layer", std::vector<SizeType32>());
+
+    auto modelConfig
+        = ModelConfig{vocabSize, numLayers, numAttentionLayers, numRnnLayers, numHeads, hiddenSize, dataType};
+
+    if (!numKvHeadsPerAttentionLayer.empty())
+    {
+        std::transform(numKvHeadsPerAttentionLayer.cbegin(), numKvHeadsPerAttentionLayer.cend(),
+            numKvHeadsPerAttentionLayer.begin(),
+            [tensorParallelism](SizeType32 const numKvHeads) { return std::max(numKvHeads / tensorParallelism, 1); });
+        modelConfig.setNumKvHeadsPerLayer(numKvHeadsPerAttentionLayer);
+    }
+    else
+    {
+        modelConfig.setNbKvHeads(numKvHeads);
+    }
+
     modelConfig.setSizePerHead(sizePerHead);
-    modelConfig.setNbKvHeads(numKvHeads);
     modelConfig.setLayerTypes(layerTypes);
 
     // Set logits datatype
@@ -269,13 +295,24 @@ void parseLora(ModelConfig& modelConfig, Json const& json, Json const& pluginCon
 
     if (loraTargetModules.has_value())
     {
+        auto const& loraModuleNames = loraTargetModules.value();
+        auto const& numKvHeadsPerLayer = modelConfig.getNumKvHeadsPerLayer();
+        if (!loraModuleNames.empty())
+        {
+            TLLM_CHECK_WITH_INFO(std::all_of(numKvHeadsPerLayer.cbegin(), numKvHeadsPerLayer.cend(),
+                                     [firstNumKvHeads = numKvHeadsPerLayer[0]](SizeType32 numKvHeads)
+                                     { return numKvHeads == firstNumKvHeads; }),
+                "LORA with a VGQA model is not supported");
+        }
+        // TODO(oargov): don't assume all layers have the same num_kv_heads to support VGQA
+        auto const numKvHeads = numKvHeadsPerLayer.empty() ? modelConfig.getNbHeads() : numKvHeadsPerLayer[0];
         bool hasMoE = !engineVersionNone && json.at("pretrained_config").contains("moe");
         auto const numExperts = hasMoE
             ? json.at("pretrained_config").at("moe").at("num_experts").template get<SizeType32>()
             : SizeType32{0};
         modelConfig.setLoraModules(LoraModule::createLoraModules(loraTargetModules.value(), modelConfig.getHiddenSize(),
-            modelConfig.getMlpHiddenSize(), modelConfig.getNbHeads(), modelConfig.getNbKvHeads(),
-            modelConfig.getSizePerHead(), tensorParallelism, numExperts));
+            modelConfig.getMlpHiddenSize(), modelConfig.getNbHeads(), numKvHeads, modelConfig.getSizePerHead(),
+            tensorParallelism, numExperts));
     }
 
     modelConfig.setMaxLoraRank(loraMaxRank);
diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp
index c5d4dda55..73df2cb3f 100644
--- a/cpp/tensorrt_llm/runtime/gptSession.cpp
+++ b/cpp/tensorrt_llm/runtime/gptSession.cpp
@@ -72,7 +72,6 @@ auto const kProfileMbIdxs = populateMicrobatchIndexes();
 GptSession::Config setPath(GptSession::Config const& original, std::string const& path)
 {
     GptSession::Config config = original;
-    config.enginePath = std::filesystem::path(path);
     return config;
 }
 
@@ -219,8 +218,13 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea
     // tokens, when enabling cyclic kv cache.
     auto const useOneMoreBlock = maxBeamWidth > 1 && maxSequenceLength > maxAttentionWindow;
 
-    auto const localNbLayers = mModelConfig.getNbAttentionLayers(mWorldConfig.getPipelineParallelism());
-    auto const nbKvHeads = mModelConfig.getNbKvHeads();
+    auto [numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd] = mModelConfig.getNumKvHeadsPerLayerLocalRange(
+        mWorldConfig.getPipelineParallelism(), mWorldConfig.getPipelineParallelRank());
+    TLLM_CHECK_WITH_INFO(std::all_of(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd,
+                             [firstNumKvHeads = *numKvHeadsPerLayerBegin](SizeType32 numKvHeads)
+                             { return numKvHeads == firstNumKvHeads; }),
+        "Deprecated session API does not support multiple cache pools, use the newer executor API instead");
+
     auto const sizePerHead = mModelConfig.getSizePerHead();
     bool constexpr enableBlockReuse{false};
     bool enableDiffMaxAttenWin = false;
@@ -235,7 +239,8 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea
     TLLM_CHECK_WITH_INFO(maxBeamWidth == 1 || !enableDiffMaxAttenWin,
         "Can't support layer-wise max_attention_window with beam search. Please use a unified max_attention_window for "
         "all layers.");
-    mKvCacheManager = std::make_shared<bmkv::KVCacheManager>(localNbLayers, nbKvHeads, sizePerHead, tokensPerBlock,
+    mKvCacheManager = std::make_shared<bmkv::KVCacheManager>(
+        std::vector<SizeType32>(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd), sizePerHead, tokensPerBlock,
         blocksInPrimaryPool, blocksInSecondaryPool, maxBatchSize, maxBeamWidth, maxAttentionWindow, sinkTokenLength,
         useOneMoreBlock, mRuntime->getStreamPtr(), enableBlockReuse, kvCacheConfig.onboardBlocks);
 
@@ -253,6 +258,7 @@ void GptSession::createKvCacheManager(SizeType32 maxBatchSize, SizeType32 maxBea
     for (auto& buffers : mBuffers)
     {
         buffers->transformerBuffers->setKvPoolPointers(mKvCacheManager.get());
+        buffers->transformerBuffers->setKvPoolMapping(mKvCacheManager.get());
     }
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
diff --git a/cpp/tensorrt_llm/runtime/ipcUtils.cpp b/cpp/tensorrt_llm/runtime/ipcUtils.cpp
index a1ab91c4a..f0a3fafbd 100644
--- a/cpp/tensorrt_llm/runtime/ipcUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/ipcUtils.cpp
@@ -38,7 +38,14 @@ bool canAccessPeer(WorldConfig const& worldConfig)
     for (SizeType32 rank : worldConfig.getTensorParallelGroup())
     {
         SizeType32 destDevice = worldConfig.getDeviceOf(rank);
-        if (worldConfig.getNodeRankOf(rank) != worldConfig.getNodeRank() || destDevice == srcDevice)
+        if (worldConfig.getNodeRankOf(rank) != worldConfig.getNodeRank())
+        {
+            TLLM_LOG_INFO("Detect inter-node TP between rank %d and rank %d, fail to access peer GPU memory",
+                worldConfig.getRank(), rank);
+            TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+            return false;
+        }
+        if (destDevice == srcDevice)
         {
             continue;
         }
@@ -149,19 +156,24 @@ AllReduceBuffers::AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWi
         * std::min(
             static_cast<std::size_t>(maxBatchSize) * maxBeamWidth * maxSequenceLength * hiddenSize * sizeof(float),
             utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(tpSize));
+    auto const lamportBufferSize
+        = tpSize * tensorrt_llm::kernels::reduce_fusion::details::kLamportTokenNumThreshold * hiddenSize * sizeof(half);
     auto const flagsSize = IpcMemory::FLAGS_SIZE * tpSize * 2;
 
-    for (auto size : {bufferSize, bufferSize, flagsSize, flagsSize})
+    for (auto size :
+        {bufferSize, bufferSize, flagsSize, flagsSize, lamportBufferSize, lamportBufferSize, lamportBufferSize})
     {
         mIpcMemoryHandles.emplace_back(size, manager, worldConfig, isP2pSupported);
     }
 
     mAllReduceCommPtrs
-        = BufferManager::cpu(ITensor::makeShape({static_cast<SizeType32>(mIpcMemoryHandles.size()) * tpSize + 1}),
+        = BufferManager::cpu(ITensor::makeShape({static_cast<SizeType32>(mIpcMemoryHandles.size()) * tpSize + 2}),
             nvinfer1::DataType::kINT64);
     auto commPtrs = BufferRange<void*>(*mAllReduceCommPtrs);
-    auto const flagPtr = static_cast<int64_t*>(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 1));
-    *flagPtr = 0;
+    auto const CustomARFlagPtr = static_cast<int64_t*>(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 1));
+    auto const LamportFlagPtr = static_cast<int64_t*>(mAllReduceCommPtrs->data(mAllReduceCommPtrs->getSize() - 2));
+    *CustomARFlagPtr = 0;
+    *LamportFlagPtr = 0;
 
     for (std::size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++)
     {
@@ -169,6 +181,22 @@ AllReduceBuffers::AllReduceBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWi
         TLLM_CHECK(memCommPtrs.size() == static_cast<std::size_t>(tpSize));
         std::copy(memCommPtrs.begin(), memCommPtrs.end(), commPtrs.begin() + memIdx * tpSize);
     }
+#if ENABLE_MULTI_DEVICE
+    auto rank = worldConfig.getRank();
+    auto tp_rank = worldConfig.getTensorParallelRank();
+    // When p2p is not supported all the mIpcMemoryHandles are
+    // null
+    if (rank == tp_rank && isP2pSupported)
+    {
+        tensorrt_llm::kernels::lamportInitialize(
+            mIpcMemoryHandles[4].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0);
+        tensorrt_llm::kernels::lamportInitialize(
+            mIpcMemoryHandles[5].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0);
+        tensorrt_llm::kernels::lamportInitialize(
+            mIpcMemoryHandles[6].getCommPtrs()[rank], lamportBufferSize / sizeof(half), nvinfer1::DataType::kHALF, 0);
+        cudaDeviceSynchronize();
+    }
+#endif
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
diff --git a/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp b/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp
index 465641bf0..8f543f9ed 100644
--- a/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp
@@ -11,7 +11,9 @@
  */
 
 #include "tensorrt_llm/runtime/lookaheadBuffers.h"
+#include "iTensor.h"
 #include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
 #include "tensorrt_llm/runtime/common.h"
 
 namespace tensorrt_llm::runtime
@@ -28,8 +30,6 @@ LookaheadDecodingBuffers::LookaheadDecodingBuffers(
     , positionIds(
           bufferManager.gpu(ITensor::makeShape({maxNumSequences, maxTokensPerStep}), nvinfer1::DataType::kINT32))
 {
-    TLLM_LOG_DEBUG(
-        "LookaheadDecodingBuffers, maxNumSequences = %d, maxTokensPerStep = %d", maxNumSequences, maxTokensPerStep);
 }
 
 LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
@@ -40,11 +40,11 @@ LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeTy
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     TLLM_CHECK_WITH_INFO(maxBeamWidth == 1, "Lookahead decoding does not support beam search");
 
-    // auto const tokensPerStep = modelConfig.getMaxTokensPerStep();
     auto const tokensPerStep = modelConfig.getMaxDecodingTokens();
     auto const numPackedMasks = static_cast<ITensor::DimType64>(tensorrt_llm::common::divUp(tokensPerStep, 32));
 
-    // Copy buffers to device
+    cumSumLength = manager.pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
+
     packedMasksDevice
         = manager.gpu(ITensor::makeShape({maxBatchSize * tokensPerStep, numPackedMasks}), nvinfer1::DataType::kINT32);
     positionOffsetsDevice = manager.gpu(ITensor::makeShape({maxBatchSize, tokensPerStep}), nvinfer1::DataType::kINT32);
@@ -54,7 +54,7 @@ LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeTy
     packedMaskHost = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32);
     positionOffsetsHost = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
     generationLengthsHost = manager.cpu(generationLengthsDevice->getShape(), nvinfer1::DataType::kINT32);
-    positionIdsHost = manager.gpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
+    positionIdsHost = manager.cpu(positionIdsDevice->getShape(), nvinfer1::DataType::kINT32);
 
     packedMaskHostCopy = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32);
     positionOffsetsHostCopy = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
@@ -76,24 +76,59 @@ void LookaheadRuntimeBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType
 
     auto const tokensPerStep = modelConfig.getMaxDecodingTokens();
 
+    manager.copy(seqSlots, *batchSlotsHostCopy);
+    manager.copy(*decoderLookaheadBuffers.generationLengths, *generationLengthsHostCopy);
     manager.copy(*decoderLookaheadBuffers.positionOffsets, *positionOffsetsHostCopy);
     manager.copy(*decoderLookaheadBuffers.packedMasks, *packedMaskHostCopy);
     manager.copy(*decoderLookaheadBuffers.positionIds, *positionIdsHostCopy);
-    manager.copy(seqSlots, *batchSlotsHostCopy);
-    manager.copy(*decoderLookaheadBuffers.generationLengths, *generationLengthsHostCopy);
 
     manager.getStream().synchronize();
 
     BufferRange<SizeType32 const> batchSlotsRange(*batchSlotsHostCopy);
+    BufferRange<SizeType32> cumSumLengthRange(*cumSumLength);
+
+    SizeType32 maxGenerationLength = 0;
+    for (SizeType32 bi = 0; bi < numGenSequences; bi++)
+    {
+        SizeType32 gbi = batchSlotsRange[bi + numCtxSequences];
+        SizeType32 theLength = BufferRange<SizeType32>(*generationLengthsHostCopy)[gbi];
+        maxGenerationLength = std::max(maxGenerationLength, theLength);
+    }
+
+    auto positionOffsetShape = positionOffsetsHost->getShape();
+    positionOffsetShape.d[1] = maxGenerationLength;
+    positionOffsetsHost->reshape(positionOffsetShape);
+    positionOffsetsDevice->reshape(positionOffsetShape);
+
+    auto positionIdsShape = positionIdsHostCopy->getShape();
+    auto positionIdsShape1D = ITensor::makeShape({ITensor::volume(positionIdsShape)});
+    positionIdsHostCopy->reshape(positionIdsShape1D);
+    positionIdsHost->reshape(positionIdsShape1D);
+
+    cumSumLengthRange[0] = 0;
     for (SizeType32 bi = 0; bi < numGenSequences; bi++)
     {
         SizeType32 gbi = batchSlotsRange[bi + numCtxSequences];
+        SizeType32 theLength = BufferRange<SizeType32>(*generationLengthsHostCopy)[gbi];
+
         manager.copy(*ITensor::at(generationLengthsHostCopy, {gbi}), *ITensor::at(generationLengthsHost, {bi}));
-        manager.copy(*ITensor::at(positionOffsetsHostCopy, {gbi}), *ITensor::at(positionOffsetsHost, {bi}));
-        manager.copy(*ITensor::slice(packedMaskHostCopy, gbi * tokensPerStep, tokensPerStep),
-            *ITensor::slice(packedMaskHost, bi * tokensPerStep, tokensPerStep));
-        manager.copy(*ITensor::at(positionIdsHostCopy, {gbi}), *ITensor::at(positionIdsHost, {bi}));
+
+        manager.copy(*ITensor::slice(positionOffsetsHostCopy, {gbi, 0}, theLength),
+            *ITensor::slice(positionOffsetsHost, {bi, 0}, theLength));
+
+        manager.copy(*ITensor::slice(packedMaskHostCopy, gbi * tokensPerStep, theLength),
+            *ITensor::slice(packedMaskHost, cumSumLengthRange[0], theLength));
+
+        manager.copy(*ITensor::slice(positionIdsHostCopy, gbi * tokensPerStep, theLength),
+            *ITensor::slice(positionIdsHost, cumSumLengthRange[0], theLength));
+
+        cumSumLengthRange[0] += theLength;
     }
+
+    positionIdsHostCopy->reshape(positionIdsShape);
+    positionIdsHost->reshape(positionIdsShape);
+    positionIdsDevice->reshape(positionIdsShape);
+
     manager.copy(*ITensor::slice(generationLengthsHost, 0, numGenSequences),
         *ITensor::slice(generationLengthsDevice, 0, numGenSequences));
     manager.copy(*ITensor::slice(positionOffsetsHost, 0, numGenSequences),
@@ -102,6 +137,7 @@ void LookaheadRuntimeBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType
         *ITensor::slice(packedMasksDevice, 0, numGenSequences * tokensPerStep));
     manager.copy(
         *ITensor::slice(positionIdsHost, 0, numGenSequences), *ITensor::slice(positionIdsDevice, 0, numGenSequences));
+    positionIdsDevice->reshape(ITensor::makeShape({cumSumLengthRange[0]}));
 
     manager.getStream().synchronize();
 
diff --git a/cpp/tensorrt_llm/runtime/loraUtils.cpp b/cpp/tensorrt_llm/runtime/loraUtils.cpp
index 176fb3b71..34302516c 100644
--- a/cpp/tensorrt_llm/runtime/loraUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/loraUtils.cpp
@@ -68,6 +68,7 @@ void loraValidateRequestTensors(std::optional<std::uint64_t> const& optTaskId,
 
         auto loraModules = modelConfig.getLoraModules();
         auto configPtr = bufferCast<SizeType32>(*config);
+        auto maxAdapterSize = modelConfig.getMaxLoraRank();
         for (SizeType32 row = 0; row < config->getShape().d[1]; ++row)
         {
             auto modId = configPtr[row * kLORA_CONFIG_ROW_SIZE + kLORA_CONFIG_MODULE_OFF];
@@ -83,6 +84,9 @@ void loraValidateRequestTensors(std::optional<std::uint64_t> const& optTaskId,
             TLLM_CHECK_WITH_INFO(it != loraModules.end(), "lora module " + moduleName + " not enabled for this model");
             TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize) <= weights->getShape().d[2],
                 "lora_weights has to few values for " + moduleName);
+            TLLM_CHECK_WITH_INFO(adapterSize <= maxAdapterSize,
+                "Invalid low_rank (" + std::to_string(adapterSize) + "). low_rank must be smaller than mMaxLowRank ("
+                    + std::to_string(maxAdapterSize) + ")");
         }
     }
 }
diff --git a/cpp/tensorrt_llm/runtime/medusaModule.cpp b/cpp/tensorrt_llm/runtime/medusaModule.cpp
index d889dcdfa..174a3b4cb 100644
--- a/cpp/tensorrt_llm/runtime/medusaModule.cpp
+++ b/cpp/tensorrt_llm/runtime/medusaModule.cpp
@@ -96,7 +96,7 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
         if (curDepth != depth)
         {
             TLLM_CHECK(depth + 1 == curDepth);
-            TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
+            TLLM_CHECK_WITH_INFO(curDepth <= getMaxDraftPathLen(),
                 "Medusa choices require more Medusa heads than the engine was built with.");
             // Save TopK
             topKs[depth - 1] = maxTopK;
diff --git a/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp b/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp
index c4f9d888b..6b9c1175f 100644
--- a/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/rnnStateBuffers.cpp
@@ -15,11 +15,11 @@
  */
 
 #include "tensorrt_llm/runtime/rnnStateBuffers.h"
+#include "iBuffer.h"
 #include "tensorrt_llm/runtime/runtimeBuffers.h"
 #include "tensorrt_llm/runtime/utils/sessionUtils.h"
 
 using namespace tensorrt_llm::runtime;
-namespace tc = tensorrt_llm::common;
 
 RnnStateBuffers::RnnStateBuffers()
 {
@@ -92,8 +92,8 @@ RnnStateBuffers::RnnStateBuffers(
         auto statePtrsShape = ITensor::makeShape({localNbLayers});
         slotMappingDevice = bufferManager.gpu(slotMappingShape, nvinfer1::DataType::kINT32);
         slotMappingHost = BufferManager::cpu(slotMappingShape, nvinfer1::DataType::kINT32);
-        rnnStatePtrs = BufferManager::cpu(statePtrsShape, nvinfer1::DataType::kINT64);
-        convStatePtrs = BufferManager::cpu(statePtrsShape, nvinfer1::DataType::kINT64);
+        rnnStatePtrs = BufferManager::cpu(statePtrsShape, TRTDataType<void*>::value);
+        convStatePtrs = BufferManager::cpu(statePtrsShape, TRTDataType<void*>::value);
     }
     else
     {
@@ -179,8 +179,8 @@ void RnnStateBuffers::fillStatePtrs()
     rnnStatePtr.resize(mLocalNbLayers);
     convStatePtr.resize(mLocalNbLayers);
 
-    void** rnnStatePtrArray = static_cast<void**>(rnnStatePtrs->data());
-    void** convStatePtrArray = static_cast<void**>(convStatePtrs->data());
+    auto* rnnStatePtrArray = bufferCast<void*>(*rnnStatePtrs);
+    auto* convStatePtrArray = bufferCast<void*>(*convStatePtrs);
 
     for (int i = 0; i < mLocalNbLayers; i++)
     {
diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
index 8601ad0cc..8e0acf654 100644
--- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
+++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
@@ -18,6 +18,7 @@
 
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/kernels/decodingCommon.h"
+#include "tensorrt_llm/kernels/decodingKernels.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 
 #include <algorithm>
@@ -61,6 +62,7 @@ StatefulGptDecoder::StatefulGptDecoder(std::size_t vocabSize, std::size_t vocabS
     dOutput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     dOutput->cumLogProbs = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     dOutput->beamHypotheses.empty(mBufferManager);
+    dOutput->logProbsTiled = mBufferManager.emptyTensor(MemoryType::kGPU, TRTDataType<float>::value);
 
     dInput->stopWordsPtrs = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<int32_t*>::value);
     dInput->stopWordsLens = mBufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType<SizeType32>::value);
@@ -147,6 +149,8 @@ void StatefulGptDecoder::reshapeBuffers(SizeType32 batchSize, SizeType32 beamWid
         mBufferManager.setZero(*dOutput.cumLogProbs);
         dOutput.beamHypotheses.reshape(batchSize, beamWidth, mMaxSequenceLength);
     }
+    dOutput.logProbsTiled->reshape(ITensor::makeShape({maxSequenceLength, batchSize, beamWidth}));
+    mBufferManager.setZero(*dOutput.logProbsTiled);
 
     mNbSteps = 0;
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
@@ -299,6 +303,7 @@ void StatefulGptDecoder::newBatch(
     {
         // manager.setZero(*dOutput.cumLogProbs);
     }
+    mBufferManager.setZero(*dOutput.logProbsTiled);
 
     // copy the request ids into dOutput.ids (with tiling)
     kernels::initOutputIds(
@@ -355,12 +360,12 @@ void StatefulGptDecoder::forwardSync()
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void StatefulGptDecoder::finalize(SamplingConfig const&) const
+void StatefulGptDecoder::finalize(SamplingConfig const& samplingConfig) const
 {
     // TODO (rkobus) can we do this inplace?
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto& outputIds = mDecodingOutput->ids;
-    mDecoder->gatherTree(*mDecodingOutput, *mDecodingInput, mBufferManager);
+    kernels::gatherTree(*mDecodingOutput, *mDecodingInput, mBufferManager, samplingConfig);
     mBufferManager.copy(*(mDecodingOutput->gatheredIds), *outputIds);
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
     return;
diff --git a/cpp/tensorrt_llm/runtime/tllmBuffers.h b/cpp/tensorrt_llm/runtime/tllmBuffers.h
index 67a55d3ba..ea6beb7b2 100644
--- a/cpp/tensorrt_llm/runtime/tllmBuffers.h
+++ b/cpp/tensorrt_llm/runtime/tllmBuffers.h
@@ -216,7 +216,7 @@ class BorrowingAllocator : public BaseAllocator<BorrowingAllocator<memoryType>,
         , mCapacity(capacity)
     {
         TLLM_CHECK_WITH_INFO(capacity == 0 || static_cast<bool>(mPtr), "Undefined pointer");
-        TLLM_CHECK_WITH_INFO(mCapacity >= 0, "Capacity must be non-negative");
+        TLLM_CHECK_WITH_INFO(mCapacity >= std::size_t(0), "Capacity must be non-negative");
     }
 
 protected:
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
index 409088820..3cb9b05b6 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.cpp
@@ -22,7 +22,10 @@
 #include "tensorrt_llm/executor/tensor.h"
 #include "tllmLogger.h"
 
+#include <algorithm>
+#include <iterator>
 #include <limits>
+#include <memory>
 #include <type_traits>
 
 using namespace tensorrt_llm::runtime;
@@ -140,6 +143,24 @@ TllmRuntime::TllmRuntime(
     // Print context memory size for CI/CD to track.
     TLLM_LOG_INFO("[MemUsageChange] Allocated %.2f MiB for execution context memory.",
         static_cast<double>(devMemorySize) / 1048576.0);
+
+    cacheTensorNames();
+}
+
+void TllmRuntime::cacheTensorNames()
+{
+    for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
+    {
+        auto const* const name = mEngine->getIOTensorName(i);
+        if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT)
+        {
+            mInputTensorNames.emplace_back(name);
+        }
+        else if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT)
+        {
+            mOutputTensorNames.emplace_back(name);
+        }
+    }
 }
 
 nvinfer1::IExecutionContext& TllmRuntime::addContext(std::int32_t profileIndex)
@@ -187,68 +208,97 @@ bool TllmRuntime::executeContext(SizeType32 contextIndex) const
     return res;
 }
 
-void TllmRuntime::setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap)
+void TllmRuntime::setInputTensorsImpl(SizeType32 contextIndex, TensorMap const& tensorMap, bool throwOnMiss)
 {
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_FUNC_RANGE();
     auto& context = getContext(contextIndex);
-    for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
+    for (auto const& name : mInputTensorNames)
     {
-        char const* name = mEngine->getIOTensorName(i);
-        if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT)
+        auto const pos = tensorMap.find(name);
+        if (pos == tensorMap.end())
         {
-            auto pos = tensorMap.find(name);
-            auto posWeight = mManagedWeightsMap.find(name);
-            if (pos == tensorMap.end() && posWeight == mManagedWeightsMap.end())
+            if (throwOnMiss)
             {
-                auto expectedShape = mEngine->getTensorShape(name);
-                TLLM_THROW(
-                    "Input tensor '%s' not found; expected shape: %s", name, ITensor::toString(expectedShape).c_str());
+                auto expectedShape = mEngine->getTensorShape(name.c_str());
+                TLLM_THROW("Input tensor '%s' not found; expected shape: %s", name.c_str(),
+                    ITensor::toString(expectedShape).c_str());
             }
-            if (posWeight != mManagedWeightsMap.end() && mSetWeights.count(contextIndex) > 0)
+            else
             {
-                continue; // This input tensor is a managed weight, and we have already set it in a previous call.
+                continue;
             }
+        }
 
-            auto const& tensor = pos == tensorMap.end() ? posWeight->second : pos->second;
-            auto const tensorDtype = tensor->getDataType();
-            auto const engineDtype = mEngine->getTensorDataType(name);
-            // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
-            TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
-                    || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
-                "%s: expected type %d, provided type %d", name, static_cast<std::int32_t>(engineDtype),
-                static_cast<std::int32_t>(tensorDtype));
-
-            auto const tensorShape = tensor->getShape();
-            auto const setInputShapeSuccess = context.setInputShape(name, tensorShape);
-            if (!setInputShapeSuccess)
-            {
-                auto const minShape = mEngine->getProfileShape(name, contextIndex, nvinfer1::OptProfileSelector::kMIN);
-                auto const maxShape = mEngine->getProfileShape(name, contextIndex, nvinfer1::OptProfileSelector::kMAX);
-
-                TLLM_THROW("Tensor '%s' has invalid shape %s, expected in range min %s, max %s", name,
-                    ITensor::toString(tensorShape).c_str(), ITensor::toString(minShape).c_str(),
-                    ITensor::toString(maxShape).c_str());
-            }
-            auto* const data = tensor->data();
-            if (data)
-            {
-                context.setInputTensorAddress(name, data);
-            }
-            else
+        auto const& tensor = pos->second;
+        auto const tensorDtype = tensor->getDataType();
+        auto const engineDtype = mEngine->getTensorDataType(name.c_str());
+        // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
+        TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
+                || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
+            "%s: expected type %d, provided type %d", name.c_str(), static_cast<std::int32_t>(engineDtype),
+            static_cast<std::int32_t>(tensorDtype));
+
+        auto const tensorShape = tensor->getShape();
+        auto const setInputShapeSuccess = context.setInputShape(name.c_str(), tensorShape);
+        if (!setInputShapeSuccess)
+        {
+            auto const minShape
+                = mEngine->getProfileShape(name.c_str(), contextIndex, nvinfer1::OptProfileSelector::kMIN);
+            auto const maxShape
+                = mEngine->getProfileShape(name.c_str(), contextIndex, nvinfer1::OptProfileSelector::kMAX);
+
+            TLLM_THROW("Tensor '%s' has invalid shape %s, expected in range min %s, max %s", name.c_str(),
+                ITensor::toString(tensorShape).c_str(), ITensor::toString(minShape).c_str(),
+                ITensor::toString(maxShape).c_str());
+        }
+        auto* const data = tensor->data();
+        if (data)
+        {
+            context.setInputTensorAddress(name.c_str(), data);
+        }
+        else
+        {
+            TLLM_CHECK_WITH_INFO(tensor->getSize() == 0, std::string("Invalid data for tensor: ") + name.c_str());
+            // TensorRT runtime does not support nullptr.
+            if (!mDummyTensor)
             {
-                TLLM_CHECK_WITH_INFO(tensor->getSize() == 0, std::string("Invalid data for tensor: ") + name);
-                // TensorRT runtime does not support nullptr.
-                if (!mDummyTensor)
-                {
-                    mDummyTensor = mBufferManager.gpu(ITensor::makeShape({1}));
-                }
-                context.setInputTensorAddress(name, mDummyTensor->data());
+                mDummyTensor = mBufferManager.gpu(ITensor::makeShape({1}));
             }
+            context.setInputTensorAddress(name.c_str(), mDummyTensor->data());
         }
     }
+}
+
+void TllmRuntime::setStaticInputTensors(TensorMap const& tensorMap)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    NVTX3_FUNC_RANGE();
+
+    TLLM_CHECK_WITH_INFO(getNbContexts() > 0, "Contexts should be created before calling setStaticInputTensors");
+    for (auto contextIndex = 0; contextIndex < getNbContexts(); ++contextIndex)
+    {
+        setInputTensorsImpl(contextIndex, tensorMap, false);
+    }
 
-    mSetWeights.insert(contextIndex);
+    // move static input tensor names to separate vector
+    auto const begin = mInputTensorNames.begin();
+    auto end = mInputTensorNames.end();
+    for (auto const& [name, tensor] : tensorMap)
+    {
+        end = std::remove(begin, end, name);
+    }
+    mInputTensorNames.erase(end, mInputTensorNames.end());
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+void TllmRuntime::setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    NVTX3_FUNC_RANGE();
+    setInputTensorsImpl(contextIndex, tensorMap, true);
+
+    auto& context = getContext(contextIndex);
     if (mUseShapeInference)
     {
         NVTX3_SCOPED_RANGE(infer_shapes);
@@ -277,41 +327,37 @@ void TllmRuntime::setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_FUNC_RANGE();
     auto& context = getContext(contextIndex);
-    for (std::int32_t i = 0; i < mEngine->getNbIOTensors(); ++i)
+    for (auto const& name : mOutputTensorNames)
     {
-        auto const name = mEngine->getIOTensorName(i);
-        if (mEngine->getTensorIOMode(name) == nvinfer1::TensorIOMode::kOUTPUT)
+        auto const engineDtype = mEngine->getTensorDataType(name.c_str());
+        auto const pos = tensorMap.find(name);
+        if (pos != tensorMap.end())
         {
-            auto const engineDtype = mEngine->getTensorDataType(name);
-            auto pos = tensorMap.find(name);
-            if (pos != tensorMap.end())
-            {
-                auto const& tensor = pos->second;
-                auto const tensorDtype = tensor->getDataType();
-                // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
-                TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
-                        || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
-                    "%s: expected type %d, provided type %d", name, static_cast<std::int32_t>(engineDtype),
-                    static_cast<std::int32_t>(tensorDtype));
-
-                if (mUseShapeInference)
-                {
-                    auto const dims = context.getTensorShape(name);
-                    tensor->reshape(dims);
-                }
-                context.setTensorAddress(name, tensor->data());
-            }
-            else if (mUseShapeInference)
-            {
-                auto const dims = context.getTensorShape(name);
-                auto tensor = ITensor::SharedPtr(mBufferManager.gpu(dims, engineDtype));
-                tensorMap.insert(pos, std::make_pair(name, tensor));
-                context.setTensorAddress(name, tensor->data());
-            }
-            else
+            auto const& tensor = pos->second;
+            auto const tensorDtype = tensor->getDataType();
+            // WAR: TRT does not support mixed FP8 and FP16 input, so engine expects FP16 tensors.
+            TLLM_CHECK_WITH_INFO(tensorDtype == engineDtype
+                    || (tensorDtype == nvinfer1::DataType::kFP8 && engineDtype == nvinfer1::DataType::kHALF),
+                "%s: expected type %d, provided type %d", name.c_str(), static_cast<std::int32_t>(engineDtype),
+                static_cast<std::int32_t>(tensorDtype));
+
+            if (mUseShapeInference)
             {
-                TLLM_THROW("Tensor %s is not found in tensorMap and shape inference is not allowed", name);
+                auto const dims = context.getTensorShape(name.c_str());
+                tensor->reshape(dims);
             }
+            context.setTensorAddress(name.c_str(), tensor->data());
+        }
+        else if (mUseShapeInference)
+        {
+            auto const dims = context.getTensorShape(name.c_str());
+            auto tensor = ITensor::SharedPtr(mBufferManager.gpu(dims, engineDtype));
+            tensorMap.insert(pos, std::make_pair(name, tensor));
+            context.setTensorAddress(name.c_str(), tensor->data());
+        }
+        else
+        {
+            TLLM_THROW("Tensor %s is not found in tensorMap and shape inference is not allowed", name.c_str());
         }
     }
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
@@ -329,7 +375,7 @@ bool TllmRuntime::hasLayerProfiler(SizeType32 contextId) const
 
 void TllmRuntime::setLayerProfiler()
 {
-    mLayerProfiler.reset(new LayerProfiler);
+    mLayerProfiler = std::make_unique<LayerProfiler>();
     for (auto& context : mContexts)
     {
         context->setProfiler(mLayerProfiler.get());
@@ -350,6 +396,8 @@ void TllmRuntime::reportToProfiler(SizeType32 contextId)
 
 void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank)
 {
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    NVTX3_FUNC_RANGE();
     auto& engine = getEngine();
     auto& manager = getBufferManager();
     if (rawEngine.getManagedWeightsMapOpt().has_value())
@@ -360,9 +408,7 @@ void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank)
         {
             TLLM_LOG_DEBUG("Loading managed weight: %s", name.c_str());
             auto iTensor = tensorrt_llm::executor::detail::toITensor(weight);
-            auto weightsDevice = std::shared_ptr<ITensor>{
-                manager.allocate(MemoryType::kGPU, iTensor->getShape(), iTensor->getDataType())};
-            manager.copy(iTensor->data(), *weightsDevice, MemoryType::kCPU);
+            auto weightsDevice = std::shared_ptr<ITensor>{manager.copyFrom(*iTensor, MemoryType::kGPU)};
             mManagedWeightsMap.insert(std::make_pair(name, weightsDevice));
         }
     }
@@ -385,4 +431,6 @@ void TllmRuntime::loadManagedWeights(RawEngine const& rawEngine, int localRank)
             mManagedWeightsMap.insert(std::make_pair(name, weightsDevice));
         }
     }
+    setStaticInputTensors(mManagedWeightsMap);
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
diff --git a/cpp/tensorrt_llm/runtime/tllmRuntime.h b/cpp/tensorrt_llm/runtime/tllmRuntime.h
index 80cea9c5f..06d7815cd 100644
--- a/cpp/tensorrt_llm/runtime/tllmRuntime.h
+++ b/cpp/tensorrt_llm/runtime/tllmRuntime.h
@@ -73,8 +73,20 @@ class TllmRuntime
 
     void clearContexts();
 
+    /// @brief Set input tensors from tensorMap for all contexts.
+    /// @details The function can be used to set static input tensors for all iterations. If a tensor was set this way,
+    /// it doesn't need to included in calls to setInputTensors anymore.
+    void setStaticInputTensors(TensorMap const& tensorMap);
+
+    /// @brief Set input tensors from tensorMap for context at contextIndex.
+    /// @details The function expects that all input tensors (excluding the ones set by setStaticInputTensors) are
+    /// contained in the tensorMap. If a tensor is missing, has a bad shape or type, it will throw.
     void setInputTensors(SizeType32 contextIndex, TensorMap const& tensorMap);
 
+    /// @brief Set output tensors from tensorMap for context at contextIndex.
+    /// @details The function expects that all output tensors are contained in the tensorMap. If a tensor is missing and
+    /// shape inference is enabled, it will allocate the tensor on GPU and insert it into the tensorMap. Otherwise it
+    /// will throw.
     void setOutputTensors(SizeType32 contextIndex, TensorMap& tensorMap);
 
     bool executeContext(SizeType32 contextIndex) const;
@@ -123,6 +135,10 @@ class TllmRuntime
     void loadManagedWeights(RawEngine const& rawEngine, int localRank);
 
 private:
+    void cacheTensorNames();
+
+    void setInputTensorsImpl(SizeType32 contextIndex, TensorMap const& tensorMap, bool throwOnMiss);
+
     BufferManager::CudaStreamPtr mStream;
     BufferManager mBufferManager;
     std::unique_ptr<nvinfer1::IRuntime> mRuntime;
@@ -133,7 +149,10 @@ class TllmRuntime
     std::unique_ptr<nvinfer1::IEngineInspector> mEngineInspector;
     std::unique_ptr<LayerProfiler> mLayerProfiler;
     bool mUseShapeInference;
-    TensorMap mManagedWeightsMap{};
-    std::set<SizeType32> mSetWeights;
+    TensorMap mManagedWeightsMap;
+    // List of input tensor names. Names of static tensors are removed from this list when setStaticInputTensors is
+    // called.
+    std::vector<std::string> mInputTensorNames;
+    std::vector<std::string> mOutputTensorNames;
 };
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/transformerBuffers.cpp b/cpp/tensorrt_llm/runtime/transformerBuffers.cpp
index f8a78f091..fead9addf 100644
--- a/cpp/tensorrt_llm/runtime/transformerBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/transformerBuffers.cpp
@@ -15,12 +15,15 @@
  */
 
 #include "tensorrt_llm/runtime/transformerBuffers.h"
+#include "iTensor.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/stlUtils.h"
 #include "tensorrt_llm/runtime/runtimeBuffers.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/utils/sessionUtils.h"
 #include <cstdlib> // std::getenv
+#include <vector>
 
 using namespace tensorrt_llm::runtime;
 namespace tc = tensorrt_llm::common;
@@ -34,6 +37,7 @@ TransformerBuffers::TransformerBuffers()
     presentKeysVals.clear();
     presentKeysValsAlt.clear();
     kvCacheBlockPoolPointers = nullptr;
+    kvCacheBlockPoolMapping = nullptr;
     kvCacheBlockOffsetsHost = nullptr;
     kvCacheBlockOffsetsDevice = nullptr;
 }
@@ -101,15 +105,16 @@ void TransformerBuffers::reshape(
     auto const maxAttentionWindow = generationConfig.maxAttentionWindow;
 
     auto const kvCacheReserve = ITensor::makeShape(
-        {batchSize, 2, modelConfig.getNbKvHeads(), maxAttentionWindow, modelConfig.getSizePerHead()});
+        {batchSize, 2, modelConfig.getNbKvHeads(0), maxAttentionWindow, modelConfig.getSizePerHead()});
     auto const kvCacheShape
-        = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxInputLength, modelConfig.getSizePerHead()});
+        = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(0), maxInputLength, modelConfig.getSizePerHead()});
+
     if (modelConfig.isPagedKVCache())
     {
         auto cacheBlockOffsetsShape = kvCacheBlockOffsetsHost->getShape();
         if (cacheBlockOffsetsShape.nbDims > 0)
         {
-            cacheBlockOffsetsShape.d[0] = batchSize;
+            cacheBlockOffsetsShape.d[1] = batchSize;
             kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape);
             kvCacheBlockOffsetsDevice->reshape(cacheBlockOffsetsShape);
         }
@@ -123,7 +128,8 @@ void TransformerBuffers::reshape(
         utils::reshapeBufferVector(presentKeysVals, kvCacheReserve);
     }
 
-    auto const localNbLayers = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism());
+    auto const localNbLayers
+        = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
 
     if (modelConfig.useGptAttentionPlugin())
     {
@@ -147,7 +153,7 @@ void TransformerBuffers::reshapeKvTensors(
 {
     auto const& manager = runtime.getBufferManager();
 
-    auto const cacheBlockOffsetsShape = ITensor::makeShape({maxBatchSize * maxBeamWidth, 2, maxBlocksPerSeq});
+    auto const cacheBlockOffsetsShape = ITensor::makeShape({1, maxBatchSize * maxBeamWidth, 2, maxBlocksPerSeq});
 
     kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape);
     manager.setZero(*kvCacheBlockOffsetsHost);
@@ -161,6 +167,11 @@ void TransformerBuffers::setKvPoolPointers(KvCacheManager const* kvCacheManager)
     kvCacheBlockPoolPointers = kvCacheManager->getBlockPoolPointers();
 }
 
+void TransformerBuffers::setKvPoolMapping(KvCacheManager const* kvCacheManager)
+{
+    kvCacheBlockPoolMapping = kvCacheManager->getLayerToPoolMapping();
+}
+
 TransformerBuffers TransformerBuffers::sliceTo(
     GenerationConfig const& generationConfig, ModelConfig const& modelConfig, SizeType32 offset, SizeType32 batchSize)
 {
@@ -169,8 +180,15 @@ TransformerBuffers TransformerBuffers::sliceTo(
     auto const generationBatchSize = generationConfig.batchSize;
     if (modelConfig.isPagedKVCache())
     {
+
         auto const& realCacheBlockOffsetsShape = kvCacheBlockOffsetsHost->getShape();
-        auto const maxBlocksPerSeq = realCacheBlockOffsetsShape.d[2];
+        auto const numPools = realCacheBlockOffsetsShape.d[0];
+        // (oargov) with multiple pools, slicing the tensor along the batch*beam dimension would require us to support
+        // non-contiguous tensors. with a single pool, we can just ignore the pools dimension when slicing and restore
+        // it later. this is part of the deprecated GPTSession API, so not supporting VGQA here should be ok.
+        TLLM_CHECK_WITH_INFO(numPools == 1,
+            "Deprecated transformerBuffers API does not support multiple cache pools, use the newer API instead");
+        auto const maxBlocksPerSeq = realCacheBlockOffsetsShape.d[3];
 
         // enable slicing by moving generationBatchSize to first dim
         auto const fakeCacheBlockOffsetsShape = ITensor::makeShape({generationBatchSize, 2, maxBlocksPerSeq});
@@ -178,13 +196,14 @@ TransformerBuffers TransformerBuffers::sliceTo(
         TensorPtr kvCacheBlockOffsetsDeviceView{ITensor::view(kvCacheBlockOffsetsDevice, fakeCacheBlockOffsetsShape)};
 
         // slice and reshape to correct shape
-        auto const cacheBlockOffsetsShape = ITensor::makeShape({batchSize, 2, maxBlocksPerSeq});
+        auto const cacheBlockOffsetsShape = ITensor::makeShape({numPools, batchSize, 2, maxBlocksPerSeq});
         buffers.kvCacheBlockOffsetsHost = ITensor::slice(kvCacheBlockOffsetsHostView, offset, batchSize);
         buffers.kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape);
         buffers.kvCacheBlockOffsetsDevice = ITensor::slice(kvCacheBlockOffsetsDeviceView, offset, batchSize);
         buffers.kvCacheBlockOffsetsDevice->reshape(cacheBlockOffsetsShape);
 
         buffers.kvCacheBlockPoolPointers = kvCacheBlockPoolPointers;
+        buffers.kvCacheBlockPoolMapping = kvCacheBlockPoolMapping;
     }
     else
     {
@@ -529,7 +548,7 @@ void TransformerBuffers::postContextStep(RuntimeBuffers* runtimeBuffers,
     if (modelConfig.useGptAttentionPlugin() && modelConfig.isPagedKVCache())
     {
         auto cacheBlockOffsetsShape = kvCacheBlockOffsetsHost->getShape();
-        cacheBlockOffsetsShape.d[0] = batchSize * beamWidth;
+        cacheBlockOffsetsShape.d[1] = batchSize * beamWidth;
         kvCacheBlockOffsetsHost->reshape(cacheBlockOffsetsShape);
         kvCacheBlockOffsetsDevice->reshape(cacheBlockOffsetsShape);
     }
@@ -720,6 +739,7 @@ void TransformerBuffers::getRuntimeBuffers(RuntimeBuffers const* runtimeBuffers,
             inputBuffers.insert_or_assign("kv_cache_block_offsets", kvCacheBlockOffsetsDevice);
             inputBuffers.insert_or_assign("host_kv_cache_block_offsets", kvCacheBlockOffsetsHost);
             inputBuffers.insert_or_assign("host_kv_cache_pool_pointers", kvCacheBlockPoolPointers);
+            inputBuffers.insert_or_assign("host_kv_cache_pool_mapping", kvCacheBlockPoolMapping);
         }
         else
         {
diff --git a/cpp/tensorrt_llm/runtime/transformerBuffers.h b/cpp/tensorrt_llm/runtime/transformerBuffers.h
index 5e4a6a847..4692e9b0e 100644
--- a/cpp/tensorrt_llm/runtime/transformerBuffers.h
+++ b/cpp/tensorrt_llm/runtime/transformerBuffers.h
@@ -53,6 +53,7 @@ class TransformerBuffers
         runtime::TllmRuntime const& runtime);
 
     void setKvPoolPointers(KvCacheManager const* kvCacheManager);
+    void setKvPoolMapping(KvCacheManager const* kvCacheManager);
 
     void reset(BufferManager& manager){};
 
@@ -92,9 +93,10 @@ class TransformerBuffers
     TensorPtr maxAttentionWindows;             // with attention plugin, host tensor
     TensorPtr sinkTokenLengths;                // with attention plugin, host tensor
     TensorPtr kvCacheBlockPoolPointers;
-    TensorPtr kvCacheBlockOffsetsHost;         // [batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
-    TensorPtr kvCacheBlockOffsetsDevice;       // [batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
-    TensorPtr runtimePerfKnobsHost;            // can hold max 16 perf knobs
+    TensorPtr kvCacheBlockPoolMapping;
+    TensorPtr kvCacheBlockOffsetsHost;   // [numPools, batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
+    TensorPtr kvCacheBlockOffsetsDevice; // [numPools, batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
+    TensorPtr runtimePerfKnobsHost;      // can hold max 16 perf knobs
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
index a15cc1f0d..f324cf5f9 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
@@ -22,6 +22,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 
 using namespace tensorrt_llm::runtime;
 namespace tc = tensorrt_llm::common;
@@ -89,6 +90,16 @@ void reshapeBufferVector(std::vector<ITensor::SharedPtr>& vector, nvinfer1::Dims
     }
 }
 
+void assertNoVGQA(ModelConfig const& modelConfig, WorldConfig const& worldConfig)
+{
+    auto [numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd] = modelConfig.getNumKvHeadsPerLayerLocalRange(
+        worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
+    TLLM_CHECK_WITH_INFO(std::all_of(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd,
+                             [firstNumKvHeads = *numKvHeadsPerLayerBegin](SizeType32 numKvHeads)
+                             { return numKvHeads == firstNumKvHeads; }),
+        "Deprecated session API does not support multiple cache pools, use the newer executor API instead");
+}
+
 std::vector<ITensor::SharedPtr> sliceBufferVector(
     std::vector<ITensor::SharedPtr> const& vector, SizeType32 const offset, SizeType32 const size)
 {
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
index 5fdd94f3e..4627cb369 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
@@ -56,6 +56,8 @@ std::vector<ITensor::SharedPtr> createBufferVector(
 
 void reshapeBufferVector(std::vector<ITensor::SharedPtr>& vector, nvinfer1::Dims const& shape);
 
+void assertNoVGQA(ModelConfig const& modelConfig, WorldConfig const& worldConfig);
+
 std::vector<ITensor::SharedPtr> sliceBufferVector(
     std::vector<ITensor::SharedPtr> const& vector, SizeType32 offset, SizeType32 size);
 
diff --git a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu
index a0f9233df..b6fd6f3b9 100644
--- a/cpp/tests/kernels/allReduce/allReduceKernelTest.cu
+++ b/cpp/tests/kernels/allReduce/allReduceKernelTest.cu
@@ -51,6 +51,16 @@ void simple_assert(bool flag)
     }
 }
 
+void check_last_cuda_error()
+{
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("CUDA error: %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
 struct CudaBuffer
 {
     void* _data;
@@ -85,8 +95,22 @@ struct CudaBuffer
 };
 
 template <typename T>
-float compare(int rank, void* _pa, void* _pb, int size, float scale)
+float compare(
+    int rank, void* _pa, void* _pb, int size, float scale, bool print_error = false, std::string const& cmp_info = "")
 {
+    if (print_error && rank == 0)
+    {
+        if (!cmp_info.empty())
+        {
+            printf("compare %s\n", cmp_info.c_str());
+        }
+        else
+        {
+            static int cnt = 0;
+            printf("unnamed compare %d\n", cnt++);
+        }
+    }
+
     auto pa = reinterpret_cast<T*>(_pa);
     auto pb = reinterpret_cast<T*>(_pb);
     float max_diff = 0.f, tot_diff = 0.f;
@@ -101,6 +125,10 @@ float compare(int rank, void* _pa, void* _pb, int size, float scale)
         float diff = std::abs(va - vb);
         if (diff > threshold)
         {
+            if (rank == 0 && print_error)
+            {
+                printf("err idx %d, value %f vs %f\n", n, va, vb);
+            }
             max_diff = std::max(max_diff, diff);
             tot_diff += diff;
             ++diff_cnt;
@@ -130,7 +158,7 @@ float compare(int rank, void* _pa, void* _pb, int size, float scale)
 template <typename T1, typename T2>
 void random_fill(std::vector<T1>& vec, T2 minv, T2 maxv)
 {
-    std::mt19937 gen(20240410);
+    std::mt19937 gen(20240725);
     std::uniform_real_distribution<float> dis(static_cast<float>(minv), static_cast<float>(maxv));
     for (auto& v : vec)
     {
@@ -164,8 +192,64 @@ std::string ar_info(AllReduceStrategyType runtime_strategy, AllReduceStrategyCon
     return info;
 }
 
-bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int warmup, int iter,
-    AllReduceStrategyType runtime_strategy = AllReduceStrategyType::ONESHOT,
+struct SetDevice
+{
+    SetDevice(int device_id)
+    {
+        TLLM_CUDA_CHECK(cudaSetDevice(device_id));
+    }
+};
+
+class Workspace
+{
+public:
+    Workspace(int world_size, int rank, int max_token_num, int max_hidden_size)
+        : world_config(world_size, 1, rank, world_size)
+        , set_device(world_config.getDevice())
+        , p_s(std::make_shared<tr::CudaStream>())
+        , buf_mgr(p_s)
+        , buffers(1, 1, max_token_num, max_hidden_size, buf_mgr, world_config)
+    {
+    }
+
+    void set_params(AllReduceParams& params) const
+    {
+        int world_size = world_config.getSize();
+        for (int i = 0; i < world_size; ++i)
+        {
+            params.peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[0].getCommPtrs()[i];
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[4].getCommPtrs()[i];
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[i + MAX_RANKS_PER_NODE]
+                = buffers.mIpcMemoryHandles[5].getCommPtrs()[i];
+            params.fusion_params.lamport_peer_comm_buffer_ptrs[i + MAX_RANKS_PER_NODE * 2]
+                = buffers.mIpcMemoryHandles[6].getCommPtrs()[i];
+        }
+        for (int i = 0; i < world_size; ++i)
+        {
+            params.peer_barrier_ptrs_in[i] = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[2].getCommPtrs()[i]);
+        }
+        for (int i = 0; i < world_size; ++i)
+        {
+            params.peer_barrier_ptrs_out[i]
+                = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[3].getCommPtrs()[i]);
+        }
+    }
+
+    cudaStream_t get_stream() const
+    {
+        return p_s->get();
+    }
+
+protected:
+    tr::WorldConfig world_config;
+    SetDevice set_device;
+    std::shared_ptr<tr::CudaStream> p_s;
+    tr::BufferManager buf_mgr;
+    tr::AllReduceBuffers buffers;
+};
+
+bool test(Workspace const& workspace, int token_num, int hidden_size, bool has_bias, bool has_affine, int warmup,
+    int iter, AllReduceStrategyType runtime_strategy = AllReduceStrategyType::ONESHOT,
     AllReduceStrategyConfig config = AllReduceStrategyConfig(0), AllReduceFusionOp fusion_op = AllReduceFusionOp::NONE)
 {
     std::srand(20240603);
@@ -183,9 +267,13 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
     random_fill(residual_buffer, -1, 1);
     random_fill(weight_buffer, -1, 1);
     random_fill(bias_buffer, -1, 1);
+    random_fill(inter_buffer, 0, 0);
+    random_fill(output_buffer, 0, 0);
     residual.copy_from(residual_buffer.data());
     weight.copy_from(weight_buffer.data());
     bias.copy_from(bias_buffer.data());
+    inter.copy_from(inter_buffer.data());
+    out.copy_from(output_buffer.data());
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
     auto rank = comm.getRank();
@@ -195,40 +283,25 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
         if (fusion_op == AllReduceFusionOp::RESIDUAL_RMS_NORM)
         {
             printf(
-                "Custom All Reduce with Residual Add and RMS Norm, %s, message size %d(token num %d, hidden size %d), "
+                "Custom All Reduce with Residual Add and RMS Norm, %s, message size %6d(token num %6d, hidden size "
+                "%6d), "
                 "has bias %d, has affine %d\n",
                 info.c_str(), message_size, token_num, hidden_size, static_cast<int>(has_bias),
                 static_cast<int>(has_affine));
         }
         else
         {
-            printf("Custom All Reduce, %s, message size %d(token num %d, hidden size %d), has bias %d, has affine %d\n",
+            printf(
+                "Custom All Reduce, %s, message size %d(token num %d, hidden size %6d), has bias %6d, has affine %6d\n",
                 info.c_str(), message_size, token_num, hidden_size, static_cast<int>(has_bias),
                 static_cast<int>(has_affine));
         }
     }
-    random_fill(input_buffer, -1 / world_size, 1 / world_size);
+    random_fill(input_buffer, -1, 1);
     in.copy_from(input_buffer.data());
-    cudaSetDevice(rank);
-
-    tr::WorldConfig world_config(world_size, 1, rank, world_size);
-    auto p_s = std::make_shared<tr::CudaStream>();
-    tr::BufferManager buf_mgr(p_s);
-    tr::AllReduceBuffers buffers(1, 1, token_num, hidden_size, buf_mgr, world_config);
 
     AllReduceParams params;
-    for (int i = 0; i < world_size; ++i)
-    {
-        params.peer_comm_buffer_ptrs[i] = buffers.mIpcMemoryHandles[0].getCommPtrs()[i];
-    }
-    for (int i = 0; i < world_size; ++i)
-    {
-        params.peer_barrier_ptrs_in[i] = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[2].getCommPtrs()[i]);
-    }
-    for (int i = 0; i < world_size; ++i)
-    {
-        params.peer_barrier_ptrs_out[i] = reinterpret_cast<uint32_t*>(buffers.mIpcMemoryHandles[3].getCommPtrs()[i]);
-    }
+    workspace.set_params(params);
     params.barrier_flag = 0;
     params.ranks_per_node = world_size;
     params.local_rank = rank;
@@ -242,11 +315,18 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
     params.fusion_params.eps = eps;
     params.fusion_params.intermediate_buffer = inter.data();
 
-    cudaStream_t s;
-    cudaStreamCreate(&s);
+    cudaStream_t s = workspace.get_stream();
     cudaEvent_t begin, end;
     cudaEventCreate(&begin);
     cudaEventCreate(&end);
+    lamportInitialize(
+        params.fusion_params.lamport_peer_comm_buffer_ptrs[rank], message_size, nvinfer1::DataType::kHALF, s);
+    lamportInitialize(params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + MAX_RANKS_PER_NODE], message_size,
+        nvinfer1::DataType::kHALF, s);
+    lamportInitialize(params.fusion_params.lamport_peer_comm_buffer_ptrs[rank + MAX_RANKS_PER_NODE * 2], message_size,
+        nvinfer1::DataType::kHALF, s);
+    cudaDeviceSynchronize();
+    comm.barrier();
     for (int i = 0; i < warmup; ++i)
     {
         params.barrier_flag += 1;
@@ -307,7 +387,7 @@ bool test(int token_num, int hidden_size, bool has_bias, bool has_affine, int wa
     {
         printf("\033[31mFAILED\033[0m\n");
     }
-    cudaStreamDestroy(s);
+    comm.barrier();
     return pass;
 }
 
@@ -315,6 +395,7 @@ TEST(Kernel, AllReduce)
 {
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
+    auto rank = comm.getRank();
     if (world_size % 2)
         return;
 
@@ -331,6 +412,8 @@ TEST(Kernel, AllReduce)
     };
     // clang-format on
     bool pass = true;
+    int max_token_num = 1000, max_hidden_size = 8192;
+    Workspace workspace(world_size, rank, max_token_num, max_hidden_size);
     for (auto config : configs)
     {
         for (auto op : ops)
@@ -340,23 +423,23 @@ TEST(Kernel, AllReduce)
                 for (auto has_affine : {false, true})
                 {
                     pass = pass
-                        && test(
-                            1, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 1, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            1, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 1, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            10, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 10, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            10, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT, config, op);
+                        && test(workspace, 10, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::ONESHOT,
+                            config, op);
                     pass = pass
-                        && test(
-                            1000, 4096, has_bias, has_affine, warmup, iter, AllReduceStrategyType::TWOSHOT, config, op);
+                        && test(workspace, 1000, 4096, has_bias, has_affine, warmup, iter,
+                            AllReduceStrategyType::TWOSHOT, config, op);
                     pass = pass
-                        && test(
-                            1000, 8192, has_bias, has_affine, warmup, iter, AllReduceStrategyType::TWOSHOT, config, op);
+                        && test(workspace, 1000, 8192, has_bias, has_affine, warmup, iter,
+                            AllReduceStrategyType::TWOSHOT, config, op);
                 }
             }
         }
@@ -368,28 +451,22 @@ TEST(Kernel, AllReduceOneShot)
 {
     auto& comm = mpi::MpiComm::world();
     auto world_size = comm.getSize();
+    auto rank = comm.getRank();
     if (world_size % 2)
         return;
 
     int warmup = 100, iter = 100;
-    std::vector<int> candidate_bs{1, 2, 4, 8, 16, 32, 64, 128};
-    std::vector<int> candidate_hidden{4096, 8192, 12288, 16384};
+    std::vector<int> candidate_bs{1, 2, 4, 8, 16};
+    std::vector<int> candidate_hidden{1024, 2048, 4096, 8192};
     bool pass = true;
+    int max_token_num = 16, max_hidden_size = 8192;
+    Workspace workspace(world_size, rank, max_token_num, max_hidden_size);
     for (auto bs : candidate_bs)
     {
         for (auto hidden : candidate_hidden)
         {
             pass = pass
-                && test(bs, hidden, false, true, warmup, iter, AllReduceStrategyType::ONESHOT,
-                    AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
-            pass = pass
-                && test(bs, hidden, true, true, warmup, iter, AllReduceStrategyType::ONESHOT,
-                    AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
-            pass = pass
-                && test(bs, hidden, false, false, warmup, iter, AllReduceStrategyType::ONESHOT,
-                    AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
-            pass = pass
-                && test(bs, hidden, true, false, warmup, iter, AllReduceStrategyType::ONESHOT,
+                && test(workspace, bs, hidden, false, true, warmup, iter, AllReduceStrategyType::ONESHOT,
                     AllReduceStrategyConfig(0), AllReduceFusionOp::RESIDUAL_RMS_NORM);
         }
     }
diff --git a/cpp/tests/kernels/decodingKernelTest.cpp b/cpp/tests/kernels/decodingKernelTest.cpp
index 0860326ba..9b9a868b4 100644
--- a/cpp/tests/kernels/decodingKernelTest.cpp
+++ b/cpp/tests/kernels/decodingKernelTest.cpp
@@ -286,6 +286,438 @@ TEST_F(TestBeamHypothesesCopy, SingleBatchTest)
     checkAllEqual();
 }
 
+/**
+ * @brief Fills a slice of a tensor with data from a source array.
+ *
+ * This function writes to `tensor`  from source array `src` at index `idx.
+ * It optionally flattens the tensor before performing the insertion.
+ * For example tensor if we wanted to write 5 values in the 3rd row of [1,10,100]
+ * We will use (tensor, 2, 5, src, true, mBufferManager) where src is a buffer with at least 5 elems.
+ *
+ * @tparam T The type of elements in the source array.
+ * @param tensor A shared pointer to the tensor to be modified. Also need to be of type T.
+ * @param idx The index at which to start inserting data into the tensor.
+ * @param insertLen The number of elements to insert from the source array into the tensor.
+ * @param src An array containing the data to be inserted into the tensor.
+ * @param flattenFirst A boolean flag indicating whether to flatten the first dimension of the tensor before insertion.
+ * @param bufferManager A shared pointer to a BufferManager responsible for managing memory operations.
+ */
+template <typename T>
+void fillTensorAtIndex(ITensor::SharedPtr tensor, SizeType32 idx, std::vector<T> src, bool flattenFirst,
+    std::shared_ptr<tensorrt_llm::runtime::BufferManager> bufferManager)
+{
+    SizeType32 insertLen = src.size();
+    ITensor::SharedPtr target = ITensor::view(tensor);
+    if (flattenFirst)
+    {
+        target->squeeze(0);
+    }
+
+    target = ITensor::slice(target, idx, 1);
+    target->squeeze(0);
+    target = ITensor::slice(target, 0, insertLen);
+    bufferManager->copy(src.data(), *target);
+}
+
+class TestGatherTree : public ::testing::Test
+{
+public:
+    SizeType32 batchSize{1};
+    SizeType32 beamWidth{5};
+    SizeType32 maxSeqLen{20};
+
+    using TensorPtr = ITensor::SharedPtr;
+
+    using DecodingOutputPtr = std::unique_ptr<DecodingOutput>;
+    DecodingOutputPtr decodingOutput{nullptr};
+
+    SamplingConfig samplingConfig = SamplingConfig();
+
+    std::shared_ptr<tensorrt_llm::runtime::CudaStream> mStream{nullptr};
+    std::shared_ptr<tensorrt_llm::runtime::BufferManager> mBufferManager{nullptr};
+
+    SamplingConfig mSamplingConfig;
+
+    using DecodingInputPtr = std::unique_ptr<DecodingInput>;
+    DecodingInputPtr decodingInput{nullptr};
+
+    TensorPtr targetOut{nullptr};
+
+    void SetUp() override
+    {
+        mStream = std::make_shared<tensorrt_llm::runtime::CudaStream>();
+        mBufferManager = std::make_shared<tensorrt_llm::runtime::BufferManager>(mStream);
+    }
+
+    // create the empty buffers with the correct shapes and zero them
+    void createBuffers()
+    {
+        auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
+        auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
+        auto constexpr nvFloatType = TRTDataType<float>::value;
+
+        auto const maxBatchSizeShape = ITensor::makeShape({batchSize});
+        auto const maxBatchSizeXmaxBeamWidth = ITensor::makeShape({batchSize, beamWidth});
+        auto const jointOutputIdsShape = ITensor::makeShape({batchSize, beamWidth, maxSeqLen});
+
+        { // prevent reusing these vars after std::move
+            auto dummyLogits = mBufferManager->emptyTensor(MemoryType::kGPU, nvFloatType);
+            auto endIds = mBufferManager->emptyTensor(MemoryType::kGPU, nvTokenIdType);
+            auto batchSlots = mBufferManager->emptyTensor(MemoryType::kPINNED, nvSizeType);
+            decodingInput = std::make_unique<DecodingInput>(
+                0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(batchSlots));
+        }
+        auto& dInput = *decodingInput;
+
+        dInput.maxLength = maxSeqLen;
+
+        const_cast<ITensor&>(*dInput.endIds).reshape(maxBatchSizeShape);
+        const_cast<ITensor&>(*dInput.batchSlots).reshape(maxBatchSizeShape);
+        const_cast<ITensor&>(*dInput.endIds).reshape(maxBatchSizeShape);
+        const_cast<ITensor&>(*dInput.batchSlots).reshape(maxBatchSizeShape);
+        auto& inputLengths = const_cast<ITensor&>(*dInput.lengths);
+        dInput.lengths = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, nvSizeType);
+        mBufferManager->setZero(const_cast<ITensor&>(*dInput.lengths));
+
+        { // prevent reusing these vars after std::move
+
+            auto ids = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+            mBufferManager->setZero(*ids);
+            auto gatheredIds = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+            mBufferManager->setZero(*gatheredIds);
+
+            decodingOutput = std::make_unique<DecodingOutput>(std::move(ids), std::move(gatheredIds));
+        }
+        auto& dOutput = *decodingOutput;
+
+        dOutput.logProbs = mBufferManager->gpu(jointOutputIdsShape, nvFloatType);
+        mBufferManager->setZero(*dOutput.logProbs);
+        dOutput.logProbsTiled = mBufferManager->gpu(ITensor::makeShape({maxSeqLen, batchSize, beamWidth}), nvFloatType);
+        mBufferManager->setZero(*dOutput.logProbsTiled);
+        dOutput.lengths = mBufferManager->gpu(ITensor::makeShape({batchSize, beamWidth}), nvSizeType);
+        mBufferManager->setZero(*dOutput.lengths);
+        dOutput.cumLogProbs = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, nvFloatType);
+        mBufferManager->setZero(*dOutput.cumLogProbs);
+
+        dOutput.beamHypotheses.empty(*mBufferManager);
+        dOutput.beamHypotheses.reshape(batchSize, beamWidth, maxSeqLen);
+
+        dOutput.finishReasons
+            = mBufferManager->gpu(maxBatchSizeXmaxBeamWidth, TRTDataType<tk::FinishedState::UnderlyingType>::value);
+        mBufferManager->setZero(*dOutput.finishReasons);
+        dOutput.parentIds = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+        mBufferManager->setZero(*dOutput.parentIds);
+
+        targetOut = mBufferManager->gpu(jointOutputIdsShape, nvTokenIdType);
+        mBufferManager->setZero(*targetOut);
+    }
+
+    // clang-format off
+
+    // hardcode the input data for the output_len = 10 case
+    // this should not cause any beam swapping from the CBAs, just reorder the beams
+    void hardcodeBuffersLen10()
+    {
+        auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
+        auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
+        auto constexpr nvFloatType = TRTDataType<float>::value;
+
+        std::vector<SizeType32> len = {3, 3, 3, 3, 3};
+        TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
+        mBufferManager->copy(len.data(),*inputLengths);
+
+        std::vector<std::vector<float>> logProbs =
+        {
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -2.41985},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -0.493615, -2.61479},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -3.11851, -1.01671},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, 0, 0},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -3.62298}
+        };
+        for (SizeType32 it = 0; it < logProbs.size(); it++){
+            fillTensorAtIndex(decodingOutput->logProbs, it, logProbs[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsTiled =
+        {
+            {-2.70907, -2.96689, -3.27157, -3.37314, -3.50595},
+            {-1.84733, -1.8942, -1.63675, -1.9567, -1.47513},
+            {-0.305059, -0.765237, -2.31329, -2.37162, -2.48475},
+            {-1.97517, -0.0377979, -2.0169, -2.42439, -2.27471},
+            {-1.31451, -2.2442, -1.5831, -2.44732, -2.02409},
+            {-1.57552, -2.63339, -2.11286, -2.57304, -3.85214},
+            {-0.310524, -0.534199, -0.74379, -2.86232, -1.72914},
+            {-0.696636, -0.493615, -0.237725, -3.07164, -3.11851},
+            {-2.41985, -2.61479, -1.01671, -3.62298, -1.26586},
+            {-0.844337, -0.922832, -0.427682, -0.419985, -1.85996}
+        };
+        TensorPtr logProbsTiledView = ITensor::view(decodingOutput->logProbsTiled,ITensor::makeShape({maxSeqLen*batchSize, beamWidth}));
+        for (SizeType32 it = 0; it < logProbsTiled.size(); it++){
+            auto logProbsSlice = ITensor::slice(logProbsTiledView, it+3,1);
+            mBufferManager->copy(logProbsTiled[it].data(),*logProbsSlice);
+        }
+
+        std::vector<SizeType32> outputLenghts = {13, 13, 13, 13, 13};
+        mBufferManager->copy(outputLenghts.data(),*decodingOutput->lengths);
+
+        std::vector<float> cumLogProbs = {-15.0458, -15.4681, -15.8323, -15.8424, -16.0614};
+        mBufferManager->copy(cumLogProbs.data(),*decodingOutput->cumLogProbs);
+
+        std::vector<std::vector<TokenIdType>> outputIdsCBA =
+        {
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973}
+        };
+        for(SizeType32 it = 0; it < outputIdsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.outputIdsCBA, it, outputIdsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsCBA =
+        {
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -2.19674},
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -2.81382,}
+        };
+        for(SizeType32 it = 0; it < logProbsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.logProbsCBA, it, logProbsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<SizeType32> sequenceLengthsCBA = {10, 10, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(sequenceLengthsCBA.data(), *decodingOutput->beamHypotheses.sequenceLengthsCBA);
+
+        std::vector<float> cumLogProbsCBA = {-13.6336, -13.8988, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(cumLogProbsCBA.data(), *decodingOutput->beamHypotheses.cumLogProbsCBA);
+
+        std::vector<float> normedScoresCBA = {-1.7042, -1.73735, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(normedScoresCBA.data(), *decodingOutput->beamHypotheses.normedScoresCBA);
+
+        std::vector<SizeType32> numBeamsCBA = {2};
+        mBufferManager->copy(numBeamsCBA.data(), *decodingOutput->beamHypotheses.numBeamsCBA);
+
+        std::vector<float> minNormedScoresCBA = {-1.73735};
+        mBufferManager->copy(minNormedScoresCBA.data(), *decodingOutput->beamHypotheses.minNormedScoresCBA);
+
+        std::vector<SizeType32> batchDones = {0};
+        mBufferManager->copy(batchDones.data(), *decodingOutput->beamHypotheses.batchDones);
+
+        std::vector<uint8_t> finishReasons = {4, 4, 4, 4, 4};
+        mBufferManager->copy(finishReasons.data(), *decodingOutput->finishReasons);
+
+        std::vector<std::vector<TokenIdType>> ids =
+        {
+            {1, 864, 304, 1073, 825, 1048, 278, 278, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 367, 920, 304, 310, 1749, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 679, 263, 760, 679, 263, 29973, 13, 310, 526, 502},
+            {1, 864, 304, 1207, 901, 278, 1749, 445, 3889, 393, 591, 13443, 276},
+            {1, 864, 304, 1074, 263, 29973, 1207, 263, 2446, 12623, 1334, 29915, 30010}
+        };
+        for(SizeType32 it = 0; it < ids.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->ids, it, ids[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<SizeType32>> parentIds =
+        {
+            {0, 0, 0, 0, 0, 3, 0, 1, 1, 0, 0, 0, 0},
+            {0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1, 1, 1},
+            {0, 0, 0, 0, 1, 2, 1, 4, 3, 2, 4, 4, 3},
+            {0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0, 0, 4},
+            {0, 0, 0, 0, 3, 3, 1, 2, 0, 4, 0, 3, 0}
+        };
+        for(SizeType32 it = 0; it < parentIds.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->parentIds, it, parentIds[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<TokenIdType>> targetOutput =
+    {
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 13, 4806, 526},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 13443, 502},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 591, 29915, 276},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13, 4806, 30010}
+        };
+        for(SizeType32 it = 0; it < targetOutput.size(); it++)
+        {
+            fillTensorAtIndex(targetOut, it, targetOutput[it], true, mBufferManager);
+        }
+    }
+
+    // this case has the output_len = 8, and tests that the beams from the CBAs are correctly swapped.
+    void hardcodeBuffersLen8()
+    {
+        auto constexpr nvTokenIdType = TRTDataType<TokenIdType>::value;
+        auto constexpr nvSizeType = TRTDataType<SizeType32>::value;
+        auto constexpr nvFloatType = TRTDataType<float>::value;
+
+        std::vector<SizeType32> len = {3, 3, 3, 3, 3};
+        TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
+        mBufferManager->copy(len.data(),*inputLengths);
+
+        std::vector<std::vector<float> >logProbs =
+        {
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.44732, -2.11286, -0.74379},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -2.86232},
+            {-2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -3.85214, -1.72914}
+        };
+        for (SizeType32 it = 0; it < logProbs.size(); it++){
+            fillTensorAtIndex(decodingOutput->logProbs, it, logProbs[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsTiled =
+        {
+            {-2.70907, -2.96689, -3.27157, -3.37314, -3.50595},
+            {-1.84733, -1.8942, -1.63675, -1.9567, -1.47513},
+            {-0.305059, -0.765237, -2.31329, -2.37162, -2.48475},
+            {-1.97517, -0.0377979, -2.0169, -2.42439, -2.27471},
+            {-1.31451, -2.2442, -1.5831, -2.44732, -2.02409},
+            {-1.57552, -2.63339, -2.11286, -2.57304, -3.85214},
+            {-0.310524, -0.534199, -0.74379, -2.86232, -1.72914},
+            {-0.696636, -0.493615, -0.237725, -3.07164, -3.11851}
+        };
+        TensorPtr logProbsTiledView = ITensor::view(decodingOutput->logProbsTiled,ITensor::makeShape({maxSeqLen*batchSize, beamWidth}));
+        for (SizeType32 it = 0; it < logProbsTiled.size(); it++){
+            auto logProbsSlice = ITensor::slice(logProbsTiledView, it+3,1);
+            mBufferManager->copy(logProbsTiled[it].data(),*logProbsSlice);
+        }
+        std::vector<SizeType32> outputLenghts = {11, 11, 11, 11, 11};
+        mBufferManager->copy(outputLenghts.data(),*decodingOutput->lengths);
+
+        std::vector<float> cumLogProbs = {-11.7816, -11.9304, -14.0883, -14.1566, -14.2035};
+        mBufferManager->copy(cumLogProbs.data(),*decodingOutput->cumLogProbs);
+
+        std::vector<std::vector<TokenIdType>> outputIdsCBA =
+        {
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973}
+        };
+        for(SizeType32 it = 0; it < outputIdsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.outputIdsCBA, it, outputIdsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<float>> logProbsCBA =
+        {
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -1.31451, -2.63339, -0.534199, -2.19674},
+            {0, 0, 0, -2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -2.81382,}
+        };
+        for(SizeType32 it = 0; it < logProbsCBA.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->beamHypotheses.logProbsCBA, it, logProbsCBA[it], true, mBufferManager);
+        }
+
+        std::vector<SizeType32> sequenceLengthsCBA  = {10, 10, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(sequenceLengthsCBA.data(), *decodingOutput->beamHypotheses.sequenceLengthsCBA);
+
+        std::vector<float> cumLogProbsCBA = {-13.6336, -13.8988, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(cumLogProbsCBA.data(), *decodingOutput->beamHypotheses.cumLogProbsCBA);
+
+        std::vector<float> normedScoresCBA = {-1.7042, -1.73735, 0, 0, 0, 0, 0, 0, 0, 0};
+        mBufferManager->copy(normedScoresCBA.data(), *decodingOutput->beamHypotheses.normedScoresCBA);
+
+        std::vector<SizeType32> numBeamsCBA = {2};
+        mBufferManager->copy(numBeamsCBA.data(), *decodingOutput->beamHypotheses.numBeamsCBA);
+
+        std::vector<float> minNormedScoresCBA = {-1.73735};
+        mBufferManager->copy(minNormedScoresCBA.data(), *decodingOutput->beamHypotheses.minNormedScoresCBA);
+
+        std::vector<SizeType32> batchDones = {0};
+        mBufferManager->copy(batchDones.data(), *decodingOutput->beamHypotheses.batchDones);
+
+        std::vector<uint8_t> finishReasons = {4, 4, 4, 4, 4};
+        mBufferManager->copy(finishReasons.data(), *decodingOutput->finishReasons);
+
+        std::vector<std::vector<TokenIdType>> ids =
+        {
+            {1, 864, 304, 1073, 825, 1048, 278, 278, 3815, 29973, 13},
+            {1, 864, 304, 367, 920, 304, 310, 1749, 3815, 29973, 13},
+            {1, 864, 304, 679, 263, 760, 679, 263, 29973, 13, 310},
+            {1, 864, 304, 1207, 901, 278, 1749, 445, 3889, 393, 591},
+            {1, 864, 304, 1074, 263, 29973, 1207, 263, 2446, 12623, 1334}
+        };
+        for(SizeType32 it = 0; it < ids.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->ids, it, ids[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<SizeType32>> parentIds =
+        {
+            {0, 0, 0, 0, 0, 3, 0, 1, 1, 0, 0},
+            {0, 0, 0, 0, 0, 1, 2, 1, 0, 1, 1},
+            {0, 0, 0, 0, 1, 2, 1, 4, 3, 2, 4},
+            {0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 0},
+            {0, 0, 0, 0, 3, 3, 1, 2, 0, 4, 0}
+        };
+        for(SizeType32 it = 0; it < parentIds.size(); it++)
+        {
+            fillTensorAtIndex(decodingOutput->parentIds, it, parentIds[it], true, mBufferManager);
+        }
+
+        std::vector<std::vector<TokenIdType>> targetOutput =
+        {
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 13},
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 13},
+            {1, 864, 304, 367, 263, 760, 310, 278, 3815, 29973, 0},
+            {1, 864, 304, 367, 263, 760, 310, 1749, 3815, 29973, 0},
+            {1, 864, 304, 367, 263, 760, 310, 278, 2446, 12623, 310}
+        };
+        for(SizeType32 it = 0; it < targetOutput.size(); it++)
+        {
+            fillTensorAtIndex(targetOut, it, targetOutput[it], true, mBufferManager);
+        }
+    }
+
+    // clang-format on
+
+    bool checkResult()
+    {
+
+        TensorPtr reference = this->mBufferManager->copyFrom((*targetOut), tensorrt_llm::runtime::MemoryType::kCPU);
+        auto referencePtr = bufferCast<TokenIdType>(*reference);
+
+        TensorPtr real
+            = this->mBufferManager->copyFrom((*decodingOutput->gatheredIds), tensorrt_llm::runtime::MemoryType::kCPU);
+        auto realPtr = bufferCast<TokenIdType>(*real);
+
+        bool allEqual = true;
+        for (SizeType32 iAssert = 0; iAssert < batchSize * beamWidth * maxSeqLen; iAssert++)
+        {
+            if (referencePtr[iAssert] != realPtr[iAssert])
+            {
+                TLLM_LOG_ERROR("Mismatch input value. Position of inputs: %d, expected value: %d, output value: %d",
+                    iAssert, referencePtr[iAssert], realPtr[iAssert]);
+                allEqual = false;
+            }
+        }
+        return allEqual;
+    }
+};
+
+TEST_F(TestGatherTree, GatherTreeNoSwap)
+{
+    createBuffers();
+    hardcodeBuffersLen10();
+    cudaDeviceSynchronize();
+    kernels::gatherTree(*decodingOutput, *decodingInput, *mBufferManager, mSamplingConfig);
+    cudaDeviceSynchronize();
+
+    EXPECT_TRUE(checkResult());
+}
+
+TEST_F(TestGatherTree, GatherTreeWithSwap)
+{
+    createBuffers();
+    hardcodeBuffersLen8();
+    cudaDeviceSynchronize();
+    kernels::gatherTree(*decodingOutput, *decodingInput, *mBufferManager, mSamplingConfig);
+    cudaDeviceSynchronize();
+
+    EXPECT_TRUE(checkResult());
+}
+
 enum AcceptKernelMode
 {
     BY_IDS,
@@ -872,24 +1304,24 @@ class DecodingKernelsTest : public testing::Test
 
     void callAcceptByIds()
     {
-        tksp::invokeAcceptDraftTokensByIds(bufferCast<SizeType32>(*mDraftTokens),
-            bufferCast<SizeType32>(*mTargetTokens), bufferCast<SizeType32>(*mContextLengths),
-            bufferCast<SizeType32>(*mNumsDraftTokens), bufferCast<SizeType32>(*mSequenceLengths),
-            reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
-            reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedFinal)),
-            bufferCast<SizeType32>(*mFinishedSum), bufferCast<SizeType32>(*mBatchSlots), mBatchSize, mMaxBatchSize,
-            mBeamWidth, mMaxSeqLen, mMaxDraftTokens, mStream->get());
+        // tksp::invokeAcceptDraftTokensByIds(bufferCast<SizeType32>(*mDraftTokens),
+        //     bufferCast<SizeType32>(*mTargetTokens), bufferCast<SizeType32>(*mContextLengths),
+        //     bufferCast<SizeType32>(*mNumsDraftTokens), bufferCast<SizeType32>(*mSequenceLengths),
+        //     reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
+        //     reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedFinal)),
+        //     bufferCast<SizeType32>(*mFinishedSum), bufferCast<SizeType32>(*mBatchSlots), mBatchSize, mMaxBatchSize,
+        //     mBeamWidth, mMaxSeqLen, mMaxDraftTokens, mStream->get());
     }
 
     void callAcceptByLogits()
     {
-        tksp::acceptDraftTokensByLogits(bufferCast<T>(*mDraftLogits),
-            reinterpret_cast<T**>(bufferCast<int64_t>(*mTargetLogitsPtrs)), bufferCast<T>(*mDraftProbs),
-            bufferCast<T>(*mTargetProbs), bufferCast<SizeType32>(*mNumsDraftTokens),
-            reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
-            reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStates)), bufferCast<SizeType32>(*mBatchSlots),
-            mBatchSize, mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSize, mMaxDraftTokens, false, 0.9f,
-            mStream->get());
+        // tksp::acceptDraftTokensByLogits(bufferCast<T>(*mDraftLogits),
+        //     reinterpret_cast<T**>(bufferCast<int64_t>(*mTargetLogitsPtrs)), bufferCast<T>(*mDraftProbs),
+        //     bufferCast<T>(*mTargetProbs), bufferCast<SizeType32>(*mNumsDraftTokens),
+        //     reinterpret_cast<tk::FinishedState*>(bufferCast<tk::FinishedState::UnderlyingType>(*mFinishedSteps)),
+        //     reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*mCurandStates)),
+        //     bufferCast<SizeType32>(*mBatchSlots), mBatchSize, mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSize,
+        //     mMaxDraftTokens, false, 0.9f, mStream->get());
     }
 
     void callAcceptByIdsWithPaths()
@@ -1165,7 +1597,7 @@ typedef testing::Types<float, half> FloatAndHalfTypes;
 
 TYPED_TEST_SUITE(DecodingKernelsTest, FloatAndHalfTypes);
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelSmall)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByIdsKernelSmall)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(1)
@@ -1176,7 +1608,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelSmall)
                       .setAcceptMode(AcceptKernelMode::BY_IDS));
 }
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelLarge)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByIdsKernelLarge)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(128)
@@ -1187,7 +1619,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByIdsKernelLarge)
                       .setAcceptMode(AcceptKernelMode::BY_IDS));
 }
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelSmall)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByLogitsKernelSmall)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(1)
@@ -1198,7 +1630,7 @@ TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelSmall)
                       .setAcceptMode(AcceptKernelMode::BY_LOGITS));
 }
 
-TYPED_TEST(DecodingKernelsTest, acceptDraftTokensByLogitsKernelLarge)
+TYPED_TEST(DecodingKernelsTest, DISABLED_acceptDraftTokensByLogitsKernelLarge)
 {
     this->runTest(DecodingKernelTestParam()
                       .setBatchSize(64)
diff --git a/cpp/tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/kernels/mixtureOfExpertsTest.cu
index 7d00c5d27..894780b3a 100644
--- a/cpp/tests/kernels/mixtureOfExpertsTest.cu
+++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu
@@ -1087,11 +1087,37 @@ protected:
     void BasicPermuteTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4);
 
     std::vector<int> calcPermuteMapExpertParallel(std::vector<int> const& expected_experts);
-    void ExpertParallelTest(int k = 1);
 
-    void TensorParallelTest(int k = 1);
+    void ExpertParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
+    {
+        // 2 experts per rank
+        ParallelelismTest(k, 1, num_experts / 2, hidden_size, num_experts);
+        // 1 expert per rank
+        ParallelelismTest(k, 1, num_experts, hidden_size, num_experts);
+    }
+
+    void TensorParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
+    {
+        ParallelelismTest(k, 2, 1, hidden_size, num_experts);
+        ParallelelismTest(k, 4, 1, hidden_size, num_experts);
+        ParallelelismTest(k, 8, 1, hidden_size, num_experts);
+    }
 
-    void MixedParallelTest(int k = 1);
+    void MixedParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
+    {
+        // 2 experts per rank
+        ParallelelismTest(k, 2, num_experts / 2, hidden_size, num_experts);
+        ParallelelismTest(k, 4, num_experts / 2, hidden_size, num_experts);
+        ParallelelismTest(k, 8, num_experts / 2, hidden_size, num_experts);
+
+        // 1 expert per rank
+        ParallelelismTest(k, 2, num_experts, hidden_size, num_experts);
+        ParallelelismTest(k, 4, num_experts, hidden_size, num_experts);
+        ParallelelismTest(k, 8, num_experts, hidden_size, num_experts);
+    }
+
+    void ParallelelismTest(int k = 1, int tp_size = 4, int ep_size = 2, int64_t hidden_size = DEFAULT_HIDDEN_SIZE,
+        int64_t num_experts = 4);
 };
 
 template <class WeightParams>
@@ -1276,6 +1302,7 @@ TYPED_TEST(MixtureOfExpertsTest, PermuteMixtral8x7b)
 {
     this->mUseBias = false;
     this->mActType = tensorrt_llm::ActivationType::Swiglu;
+    this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE;
     this->BasicPermuteTest(2, 4096, 8);
 }
 
@@ -1299,7 +1326,8 @@ std::vector<int> MixtureOfExpertsTest<TypeParam_>::calcPermuteMapExpertParallel(
 }
 
 template <class TypeParam_>
-void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
+void MixtureOfExpertsTest<TypeParam_>::ParallelelismTest(
+    int k, int tp_size, int ep_size, int64_t hidden_size, int64_t num_experts)
 {
     if (FP8)
     {
@@ -1307,121 +1335,12 @@ void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
         mUseBias = false;
     }
 
-    auto test_archs = getAllTileConfigsToTest();
-    for (auto [gemm1, gemm2] : test_archs)
+    ASSERT_LE(ep_size, num_experts);
+    if (tp_size == 1)
     {
-        mInternalSelectedConfig1 = gemm1;
-        mInternalSelectedConfig2 = gemm2;
-
-        int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
-        int parallelism = 2;
-        int64_t num_experts = 4;
-        int64_t num_tokens = 3;
-
-        std::vector<DataType> hidden_states(hidden_size * num_tokens);
-        auto raw_unquant_input = populateTokens(hidden_states);
-
-        std::vector<float> probs = {
-            0.5, 0.1, 0.25, 0.15,   //
-            0.03, 0.2, 0.07, 0.7,   //
-            0.25, 0.21, 0.35, 0.19, //
-        };
-
-        std::vector<int> expected_experts{0, 3, 2};
-        if (k == 2)
-            expected_experts = {0, 2, 3, 1, 2, 0};
-        else if (k == 3)
-            expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
-        std::vector<OutputType> results(hidden_states.size(), 0);
-        for (int i = 0; i < parallelism; i++)
-        {
-            if (i == 0)
-            {
-                // Only need to init the inputs on the first iteration
-                runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
-                    MOEParallelismConfig{1, 0, parallelism, i});
-            }
-            else
-            {
-                runMoEPermute(MOEParallelismConfig{1, 0, parallelism, i});
-            }
-
-            auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
-            // Experts should only be selected when we are on the right node
-            // Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node
-            int const start_expert = i * (mNumExperts / parallelism);
-            std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(),
-                [&](int val) { return val >= mNumExperts ? val : val + start_expert; });
-            auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, parallelism, i);
-            ASSERT_EQ(selected_expert, masked_expected_experts);
-
-            auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
-            auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts);
-            ASSERT_EQ(permute_map, proj_map) << "Iteration " << i;
-            compareSoftmax(expected_experts, probs);
-
-            // Do the final reduce
-            auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
-            std::transform(
-                iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{});
-        }
-
-        compareFinal(expected_experts, probs, raw_unquant_input, results);
-    }
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallel)
-{
-    this->ExpertParallelTest();
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelK2)
-{
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelNoBias)
-{
-    this->mUseBias = false;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelRenorm)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSparseMixer)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelGeglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Geglu;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSwiglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Swiglu;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-template <class TypeParam_>
-void MixtureOfExpertsTest<TypeParam_>::TensorParallelTest(int k)
-{
-    if (FP8)
-    {
-        // TODO Remove this when bias + FP8 is supported
-        mUseBias = false;
+        // Only the first 4 experts are ever used. They should be split across at least 2 ranks
+        ASSERT_LT(num_experts / ep_size, 4)
+            << "Expert parallelism must have less than 4 experts per rank or the test is ineffective";
     }
 
     auto test_archs = getAllTileConfigsToTest();
@@ -1430,9 +1349,6 @@ void MixtureOfExpertsTest<TypeParam_>::TensorParallelTest(int k)
         mInternalSelectedConfig1 = gemm1;
         mInternalSelectedConfig2 = gemm2;
 
-        int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
-        int parallelism = 8;
-        int64_t num_experts = 4;
         int64_t num_tokens = 3;
 
         std::vector<DataType> hidden_states(hidden_size * num_tokens);
@@ -1444,130 +1360,9 @@ void MixtureOfExpertsTest<TypeParam_>::TensorParallelTest(int k)
             0.25, 0.21, 0.35, 0.19, //
         };
 
-        std::vector<int> expected_experts{0, 3, 2};
-        if (k == 2)
-            expected_experts = {0, 2, 3, 1, 2, 0};
-        else if (k == 3)
-            expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
-        std::vector<OutputType> results(hidden_states.size(), 0);
-        for (int i = 0; i < parallelism; i++)
-        {
-            if (i == 0)
-            {
-                // Only need to init the inputs on the first iteration
-                runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
-                    MOEParallelismConfig{parallelism, i, 1, 0});
-            }
-            else
-            {
-                runMoEPermute(MOEParallelismConfig{parallelism, i, 1, 0});
-            }
-
-            auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
-            EXPECT_EQ(selected_expert, expected_experts);
-
-            auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
-            std::vector<int> permute_map{0, 2, 1};
-            if (k == 2)
-                permute_map = {0, 5, 4, 3, 2, 1};
-            if (k == 3)
-                permute_map = {0, 8, 6, 4, 2, 1, 7, 5, 3};
-
-            ASSERT_EQ(permute_map, proj_map) << "Iteration " << i;
-
-            // Do the final reduce
-            auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
-            std::transform(
-                iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{});
-        }
-
-        compareFinal(expected_experts, probs, raw_unquant_input, results);
-    }
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallel)
-{
-    this->TensorParallelTest();
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelK2)
-{
-    this->TensorParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelK3)
-{
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelNoBias)
-{
-    this->mUseBias = false;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelRenorm)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelSparseMixer)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelGeglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Geglu;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelSwiglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Swiglu;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-template <class TypeParam_>
-void MixtureOfExpertsTest<TypeParam_>::MixedParallelTest(int k)
-{
-    if (FP8)
-    {
-        // TODO Remove this when bias + FP8 is supported
-        mUseBias = false;
-    }
-
-    auto test_archs = getAllTileConfigsToTest();
-    for (auto [gemm1, gemm2] : test_archs)
-    {
-        mInternalSelectedConfig1 = gemm1;
-        mInternalSelectedConfig2 = gemm2;
-
-        int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
-        int tp_parallelism = 2;
-        int ep_parallelism = 2;
-        int64_t num_experts = 4;
-        int64_t num_tokens = 3;
-
-        std::vector<DataType> hidden_states(hidden_size * num_tokens);
-        auto raw_unquant_input = populateTokens(hidden_states);
-
-        std::vector<float> probs = {
-            0.5, 0.1, 0.25, 0.15,   //
-            0.03, 0.2, 0.07, 0.7,   //
-            0.25, 0.21, 0.35, 0.19, //
-        };
+        std::vector<std::vector<DataType>> hidden_input = {hidden_states};
+        std::vector<std::vector<float>> router_input = {probs};
+        resizeRouterInputs(router_input, num_experts, num_tokens);
 
         std::vector<int> expected_experts{0, 3, 2};
         if (k == 2)
@@ -1575,34 +1370,34 @@ void MixtureOfExpertsTest<TypeParam_>::MixedParallelTest(int k)
         else if (k == 3)
             expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
         std::vector<OutputType> results(hidden_states.size(), 0);
-        for (int i = 0; i < tp_parallelism; i++)
+        for (int i = 0; i < tp_size; i++)
         {
-            for (int j = 0; j < ep_parallelism; j++)
+            for (int j = 0; j < ep_size; j++)
             {
                 if (i == 0 && j == 0)
                 {
                     // Only need to init the inputs on the first iteration
-                    runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
-                        MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j});
+                    runMoEPermute(hidden_input, router_input, hidden_size, num_experts, k, {},
+                        MOEParallelismConfig{tp_size, i, ep_size, j});
                 }
                 else
                 {
-                    runMoEPermute(MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j});
+                    runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j});
                 }
 
                 auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
                 // Experts should only be selected when we are on the right node
                 // Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node
-                int const start_expert = j * (mNumExperts / ep_parallelism);
+                int const start_expert = j * (mNumExperts / ep_size);
                 std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(),
                     [&](int val) { return val >= mNumExperts ? val : val + start_expert; });
-                auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_parallelism, j);
+                auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_size, j);
                 ASSERT_EQ(selected_expert, masked_expected_experts);
 
                 auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
                 auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts);
                 ASSERT_EQ(permute_map, proj_map) << "Iteration " << i << " " << j;
-                compareSoftmax(expected_experts, probs);
+                compareSoftmax(expected_experts, router_input[0]);
 
                 // Do the final reduce
                 auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
@@ -1611,54 +1406,76 @@ void MixtureOfExpertsTest<TypeParam_>::MixedParallelTest(int k)
             }
         }
 
-        compareFinal(expected_experts, probs, raw_unquant_input, results);
+        compareFinal(expected_experts, router_input[0], raw_unquant_input, results);
     }
 }
 
-TYPED_TEST(MixtureOfExpertsTest, MixedParallel)
-{
-    this->MixedParallelTest();
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelK2)
-{
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelNoBias)
-{
-    this->mUseBias = false;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelRenorm)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelSparseMixer)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelGeglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Geglu;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
+#define PARALLEL_TEST_SUITE(ParallelismType)                                                                           \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType)                                                                  \
+    {                                                                                                                  \
+        this->ParallelismType##Test();                                                                                 \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K2)                                                              \
+    {                                                                                                                  \
+        this->ParallelismType##Test(2);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K3)                                                              \
+    {                                                                                                                  \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##NoBias)                                                          \
+    {                                                                                                                  \
+        this->mUseBias = false;                                                                                        \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Renorm)                                                          \
+    {                                                                                                                  \
+        this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;                                                \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##SparseMixer)                                                     \
+    {                                                                                                                  \
+        this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;                                               \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        /* k=3 is not supported for sparse mixer tests */                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Geglu)                                                           \
+    {                                                                                                                  \
+        this->mActType = tensorrt_llm::ActivationType::Geglu;                                                          \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Swiglu)                                                          \
+    {                                                                                                                  \
+        this->mActType = tensorrt_llm::ActivationType::Swiglu;                                                         \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Mixtral8x7b)                                                     \
+    {                                                                                                                  \
+        this->mUseBias = false;                                                                                        \
+        this->mActType = tensorrt_llm::ActivationType::Swiglu;                                                         \
+        this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE;                         \
+        this->ParallelismType##Test(2, 4096, 8);                                                                       \
+    }
 
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelSwiglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Swiglu;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
+PARALLEL_TEST_SUITE(ExpertParallel)
+PARALLEL_TEST_SUITE(TensorParallel)
+PARALLEL_TEST_SUITE(MixedParallel)
 
 TYPED_TEST(MixtureOfExpertsTest, ConfigSweep)
 {
diff --git a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp
index 399595583..e3c479ba4 100644
--- a/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp
+++ b/cpp/tests/kernels/sampling/samplingAirTopPTest.cpp
@@ -62,7 +62,7 @@ class AirTopPSamplingKernelTest : public SamplingKernelTest<T>
 
         tk::TopPSamplingKernelParams<T> kernelParams;
         kernelParams.probs = bufferCast<T>(*this->mProbsDevice);
-        kernelParams.outputIds = bufferCast<int*>(*this->mIdsPtrHost);
+        kernelParams.outputIdsPtrs = bufferCast<int*>(*this->mIdsPtrHost);
         kernelParams.workspace = workspaceDevice->data();
         kernelParams.topPs = bufferCast<float>(*this->mTopPsDevice);
         kernelParams.sequenceLength = bufferCast<int32_t>(*this->mSeqLengthsDevice);
@@ -91,54 +91,59 @@ TYPED_TEST_SUITE(AirTopPSamplingKernelTest, FloatAndHalfTypes);
 
 TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessSmallP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessAncestral)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeVocabSmallP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, NondeterministicCorrectnessLargeVocabLargeP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessSmallP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f).setDeterministicTopP(true));
+    this->runTest(
+        SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f).setDeterministicTopP(true));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f).setDeterministicTopP(true));
+    this->runTest(
+        SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f).setDeterministicTopP(true));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessAncestral)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f).setDeterministicTopP(true));
+    this->runTest(
+        SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f).setDeterministicTopP(true));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeVocabSmallP)
 {
     this->runTest(
-        SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f).setDeterministicTopP(true));
+        SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f).setDeterministicTopP(
+            true));
 };
 
 TYPED_TEST(AirTopPSamplingKernelTest, DeterministicCorrectnessLargeVocabLargeP)
 {
     this->runTest(
-        SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f).setDeterministicTopP(true));
+        SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f).setDeterministicTopP(
+            true));
 };
 
 class AirTopPSamplingKernelUtilsTest : public SamplingKernelTest<float>
diff --git a/cpp/tests/kernels/sampling/samplingTest.cpp b/cpp/tests/kernels/sampling/samplingTest.cpp
index c7f9cd2b6..d5d900244 100644
--- a/cpp/tests/kernels/sampling/samplingTest.cpp
+++ b/cpp/tests/kernels/sampling/samplingTest.cpp
@@ -110,6 +110,8 @@ void SamplingKernelTest<T>::setupBuffers(SamplingKernelTestParam const& param)
 
     auto const topK = param.topK;
     auto const topP = param.topP;
+    // TopK == 0 case (TopP kernel)
+    auto const topKDistUpperBound = std::max(topK, static_cast<unsigned int>(1));
 
     std::mt19937 gen(42);
 
@@ -133,7 +135,7 @@ void SamplingKernelTest<T>::setupBuffers(SamplingKernelTestParam const& param)
         0, vocabSize - 1); // -1 because uniform_int_distribution generates closed interval
     std::uniform_real_distribution<> skipDecodeDist(0, 1);
     std::uniform_real_distribution<> topPDist(0, topP);
-    std::uniform_int_distribution<> topKDist(1, topK);
+    std::uniform_int_distribution<> topKDist(1, topKDistUpperBound);
     std::uniform_int_distribution<> tokensPerStepDist(1, maxTokensPerStep);
     std::uniform_int_distribution<> seqLenDist(0, mMaxSeqLen - maxTokensPerStep);
     std::uniform_real_distribution<> logProbDist(-3.f, 3.f);
@@ -158,7 +160,7 @@ void SamplingKernelTest<T>::setupBuffers(SamplingKernelTestParam const& param)
         endIdsHostPtr[bi] = endIdsDistr(gen);
         skipDecodeHostPtr[bi] = skipDecodeDist(gen) > 0.8;
         topPsHostPtr[bi] = topPDist(gen);
-        topKsHostPtr[bi] = topKDist(gen);
+        topKsHostPtr[bi] = topK == 0 ? 0 : topKDist(gen);
         tokensPerStepPtr[bi] = tokensPerStepDist(gen);
         finishedHostPtr[bi] = finishedDist(gen) > 0.8 ? tk::FinishedState::finished() : tk::FinishedState::empty();
     }
@@ -196,9 +198,9 @@ void SamplingKernelTest<T>::setupBuffers(SamplingKernelTestParam const& param)
     // Init logits randomly
     auto logitsHostPtr = bufferCast<T>(*mLogitsHost);
     initRandom(logitsHostPtr, batchSize * maxTokensPerStep * vocabSize, -3.0f, 3.0f);
-
     // Only in greedy search we can guarantee the selected token and stop by condition
-    if (topK == 1)
+    // TopK == 1 for TopK kernel greedy, TopK == 0 for TopP kernels
+    if (topK <= 1)
     {
         for (SizeType32 bi = 0; bi < batchSize; ++bi)
         {
@@ -231,13 +233,29 @@ std::vector<SizeType32> SamplingKernelTest<T>::computeTopKTopPVariants(
     auto topK = bufferCast<int32_t>(*mTopKsHost)[batchSlot];
     auto topP = bufferCast<float>(*mTopPsHost)[batchSlot];
 
-    allowedTokens.insert(allowedTokens.begin(), indices.begin(), indices.begin() + topK);
+    if (topK > 0)         // handling top K kernel, top P result based on topK tokens
+    {
+        float sSum = 0.f; // sSum as in samplingTopKKernels.cu
+        for (auto ki = 0; ki < topK; ki++)
+        {
+            sSum += static_cast<float>(probsPtr[indices[ki]]);
+        }
+        topP *= sSum; // the adjusted topP in the selected topK distribution
+    }
+
     float totalProb = 0.f;
     SizeType32 idx = 0;
     while (totalProb < topP && idx < vocabSize)
     {
         allowedTokens.push_back(indices[idx]);
         totalProb += static_cast<float>(probsPtr[indices[idx++]]);
+        // cuda may selected a different index with same probability in kernel reduce, in test we allow them
+        while (idx < vocabSize
+            && static_cast<float>(probsPtr[indices[idx]]) == static_cast<float>(probsPtr[indices[idx - 1]]))
+        {
+            allowedTokens.push_back(indices[idx]);
+            totalProb += static_cast<float>(probsPtr[indices[idx++]]);
+        }
     }
     return allowedTokens;
 }
@@ -284,12 +302,15 @@ void SamplingKernelTest<T>::verifyResult(SamplingKernelTestParam const& param)
         auto const tokensPerStep = tokensPerStepPtr[batchSlot];
         for (SizeType32 ti = 0; ti < tokensPerStep; ++ti)
         {
-            auto kResults = param.returnAllTopK ? bufferCast<int32_t>(*mTopKsHost)[batchSlot] : 1;
-
-            for (SizeType32 ki = 0; ki < kResults; ++ki)
+            auto topK = bufferCast<int32_t>(*mTopKsHost)[batchSlot];
+            auto kResults = param.returnAllSelectedTokens ? (topK == 0 ? vocabSize : topK) : 1;
+            auto topKTopPVariants = computeTopKTopPVariants(bi, batchSlot, ti, maxTokensPerStep, vocabSize);
+            SizeType32 ki;
+            for (ki = 0; ki < kResults && ki < topKTopPVariants.size(); ++ki)
             {
                 // Set reference finished state to true if we finished before or at current step
-                auto const idsIdx = param.returnAllTopK ? ti * mMaxTopK + ki : seqLengthsOrigHostPtr[batchSlot] + ti;
+                auto const idsIdx
+                    = param.returnAllSelectedTokens ? ti * mMaxTopK + ki : seqLengthsOrigHostPtr[batchSlot] + ti;
                 auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx];
                 // Check the range of the returned token ([0, vocabSize))
                 EXPECT_TRUE((outputId >= 0) && (outputId < vocabSize));
@@ -299,7 +320,7 @@ void SamplingKernelTest<T>::verifyResult(SamplingKernelTestParam const& param)
                 if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished()
                     && !finishedOrigHostPtr[batchSlot].isSkipDecoding())
                 {
-                    if (maxTokensPerStep == 1 && !param.returnAllTopK)
+                    if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens)
                     {
                         if (generatedEOS)
                         {
@@ -314,8 +335,6 @@ void SamplingKernelTest<T>::verifyResult(SamplingKernelTestParam const& param)
                         }
                     }
 
-                    auto topKTopPVariants = computeTopKTopPVariants(bi, batchSlot, ti, maxTokensPerStep, vocabSize);
-
                     bool found = false;
                     for (auto const& var : topKTopPVariants)
                     {
@@ -340,11 +359,24 @@ void SamplingKernelTest<T>::verifyResult(SamplingKernelTestParam const& param)
                     EXPECT_EQ(finishedHostPtr[batchSlot].isFinished(), finishedOrigHostPtr[batchSlot].isFinished());
                 }
             }
+
+            // a boundary check for returnAllSelectedTokens in topP kernel and when TopP selected indices < topK in topK
+            // kernel.
+            if (!skipDecodeHostPtr[batchSlot] && !finishedOrigHostPtr[batchSlot].isFinished()
+                && !finishedOrigHostPtr[batchSlot].isSkipDecoding())
+            {
+                if (param.returnAllSelectedTokens && (topK == 0 || ki != topK))
+                {
+                    auto const idsIdx = ti * mMaxTopK + ki;
+                    auto const outputId = outputIdsHostPtr[batchSlot * mMaxSeqLen + idsIdx];
+                    EXPECT_EQ(outputId, -1);
+                }
+            }
         }
     }
 
     // Cum log probs is not supported for multiple tokens per step or all top K return
-    if (maxTokensPerStep == 1 && !param.returnAllTopK)
+    if (maxTokensPerStep == 1 && !param.returnAllSelectedTokens)
     {
         for (int32_t bi = 0; bi < batchSize; ++bi)
         {
diff --git a/cpp/tests/kernels/sampling/samplingTest.h b/cpp/tests/kernels/sampling/samplingTest.h
index 33d4e46b0..10de1f059 100644
--- a/cpp/tests/kernels/sampling/samplingTest.h
+++ b/cpp/tests/kernels/sampling/samplingTest.h
@@ -194,7 +194,7 @@ struct SamplingKernelTestParam
     bool normalizeLogProbs{false};
     bool logitsHasProbs{true};
     int32_t maxTokensPerStep{1};
-    bool returnAllTopK{false};
+    bool returnAllSelectedTokens{false};
     bool useLogitsPtrs{false};
     bool isDeterministicTopP{false};
 
@@ -228,9 +228,9 @@ struct SamplingKernelTestParam
         return *this;
     }
 
-    SamplingKernelTestParam& setReturnAllTopK()
+    SamplingKernelTestParam& setReturnAllSelectedTokens()
     {
-        returnAllTopK = true;
+        returnAllSelectedTokens = true;
         return *this;
     }
 
diff --git a/cpp/tests/kernels/sampling/samplingTopKTest.cpp b/cpp/tests/kernels/sampling/samplingTopKTest.cpp
index 0d3ea5b78..2bb5763fc 100644
--- a/cpp/tests/kernels/sampling/samplingTopKTest.cpp
+++ b/cpp/tests/kernels/sampling/samplingTopKTest.cpp
@@ -70,10 +70,10 @@ class TopKSamplingKernelTest : public SamplingKernelTest<T>
         kernelParams.finishedOutput = reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
             bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(*this->mFinishedDevice));
         kernelParams.skipDecode = bufferCast<bool>(*this->mSkipDecodeDevice);
-        kernelParams.cumLogProbs = params.returnAllTopK || params.maxTokensPerStep > 1
+        kernelParams.cumLogProbs = params.returnAllSelectedTokens || params.maxTokensPerStep > 1
             ? nullptr
             : bufferCast<float>(*this->mCumLogProbsDevice);
-        kernelParams.outputLogProbs = params.returnAllTopK || params.maxTokensPerStep > 1
+        kernelParams.outputLogProbs = params.returnAllSelectedTokens || params.maxTokensPerStep > 1
             ? nullptr
             : bufferCast<float>(*this->mOutputLogProbsDevice);
         kernelParams.curandState = reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*this->mCurandStatesDevice));
@@ -84,7 +84,7 @@ class TopKSamplingKernelTest : public SamplingKernelTest<T>
         kernelParams.vocabSizePadded = params.vocabSize;
         kernelParams.normalizeLogProbs = params.normalizeLogProbs;
         kernelParams.logitsHasProbs = params.logitsHasProbs;
-        kernelParams.returnAllTopK = params.returnAllTopK;
+        kernelParams.returnAllSelectedTokens = params.returnAllSelectedTokens;
 
         // Perform batched TopK sampling
         tk::invokeBatchTopKSampling(kernelParams, this->mStream->get());
@@ -136,7 +136,7 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessTopKMaxTokensPerStep)
         SamplingKernelTestParam().setBatchSize(16).setVocabSize(4000).setTopK(63).setTopP(1.0f).setMaxTokensPerStep(4));
 };
 
-TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllTopK)
+TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllSelectedTokens)
 {
     this->runTest(SamplingKernelTestParam()
                       .setBatchSize(16)
@@ -144,7 +144,18 @@ TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllTopK)
                       .setTopK(10)
                       .setTopP(1.0f)
                       .setMaxTokensPerStep(4)
-                      .setReturnAllTopK());
+                      .setReturnAllSelectedTokens());
+};
+
+TYPED_TEST(TopKSamplingKernelTest, CorrectnessReturnAllSelectedTokensSmallP)
+{
+    this->runTest(SamplingKernelTestParam()
+                      .setBatchSize(16)
+                      .setVocabSize(50)
+                      .setTopK(20)
+                      .setTopP(0.3f)
+                      .setMaxTokensPerStep(4)
+                      .setReturnAllSelectedTokens());
 };
 
 TYPED_TEST(TopKSamplingKernelTest, CorrectnessLogitsPtrs)
diff --git a/cpp/tests/kernels/sampling/samplingTopPTest.cpp b/cpp/tests/kernels/sampling/samplingTopPTest.cpp
index c09133312..92fc81738 100644
--- a/cpp/tests/kernels/sampling/samplingTopPTest.cpp
+++ b/cpp/tests/kernels/sampling/samplingTopPTest.cpp
@@ -53,7 +53,7 @@ class TopPSamplingKernelTest : public SamplingKernelTest<T>
 
         tk::TopPSamplingKernelParams<T> kernelParams;
         kernelParams.probs = bufferCast<T>(*this->mProbsDevice);
-        kernelParams.outputIds = bufferCast<int*>(*this->mIdsPtrHost);
+        kernelParams.outputIdsPtrs = bufferCast<int*>(*this->mIdsPtrHost);
         kernelParams.workspace = workspaceDevice->data();
         kernelParams.topPs = bufferCast<float>(*this->mTopPsDevice);
         kernelParams.sequenceLength = bufferCast<int32_t>(*this->mSeqLengthsDevice);
@@ -64,12 +64,15 @@ class TopPSamplingKernelTest : public SamplingKernelTest<T>
         kernelParams.finishedOutput = reinterpret_cast<tensorrt_llm::kernels::FinishedState*>(
             bufferCast<tensorrt_llm::kernels::FinishedState::UnderlyingType>(*this->mFinishedDevice));
         kernelParams.skipDecode = bufferCast<bool>(*this->mSkipDecodeDevice);
-        kernelParams.cumLogProbs = bufferCast<float>(*this->mCumLogProbsDevice);
-        kernelParams.outputLogProbs = bufferCast<float>(*this->mOutputLogProbsDevice);
+        kernelParams.cumLogProbs
+            = params.returnAllSelectedTokens ? nullptr : bufferCast<float>(*this->mCumLogProbsDevice);
+        kernelParams.outputLogProbs
+            = params.returnAllSelectedTokens ? nullptr : bufferCast<float>(*this->mOutputLogProbsDevice);
         kernelParams.curandState = reinterpret_cast<curandState_t*>(bufferCast<int8_t>(*this->mCurandStatesDevice));
         kernelParams.batchSize = params.batchSize;
         kernelParams.maxBatchSize = maxBatchSize;
         kernelParams.vocabSizePadded = params.vocabSize;
+        kernelParams.returnAllSelectedTokens = params.returnAllSelectedTokens;
 
         // Perform batched TopP sampling
         tk::invokeBatchTopPSampling<T>(kernelParams, this->mStream->get());
@@ -80,26 +83,36 @@ TYPED_TEST_SUITE(TopPSamplingKernelTest, FloatAndHalfTypes);
 
 TYPED_TEST(TopPSamplingKernelTest, CorrectnessSmallP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.2f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.2f));
 };
 
 TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(0.9f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(0.9f));
 };
 
 TYPED_TEST(TopPSamplingKernelTest, CorrectnessAncestral)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopP(1.0f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(6).setVocabSize(4).setTopK(0).setTopP(1.0f));
 };
 
 TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabSmallP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.2f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.2f));
 };
 
 TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabLargeP)
 {
-    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopP(0.9f));
+    this->runTest(SamplingKernelTestParam().setBatchSize(32).setVocabSize(51200).setTopK(0).setTopP(0.9f));
+};
+
+TYPED_TEST(TopPSamplingKernelTest, CorrectnessReturnAllSelectedTokens)
+{
+    this->runTest(SamplingKernelTestParam()
+                      .setBatchSize(16)
+                      .setVocabSize(50)
+                      .setTopK(0)
+                      .setTopP(0.8f)
+                      .setReturnAllSelectedTokens());
 };
 } // end of namespace
diff --git a/cpp/tests/layers/baseSamplingLayerTest.cpp b/cpp/tests/layers/baseSamplingLayerTest.cpp
index 7b286514d..3e0dd2b3b 100644
--- a/cpp/tests/layers/baseSamplingLayerTest.cpp
+++ b/cpp/tests/layers/baseSamplingLayerTest.cpp
@@ -48,21 +48,23 @@ void BaseSamplingLayerTest<T>::setup(uint64_t seed, TestSamplingParams const& pa
         computeProb(mTestLogitsInit.data(), mTestLogitsInit.data(), 4, mVocabSize);
     }
 
-    mSeqLengthsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
-    mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
+    mSeqLengthsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32);
+    mContextLengthDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32);
     mFinishedDevice = mBufferManager->gpu(
-        ITensor::makeShape({mMaxBatchSize}), TRTDataType<tk::FinishedState::UnderlyingType>::value);
-    mOutputIdsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, mMaxSeqLen}), nvinfer1::DataType::kINT32);
-    mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kINT32);
-    mIdsPtrHost = mBufferManager->pinned(ITensor::makeShape({mMaxBatchSize}), ptrType);
+        ITensor::makeShape({maxBatchSize()}), TRTDataType<tk::FinishedState::UnderlyingType>::value);
+    mOutputIdsDevice
+        = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), mMaxSeqLen}), nvinfer1::DataType::kINT32);
+    mEndIdsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kINT32);
+    mIdsPtrHost = mBufferManager->pinned(ITensor::makeShape({maxBatchSize()}), ptrType);
 
-    mCumLogProbsDevice = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize}), nvinfer1::DataType::kFLOAT);
+    mCumLogProbsDevice = mBufferManager->gpu(ITensor::makeShape({maxBatchSize()}), nvinfer1::DataType::kFLOAT);
     mOutputLogProbsDevice
-        = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, mMaxSeqLen}), nvinfer1::DataType::kFLOAT);
+        = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), mMaxSeqLen}), nvinfer1::DataType::kFLOAT);
 
-    mBatchSlots = mBufferManager->pinned(ITensor::makeShape({mBatchSize}), nvinfer1::DataType::kINT32);
+    mBatchSlots
+        = mBufferManager->pinned(ITensor::makeShape({mBatchSize + mBatchSizeBadPad}), nvinfer1::DataType::kINT32);
     mCurandStatesDevice
-        = mBufferManager->gpu(ITensor::makeShape({mMaxBatchSize, sizeof(curandState_t)}), nvinfer1::DataType::kINT8);
+        = mBufferManager->gpu(ITensor::makeShape({maxBatchSize(), sizeof(curandState_t)}), nvinfer1::DataType::kINT8);
 
     auto const workspaceSize = mSamplingLayer->getWorkspaceSize();
 
@@ -79,10 +81,14 @@ void BaseSamplingLayerTest<T>::setup(uint64_t seed, TestSamplingParams const& pa
     {
         batchSlotsPtr[bi] = 2 * bi;
     }
+    for (SizeType32 bi = 0; bi < mBatchSizeBadPad; ++bi)
+    {
+        batchSlotsPtr[mBatchSize + bi] = 0xbaadf00d;
+    }
 
     auto idsPtrHostPtr = BufferRange<void*>(*mIdsPtrHost);
     auto outputIdsDevicePtr = bufferCast<int32_t>(*mOutputIdsDevice);
-    for (SizeType32 bi = 0; bi < mMaxBatchSize; bi++)
+    for (SizeType32 bi = 0; bi < maxBatchSize(); bi++)
     {
         idsPtrHostPtr[bi] = outputIdsDevicePtr + bi * mMaxSeqLen;
     }
@@ -158,13 +164,13 @@ void BaseSamplingLayerTest<T>::batchCopy(int32_t step)
 template <typename T>
 bool BaseSamplingLayerTest<T>::checkResult(int32_t* outputIds, std::vector<std::set<int32_t>>& expectedIds)
 {
-    assert(expectedIds.size() == mMaxSeqLen * mBatchBeam);
+    assert(expectedIds.size() == mMaxSeqLen * batchBeam());
     int failures = 0;
     auto* const batchSlotsPtr = bufferCast<int32_t>(*mBatchSlots);
-    for (int32_t i = 0; i < mMaxSeqLen * mBatchBeam; ++i)
+    for (int32_t i = 0; i < mMaxSeqLen * batchBeam(); ++i)
     {
-        int32_t s = i / mBatchBeam;
-        int32_t b = i % mBatchBeam;
+        int32_t s = i / batchBeam();
+        int32_t b = i % batchBeam();
         auto const batchSlot = batchSlotsPtr[b];
         std::set<int32_t> expts = expectedIds.at(i);
         auto const outputId = outputIds[batchSlot * mMaxSeqLen + s];
@@ -186,7 +192,7 @@ bool BaseSamplingLayerTest<T>::checkResult(int32_t* outputIds, std::vector<std::
         }
     }
     TLLM_LOG_DEBUG(
-        "check...%6s : failures: %d / %d", failures == 0 ? "....OK" : "FAILED", failures, mMaxSeqLen * mBatchBeam);
+        "check...%6s : failures: %d / %d", failures == 0 ? "....OK" : "FAILED", failures, mMaxSeqLen * batchBeam());
     return failures == 0;
 }
 
@@ -194,10 +200,11 @@ template <typename T>
 void BaseSamplingLayerTest<T>::runTest(
     std::vector<std::set<int32_t>> expectedOutputIds, TestSamplingParams const& params, int32_t endId)
 {
+    mBatchSize = params.batchSize;
     initLayer(params);
 
     auto const decoderDomain
-        = tensorrt_llm::layers::DecoderDomain(mMaxBatchSize, mBeamWidth, mVocabSize, mVocabSizePadded);
+        = tensorrt_llm::layers::DecoderDomain(maxBatchSize(), mBeamWidth, mVocabSize, mVocabSizePadded);
     mDecodingWorkspace = std::make_unique<tensorrt_llm::runtime::DecodingLayerWorkspace>(
         mBufferManager, decoderDomain, TRTDataType<T>::value, mSamplingLayer->getWorkspaceSize());
     mEndId = endId;
diff --git a/cpp/tests/layers/baseSamplingLayerTest.h b/cpp/tests/layers/baseSamplingLayerTest.h
index 4ebb75447..f70dc3561 100644
--- a/cpp/tests/layers/baseSamplingLayerTest.h
+++ b/cpp/tests/layers/baseSamplingLayerTest.h
@@ -85,6 +85,7 @@ struct TestSamplingParams
     std::vector<float> decay;
     std::vector<float> minTopP;
     std::vector<int32_t> topPResetIds;
+    int32_t batchSize = 6;
     bool useBias = false;
 };
 
@@ -96,11 +97,10 @@ class BaseSamplingLayerTest : public testing::Test
     using BufferPtr = tensorrt_llm::runtime::IBuffer::SharedPtr;
 
     int32_t seed = 0;
-    static uint64_t const mMaxSeed = 32;
-    int32_t const mBatchSize = 6;
-    int32_t const mMaxBatchSize = 2 * mBatchSize;
+    int32_t mBatchSize = -1; // setup by runTest
+    static int32_t constexpr mBatchSizeBadPad = 512;
+    static uint64_t constexpr mMaxSeed = 32;
     int32_t const mBeamWidth = 1;
-    int32_t const mBatchBeam = mBatchSize * mBeamWidth;
     int32_t const mVocabSize = 8;
     int32_t const mVocabSizePadded = mVocabSize;
 
@@ -135,6 +135,16 @@ class BaseSamplingLayerTest : public testing::Test
 
     std::vector<T> mTestLogitsInit;
 
+    int32_t maxBatchSize() const
+    {
+        return 2 * mBatchSize;
+    }
+
+    int32_t batchBeam() const
+    {
+        return mBatchSize * mBeamWidth;
+    }
+
     void setup(uint64_t seed, TestSamplingParams const& params);
 
     virtual void initLayer(TestSamplingParams const& params) = 0;
diff --git a/cpp/tests/layers/lookaheadAlgorithmTest.cpp b/cpp/tests/layers/lookaheadAlgorithmTest.cpp
index fc70b2bff..68eb8c193 100644
--- a/cpp/tests/layers/lookaheadAlgorithmTest.cpp
+++ b/cpp/tests/layers/lookaheadAlgorithmTest.cpp
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/layers/lookaheadAlgorithm.h"
 #include "tensorrt_llm/layers/lookaheadDecodingUtils.h"
 #include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/lookaheadModule.h"
 #include "tests/layers/randomLlm.h"
 
@@ -84,9 +85,10 @@ TEST_P(LookaheadAlgorithmTest, predict)
     std::tie(std::ignore, std::ignore, maxDraftLenRuntime, std::ignore)
         = executor::LookaheadDecodingConfig(w, n, g).calculateSpeculativeResource();
     auto shape = ITensor::makeShape({maxTokensPerStep});
+    auto shape2d = ITensor::makeShape({maxTokensPerStep, maxTokensPerStep});
     auto shapeSingle = ITensor::makeShape({1});
     TensorPtr posidMax = BufferManager::cpu(shape, nvinfer1::DataType::kINT32);
-    TensorPtr smaskMax = BufferManager::cpu(shape, nvinfer1::DataType::kBOOL);
+    TensorPtr attentionMaskMax = BufferManager::cpu(shape2d, nvinfer1::DataType::kBOOL);
     TensorPtr inputLengthPtr = BufferManager::cpu(shapeSingle, nvinfer1::DataType::kINT32);
     auto& inputLength(*BufferRange<SizeType32>(*inputLengthPtr).begin());
 
@@ -123,26 +125,34 @@ TEST_P(LookaheadAlgorithmTest, predict)
     {
         TLLM_LOG_DEBUG("\noracle[%d] = '%c'", sequenceLength - 1, static_cast<char>(sequenceRange[sequenceLength - 1]));
         bufferCast<SizeType32>(*posidMax)[0] = sequenceLength - 1;
-        bufferCast<bool>(*smaskMax)[0] = true;
+        BufferLocation<bool> amaskLocation(*attentionMaskMax);
+        for (auto& item : amaskLocation)
+        {
+            item = false;
+        }
+        for (SizeType32 i = 0; i < maxTokensPerStep; i++)
+        {
+            amaskLocation.at(i, 0) = true;
+        }
+
         algo.prepare(                                                     //
             ITensor::slice(sequence, sequenceLength, maxDraftLenRuntime), //
             ITensor::slice(posidMax, 1, maxDraftLenRuntime),              //
-            ITensor::slice(smaskMax, 1, maxDraftLenRuntime),              //
             inputLengthPtr,                                               //
+            attentionMaskMax, 1,                                          //
             sequenceLengthPtr,                                            //
             ITensor::slice(sequence, sequenceLength - 1, 1));
 
         TensorPtr input = ITensor::slice(sequence, sequenceLength - 1, inputLength + 1);
         TensorPtr posid = ITensor::slice(posidMax, 0, inputLength + 1);
-        TensorPtr smask = ITensor::slice(smaskMax, 0, inputLength + 1);
+        TensorPtr amask = ITensor::slice(attentionMaskMax, 0, inputLength + 1);
 
         PRINT_TOKENS(input);
         PRINT_VALUES(posid);
-        PRINT_VALUES(smask);
+        PRINT_VALUES(amask);
 
         TensorPtr output = ITensor::slice(outputMax, 0, inputLength + 1);
-        llm.foretell(output, input, posid);
-        llm.sampleByMask(output, smask);
+        llm.foretell(output, input, posid, amask);
         PRINT_TOKENS(output);
 
         // algo.update(acceptedMax, acceptedOffsetsMax, acceptedLengthPtr, output, endIdPtr);
@@ -207,4 +217,46 @@ INSTANTIATE_TEST_CASE_P(CombineLookaheadAlgorithmTestSmall_222, LookaheadAlgorit
     testing::Combine(testing::Values(std::make_tuple(2, 2)), testing::Values(std::make_tuple(2, 2)),
         testing::Values(std::make_tuple(2, 2))));
 
+TEST(LookaheadAlgorithmTest, treeEncodeTest)
+{
+    auto testWithData = [](TensorPtr inputTokens, TensorPtr inputPosIds, SizeType32 lastPosId, SizeType32 gold_len)
+    {
+        auto shape = inputTokens->getShape();
+        auto shape2d = ITensor::makeShape({shape.d[0], shape.d[0]});
+
+        TensorPtr inputMasks = BufferManager::cpu(shape2d, nvinfer1::DataType::kBOOL);
+        LookaheadAlgorithm::posIdsToMask(inputMasks, inputPosIds);
+
+        TensorPtr outputTokens = BufferManager::cpu(shape, nvinfer1::DataType::kINT32);
+        TensorPtr outputPosIds = BufferManager::cpu(shape, nvinfer1::DataType::kINT32);
+        TensorPtr encodeMap = BufferManager::cpu(shape, nvinfer1::DataType::kINT32);
+        TensorPtr outputMasks = BufferManager::cpu(shape2d, nvinfer1::DataType::kBOOL);
+
+        // auto len = LookaheadAlgorithm::treeEncode(outputTokens, outputPosIds, outputMasks, inputTokens, inputPosIds,
+        // inputMasks, '$', 9);
+        auto len = LookaheadAlgorithm::treeEncode(inputTokens, inputPosIds, inputMasks, encodeMap);
+        TLLM_LOG_DEBUG("len = %d", len);
+
+        EXPECT_EQ(len, gold_len);
+    };
+
+    testWithData(                                                 //
+        initTensor(std::string("01234512345")),                   //
+        initTensor({10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}), //
+        9, 6);
+
+    testWithData(                                                 //
+        initTensor(std::string("01234512abc")),                   //
+        initTensor({10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15}), //
+        9, 9);
+
+    testWithData(                                                                     //
+        initTensor(std::string("01234512abc2aBCD")),                                  //
+        initTensor({10, 11, 12, 13, 14, 15, 11, 12, 13, 14, 15, 12, 13, 14, 15, 16}), //
+        9, 12);
+
+    testWithData(initTensor(std::string("wmplhi  folxamp")),
+        initTensor({21, 22, 23, 24, 25, 26, 27, 21, 22, 23, 24, 21, 22, 23, 24}), 20, 15);
+}
+
 } // namespace tensorrt_llm::tests::layers
diff --git a/cpp/tests/layers/lookaheadDecodingLayerTest.cpp b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp
index e3460a52b..71c62bd1a 100644
--- a/cpp/tests/layers/lookaheadDecodingLayerTest.cpp
+++ b/cpp/tests/layers/lookaheadDecodingLayerTest.cpp
@@ -230,11 +230,11 @@ class LookaheadDecodingLayerTest : public testing::Test
     TensorPtr mNumNewTokensCumSum;
     TensorPtr mPathsOffsets;
     TensorPtr mDraftLengths;
+    TensorPtr mPrevDraftLengths;
     TensorPtr mDraftTokens;
     TensorPtr mPackedMasks;
     TensorPtr mPackedMasksBool;
     TensorPtr mGenerationLengths;
-    TensorPtr mGenerationLengthsMax;
     TensorPtr mPositionOffsets;
     TensorPtr mPositionIds;
     TensorPtr mAttentionPackedMask;
@@ -371,6 +371,7 @@ void LookaheadDecodingLayerTest::allocateBuffers()
         ITensor::makeShape({mMaxTokensPerStep, maxBatchSize, 1}), nvinfer1::DataType::kINT32);
     mNumNewTokens = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
     mDraftLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
+    mPrevDraftLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
     mDraftTokens
         = BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, maxDraftLen}), nvinfer1::DataType::kINT32);
     auto packedMaskShape = ITensor::makeShape(
@@ -382,7 +383,6 @@ void LookaheadDecodingLayerTest::allocateBuffers()
     mPathsOffsets = BufferManager::pinnedPool(
         ITensor::makeShape({maxBatchSize, maxAcceptedDraftLen}), nvinfer1::DataType::kINT32);
     mGenerationLengths = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
-    mGenerationLengthsMax = BufferManager::pinnedPool(maxBatchShape1D, nvinfer1::DataType::kINT32);
     mPositionOffsets
         = BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize, mMaxTokensPerStep}), nvinfer1::DataType::kINT32);
     mPositionIds
@@ -462,10 +462,8 @@ void LookaheadDecodingLayerTest::newRequests(std::vector<SizeType32> requestIds)
         setupParams->prompt.emplace_back(mPrompt[gbi]);
         setupParams->algoConfigs.emplace_back(mTestParam.w, mTestParam.n, mTestParam.g);
         PRINT_TOKENS(setupParams->prompt[bi]);
-        setupParams->generationLengths = mGenerationLengthsMax;
-        setupParams->actualGenerationLengths = mGenerationLengths;
+        setupParams->generationLengths = mGenerationLengths;
         setupParams->positionOffsets = mPositionOffsets;
-        // setupParams->outputs.positionIds = mPositionIds;
         setupParams->attentionPackedMasks = mPackedMasks;
     }
     std::vector<uint64_t> seed(requestIds.begin(), requestIds.end());
@@ -669,14 +667,14 @@ void LookaheadDecodingLayerTest::decodeForward()
     PRINT_VALUES(mSequenceLengths);
     outputParams->sequenceLength = mSequenceLengths;
     outputParams->nextDraftLengths = mDraftLengths;
+    outputParams->prevDraftLengths = mPrevDraftLengths;
     outputParams->nextDraftTokens = mDraftTokens;
     outputParams->packedMasks = mPackedMasks;
     outputParams->numNewTokens = mNumNewTokens;
     outputParams->newTokens = mNewTokens;
     outputParams->numNewTokensCumSum = mNumNewTokensCumSum;
     outputParams->pathsOffsets = mPathsOffsets;
-    outputParams->generationLengths = mGenerationLengthsMax;
-    outputParams->actualGenerationLengths = mGenerationLengths;
+    outputParams->generationLengths = mGenerationLengths;
     outputParams->positionOffsets = mPositionOffsets;
     outputParams->positionIds = mPositionIds;
     outputParams->packedMasks = mPackedMasks;
@@ -722,17 +720,17 @@ void LookaheadDecodingLayerTest::verifyDecode()
     BufferRange<SizeType32> cumSumRange(*mNumNewTokensCumSum);
     BufferRange<SizeType32> pathOffsetsRange(*mPathsOffsets);
     PRINT_VALUES(mNumNewTokensCumSum);
-    for (SizeType32 gbi = 0; gbi < mTestParam.maxBatchSize; gbi++)
+    for (SizeType32 bi = 0; bi < batchSize; bi++)
     {
-        SizeType32 pathOffsetBegin = cumSumRange[gbi];
-        SizeType32 pathOffsetEnd = cumSumRange[gbi + 1];
+        auto gbi = BufferRange<SizeType32>(*mBatchSlots)[bi];
+        SizeType32 pathOffsetBegin = cumSumRange[bi];
+        SizeType32 pathOffsetEnd = cumSumRange[bi + 1];
         TensorPtr golden = ITensor::at(mGoldenSampledTokens, {gbi});
         auto sequenceLength = BufferLocation<SizeType32>(*mSequenceLengths).at(gbi);
         auto numNewTokens = BufferLocation<SizeType32>(*mNumNewTokens).at(gbi);
         TensorPtr newTokens = ITensor::slice(mOutputIds, {gbi, 0, sequenceLength - numNewTokens}, numNewTokens);
         BufferRange<SizeType32> goldenRange(*ITensor::at(mGoldenSampledTokens, {gbi}));
-        BufferRange<TokenIdType> newTokensRange(
-            *ITensor::slice(mOutputIds, {gbi, 0, sequenceLength - numNewTokens}, numNewTokens));
+        BufferRange<TokenIdType> newTokensRange(*newTokens);
 
         SizeType32 ni = 1;
         for (SizeType32 poi = pathOffsetBegin; poi < pathOffsetEnd; poi++)
diff --git a/cpp/tests/layers/lookaheadRandomLlmTest.cpp b/cpp/tests/layers/lookaheadRandomLlmTest.cpp
index e4570b1ee..f8e8ff027 100644
--- a/cpp/tests/layers/lookaheadRandomLlmTest.cpp
+++ b/cpp/tests/layers/lookaheadRandomLlmTest.cpp
@@ -207,7 +207,7 @@ TEST(LookaheadRandomllm, gpuSampling)
     kernelParams.vocabSizePadded = vocabSize;
     kernelParams.normalizeLogProbs = false;
     kernelParams.logitsHasProbs = false;
-    kernelParams.returnAllTopK = false;
+    kernelParams.returnAllSelectedTokens = false;
 
     PRINT_TOKENS(mEndIds);
     PRINT_VALUES(mTokensPerStep);
diff --git a/cpp/tests/layers/randomLlm.cpp b/cpp/tests/layers/randomLlm.cpp
index 2116186a6..9746286d9 100644
--- a/cpp/tests/layers/randomLlm.cpp
+++ b/cpp/tests/layers/randomLlm.cpp
@@ -276,8 +276,8 @@ void LookaheadRandomLlm::foretell(TensorPtr const& output, TensorConstPtr const&
         {
             right &= maskLocation.at(i, j) ? oracleRange[positionRange[j]] == inputRange[j] : true;
         }
-        if (i < verifyStart)
-        { // lookahead might be right
+        if (i < verifyStart && false)
+        { // lookahead might be right. Since we verify lookahead branch, then must be right.
             outputRange[i] = ((right || rand() % 5) && legal) ? oracleRange[positionRange[i] + 1] : invalid;
         }
         else
diff --git a/cpp/tests/layers/samplingLayerTest.cpp b/cpp/tests/layers/samplingLayerTest.cpp
index 8efd52479..a6641378b 100644
--- a/cpp/tests/layers/samplingLayerTest.cpp
+++ b/cpp/tests/layers/samplingLayerTest.cpp
@@ -51,7 +51,7 @@ class SamplingLayerTest : public BaseSamplingLayerTest<T>
         }
 
         auto const decodingDomain
-            = tensorrt_llm::layers::DecoderDomain(this->mMaxBatchSize, 1, this->mVocabSize, this->mVocabSizePadded);
+            = tensorrt_llm::layers::DecoderDomain(this->maxBatchSize(), 1, this->mVocabSize, this->mVocabSizePadded);
         this->mSamplingLayer = std::make_shared<tensorrt_llm::layers::SamplingLayer<T>>(
             decodingMode, decodingDomain, this->mBufferManager);
     }
diff --git a/cpp/tests/layers/topKSamplingLayerTest.cpp b/cpp/tests/layers/topKSamplingLayerTest.cpp
index 1da62cc23..f6ddde52a 100644
--- a/cpp/tests/layers/topKSamplingLayerTest.cpp
+++ b/cpp/tests/layers/topKSamplingLayerTest.cpp
@@ -34,7 +34,7 @@ class TopKSamplingLayerTest : public BaseSamplingLayerTest<T>
     void initLayer(TestSamplingParams const& params) override
     {
         auto const decodingDomain
-            = tensorrt_llm::layers::DecoderDomain(this->mMaxBatchSize, 1, this->mVocabSize, this->mVocabSizePadded);
+            = tensorrt_llm::layers::DecoderDomain(this->maxBatchSize(), 1, this->mVocabSize, this->mVocabSizePadded);
         this->mSamplingLayer
             = std::make_shared<tensorrt_llm::layers::TopKSamplingLayer<T>>(decodingDomain, this->mBufferManager);
     }
diff --git a/cpp/tests/layers/topPSamplingLayerTest.cpp b/cpp/tests/layers/topPSamplingLayerTest.cpp
index c1af229e1..8f65c312e 100644
--- a/cpp/tests/layers/topPSamplingLayerTest.cpp
+++ b/cpp/tests/layers/topPSamplingLayerTest.cpp
@@ -40,12 +40,13 @@ class TopPSamplingLayerTest : public BaseSamplingLayerTest<T>
     void initLayer(TestSamplingParams const& params) override
     {
         auto const decodingDomain
-            = tensorrt_llm::layers::DecoderDomain(this->mMaxBatchSize, 1, this->mVocabSize, this->mVocabSizePadded);
+            = tensorrt_llm::layers::DecoderDomain(this->maxBatchSize(), 1, this->mVocabSize, this->mVocabSizePadded);
         this->mSamplingLayer = std::make_shared<tensorrt_llm::layers::TopPSamplingLayer<T>>(
             decodingDomain, this->mBufferManager, &mDeviceProp);
     }
 
-    struct cudaDeviceProp mDeviceProp;
+protected:
+    cudaDeviceProp mDeviceProp{};
 };
 
 TYPED_TEST_SUITE(TopPSamplingLayerTest, FloatAndHalfTypes);
@@ -165,4 +166,28 @@ TYPED_TEST(TopPSamplingLayerTest, TopPDecay)
     this->runTest(expectedOutputIds, params);
 }
 
+TYPED_TEST(TopPSamplingLayerTest, LargeBatch)
+{
+    SizeType32 topK = 0;
+    float topP = 0.3f;
+    TestSamplingParams params;
+    params.topKs = {topK};
+    params.topPs = {topP};
+
+    // Force to use more than 1 block
+    params.batchSize = this->mDeviceProp.maxThreadsPerBlock + 1;
+    std::vector<std::set<int32_t>> expectedOutputId{{4}, {0}, {2}, {0}};
+    std::vector<std::set<int32_t>> expectedOutputIds;
+    expectedOutputIds.reserve(expectedOutputId.size() * params.batchSize);
+
+    for (auto const& id : expectedOutputId)
+    {
+        for (int32_t i = 0; i < params.batchSize; ++i)
+        {
+            expectedOutputIds.emplace_back(id);
+        }
+    }
+    this->runTest(expectedOutputIds, params);
+}
+
 } // namespace
diff --git a/cpp/tests/resources/data/test_model_lora_config.json b/cpp/tests/resources/data/test_model_lora_config.json
index 73a598d01..ea6442186 100644
--- a/cpp/tests/resources/data/test_model_lora_config.json
+++ b/cpp/tests/resources/data/test_model_lora_config.json
@@ -63,7 +63,6 @@
         "gather_context_logits": false,
         "gather_generation_logits": false,
         "strongly_typed": true,
-        "builder_opt": null,
         "profiling_verbosity": "layer_names_only",
         "enable_debug_output": false,
         "max_draft_len": 0,
diff --git a/cpp/tests/resources/scripts/build_chatglm_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py
index e845a0365..530db1d8f 100644
--- a/cpp/tests/resources/scripts/build_chatglm_engines.py
+++ b/cpp/tests/resources/scripts/build_chatglm_engines.py
@@ -59,7 +59,6 @@ def build_engine(ckpt_dir: str,
         "--max_seq_len=384",
         "--gpt_attention_plugin=float16",
         "--gemm_plugin=float16",
-        "--builder_opt=0",
     ]
     if is_ifb:
         build_cmd.extend([
diff --git a/cpp/tests/resources/scripts/build_gpt_engines.py b/cpp/tests/resources/scripts/build_gpt_engines.py
index 104879be4..7cbc8c382 100755
--- a/cpp/tests/resources/scripts/build_gpt_engines.py
+++ b/cpp/tests/resources/scripts/build_gpt_engines.py
@@ -63,7 +63,6 @@ def build_engine(
         f'--max_input_len={max_input_len}',
         f'--max_seq_len={max_seq_len}',
         '--max_beam_width=2',
-        '--builder_opt=0',
         '--kv_cache_type=continuous',
     ]
     legacy_args = [
diff --git a/cpp/tests/resources/scripts/build_llama_engines.py b/cpp/tests/resources/scripts/build_llama_engines.py
index 425b636f4..12f56b364 100644
--- a/cpp/tests/resources/scripts/build_llama_engines.py
+++ b/cpp/tests/resources/scripts/build_llama_engines.py
@@ -90,7 +90,7 @@ def build_engines(model_cache: str, only_multi_gpu: bool):
 
     tp_pp_sizes = [(1, 1)]
     if only_multi_gpu:
-        tp_pp_sizes = [(1, 4), (4, 1), (1, 2), (2, 2)]
+        tp_pp_sizes = [(1, 4), (4, 1), (1, 2), (2, 2), (2, 1)]
     for tp_size, pp_size in tp_pp_sizes:
         tp_pp_dir = f"tp{tp_size}-pp{pp_size}-gpu"
         print(f"\nBuilding fp16 tp{tp_size} pp{pp_size} engine")
diff --git a/cpp/tests/resources/scripts/generate_expected_gpt_output.py b/cpp/tests/resources/scripts/generate_expected_gpt_output.py
index 4037a236f..69607af7c 100755
--- a/cpp/tests/resources/scripts/generate_expected_gpt_output.py
+++ b/cpp/tests/resources/scripts/generate_expected_gpt_output.py
@@ -151,7 +151,7 @@ def generate_outputs(num_beams):
                     output_logits=True,
                     output_log_probs=True,
                     output_cum_log_probs=True)
-    # GptExecutorTest.GenerationLogitsEarlyStop requires to use context_fmha_fp32_acc flag in runtime
+    # GptExecutorTest.GenerationLogitsEarlyStop and several tests require to use context_fmha_fp32_acc flag in runtime
     model_spec_obj.enable_context_fmha_fp32_acc()
     generate_output(engine=model_spec_obj.get_model_path(),
                     num_beams=num_beams,
@@ -165,6 +165,14 @@ def generate_outputs(num_beams):
     model_spec_obj.use_gpt_plugin()
     model_spec_obj.set_kv_cache_type(_tb.KVCacheType.PAGED)
     model_spec_obj.use_packed_input()
+    generate_output(engine=model_spec_obj.get_model_path(),
+                    num_beams=num_beams,
+                    input_name=input_name,
+                    model_spec_obj=model_spec_obj,
+                    output_logits=False,
+                    output_log_probs=True,
+                    output_cum_log_probs=True)
+    model_spec_obj.enable_context_fmha_fp32_acc()
     generate_output(engine=model_spec_obj.get_model_path(),
                     num_beams=num_beams,
                     input_name=input_name,
diff --git a/cpp/tests/resources/scripts/generate_expected_llama_output.py b/cpp/tests/resources/scripts/generate_expected_llama_output.py
index 08d904201..cff87fbe0 100644
--- a/cpp/tests/resources/scripts/generate_expected_llama_output.py
+++ b/cpp/tests/resources/scripts/generate_expected_llama_output.py
@@ -72,7 +72,7 @@ def generate_outputs(num_beams, only_multi_gpu=False):
     elif COMM_WORLD.size == 4:
         tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
     elif COMM_WORLD.size == 2:
-        tp_pp_sizes = [(1, 2)]
+        tp_pp_sizes = [(1, 2), (2, 1)]
     else:
         raise RuntimeError(
             f"The world size of MPI {COMM_WORLD.size} is not equal to 1, 2, or 4."
diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py
index a54ff4c30..ba0fe6d0d 100755
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@@ -16,6 +16,7 @@
 
 import argparse as _arg
 import copy
+import glob
 import logging as _log
 import os as _os
 import pathlib as _pl
@@ -68,6 +69,98 @@ def run_command(command: _tp.Sequence[str],
     _sp.check_call(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
 
 
+def merge_report(parallel, retry, output):
+    import xml.etree.ElementTree as ElementTree
+    base = ElementTree.parse(parallel)
+    extra = ElementTree.parse(retry)
+
+    base_suite = base.getroot()
+    extra_suite = extra.getroot()
+
+    base_suite.attrib['failures'] = extra_suite.attrib['failures']
+    base_suite.attrib['time'] = str(
+        int(base_suite.attrib['time']) + int(extra_suite.attrib['time']))
+
+    case_names = {element.attrib['name'] for element in extra_suite}
+    base_suite[:] = [
+        element
+        for element in base_suite if element.attrib['name'] not in case_names
+    ] + list(extra_suite)
+
+    base.write(output, encoding="UTF-8", xml_declaration=True)
+
+
+def add_parallel_info(report, parallel):
+    import xml.etree.ElementTree as ElementTree
+    try:
+        document = ElementTree.parse(report)
+    except FileNotFoundError:
+        return
+    root = document.getroot()
+    root.attrib['parallel'] = str(parallel)
+    document.write(report, encoding="UTF-8", xml_declaration=True)
+
+
+def parallel_run_ctest(
+    command: _tp.Sequence[str],
+    cwd: _pl.Path,
+    *,
+    shell=False,
+    env=None,
+    timeout=None,
+    parallel=2,
+) -> None:
+    if parallel == 1:
+        return run_command(command,
+                           cwd=cwd,
+                           shell=shell,
+                           env=env,
+                           timeout=timeout)
+
+    env = {} if env is None else env
+    env['CTEST_PARALLEL_LEVEL'] = str(parallel)
+
+    def get_report():
+        reports = glob.glob("results-*.xml", root_dir=cwd)
+        if not reports:
+            return ''
+
+        return reports[0]
+
+    report = None
+    try:
+        run_command(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
+    except _sp.CalledProcessError:
+        report = get_report()
+        if report == '':
+            # Some catastrophic fail happened that there's no report generated
+            raise
+
+        parallel_report = 'parallel-' + report
+        _os.rename(cwd / report, cwd / parallel_report)
+
+        try:
+            _log.info("Parallel test failed, retry serial on failed tests")
+            del env['CTEST_PARALLEL_LEVEL']
+            command = [*command, "--rerun-failed"]
+            run_command(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
+        finally:
+            if not _os.path.exists(cwd / report):
+                # Some catastrophic fail happened that there's no report generated
+                # Use parallel result as final report
+                _os.rename(cwd / parallel_report, cwd / report)
+            else:
+                retry_report = 'retry-' + report
+                _os.rename(cwd / report, cwd / retry_report)
+                merge_report(cwd / parallel_report, cwd / retry_report,
+                             cwd / report)
+    finally:
+        if report is None:
+            report = get_report()
+        if report:
+            add_parallel_info(cwd / report, parallel)
+
+
 def run_tests(build_dir: _pl.Path,
               model_cache: _tp.Optional[str] = None,
               skip_unit_tests=False,
@@ -376,6 +469,12 @@ def prepare_multi_gpu_model_tests(python_exe: str,
                         model_cache_arg=model_cache_arg,
                         only_multi_gpu_arg=only_multi_gpu_arg)
 
+    prepare_model_tests(model_name="llama",
+                        python_exe=python_exe,
+                        root_dir=root_dir,
+                        resources_dir=resources_dir,
+                        model_cache_arg=model_cache_arg)
+
     prepare_model_tests(model_name="t5",
                         python_exe=python_exe,
                         root_dir=root_dir,
@@ -483,7 +582,7 @@ def run_unit_tests(build_dir: _pl.Path, timeout=1800):
     excluded_tests.append("Encoder")
     excluded_tests.append("EncDec")
     ctest.extend(["-E", "|".join(excluded_tests)])
-    run_command(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)
+    parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)
 
 
 def run_single_gpu_tests(build_dir: _pl.Path,
@@ -541,7 +640,18 @@ def run_single_gpu_tests(build_dir: _pl.Path,
         ctest.extend(["-R", "|".join(included_tests)])
         if excluded_tests:
             ctest.extend(["-E", "|".join(excluded_tests)])
-        run_command(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)
+        parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)
+    if run_gpt:
+        xml_output_file = build_dir / "results-single-gpu-disagg-executor_gpt.xml"
+        trt_model_test = produce_mpirun_command(
+            global_commands=["mpirun", "--allow-run-as-root"],
+            nranks=2,
+            local_commands=[
+                "tests/executor/executorTest",
+                "--gtest_filter=*GptSingleDeviceDisaggSymmetricExecutorTest*"
+            ],
+            leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+        run_command(trt_model_test, cwd=build_dir, env=cpp_env, timeout=timeout)
 
 
 def produce_mpirun_command(*, global_commands, nranks, local_commands,
@@ -574,6 +684,7 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
         "-n",
         "2",
         "--allow-run-as-root",
+        "--oversubscribe",
         "batch_manager/cacheTransceiverTest",
     ]
     run_command(cache_trans_test, cwd=tests_dir, env=cpp_env, timeout=300)
@@ -654,25 +765,78 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
     run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
 
     new_env = copy.copy(cpp_env)
-    xml_output_file = build_dir / "results-multi-gpu-dist-executor_gpt.xml"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
     trt_model_test = produce_mpirun_command(
         global_commands=["mpirun", "--allow-run-as-root"],
         nranks=2,
         local_commands=[
             "executor/executorTest",
-            "--gtest_filter=DistExecutorTest.GPTTokenComparison"
+            "--gtest_filter=*DisaggSymmetricExecutorTest*"
         ],
         leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
     run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
 
     new_env = copy.copy(cpp_env)
-    xml_output_file = build_dir / "results-multi-gpu-dist-executor_chatglm.xml"
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
     trt_model_test = produce_mpirun_command(
         global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=2,
+        nranks=4,
+        local_commands=[
+            "executor/executorTest",
+            "--gtest_filter=*DisaggSymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
+
+    new_env = copy.copy(cpp_env)
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=8,
+        local_commands=[
+            "executor/executorTest",
+            "--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
+
+    new_env = copy.copy(cpp_env)
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=4,
+        local_commands=[
+            "executor/executorTest",
+            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
+
+    new_env = copy.copy(cpp_env)
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=6,
+        local_commands=[
+            "executor/executorTest",
+            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
+
+    new_env = copy.copy(cpp_env)
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=8,
         local_commands=[
             "executor/executorTest",
-            "--gtest_filter=DistExecutorTest.ChatGLMTokenComparison"
+            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
         ],
         leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
     run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index 152a060f0..42a2fecd0 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -195,7 +195,8 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
     SizeType32 constexpr nbRnnLayers{0};
     SizeType32 constexpr nbHeads{16};
     SizeType32 constexpr hiddenSize{1024};
-    ModelConfig modelConfig{vocabSize, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
+    ModelConfig modelConfig{
+        vocabSize, nbAttentionLayers + nbRnnLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
     modelConfig.useGptAttentionPlugin(false);
 
     auto streamPtr = std::make_shared<CudaStream>();
@@ -315,7 +316,8 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
     SizeType32 constexpr nbRnnLayers{0};
     SizeType32 constexpr nbHeads{16};
     SizeType32 constexpr hiddenSize{1024};
-    ModelConfig modelConfig{vocabSize, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
+    ModelConfig modelConfig{
+        vocabSize, nbAttentionLayers + nbRnnLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
     modelConfig.useGptAttentionPlugin(false);
 
     auto streamPtr = std::make_shared<CudaStream>();
@@ -440,7 +442,8 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
     SizeType32 constexpr nbRnnLayers{0};
     SizeType32 constexpr nbHeads{16};
     SizeType32 constexpr hiddenSize{1024};
-    ModelConfig modelConfig{vocabSize, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
+    ModelConfig modelConfig{
+        vocabSize, nbAttentionLayers + nbRnnLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
     modelConfig.useGptAttentionPlugin(false);
     modelConfig.setSpeculativeDecodingMode(SpeculativeDecodingMode::DraftTokensExternal());
 
@@ -486,7 +489,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
     auto const maxAttentionWindow = maxSeqLength;
     SizeType32 const sinkTokenLength{0};
 
-    auto const decodingMode = maxBeamWidth == 1 ? tle::DecodingMode::TopKTopP() : tle::DecodingMode::BeamSearch();
+    auto const decodingMode = tle::DecodingMode::ExternalDraftTokens(); // only supports bw=1
 
     // set up decoder
     auto decoder
@@ -515,13 +518,10 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
     decoder.forward(outputs, inputs);
 
     advanceSequenceLengths(expectedLengths, acceptedTokensPerStep, samplingConfigs, batchSize, maxBeamWidth);
-    // WAR: we don't write endId back into outputIds when we rejected tokens,
-    // so we adjust the lengths for verifyResults here
-    advanceSequenceLengths(generatedLengths, advancedTokensPerStep, samplingConfigs, batchSize, maxBeamWidth);
     checkSequenceLengths(*outputs.sequenceLengths, expectedLengths, manager);
     EXPECT_THAT(decoder.getFinished(), ::testing::Each(false));
 
-    verifyResults(manager, decoder, samplingConfigs, inputLengths, generatedLengths, batchSize, maxBeamWidth,
+    verifyResults(manager, decoder, samplingConfigs, inputLengths, expectedLengths, batchSize, maxBeamWidth,
         maxSeqLength, tokenId, padId);
 }
 
diff --git a/cpp/tests/runtime/gptDecoderTest.cpp b/cpp/tests/runtime/gptDecoderTest.cpp
index 36ed4a4d2..7b4e0e2fc 100644
--- a/cpp/tests/runtime/gptDecoderTest.cpp
+++ b/cpp/tests/runtime/gptDecoderTest.cpp
@@ -81,7 +81,7 @@ void testDecoder(nvinfer1::DataType const dtype, SamplingConfig const& samplingC
     SizeType32 constexpr nbHeads{16};
     SizeType32 constexpr hiddenSize{1024};
     SizeType32 constexpr batchSize{4};
-    ModelConfig modelConfig{vocabSize, nbLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
+    ModelConfig modelConfig{vocabSize, nbLayers + nbRnnLayers, nbLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
     modelConfig.useGptAttentionPlugin(false);
 
     SizeType32 constexpr maxInputLength{8};
diff --git a/cpp/tests/runtime/loraCacheTest.cpp b/cpp/tests/runtime/loraCacheTest.cpp
index 8429cd9ff..9f89e8959 100644
--- a/cpp/tests/runtime/loraCacheTest.cpp
+++ b/cpp/tests/runtime/loraCacheTest.cpp
@@ -62,7 +62,7 @@ class LoraCacheTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-typ
 
     void SetUp() override
     {
-        mModelConfig = std::make_unique<ModelConfig>(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
+        mModelConfig = std::make_unique<ModelConfig>(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
         mModelConfig->setMlpHiddenSize(32);
         mWorldConfig = std::make_unique<WorldConfig>(2, 1, 0);
         std::vector<LoraModule> modules{
@@ -166,7 +166,7 @@ TEST_F(LoraCacheTest, LoraCachePageManagerTest)
 
 TEST_F(LoraCacheTest, determineNumPages)
 {
-    ModelConfig modelConfig(0, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT);
+    ModelConfig modelConfig(0, 2, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT);
     modelConfig.setLoraModules(LoraModule::createLoraModules({"attn_dense", "attn_qkv"}, 4, 4, 1, 1, 2, 2, 0));
     WorldConfig worldConfig(1, 1, 0);
 
@@ -358,7 +358,7 @@ TEST_F(LoraCacheTest, basicPutGet)
 
 TEST_F(LoraCacheTest, splitTransposeCpu)
 {
-    auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
+    auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
     auto worldConfig = WorldConfig(2, 1, 0);
 
     SizeType32 const split{2};
@@ -421,7 +421,7 @@ TEST_F(LoraCacheTest, splitTransposeCpu)
 
 TEST_F(LoraCacheTest, copyToPages_tp1)
 {
-    auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
+    auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
     modelConfig.setMlpHiddenSize(32);
     auto worldConfig = WorldConfig(1, 1, 0);
     std::vector<LoraModule> modules{
@@ -479,7 +479,7 @@ TEST_F(LoraCacheTest, copyToPages_tp1)
 
 TEST_F(LoraCacheTest, copyToPages_tp2_rank0)
 {
-    auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
+    auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
     modelConfig.setMlpHiddenSize(32);
     auto worldConfig = WorldConfig(2, 1, 0);
     std::vector<LoraModule> modules{
@@ -536,7 +536,7 @@ TEST_F(LoraCacheTest, copyToPages_tp2_rank0)
 
 TEST_F(LoraCacheTest, copyToPages_tp2_rank1)
 {
-    auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
+    auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
     modelConfig.setMlpHiddenSize(32);
     auto worldConfig = WorldConfig(2, 1, 1);
     std::vector<LoraModule> modules{
diff --git a/cpp/tests/runtime/loraManagerTest.cpp b/cpp/tests/runtime/loraManagerTest.cpp
index b7cfd987e..0718bb316 100644
--- a/cpp/tests/runtime/loraManagerTest.cpp
+++ b/cpp/tests/runtime/loraManagerTest.cpp
@@ -59,7 +59,7 @@ class LoraManagerTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-t
 {
 protected:
     LoraManagerTest()
-        : mModelConfig(1, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT)
+        : mModelConfig(1, 2, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT)
     {
     }
 
@@ -80,7 +80,7 @@ class LoraManagerTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-t
 
     PeftTable getPeftTable(SizeType32 tpRank = 0)
     {
-        auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
+        auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
         modelConfig.setMlpHiddenSize(32);
         auto worldConfig = WorldConfig(2, 2, 3);
         std::vector<LoraModule> modules{
@@ -292,7 +292,7 @@ static std::tuple<std::vector<int32_t>, std::vector<int64_t>, PeftTable> createF
 TEST_F(LoraManagerTest, fillInputTensors)
 {
     LoraManager loraManager;
-    auto modelConfig = ModelConfig(0, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
+    auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 16, nvinfer1::DataType::kFLOAT);
     modelConfig.setMlpHiddenSize(32);
     auto worldConfig = WorldConfig(1, 1, 0);
     std::vector<LoraModule> modules{
diff --git a/cpp/tests/runtime/loraUtilsTest.cpp b/cpp/tests/runtime/loraUtilsTest.cpp
index b6cdd15f8..b44303346 100644
--- a/cpp/tests/runtime/loraUtilsTest.cpp
+++ b/cpp/tests/runtime/loraUtilsTest.cpp
@@ -86,7 +86,7 @@ TEST_F(LoraUtilsTest, dims_mem_type)
 
 TEST_F(LoraUtilsTest, loraValidateRequestTensors)
 {
-    auto modelConfig = ModelConfig(0, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT);
+    auto modelConfig = ModelConfig(0, 2, 2, 0, 1, 4, nvinfer1::DataType::kFLOAT);
     auto worldConfig = WorldConfig();
 
     std::optional<TensorPtr> optReqLoraWeights
@@ -114,6 +114,11 @@ TEST_F(LoraUtilsTest, loraValidateRequestTensors)
         LoraModule(LoraModule::ModuleType::kATTN_Q, 4, 4, false, true, -1, 0),
     };
     modelConfig.setLoraModules(modules);
+    EXPECT_THAT([&]()
+        { loraValidateRequestTensors(12345, optReqLoraWeights, optReqLoraConfig, modelConfig, worldConfig); },
+        testing::Throws<std::runtime_error>());
+
+    modelConfig.setMaxLoraRank(4);
 
     loraValidateRequestTensors(12345, optReqLoraWeights, optReqLoraConfig, modelConfig, worldConfig);
 
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index abf1e9d22..e9a7506ce 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -62,7 +62,7 @@ COPY benchmarks benchmarks
 COPY scripts scripts
 COPY tensorrt_llm tensorrt_llm
 COPY 3rdparty 3rdparty
-COPY setup.py requirements.txt requirements-dev.txt ./
+COPY .gitmodules setup.py requirements.txt requirements-dev.txt ./
 
 # Create cache directories for pip and ccache
 RUN mkdir -p /root/.cache/pip /root/.cache/ccache
@@ -89,9 +89,11 @@ RUN ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensor
     test -f bin/executorWorker && \
     ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensorrt_llm/libs")') lib && \
     test -f lib/libnvinfer_plugin_tensorrt_llm.so && \
-    ln -sv lib/libnvinfer_plugin_tensorrt_llm.so lib/libnvinfer_plugin_tensorrt_llm.so.9 && \
     echo "/app/tensorrt_llm/lib" > /etc/ld.so.conf.d/tensorrt_llm.conf && \
     ldconfig
+# Test LD configuration
+RUN ! ( ldd -v bin/executorWorker | grep tensorrt_llm | grep -q "not found" )
+
 ARG SRC_DIR=/src/tensorrt_llm
 COPY --from=wheel ${SRC_DIR}/benchmarks benchmarks
 ARG CPP_BUILD_DIR=${SRC_DIR}/cpp/build
diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh
index ebd93e81f..70c01917a 100644
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@@ -6,6 +6,10 @@ set -ex
 # and closest to the version specified in
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07
 TORCH_VERSION="2.4.0"
+# Check the compatible torchvision from
+# https://github.com/pytorch/vision/tree/main?tab=readme-ov-file#installation
+# and also confirm with https://pypi.org/pypi/torchvision/0.19.0/json
+TORCHVISION_VERSION="0.19.0"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 
 prepare_environment() {
@@ -35,29 +39,44 @@ restore_environment() {
 
 install_from_source() {
     if [[ $SYSTEM_ID == *"centos"* ]]; then
-	VERSION_ID=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
-	if [[ $VERSION_ID == "7" ]]; then
-            echo "Installation from PyTorch source codes cannot be supported..."
-	    exit 1
-	fi
+      VERSION_ID=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
+      if [[ $VERSION_ID == "7" ]]; then
+        echo "Installation from PyTorch source codes cannot be supported..."
+        exit 1
+      fi
     fi
     prepare_environment $1
+
     export _GLIBCXX_USE_CXX11_ABI=$1
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
 
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
+    export PYTORCH_BUILD_VERSION=${TORCH_VERSION}
+    export PYTORCH_BUILD_NUMBER=0
     pip3 uninstall -y torch
     cd /tmp
-    git clone --depth 1 --branch v$TORCH_VERSION https://github.com/pytorch/pytorch
+    git clone --depth 1 --branch v${TORCH_VERSION} https://github.com/pytorch/pytorch
     cd pytorch
     git submodule sync && git submodule update --init --recursive
     pip3 install -r requirements.txt
     python3 setup.py install
     cd /tmp && rm -rf /tmp/pytorch
+
+    export PYTORCH_VERSION=${PYTORCH_BUILD_VERSION}
+    export FORCE_CUDA=1
+    export BUILD_VERSION=${TORCHVISION_VERSION}
+    pip3 uninstall -y torchvision
+    cd /tmp
+    git clone --depth 1 --branch v${TORCHVISION_VERSION} https://github.com/pytorch/vision
+    cd vision
+    python3 setup.py install
+    cd /tmp && rm -rf /tmp/vision
+
     restore_environment $1
 }
 
 install_from_pypi() {
-    pip3 install torch==${TORCH_VERSION}
+    pip3 uninstall -y torch torchvision
+    pip3 install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION}
 }
 
 case "$1" in
diff --git a/docs/requirements.txt b/docs/requirements.txt
index a0616a0e8..f32e828e0 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,3 +4,4 @@ sphinx-rtd-theme
 myst_parser
 breathe
 pygit2
+sphinx_copybutton
diff --git a/docs/source/advanced/batch-manager.md b/docs/source/advanced/batch-manager.md
index d46e05d6d..4a6d8650a 100644
--- a/docs/source/advanced/batch-manager.md
+++ b/docs/source/advanced/batch-manager.md
@@ -147,6 +147,7 @@ Note: this feature isn't supported with the `V1` batching scheme for the moment.
 * `capacitySchedulerPolicy`, policy used to select the subset available requests in each iteration of the InflightBatching generation loop.
   - `MAX_UTILIZATION` packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.
   - `GUARANTEED_NO_EVICT` uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.
+  - `STATIC_BATCH` similarly to `GUARANTEED_NO_EVICT` schedules the maximum possible batch size without eviction. New requests are scheduled only after all requests in the previous batch have finished.
 
 ### Optional GptManager parameters
 * `TrtGptModelOptionalParams` class encapsulates the following fields:
@@ -227,6 +228,9 @@ It can also adopt a more conservative approach and schedule requests only when i
 knows that the memory allocation will be sufficient to process all active requests
 even in the worst case of KV cache consumption. That mode corresponds to a
 `SchedulerConfig::capacitySchedulerPolicy` set to `kGUARANTEED_NO_EVICT`.
+Another traditional batching scheme with a batch of requests running in lockstep
+until generation for all of them is completed corresponds to
+`SchedulerConfig::capacitySchedulerPolicy` set to `kSTATIC_BATCH`.
 
 The `GptManager`'s worker thread terminates when the `GptManager` destructor is
 called and there are no more active requests.
diff --git a/docs/source/executor.md b/docs/source/advanced/executor.md
similarity index 81%
rename from docs/source/executor.md
rename to docs/source/advanced/executor.md
index 8955c6bae..500e7cab3 100644
--- a/docs/source/executor.md
+++ b/docs/source/advanced/executor.md
@@ -15,37 +15,6 @@ The following sections provide an overview of the main classes defined in the Ex
 
 The `Executor` class is responsible for receiving requests from the client, and providing responses for those requests. The executor is constructed by providing a path to a directory containing the TensorRT-LLM engine or buffers containing the engine and the model JSON configuration. The client can create requests and enqueue those requests for execution using the `enqueueRequest` or `enqueueRequests` methods of the `Executor` class. Enqueued requests will be scheduled for execution by the executor, and multiple independent requests can be batched together at every iteration of the main execution loop (a process often referred to as continuous batching or iteration-level batching). Responses for a particular request can be awaited for by calling the `awaitResponses` method, and by providing the request id. Alternatively, responses for any requests can be awaited for by omitting to provide the request id when calling `awaitResponses`. The `Executor` class also allows to cancel requests using the `cancelRequest` method and to obtain per-iteration and per-request statistics using the `getLatestIterationStats`.
 
-#### Logits Post-Processor (optional)
-
-Users can alter the logits produced by the network, by providing a map of named callbacks of the form:
-
-```
-std::unordered_map<std::string, function<Tensor(IdType, Tensor&, BeamTokens const&, StreamPtr const&, std::optional<IdType>)>>
-```
-to an instance of `LogitsPostProcessorConfig`. The map key is the name associated with that logits post-processing callback. Each request can then specify the name of the logits post-processor to use for that particular request, if any.
-
-The first argument to the callback is the request id, second is the logits tensor, third are the tokens produced by the request so far, fourth is the operation stream used by the logits tensor, and last one is an optional client id. The callback returns a modified tensor of logits.
-
-Users *must* use the stream to access the logits tensor. For example, performing a addition with a bias tensor should be enqueued on that stream.
-Alternatively, users may call `stream->synchronize()`, however, that will slow down the entire execution pipeline.
-
-Multiple requests can share same client id and callback can use different logic based on client id.
-
-We also provide a batched version that allows altering logits of multiple requests in a batch. This allows further optimizations and reduces callback overheads.
-
-```
-std::function<void(std::vector<IdType> const&, std::vector<Tensor>&, std::vector<std::reference_wrapper<BeamTokens const>> const&, StreamPtr const&, std::vector<std::optional<IdType>> const&)>
-```
-
-A single batched callback can be specified in `LogitsPostProcessorConfig`. Each request can opt to apply this callback by specifying the name of the logits
-post-processor as `Request::kBatchedPostProcessorName`.
-
-Note: Neither callback variant is supported with the `STATIC` batching type for the moment.
-
-In a multi-GPU run, callback is invoked on all tensor parallel ranks (in last pipeline rank) by default.
-For correct execution, user should replicate client-side state accessed by callback on all tensor parallel ranks.
-If replication is expensive or infeasible, use `LogitsPostProcessorConfig::setReplicate(false)` to invoke callback only on first tensor parallel rank.
-
 ### The Request Class
 
 The `Request` class is used to define properties of the request, such as the input token ids and the maximum number of tokens to generate. The `streaming` parameter can be used to indicate if the request should generate a response for each new generated tokens (`streaming = true`) or only after all tokens have been generated (`streaming = false`). Other mandatory parameters of the request include the sampling configuration (defined by the `SamplingConfig` class) which contains parameters controlling the decoding process and the output configuration (defined by the `OutputConfig` class) which controls what information should be included in the `Result` for a particular response.
@@ -83,6 +52,32 @@ The executor can process requests with different beam widths if the following co
 
 The request queue of the executor must be empty to allow it to reconfigure itself for a new beam width. This reconfiguration will happen automatically when requests with a new beam width are enqueued. If requests with different beam widths are enqueued at the same time, the executor will encounter an error and terminate all requests prematurely.
 
+### Controlling output with Logits Post-Processor
+
+Optionally, you can alter the logits produced by the network by providing an instance of `Executor::LogitsPostProcessorConfig`. For instance, this feature can be used to generate JSON formatted output. {cpp:class}`Executor::LogitsPostProcessorConfig <tensorrt_llm::executor::LogitsPostProcessorConfig>` specifies a map of named callbacks in the following form
+
+```cpp
+std::unordered_map<std::string, function<Tensor(IdType, Tensor&, BeamTokens const&, StreamPtr const&, std::optional<IdType>)>>
+```
+
+The map key is the name associated with that logits post-processing callback. Each request can then specify the name of the logits post-processor to use for that particular request, if any.
+
+The first argument to the callback is the request id, second is the logits tensor, third are the tokens produced by the request so far, fourth is the operation stream used by the logits tensor, and last one is an optional client id. The callback returns a modified tensor of logits. Multiple requests can share same client id and callback can use different logic based on client id.
+
+You must use the stream to access the logits tensor. For example, to perform an addition with a bias tensor, the addition operation is enqueued on that stream. Alternatively, you can call `stream->synchronize()`, however, that will slow down the entire execution pipeline.
+
+The executor also includes a {cpp:class}`LogitsPostProcessorBatched <tensorrt_llm::executor::LogitsPostProcessorBatched>` method that enables altering logits of multiple requests in a batch. The batched method allows further optimizations and reduces callback overheads.
+
+```cpp
+std::function<void(std::vector<IdType> const&, std::vector<Tensor>&, std::vector<std::reference_wrapper<BeamTokens const>> const&, StreamPtr const&, std::vector<std::optional<IdType>> const&)>
+```
+
+A single batched callback can be specified in `LogitsPostProcessorConfig`. Each request can opt to apply this callback by specifying the name of the logits post-processor as `Request::kBatchedPostProcessorName`.
+
+Note: Neither callback variant is supported with the `STATIC` batching type for the moment.
+
+In a multi-GPU run, the callback is invoked on all ranks in the first tensor-parallel group, by default. To ensure correct execution, replicate the client-side state that is accessed by the callback on these ranks. If replication is expensive or infeasible, use `LogitsPostProcessorConfig::setReplicate(false)` to invoke the callback only on rank 0. The executor broadcasts the sampled tokens internally to ensure correct execution.
+
 ## C++ Executor API Example
 
 Two C++ examples are provided that shows how to use the Executor API and can be found in the [`examples/cpp/executor`](source:examples/cpp/executor/) folder.
diff --git a/docs/source/advanced/gpt-runtime.md b/docs/source/advanced/gpt-runtime.md
index 60a881e02..2e8ce590c 100644
--- a/docs/source/advanced/gpt-runtime.md
+++ b/docs/source/advanced/gpt-runtime.md
@@ -133,14 +133,14 @@ value for a given parameter, the vector can be limited to a single element
 
 ***General***
 
-|   Name in TRT-LLM   |                         Description                          |   Data type   |                        Range of value                        |                     Default value                     |       Name in HF       |
-| :-----------------: | :----------------------------------------------------------: | :-----------: | :----------------------------------------------------------: | :---------------------------------------------------: | :--------------------: |
-|    `temperature`    |          modulation of logits in sampling workflow           | List\[Float\] |                     \[0.0f, $+\infty$\)                      |                `1.0f` (no modulation)                 |     `temperature`      |
-|     `minLength`     |        lower-bound on the number of tokens generated         |  List\[Int\]  |                       \[0, $+\infty$\)                       | `0` (no effect (the first generated token can be EOS) |      `min_length`      |
-| `repetitionPenalty` | penalize repetitive tokens <br> multiplicative, irrespective of appearances count | List\[Float\] | \[0.0f, $+\infty$\) <br> `< 1.0f` encourages repetition <br> `> 1.0f` discourages it |                  `1.0f` (no effect)                   |  `repetition_penalty`  |
-|  `presencePenalty`  | penalize existed tokens <br> additive, irrespective of appearances count | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
-| `frequencyPenalty`  | penalize existed tokens <br> additive, dependent on appearances count | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
-| `noRepeatNgramSize` |                                                              |  List\[Int\]  | \[0, $+\infty$\) <br> `> 0` all ngrams of that size can only occur once |                    `0` (no effect)                    | `no_repeat_ngram_size` |
+|   Name in TRT-LLM   |                                    Description                                    |   Data type   |                                      Range of value                                       |                     Default value                     |       Name in HF       |
+| :-----------------: | :-------------------------------------------------------------------------------: | :-----------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------: | :--------------------: |
+|    `temperature`    |                     modulation of logits in sampling workflow                     | List\[Float\] |                                    \[0.0f, $+\infty$\)                                    |                `1.0f` (no modulation)                 |     `temperature`      |
+|     `minLength`     |                   lower-bound on the number of tokens generated                   |  List\[Int\]  |                                     \[0, $+\infty$\)                                      | `0` (no effect (the first generated token can be EOS) |      `min_length`      |
+| `repetitionPenalty` | penalize repetitive tokens <br> multiplicative, irrespective of appearances count | List\[Float\] |   \[0.0f, $+\infty$\) <br> `< 1.0f` encourages repetition <br> `> 1.0f` discourages it    |                  `1.0f` (no effect)                   |  `repetition_penalty`  |
+|  `presencePenalty`  |     penalize existed tokens <br> additive, irrespective of appearances count      | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
+| `frequencyPenalty`  |       penalize existed tokens <br> additive, dependent on appearances count       | List\[Float\] | \($-\infty$, $+\infty$\) <br> `< 0.0f` encourages repetition <br> `> 0.0f` discourages it |                  `0.0f` (no effect)                   |           no           |
+| `noRepeatNgramSize` |                                                                                   |  List\[Int\]  |          \[0, $+\infty$\) <br> `> 0` all ngrams of that size can only occur once          |                    `0` (no effect)                    | `no_repeat_ngram_size` |
 
 * The tokens of input prompt are included during adopting `repetitionPenalty`, `presencePenalty`, and `frequencyPenalty` onto logits.
 
@@ -158,9 +158,9 @@ value for a given parameter, the vector can be limited to a single element
 | `topPResetIds`  |    the decay in the `topP` algorithm    |  List\[Int\]  | \[-1, $+\infty$\) | `-1` (no effect) |     no     |
 
  * If setting `topK = 0` and `topP = 0.0f`, greedy search is performed.
- * If setting `topK > 0` and `topP = 0.0f`, `topK` tokens of highest probilities will become the candidates of sampling (named `TopK sampling` in TRT-LLM).
- * If setting `topK = 0` and `topP > 0.0f`, tokens will be sorted with probility descendly, then the tokens with highest probilities which the accumulated probility larger than `topP` will become the candidates of sampling (named `TopP sampling` in TRT-LLM).
- * If setting `topK > 0` and `topP > 0.0f`, `topK` tokens of highest probilities will be selected, then those selected tokens will be sorted with probility descendly and their probility will be normalized, then the tokens with highest normalized probilities which the accumulated probility larger than `topP` will become the candidates of sampling (named `TopKTopP sampling` in TRT-LLM)
+ * If setting `topK > 0` and `topP = 0.0f`, `topK` tokens of highest probabilities will become the candidates of sampling (named `TopK sampling` in TRT-LLM).
+ * If setting `topK = 0` and `topP > 0.0f`, tokens will be sorted with probability descendly, then the tokens with highest probabilities which the accumulated probability larger than `topP` will become the candidates of sampling (named `TopP sampling` in TRT-LLM).
+ * If setting `topK > 0` and `topP > 0.0f`, `topK` tokens of highest probabilities will be selected, then those selected tokens will be sorted with probability descendly and their probability will be normalized, then the tokens with highest normalized probabilities which the accumulated probability larger than `topP` will become the candidates of sampling (named `TopKTopP sampling` in TRT-LLM)
 
  * If different `topK` values are provided for the different sequences in the batch, the performance of the implementation will depend on the largest value. For efficiency reasons, we recommend to batch requests with similar `topK` values together.
 
diff --git a/docs/source/kv_cache_reuse.md b/docs/source/advanced/kv-cache-reuse.md
similarity index 99%
rename from docs/source/kv_cache_reuse.md
rename to docs/source/advanced/kv-cache-reuse.md
index 266ca7bc5..4d7fcfc81 100644
--- a/docs/source/kv_cache_reuse.md
+++ b/docs/source/advanced/kv-cache-reuse.md
@@ -1,3 +1,5 @@
+(kv-cache-reuse)=
+
 # KV cache reuse
 
 This document describes how kv cache pages can be shared and reused by requests that start with the same prompt. This can greatly lower first token latency, the time it takes before the first output token is generated. Many use cases can benefit from this, including multi-turn requests and system prompts.
diff --git a/docs/source/speculative_decoding.md b/docs/source/advanced/speculative-decoding.md
similarity index 91%
rename from docs/source/speculative_decoding.md
rename to docs/source/advanced/speculative-decoding.md
index 00d8bfb68..9fb771a15 100644
--- a/docs/source/speculative_decoding.md
+++ b/docs/source/advanced/speculative-decoding.md
@@ -1,3 +1,5 @@
+(speculative-decoding)=
+
 # Speculative Sampling
 
 Speculative Sampling (also referred to as Speculative Decoding) is a set of techniques designed to allow generation of more than one token per forward pass iteration. This can lead to a reduction in the average per-token latency **in situations where the GPU
@@ -30,11 +32,11 @@ may prove simpler than generating a summary for an article.
 Furthermore, when integrating Medusa with a standard PyTorch model implementation which may not be as finely
 tuned as TensorRT-LLM, the potential time savings are more pronounced.
 
-# Draft Model Approach
+## Draft-Target-Model Approach
+
+The Draft-Target-Model involves the use of two distinct models trained independently but sharing the same vocabulary: a smaller Draft model and a larger Target model. For example, GPT 125M / 6.7B models can serve as the Draft / Target model.
 
-The Draft model approach involves the use of two distinct models trained independently
-but sharing the same vocabulary: a smaller Draft model and a larger Target model.
-For example, a GPT 125M model can serve as the Draft model, while a GPT 6.7B model acts as the Target model.
+There are two styles of using Draft-Target-Model in TensorRT-LLM now. The first one is using TensorRT-LLM-BLS in Triton, which more information and detailed steps can be found in this document. The second one is using it directly in TensorRT-LLM, which steps can be found in [examples/draft_target_model/README.md](../../../examples/draft_target_model/README.md) and the code can be found in [examples/run.py](../../../examples/run.py).
 
 The management of Draft and Target models is facilitated through two separate `GptManager` instances.
 It is essential that you to coordinate the interactions between the Draft and Target models effectively.
@@ -58,7 +60,7 @@ it is advisable to enable KV cache reuse for both models.
 This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command
 and setting `enableBlockReuse=true` in the `KVCacheConfig`.
 
-## Using Draft model approach with Triton Inference Server
+### Using Draft model approach with Triton Inference Server
 
 + Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS).
 + The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py).
@@ -218,7 +220,7 @@ and setting `enableBlockReuse=true` in the `KVCacheConfig`.
     pkill -9 -f tritonserver
     ```
 
-# Medusa
+## Medusa
 
 This approach leverages a single model to both generate and verify draft tokens.
 It enhances the existing model by adding multiple extra language model heads, known as Medusa heads.
@@ -249,7 +251,7 @@ In the TensorRT-LLM implementation of Medusa, the configuration of the tree is a
 This flexibility allows you to experiment and identify the optimal tree structure for your use case,
 which can then be utilized in a production environment.
 
-## Medusa Tree
+### Medusa Tree
 
 Consider the following diagram, which illustrates how the hidden states from the last layer of the base model
 are passed to the base model's language model (LM) head and to four Medusa heads (MHs).
@@ -294,11 +296,11 @@ So, only `9` candidates are specified.
 
 **Specifying paths-only instead of all choices is currently supported only in the Python runtime.**
 
-## Using Medusa with TensorRT-LLM
+### Using Medusa with TensorRT-LLM
 
 For guidance on constructing and executing Medusa with the Python runtime, consult the [Medusa README](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/medusa/README.md). When utilizing the Inflight Fused Batching (IFB) with the C++ API, it is necessary to define the `medusa_choices` explicitly within the model configuration. For detailed instructions, refer to the [model configuration in TensorRT-LLM backend](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration) for more details.
 
-### Limitations
+#### Limitations
 
 - TensorRT-LLM supports Medusa only for Vicuna (fine tuned LLaMA).
 However, similar to any new model, you can follow the same approach to define your own Medusa model and deploy with TensorRT-LLM.
@@ -306,7 +308,7 @@ However, similar to any new model, you can follow the same approach to define yo
 - Beam search is **not** compatible with Medusa.
 
 
-# ReDrafter
+## ReDrafter
 
 This approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. However, unlike Medusa, it predicts draft tokens using a recurrent predictor, where each draft token depends on the previous one. This method also allows the use of beam search to identify more prominent draft tokens. For more details, please read [the ReDrafter paper](https://arxiv.org/html/2403.09919v1).
 
@@ -339,7 +341,7 @@ Each request can be assigned a specific lookahead configuration when input to th
 
 ## Build and execute an engine from a model
 
-Vicuna models re-use Llmama Python scripts located in [examples/llama](../../examples/llama).
+Vicuna models reuse Llmama Python scripts located in [examples/llama](../../examples/llama).
 
 ### Convert a model to checkpoint
 ```bash
@@ -347,49 +349,50 @@ MODEL_DIR=/path/to/vicuna-7b-v1.3
 ENGINE_DIR=tmp/engine
 CKPT_DIR=tmp/engine/ckpt
 
-python3 examples/llama/convert_checkpoint.py \
---model_dir=$MODEL_DIR                       \
---output_dir=$CKPT_DIR                       \
---dtype=float16                              \
---tp_size=1                                  \
---pp_size=1
+python3 examples/llama/convert_checkpoint.py    \
+    --model_dir=$MODEL_DIR                      \
+    --output_dir=$CKPT_DIR                      \
+    --dtype=float16                             \
+    --tp_size=1                                 \
+    --pp_size=1
 ```
 
 ### Build checkpoints for an engine
 ```bash
-trtllm-build                   \
---checkpoint_dir=$CKPT_DIR     \
---output_dir=$ENGINE_DIR       \
---gpt_attention_plugin=float16 \
---gemm_plugin=float16          \
---max_batch_size=32            \
---max_input_len=1024           \
---max_seq_len=2048             \
---max_beam_width=1             \
---log_level=error              \
---max_draft_len=83             \
---speculative_decoding_mode=lookahead_decoding
+trtllm-build                        \
+    --checkpoint_dir=$CKPT_DIR      \
+    --output_dir=$ENGINE_DIR        \
+    --gpt_attention_plugin=float16  \
+    --gemm_plugin=float16           \
+    --max_batch_size=32             \
+    --max_input_len=1024            \
+    --max_seq_len=2048              \
+    --max_beam_width=1              \
+    --log_level=error               \
+    --max_draft_len=83              \
+    --speculative_decoding_mode=lookahead_decoding
 ```
 
 ### Execute an engine
 
 Run `examples/run.py` to generate sequences.
 ```bash
-python examples/run.py              \
---max_output_len=32                 \
---lookahead_config=[7,7,7]          \
---tokenizer_dir=$MODEL_DIR          \
---engine_dir= $ENGINE_DIR           \
---log_levelverbose--input_text 'Once upon' 'To be, or not' 'Be not afraid of greatness'
+python examples/run.py          \
+    --tokenizer_dir=$MODEL_DIR  \
+    --engine_dir=$ENGINE_DIR    \
+    --max_output_len=32         \
+    --lookahead_config=[7,7,7]  \
+    --log_level=verbose         \
+    --input_text 'Once upon' 'To be, or not' 'Be not afraid of greatness'
 ```
 
 Run `examples/summarize.py` to summarize the CNN daily dataset.
 ```bash
-python examples/summarize.py         \
---test_trt_llm                       \
---hf_model_dir$MODEL_DIR             \
---data_type fp16                     \
---engine_dir$ENGINE_DIR              \
---lookahead_config= [7,7,7]          \
---test_hf
+python examples/summarize.py    \
+    --test_hf                   \
+    --test_trt_llm              \
+    --hf_model_dir=$MODEL_DIR   \
+    --engine_dir=$ENGINE_DIR    \
+    --data_type=fp16            \
+    --lookahead_config=[7,7,7]
 ```
diff --git a/docs/source/architecture/core-concepts.md b/docs/source/architecture/core-concepts.md
index 4bfabbf2f..d2e638cdf 100644
--- a/docs/source/architecture/core-concepts.md
+++ b/docs/source/architecture/core-concepts.md
@@ -205,7 +205,7 @@ void invokeQuantization(...) {
 ```
 
 For more details on how TensorRT-LLM implements the GPT Attention operator, see
-the [Multi-head, Multi-query and Group-query Attention](gpt_attention.md) document.
+the [Multi-head, Multi-query and Group-query Attention](../advanced/gpt-attention.md) document.
 
 # Runtime
 
@@ -214,7 +214,7 @@ the runtime components is to load the TensorRT engines and drive their
 execution. Typically, for an auto-regressive model like GPT, the runtime is in
 charge of loading the engine that implements both the processing of the input
 sequence as well as the body of the generation loop. See the [GPT C++
-Runtime](gpt_runtime.md) document for details on the C++ Runtime.
+Runtime](../advanced/gpt-runtime.md) document for details on the C++ Runtime.
 
 (multi-gpu-multi-node)=
 
@@ -254,3 +254,131 @@ subsets of layers. Tensor Parallelism usually leads to more balanced executions
 but requires more memory bandwidth between the GPUs. Pipeline Parallelism
 reduces the need for high-bandwidth communication but may incur load-balancing
 issues and may be less efficient in terms of GPU utilization.
+
+## Examples
+
+Here are examples of Llama 3.1 70B and Llama 3.1 405B showing how to perform multi-GPU and multi-node inference in TensorRT-LLM. The example of Llama 3.1 70B performs multi-GPU inference on a single node, while the example of Llama 3.1 405B performs multi-node inference.
+
+### Llama 3.1 70B
+
+The following sample commands build an engine for running the Llama 3.1 70B model with tensor parallelism (TP=4) using 4 GPUs on a single node.
+
+```bash
+folder_trt_llm=../TensorRT-LLM
+model_dir=Llama-3.1-70B
+ckpt_dir=ckpt_llama_3.1_70b
+engine_dir=engine_llama_3.1_70b
+dtype=bfloat16
+tp_size=4
+pp_size=1
+kv_cache_type=paged
+max_input_len=128
+max_output_len=128
+max_batch_size=4
+workers=$(( tp_size * pp_size ))
+
+python ${folder_trt_llm}/examples/llama/convert_checkpoint.py \
+    --output_dir ${ckpt_dir} \
+    --model_dir ${model_dir} \
+    --dtype ${dtype} \
+    --tp_size ${tp_size} \
+    --pp_size ${pp_size} \
+    --workers ${workers} \
+    --use_parallel_embedding
+
+trtllm-build \
+    --output_dir ${engine_dir} \
+    --checkpoint_dir ${ckpt_dir} \
+    --gemm_plugin ${dtype} \
+    --gpt_attention_plugin ${dtype} \
+    --kv_cache_type ${kv_cache_type} \
+    --max_input_len ${max_input_len} \
+    --max_seq_len $(( max_input_len + max_output_len )) \
+    --max_batch_size ${max_batch_size} \
+    --workers ${workers}
+```
+
+The following sample commands perform inference using 4 GPUs on a single node by running `examples/run.py`.
+
+```bash
+input_text="Born in north-east France, Soyer trained as a"
+
+mpirun -n $(( tp_size * pp_size )) \
+    python ${folder_trt_llm}/examples/run.py \
+        --engine_dir ${engine_dir} \
+        --tokenizer_dir ${model_dir} \
+        --input_text "${input_text}" \
+        --max_output_len ${max_output_len}
+```
+
+### Llama 3.1 405B
+
+The following sample commands build an engine for running the Llama 3.1 405B model with tensor parallelism (TP=16) on 2 nodes that each have 8 GPUs. Although the model runs on multiple nodes, you can build the engine on a single node.
+
+```bash
+folder_trt_llm=../TensorRT-LLM
+model_dir=Llama-3.1-405B
+ckpt_dir=ckpt_llama_3.1_405b
+engine_dir=engine_llama_3.1_405b
+dtype=bfloat16
+tp_size=16
+pp_size=1
+kv_cache_type=paged
+max_input_len=128
+max_output_len=128
+max_batch_size=4
+workers=8
+
+python ${folder_trt_llm}/examples/llama/convert_checkpoint.py \
+    --output_dir ${ckpt_dir} \
+    --model_dir ${model_dir} \
+    --dtype ${dtype} \
+    --tp_size ${tp_size} \
+    --pp_size ${pp_size} \
+    --workers ${workers} \
+    --use_parallel_embedding
+
+trtllm-build \
+    --output_dir ${engine_dir} \
+    --checkpoint_dir ${ckpt_dir} \
+    --gemm_plugin ${dtype} \
+    --gpt_attention_plugin ${dtype} \
+    --kv_cache_type ${kv_cache_type} \
+    --max_input_len ${max_input_len} \
+    --max_seq_len $(( max_input_len + max_output_len )) \
+    --max_batch_size ${max_batch_size} \
+    --workers ${workers}
+```
+
+The following sample script, `launch_llama_3.1_405b.sh`, shows how to perform inference with Slurm on 2 nodes that each have 8 GPUs. If you use a different workload management software, the key concern is to run the `examples/run.py` command.
+
+```bash
+#!/bin/bash
+#SBATCH --account account
+#SBATCH --partition partition
+#SBATCH --job-name job-name
+#SBATCH --time 1:00:00
+#SBATCH --nodes 2
+
+folder_trt_llm=../TensorRT-LLM
+engine_dir=engine_llama_3.1_405b
+model_dir=Llama-3.1-405B
+max_output_len=128
+
+input_text="Born in north-east France, Soyer trained as a"
+
+srun \
+    --ntasks-per-node 8 \
+    --mpi pmix \
+    python ${folder_trt_llm}/examples/run.py \
+        --engine_dir ${engine_dir} \
+        --tokenizer_dir ${model_dir} \
+        --input_text "${input_text}" \
+        --max_output_len ${max_output_len}
+```
+
+You can perform inference by running the script on the Slurm cluster.
+
+```bash
+sbatch launch_llama_3.1_405b.sh
+```
diff --git a/docs/source/architecture/workflow.md b/docs/source/architecture/workflow.md
index 1d366dc5f..6d02e406b 100644
--- a/docs/source/architecture/workflow.md
+++ b/docs/source/architecture/workflow.md
@@ -10,7 +10,7 @@ The build workflow contains two major steps.
 
 To generalize the TensorRT-LLM optimization features to all models, and to share the same workflow between different models for TensorRT-LLM users, TensorRT-LLM has conventions about how the models shall be defined and how the models shall be imported.
 
-TensorRT-LLM checkpoint convention is documented in [checkpoint doc](/docs/source/architecture/checkpoint.md), and all decoder-only models had been migrated to adopt the convention. Model-specific convert_checkpoint.py scripts are shipped as source code in example directories, and a trtllm-build CLI tool had been added. However, there are some disadvantages of providing convert checkpoint scripts outside the core TensorRT-LLM lib as example:
+TensorRT-LLM checkpoint convention is documented in [](checkpoint.md) and all decoder-only models had been migrated to adopt the convention. Model-specific convert_checkpoint.py scripts are shipped as source code in example directories, and a trtllm-build CLI tool had been added. However, there are some disadvantages of providing convert checkpoint scripts outside the core TensorRT-LLM lib as example:
 
 1. TensorRT-LLM evolves so quickly that the model's definition code might have changed for better performance; which means the `convert_checkpoint.py` is out of date.
 
@@ -47,7 +47,9 @@ class LLaMAForCausalLM (DecoderModelForCausalLM):
 ```
 
 
-Then, in the [convert_checkpoint.py](../../../../examples/llama/convert_checkpoint.py) script, the logic can be greatly simplified. Even if the model definition code of TensorRT-LLM LLaMA class is changed due to some reason, the `from_hugging_face` API will keep the same, thus the existing workflow using this interface will not be affected.
+Then, in the convert_checkpoint.py script in the
+[`examples/llama/`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama/) directory of the GitHub repo,
+the logic can be greatly simplified. Even if the model definition code of TensorRT-LLM LLaMA class is changed due to some reason, the `from_hugging_face` API will keep the same, thus the existing workflow using this interface will not be affected.
 
 
 ```python
@@ -65,7 +67,9 @@ Since LLaMA models were also released with different formats, such as the Meta c
 In the 0.9 release, only LLaMA is refactored. Since popular LLaMA (and its variants) models are released by Hugging Face and Meta checkpoint formats, only these two functions are implemented.
 
 
-In future releases, there might be `from_jax`, `from_nemo`, `from_keras` or other factory methods for different training checkpoints added, for example the TensorRT-LLM [GEMMA](../../../../examples/gemma/README.md) model supports JAX/Keras formats in addition to huggingface. The model developers can choose to implement **any subset** of these factory methods for the models they contributed to TensorRT-LLM.
+In future releases, there might be `from_jax`, `from_nemo`, `from_keras` or other factory methods for different training checkpoints added.
+For example, the Gemma 2B model and the convert_checkpoint.py file in the [`examples/gemma`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gemma/)
+directory support JAX and Keras formats in addition to Hugging Face. The model developers can choose to implement **any subset** of these factory methods for the models they contributed to TensorRT-LLM.
 
 
 For some formats which are not supported by TensorRT-LLM model developers, you still have the freedom to implement your own weights conversion outside the core lib; the flow will look like this:
@@ -96,7 +100,9 @@ TensorRT-LLM relies on NVIDIA Modelopt toolkit to support some of the quantizati
 
 In TensorRT-LLM 0.8 version:
 
-* For Modelopt-supported quantization algorithms, a standalone script in the example folder [quantize.py](../../../../examples/quantization/quantize.py) shall be executed to export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines.
+* For Modelopt-supported quantization algorithms, a standalone script,
+  [example/quantization/quantize.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py)
+  can export TensorRT-LLM checkpoints, and the trtllm-build command needs to be executed to build the checkpoints to engines.
 
 * For the non-Modelopt quantization algorithms, users need to use the per-model convert_checkpoint.py scripts to export TensorRT-LLM checkpoints.
 
@@ -116,8 +122,6 @@ class PretrainedModel:
         # and save the checkpoint to output_dir
 ```
 
-```{note}
-
 * The default implementation only handles the Modelopt supported quantization. The LLaMA class then inherits this `PretrainedModel` and dispatches the Modelopt quantization to the super class's default implementation.
 * The model developer raises errors in the sub-class implementation if the new model is not supported by Modelopt yet.
 
@@ -145,7 +149,7 @@ class LLaMAForCausalLM:
 The `quantize` API is designed to take multi-GPU resources internally to make quantization. For example, a LLaMA 70B BF16 takes 140G memory, if we make FP8 quantization, then, another 70G is needed. So, we need at least 210G, 4 * A100(H100) is needed to quantize the LLaMA 70B model. If you want to call `quantize` API inside a MPI program, be cautious and ensure the quantize API is only called by rank 0.
 
 
-Usage of the `quantize` API in an MPI program looks like this, only rank 0 calls it. In an non-MPI program, and `if rank == 0` and the `mpi_barrier()` is not needed.
+Usage of the `quantize` API in an MPI program looks like this, only rank 0 calls it. In an non-MPI program, the `if rank == 0` and the `mpi_barrier()` are not needed.
 
 ```python
 quant_config = QuantConfig()
@@ -179,7 +183,7 @@ engine.save(engine_dir)
 ```
 
 
-The Llama object can be created by any method mentioned in the [conversion APIs](#conversion-apis) and the [quantization APIs](#quantization-apis) section.
+The Llama object can be created by any method mentioned in the [](#conversion-apis) or [](#quantization-apis) sections.
 
 
 The `trtllm-build` CLI tool is a thin wrapper around this `tensorrt_llm.build` API. The flags of the CLI tool are kept close to the fields of the `BuildConfig` class.
@@ -216,8 +220,7 @@ All the weights conversion, quantization, and build APIs mentioned above have co
 * A unified quantization script is inside the `examples/quantization/quantize.py` and can be shared by all **supported** models.
 * A `trtllm-build` CLI tool builds all models from TensorRT-LLM checkpoint.
 
-
-```{note}
+Refer to the following considerations for the CLI tools:
 
 * These scripts and tools should be used for scripting. Do not import the Python functions/class defined in these tools. TensorRT-LLM does not promise the content of these scripts can be compatible with previous versions. The options of these tools may also be changed when it’s not avoidable.
 
diff --git a/docs/source/blogs/quantization-in-TRT-LLM.md b/docs/source/blogs/quantization-in-TRT-LLM.md
index bf0cfb1bc..73f18b8f4 100644
--- a/docs/source/blogs/quantization-in-TRT-LLM.md
+++ b/docs/source/blogs/quantization-in-TRT-LLM.md
@@ -12,31 +12,31 @@ TensorRT-LLM offers a best-in-class unified quantization toolkit to significantl
 ### Performance
 In the following benchmark, we highlight the acceleration of a few popular models at a small batch size without imposing latency constraints. It's important to note that in scenarios where there's a latency constraint in your application, TRT-LLM can achieve an even greater performance improvement. Using LLaMA-v2-7B as an example, when the first token latency is constrained to be under 500ms, quantization with FP8 and a batch size of 16 achieves a notable **2.3x inference speedup** compared to FP16 on a H100.
 
-| Model       | Batch Size |  Speedup (FP8 v.s. FP16) | Speedup (INT8 SQ v.s. FP16) |
-|-------------|:----------:|:------------------------:|:---------------------------:|
-| GPT-J       |      1     |  1.40x  | 1.40x  |
-| GPT-J       |      8     |  1.44x  | 1.30x  |
-| LLaMA-v2-7B |      1     |  1.51x  | 1.47x  |
-| LLaMA-v2-7B |      8     |  1.40x  | 1.32x  |
+| Model       | Batch Size | Speedup (FP8 v.s. FP16) | Speedup (INT8 SQ v.s. FP16) |
+| ----------- | :--------: | :---------------------: | :-------------------------: |
+| GPT-J       |     1      |          1.40x          |            1.40x            |
+| GPT-J       |     8      |          1.44x          |            1.30x            |
+| LLaMA-v2-7B |     1      |          1.51x          |            1.47x            |
+| LLaMA-v2-7B |     8      |          1.40x          |            1.32x            |
 
 *The above benchmarks were run with Input Length=1024, Output Length=128, and TP=1 on H100 80GB.
 
 ### Accuracy
 
-| Model        | Quantization Methods | MMLU Baseline (FP16) | MMLU Post-quantization |   MMLU Loss   |
-|--------------|:--------------------:|:--------------------:|:----------------------:|:-------------:|
-| Falcon-180B  |          FP8         |         70.4         |          70.3          |     0.14%     |
-|              |        INT8-SQ       |         70.4         |          68.6          |     2.56%     |
-|              |       INT4-AWQ       |         70.4         |          69.8          |     0.85%     |
-| Falcon-40B   |          FP8         |         56.1         |          55.6          |     0.89%     |
-|              |        INT8-SQ       |         56.1         |          54.7          |     2.50%     |
-|              |       INT4-AWQ       |         56.1         |          55.5          |     1.07%     |
-| LLaMA-v2-70B |          FP8         |         69.1         |          68.5          |     0.87%     |
-|              |        INT8-SQ       |         69.1         |          67.2          |     2.75%     |
-|              |       INT4-AWQ       |         69.1         |          68.4          |     1.01%     |
-| MPT-30B      |          FP8         |         47.5         |          47.4          |     0.21%     |
-|              |        INT8-SQ       |         47.5         |          46.8          |     1.47%     |
-|              |       INT4-AWQ       |         47.5         |          46.5          |     2.11%     |
+| Model        | Quantization Methods | MMLU Baseline (FP16) | MMLU Post-quantization | MMLU Loss |
+| ------------ | :------------------: | :------------------: | :--------------------: | :-------: |
+| Falcon-180B  |         FP8          |         70.4         |          70.3          |   0.14%   |
+|              |       INT8-SQ        |         70.4         |          68.6          |   2.56%   |
+|              |       INT4-AWQ       |         70.4         |          69.8          |   0.85%   |
+| Falcon-40B   |         FP8          |         56.1         |          55.6          |   0.89%   |
+|              |       INT8-SQ        |         56.1         |          54.7          |   2.50%   |
+|              |       INT4-AWQ       |         56.1         |          55.5          |   1.07%   |
+| LLaMA-v2-70B |         FP8          |         69.1         |          68.5          |   0.87%   |
+|              |       INT8-SQ        |         69.1         |          67.2          |   2.75%   |
+|              |       INT4-AWQ       |         69.1         |          68.4          |   1.01%   |
+| MPT-30B      |         FP8          |         47.5         |          47.4          |   0.21%   |
+|              |       INT8-SQ        |         47.5         |          46.8          |   1.47%   |
+|              |       INT4-AWQ       |         47.5         |          46.5          |   2.11%   |
 
 
 
@@ -46,19 +46,19 @@ A quantization method comprises three primary components:
 2. Activation precision format
 3. Calibration algorithms
 
-Typically, in the context of small-batch inference scenarios (batch size ≤ 4), the key consideration is memory bandwidth, making weight-only quantization methods the preferred choice. Conversely, for large-batch inference scenarios, such as serving scenarios (batch size ≥ 16), both memory bandwidth and computation density become crucial factors. Consequently, it's recommended to opt for a quantization method that has both weight and activation quantized. For batch size ≥ 16, the choice of quantization method can be model speicfic. We suggest to prioritize using FP8 first, as we typically see it offers the best performance and accuracy. If the results do not meet your specific use case, you can further experiment with Int8 SmoothQuant (Int8 SQ) followed by AWQ and/or GPTQ.
+Typically, in the context of small-batch inference scenarios (batch size ≤ 4), the key consideration is memory bandwidth, making weight-only quantization methods the preferred choice. Conversely, for large-batch inference scenarios, such as serving scenarios (batch size ≥ 16), both memory bandwidth and computation density become crucial factors. Consequently, it's recommended to opt for a quantization method that has both weight and activation quantized. For batch size ≥ 16, the choice of quantization method can be model specific. We suggest to prioritize using FP8 first, as we typically see it offers the best performance and accuracy. If the results do not meet your specific use case, you can further experiment with Int8 SmoothQuant (Int8 SQ) followed by AWQ and/or GPTQ.
 
 Based on specific use cases, users might have different tolerances on accuracy impact and calibration time. The table below summarizes the tradeoffs* to consider when choosing a quantization method. You can also learn more about precision formats in our [documentation](https://nvidia.github.io/TensorRT-LLM/precision.html).
 
-| Quantization Methods | Performance Improvement (batch size <= 4) | Performance Improvement (batch size >= 16)  | Accuracy Impact | Calibration Time** |
-| :--------------------------- | :--------: | :--------------: | :----------: | :--------------: |
-| FP8 (W8A8)                   | Medium     | Medium           | Very Low     |          Minutes |
-| Int8 SQ (W8A8)               | Medium     | Medium           | Medium       |          Minutes |
-| Int8 weight-only (W8A16)     | Medium     | Low              | Low          |     Not Required |
-| Int4 weight-only (W4A16)     | High       | Low              | High         |     Not Required |
-| Int4 AWQ (W4A16)             | High       | Low              | Low          |  Tens of Minutes |
-| Int4 GPTQ                    | High       | Low              | Low          |  Tens of Minutes |
-| Int4-FP8 AWQ (W4A8)          | High       | Medium           | Low          |  Tens of Minutes |
+| Quantization Methods     | Performance Improvement (batch size <= 4) | Performance Improvement (batch size >= 16) | Accuracy Impact | Calibration Time** |
+| :----------------------- | :---------------------------------------: | :----------------------------------------: | :-------------: | :----------------: |
+| FP8 (W8A8)               |                  Medium                   |                   Medium                   |    Very Low     |      Minutes       |
+| Int8 SQ (W8A8)           |                  Medium                   |                   Medium                   |     Medium      |      Minutes       |
+| Int8 weight-only (W8A16) |                  Medium                   |                    Low                     |       Low       |    Not Required    |
+| Int4 weight-only (W4A16) |                   High                    |                    Low                     |      High       |    Not Required    |
+| Int4 AWQ (W4A16)         |                   High                    |                    Low                     |       Low       |  Tens of Minutes   |
+| Int4 GPTQ                |                   High                    |                    Low                     |       Low       |  Tens of Minutes   |
+| Int4-FP8 AWQ (W4A8)      |                   High                    |                   Medium                   |       Low       |  Tens of Minutes   |
 
 \* The performance and impact are measured on 10+ popular LLMs. We'll follow up with more data points.
 ** Calibration time is subject to the actual model size.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index d07365555..db68ed16b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -34,6 +34,7 @@
     "breathe",
     'sphinx.ext.todo',
     'sphinxarg.ext',
+    'sphinx_copybutton'
 ]
 
 myst_url_schemes = {
@@ -45,6 +46,12 @@
     "https://github.com/NVIDIA/TensorRT-LLM/tree/" + branch_name + "/{{path}}",
 }
 
+myst_heading_anchors = 4
+
+myst_enable_extensions = [
+    "deflist",
+]
+
 autosummary_generate = True
 
 # -- Options for HTML output -------------------------------------------------
diff --git a/docs/source/helper.py b/docs/source/helper.py
index d14fcb963..03cdf42a0 100644
--- a/docs/source/helper.py
+++ b/docs/source/helper.py
@@ -91,7 +91,7 @@ def generate_llmapi():
     # Destination paths
     doc_dir = root_dir / "docs/source/llm-api"
     doc_dir.mkdir(exist_ok=True)
-    doc_path = doc_dir / "index.rst"
+    doc_path = doc_dir / "reference.rst"
 
     hlapi_all_file = root_dir / "tensorrt_llm/hlapi/__init__.py"
     public_classes_names = extract_all_and_eval(hlapi_all_file)['__all__']
diff --git a/docs/source/index.rst b/docs/source/index.rst
index dd8ae88c5..142454d03 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -29,23 +29,24 @@ Welcome to TensorRT-LLM's Documentation!
    installation/windows.md
    installation/build-from-source-windows.md
 
+
 .. toctree::
    :maxdepth: 2
-   :caption: LLM API Examples
+   :caption: LLM API
    :hidden:
+   :glob:
 
-   llm-api-examples/index.md
-   llm-api-examples/customization.md
-   llm-api-examples/llm_api_examples
+   llm-api/*
 
 
 .. toctree::
    :maxdepth: 2
-   :caption: LLM API
+   :caption: LLM API Examples
    :hidden:
-   :glob:
 
-   llm-api/*
+   llm-api-examples/index.md
+   llm-api-examples/customization.md
+   llm-api-examples/llm_api_examples
 
 
 .. toctree::
@@ -96,11 +97,14 @@ Welcome to TensorRT-LLM's Documentation!
 
    advanced/gpt-attention.md
    advanced/gpt-runtime.md
+   advanced/executor.md
    advanced/graph-rewriting.md
    advanced/batch-manager.md
    advanced/inference-request.md
    advanced/lora.md
    advanced/expert-parallelism.md
+   advanced/kv-cache-reuse.md
+   advanced/speculative-decoding.md
 
 .. toctree::
    :maxdepth: 2
@@ -108,7 +112,8 @@ Welcome to TensorRT-LLM's Documentation!
    :name: Performance
 
    performance/perf-overview.md
-   performance/perf-best-practices.md
+   Benchmarking <performance/perf-benchmarking.md>
+   Best Practices <performance/perf-best-practices.md>
    performance/perf-analysis.md
 
 
diff --git a/docs/source/installation/build-from-source-windows.md b/docs/source/installation/build-from-source-windows.md
index 9dcb3e1b2..e99540814 100644
--- a/docs/source/installation/build-from-source-windows.md
+++ b/docs/source/installation/build-from-source-windows.md
@@ -185,12 +185,15 @@ Building from source produces the following library files.
     - `th_common.exp`
     - `th_common.lib`
 
-The locations of the DLLs, in addition to some `torch` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
+The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
 
 ```bash
+%USERPROFILE%\inference\TensorRT\lib
 %USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
 %USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
 %USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
 ```
 
 Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
+
+Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
diff --git a/docs/source/installation/windows.md b/docs/source/installation/windows.md
index b3d9a660c..33cd5c658 100644
--- a/docs/source/installation/windows.md
+++ b/docs/source/installation/windows.md
@@ -4,7 +4,7 @@
 
 ```{note}
 The Windows release of TensorRT-LLM is currently in beta.
-We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.13.0) for the most stable experience.
+We recommend checking out the [v0.14.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.14.0) for the most stable experience.
 ```
 
 **Prerequisites**
@@ -52,7 +52,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L
   before installing TensorRT-LLM with the following command.
 
   ```bash
-  pip install tensorrt_llm==0.13.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
+  pip install tensorrt_llm==0.14.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
   ```
 
   Run the following command to verify that your TensorRT-LLM installation is working properly.
@@ -70,8 +70,4 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L
 
 This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
 [the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
-and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSCV DLL).
-
-2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.
-
-Installing the latest [Build Tools for Visual Studio 2022](https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue.
+and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).
diff --git a/docs/source/llm-api-examples/index.md b/docs/source/llm-api-examples/index.md
index 9018aa9fa..a4817c09d 100644
--- a/docs/source/llm-api-examples/index.md
+++ b/docs/source/llm-api-examples/index.md
@@ -7,13 +7,16 @@ Here is a simple example to show how to use the LLM with TinyLlama.
 ```
 
 The LLM API can be used for both offline or online usage. See more examples of the LLM API here:
-* [LLM Generate](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate.html)
-* [LLM Generate Distributed](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate_distributed.html)
-* [LLM Generate Async](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate_async.html)
-* [LLM Generate Async Streaming](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_generate_async_streaming.html)
+* [LLM Inference](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference.html)
+* [LLM Inference Distributed](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_distributed.html)
+* [LLM Inference Async](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async.html)
+* [LLM Inference Async Streaming](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async_streaming.html)
 * [LLM Quantization](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_quantization.html)
 * [LLM Auto Parallel](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_auto_parallel.html)
-
+* [LLM Logits Processor](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_logits_processor.html)
+* [Automatic Parallelism](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_auto_parallel.html) (in preview)
+* [Generation Async](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async.html)
+* [Generation Async Streamling](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_inference_async_streaming.html)
 For more details on how to fully utilize this API, check out:
 
 * [Common customizations](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/customization.html)
@@ -31,6 +34,7 @@ For more details on how to fully utilize this API, check out:
 * Falcon
 * Baichuan-1/2
 * GPT-J
+* Mamba-1/2
 
 ## Model Preparation
 
diff --git a/docs/source/llm-api/index.md b/docs/source/llm-api/index.md
new file mode 100644
index 000000000..27a7780c2
--- /dev/null
+++ b/docs/source/llm-api/index.md
@@ -0,0 +1,101 @@
+# API Introduction
+
+The LLM API is a high-level Python API and designed for LLM workflows.
+This API is under development and might have breaking changes in the future.
+
+## Supported Models
+
+* Llama (including variants Mistral, Mixtral, InternLM)
+* GPT (including variants Starcoder-1/2, Santacoder)
+* Gemma-1/2
+* Phi-1/2/3
+* ChatGLM (including variants glm-10b, chatglm, chatglm2, chatglm3, glm4)
+* QWen-1/1.5/2
+* Falcon
+* Baichuan-1/2
+* GPT-J
+* Mamba-1/2
+
+## Model Preparation
+
+The `LLM` class supports input from any of following:
+
+1. **Hugging Face Hub**: Triggers a download from the Hugging Face model hub, such as `TinyLlama/TinyLlama-1.1B-Chat-v1.0`.
+2. **Local Hugging Face models**: Uses a locally stored Hugging Face model.
+3. **Local TensorRT-LLM engine**: Built by `trtllm-build` tool or saved by the Python LLM API.
+
+You can use any of these formats interchangeably with the `LLM(model=<any-model-path>)` constructor.
+The following sections describe how to use these different formats for the LLM API.
+
+### Hugging Face Hub
+
+Using the Hugging Face Hub is as simple as specifying the repo name in the LLM constructor:
+
+```python
+llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+```
+
+### Local Hugging Face Models
+
+Given the popularity of the Hugging Face model hub, the API supports the Hugging Face format as one of the starting points.
+To use the API with Llama 3.1 models, download the model from the [Meta Llama 3.1 8B model page](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) by using the following command:
+
+```console
+git lfs install
+git clone https://huggingface.co/meta-llama/Meta-Llama-3.1-8B
+```
+
+After the model download is complete, you can load the model:
+
+```python
+llm = LLM(model=<path_to_meta_llama_from_hf>)
+```
+
+Using this model is subject to a [particular](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) license. Agree to the terms and [authenticate with Hugging Face](https://huggingface.co/meta-llama/Meta-Llama-3-8B?clone=true) to begin the download.
+
+### Local TensorRT-LLM Engine
+
+The LLM API can use a TensorRT-LLM engine.
+There are two ways to build a TensorRT-LLM engine:
+
+1. You can build the TensorRT-LLM engine from the Hugging Face model directly with the [`trtllm-build`](../commands/trtllm-build.rst) tool and then save the engine to disk for later use.
+Refer to the [README](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama) in the [`examples/llama`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama) repository on GitHub.
+
+   After the engine building is finished, you can load the model:
+
+   ```python
+   llm = LLM(model=<path_to_trt_engine>)
+   ```
+
+2. Alternatively, you can use an `LLM` instance to create the engine and persist to local disk:
+
+   ```python
+   llm = LLM(<model-path>)
+
+   # Save engine to local disk
+   llm.save(<engine-dir>)
+   ```
+
+   The engine can be loaded using the `model` argument as shown in the first approach.
+
+## Tips and Troubleshooting
+
+The following tips typically assist new LLM API users who are familiar with other APIs that are part of TensorRT-LLM:
+
+- RuntimeError: only rank 0 can start multi-node session, got 1
+
+  There is no need to add an `mpirun` prefix for launching single node multi-GPU inference with the LLM API.
+
+  For example, you can run `python llm_inference_distributed.py` to perform multi-GPU on a single node.
+
+- Hang issue on Slurm Node
+
+  If you experience a hang or other issue on a node managed with Slurm, add prefix `mpirun -n 1 --oversubscribe --allow-run-as-root` to your launch script.
+
+  For example, try `mpirun -n 1 --oversubscribe --allow-run-as-root python llm_inference_distributed.py`.
+
+- MPI_ABORT was invoked on rank 1 in communicator MPI_COMM_WORLD with errorcode 1.
+
+  Because the LLM API relies on the `mpi4py` library, put the LLM class in a function and protect the main entrypoint to the program under the `__main__` namespace to avoid a [recursive spawn](https://mpi4py.readthedocs.io/en/stable/mpi4py.futures.html#mpipoolexecutor) process in `mpi4py`.
+
+  This limitation is applicable for multi-GPU inference only.
diff --git a/docs/source/media/image-09-29-2024.png b/docs/source/media/image-09-29-2024.png
new file mode 100644
index 0000000000000000000000000000000000000000..840c76907b7524f03ddabf0864f72b95db9aeb2b
GIT binary patch
literal 178826
zcmeFYRZv{p9`&09C%8kJMuWS%6C`K|7Mz9v!9#Eh5ZocSdvJHBad&rX+-V%X-gWlg
z=iI0Ja_c^QRsFDPb@f`S=bUTKF~<KlgH@Dd&{0WHU%h&TE+;GX>D4QE_gAmrHjxow
zXHGLF4qv@`$&-^3|LmfFl!owAujjhA-O+jA{_EDs3AKx_U=3afKJ)MAtURp=35$<e
zV)Q3Ka@i<Dd*^z^*%iH)%e05p-Q(TX6?E#9_o)U%&{jr=vHn))U3asu%P}!A$yszE
z+5dh3xD89s4F7Y2uJ08C^xv;x2LQ=G^Ev->;yYTFapZq5KsU(#D@6GJaRIFVdx5_H
z@7v(VyUK#bM&j%6Yv!!N#(_E+)!#Xa4r){+2j6twoN7v|;{E57=e`;L`}dD&ck3g*
z`-j^cGZT*GmF11y-GJdu1w3`UA+x-PuBwN|->MF3v8Khdg_7BgjVL}F6G2M*NW7Qj
zS#1~#%{)0}P4cJ8|6KI9jch%Z``!R&6NGNraWP|KW8*a@FJ?={nW;y`E|rU`%VNZ*
zCfsnvaC0jg-}Z!zyQTS3Vc}$wcjNQpuhu%S7({j*TCF1|iDB2++`hE$gj4PqA^k(g
z)bn;O&2L$FY5k<|rr@7ZCGVWGgtnC(C?=@}cR+HWGN*;)e5vP&*hH&1GO~v~2}c+y
zN|vDfgx(_D(p)+(X`8;j$EWac9nSaC_L4kSY5vWs=2%@I6m6@>pP4JDK5{kl-vPQ3
zdqM?G^ah>)afB4dHz-zO5vTSurp<(aA~zzB-zIxxBNVuFQg#fS{Z@s9WfZ&O#)iR1
zIww+H9Y*>3m4nU(tknNB3!d~{;g7eKffCB<*7)B)UaDxR20dQ9`TaSr2qMP&3rKoy
zbdHRPoUq{>zB2zYpe~??nu3Xk0MIz2RSx{`J2P0R3~OB>VWCP!TV5Mah|}^_Z)wBE
zwayth!vE~Y+?`kxnGJ`5vYOhB0dx2lia@fYm+2&#e>z<Hu4mjjz}|i|Vex)3zQfhX
zi(z)15^_F&)3~@s?81@|WX1<@m<3eqb+l7SYwB(#2AZz&08lU>+>5=}YFpI~)SI9G
z9z4>jUU)K!M_FQB1BsTT5_%xdkC`{JW?2cr!WicQS+78K_Uf#NUiE!uFT?Lw8<pc@
zY`)piyPdhXF9YgK)@H}&$?f6q5h<rd@$}R6t5&>*v&7twAPd$6OMDi!jTqpp_2I7)
zuS<^|6A_zNM&TKE(QinRHGoT%n^x|_KI3t?yhBAg*S#WrhGT-fvc$Yq5jDcS1j&HO
z8DQ$$puUthzA1t9w3y0_4b3UX`6u7kC_1(dEsNgsmEp}&&#|v5wlL4x^Ik8^FQ}N6
zO$@Ze4a6OO+b&N*+NxTx+%zF+Se%Xgk~OT`J~7{kQf`p|N>#w6?bG=8?>D9AR9y-5
zAdKE|@Ofn-lUQfHs^s-1n?#*5RWE}Y>atU??~yHO1{@y^FECS1^5=|ot-~`uf5x2)
zw1gOGjskxn1sIK1sIk$Qa<vGzs_3ivAn3G#Zv+XF8x#uEjVsNA1$(=#H|gnEJ9@)L
zkk8+CqWFC^mN-2}zIYqs*|PiW`*dsbAW^c`{#tRDY-$3CM@F~Z?cZx$QfVeNX(xp!
z-33mSiT+h*J+c^}UT9fCWk&cgPTDJ%0tz2_vf?3BCouZh%Y&`&8(xb0pwMxvzlCL8
z27bbG;0WQUv>KByQC|9a(DZG=#!R=IJq9T(L1m9Po((T2MyCHJwMy5c`Pe<UkNxzv
zal%Tc%?uxv0ac*pp+LeY>t!1kOVmzk=$p>tQqGMSTC#{vJX$$DE4b<IG&?_gI=8Y`
z_^}PNFxS7_1U$iGO6}mV`1VIeA)}Ls?+Nb;$cg^ih=y!MTVL@HJ)`u&z3F`0-dhtZ
zeZQ}<ktbgb+{aJ4vq0GCQC+tEYxTI=OFTicGdXjw&_wqEjS;pgK>z~-9ls;Jl#2bs
z$O_x1?_3K~v}WV_AuJTan-i8-pRb5TB{#1w&OxuU7PghOy?C5hUI^SZo_{X4E~VwC
z8IPagj&(NP2))14QvDp1U?U_GaVqSTTzM`OdkWszKL&?pSLI@*w5J7z8+iRJnl#o+
z*Zb!k%M0aaFvj#p3So9mhtk64JGp{s?9q+Oi%rdQXvodTGgL4HPg@D-Mrk8*j(9nO
zZeXqdvp*9#ObKXFHbuS(uX!WSI@;0R?uBX=E<mbwuUIG0^_6U6#z6Z*K;+A1rS1~O
zmWX{nO2ba>yPmgS)epW2wV%D$<=jF=3n!x_{Tf1YWs^nJCE}dcxSNpD)@%83E$T-T
z4*!uwI`=JKD(^1f3OS;#XM+Z!f}HJ)BdR&8zR_H|wlRW#npsrWr1#oqPH15M@d4K+
zDsF^sefhj|We<M=S97cZ2hR8*DJN`($ESm3Gb(a+6kPmqHR=a82<3#sl&fq`D|imW
zpVWO-HSF@@EPgD4%?qQi`6V{6g2Gy@*hBazvBktI-LY?r$Q#PfL0nBLkia#&bdVsI
zP(V0k<LUToO49S<=Aq(tq=0!M8y(M7sHKH%ziTbYj@y?~!jy&F8J94$VL;bc+oB4<
zjnA#cdCKf@`AGGrw8V}c7#(mgpli1_MdZG#ec?HvV{3fkS8sSk+}?XW_ee}3OquID
z`8D5nZXFWP^ugn?|5)pCKjrM~D#n2nyPU*f`OgBO(M8W3g+N|`RL3`&cS#73!(tDN
zp$rw!%lh+cPR#ky9c%*Yp7aY}j~1uH_e5MEo_yU4!CQnoE&HUY+%Ji0+S1PDx)GRU
z_rmhuIfl8_&;A(`()$d$#fUN_Q-ZmP;+361I*jS*Ia%}dajd1RV{uiVE=fl{bOuXC
z_R29Av@p`2$NYlP8oyc2XSt@cW$ZAQJ-Wg-#liUPb*9!b7LG~pza)Z`074&IzL6|U
zIR1RFR9IGbR7ZY`46g0d3#UMlR9TxQUzI6_mSUg2L*>BpG{3Z<QtEk+%fgkhB2{Dm
z0pviX^p5|XrgHlmY@ZF9HaC8rdL~RAj(THn@nmEsq0uGqTZa$Bz1*B-zV!+90|F5H
zMeGzs;9~0!H|f0f-B=quWrjtq&)Q+f3xeGzEma=$NT|kH&lKNVsA>w2MP+VB?AOF!
z7J0APz>JI5RlZcv-?hTMKwXNv<C6$&%@<<Y#}l4gq`<B+PkhKOmFJKKo~cPrVRX9a
z&w_eZ8iODFyN?^TyVdmNJdT6di_<D|uhc%{2(@^yUHraegUljfk>FCC$8F<Nf)1Tb
zj_3W0s^lwYruyKXDK@y2Htm1S<<w6E&#m}#+IYjO$_c**e$r9*OI>mA-u10fn4aU3
z#q+`T*%Nj`T^4R#9C_PiWU@8dhQ{gTPUZ2&LnYPvXleMCBbOy<3FW$WWRKI!cc8qz
z16;i&Q2ejH1&Bm!vX!x+MUYW%z7g~l$&Ve%(!3hX;vG8txYmVKEP6Cf^~<;a-0$mV
zQ_`HQ&@{uU4maWDtS7_eo-WVbr(q&C#TL~)>EHUleP{`V!aWfl*vaceZG@f$uyy9;
zJg+Xj4W$nl_#?)v&EkU}0<_y>8}1^Zzt!V5YE5m8dsfxy3K@LL6UgqRZo1y%vUaUE
z_)w0sg}>Zxu4%QINq_Wk;`<TmJCR}bGLd@7=&JHr^_R0|TO>Mh3bm4`t#_^zw7U^B
z+C{FDs+)GiJ&@4Ia@oT!Dyw7>f#S*m2n8?fmcHN#Ryk)^RLKe)r`rC}{LGrlea~eQ
zw=KKMmWEd(`&}vqO}*NAmY!PiRCH#0$o?Wta&A@YIbwT8Tq2s^%wWUo^Xz9}&1nUw
z3_ocZsWZBAH|KqsolROBZUp%E$Ff*ARc+lc){G|1Jl7<<0{2k3nsRbaR(S_bX*&B~
zl34*jZ-UXr2f6dfEa#6FHau<a=NYnk&G71Z(r4&sAG$JC6YUq6gZ8J^F7DZ-uX<uJ
zjVIo{_!P`_t}6es&+vrc5_vAVepi}>j+~(&i~W#Cg_h_Co1O{>wg@kon&Xv|;t~6a
zfC@m9((T}joAkae%@_1Y9(#x9wEcu<Ag!)@bC8e?O(8x4g4TKG$NAb#o!Dol^VzJC
z;qqrSB`oFai!OkW`wU<KqNN{^E_|^ve{`}M)|5=*@iN(72^vWF!vS@@Lm+pZ=L2|;
zkRnGan?z@pHIbCRtd&>Tu1xwJ{Z~&#paCf;C~g)-oQaQQHD=U9Z^b4|ZM_&^a*?fc
z4e!@oU~n_I_iAL*b7~@$7EgVzk%PxH3@~(Olx}d#Azw7q{jQ*-37~eb<Quq8rzUl8
zJe9!Fhi2W+G9W(IZ*moJVk&TE(;LycdET|5{HqIbD_=#zMF|9=(q}vOZoIQ>tm8&2
zMOnye{C@IhdjlEzmcWRnqt5`I0%%64Oqc7D=$W8+hwmnH9phPZ8H}L36-XM|O%<57
zy%_Mt1kcP^^8N$1J0%>|j_&Ab_1Bj#atYNG#u0a&V|d}4;LYw($OP+iu%pF06;&OI
zO@f=t%jd2qtw!fs7omy6zUZz&iGcz@tVn<Y2p1s}I~YtY$`Mk1wpxT~k_c#xIqGBI
zj^B~@;zvc2L30Sd=<>kJKNVNq6+yYPXvW6D5AB>%#wX($ExcR_eG$8l=cXH#x8#q=
zVxz*ZRY$^XG+wyBMKXet0mWQAT3kv$r?`mR9$wGi&LI6ME`q21R8~kQ2`w9uU24E-
z?IT5ciBjPgrOYC9VtPiM*?Er?Ec+{#1~b(Ob@Lwc14y!Z_%$9DFv9T?vg!9=E4EvA
zy}Qie5bnk`J)7A=PWUj(3QkvltE&DbsLD50<L+Y?tyhQL@ZHwh`i>$6Hr=3nU-ONw
z<)%m9wknWM+d<8<bzV0n#b<@lTkN38^z7oAP&EKoPq25OG1Ax#GsZ}#Bn6lb+!Y8<
zytxrI>Oo@7&X0j|%RW!64sO0dOLPlaj?{0{ryEv?b#bIi_}3lHjkm>ohtE<G&+0~9
zrAdrfSDuES7iXqp{k@?Jh?na0*XP*bj%SlN1-e1cIN=b0s5spok}ie~9!?0-nIlO$
z73Om|w=VZJLCh^>kp;X}AFC=251lEGROk-oa#&+1adxUnF>nuwlc?Wd92=#EI<a&v
zJ{@)s1+Ed3ZGBBe!1uqkJMP=8l(eT|pu+*<1SXHAIrJNg{MLxu=#eg`ETBee>$TZ^
zTi)4m=$dk<a<y^Bt`Eirx@V@AR8@r<bthLB9J3?6DW6bxad8{;5QMIqzJXLnr^dV_
zUAei|3-q${rRW5%ipzm}DeF$zxa^IX9Qp%TMX`Lkmqb-)$nt|EDZoo}Wc<zb#tY43
z!E}>WA;L^NuUmT+ES%{Qr~v74tsus>`!P?-UONuc{LneW^%imcHn0AVD->OymV)zL
zH&h*1J{W2RA*-QZqidZVfhn=rS=@*wiU({s+uiSOLu4;!rf&R-J!*BnekHs#ya_q2
z+*xw%B~UX{1nQi-b_|V?6<Q`xdcUxo(@_e$q5rm=Zw3GBBYsm2sp7gvz~gD}y$2C8
z&0FNjO0W`p?Vi!2iZds+nVIU<x(rvmBiX|>e_m6PnFC0T5sfl$mq-p<r*QCuHb}kk
zcq4Td-ETdvmTMa+Qe`4oF0Y)v)2z7&1)}xDTWbqOsV7^beBLDiFnqu+;8aP}uC?Wc
z?jFY(S(h0uM^Hvc@G*^}!x|90G&U$W>RP?itehGzn#ES_JR|X&aN9=!t-1yh6kGzR
zqgw0)fQy=;)%7YGQOuMfj3pZ;r9+|Mi-rr0!9N4L)%?A<vP(7cLZN)lD(>Spp-=?k
zeBZ_QnrBr3?zLXO&fKll`B9;#Hgexpw0?8{`q%^f$07e_*+|53NAAIuImL6Eu~RRL
zH~JjW6^i1BUb|`E(YhH->7HunG|m0p^}2>2GQG}DFu@F>LKpIy5>JYH&_EFHb3=F^
z2cBkJ|HaO6kM<Y6kgzU=JB3O*9xei}RZ@5PvnIR-9#boRbBiB&pu|=HYR}FfN@x)H
zk9^f+cWe(ao*^Gs5u0Z@V2qCv#~sYO-q9jcX2@O0z$ksc%Z-G>pXT$iFtC@Jye5!)
zIqXQU8rg9sm2;r6H889)8axofd830^@1wKVBSq045ty(em;bL+_<<f+h;)1TvrERY
zwN@fdx>|vW3HRP<oH$RVYo;`WAyn5%w=u2wyZv6`6OqzSs}}`jAS%zs>sdQ?8%QNU
zVn_an0<ZjH@N%1T$CrGK96otG%S9|j(d}~RQZV&F*4}r(0Ap%XNMP8703>Y${I-Ox
ztd)OV%guqp5otDqiWV9bS1mv?*h9w{imIX?b(u57$kawF<XMup_3kh^s@36R;PtkO
zVe)~crK*lLi-mRQlYB%o#@Y=2zuhx>7W(OxNyDr-sR2|T^Q5LuTHrkV+T1SDyMP%_
zla6lYyD7tE<lN;A6gQH>iw4iJA$-ksCS*~5ntR&W{Ea`2#6#<{u&D@{duo|sV2b4Y
zV6kFfOU_$5rHx0PRj+3ar4*(T7RC>f)}?Wc$0`P%h7Fko3EZ?QGF3=9urI{p8hazJ
zyU#IE*?Dv_8oz1inI;7vp2Z8(g38M6@f832q$l01@<(_3fF_f!M#pZ;z$XtxT%Ur#
zgwnOK;M8}+eNbT$WPh4wm6<k^^`I1m?d5q7LvFSmEPLnb-O)-@0a_~p<qC|<apQu^
z9n%PFo1<~KKb|Ez$$2B{wIm1h)IguAxr5w)I(O1qwGx80r|=TLiV*?CblgOrHcYd6
zgYeF6Un`IcoKmb~zmScA_qy!)8YDz-l^{&rDb6qE0Kz@^R`lfx2P67>-8W4gAx{tM
z42(EQR*+1x)*Kp>ty0L*F=UPs=f8n#JKE=?ed;5&2bVipJ8qAVK9v>}bK?A+fWbuu
z;ziS&k_~osN5JORR}r1d(;C78($NV?x?4MQjksXVzzKkn0`$8-qmms{k68#EIkiMQ
zTSAw^>!C$fw)to2(ehGbfJ|MBxvI2%TS>fhLrIgjolA4wfA8K0T+uk%mwnv(W*%9d
zz?$usEnQ)c7>RQ6RzgcmYluA+u#ip=@=BMqBr{v3jW@~L=yewICdU!WR*eQqC`xer
zX;@B4lLC!n-CJ3CTK?^$?~-bgttB1OT2kWC>}OCjSAMc@J;x$G;>h8OF#}13r*@wp
zMW2+8JWXtQL%en07uOX@w7I*yk(2BLo*(}A!B#?TCf=HmrN-hpEd21w3c}t42~6`u
z=js?LWbJXVm9>8~3Xo8e&;n4t>Do!`&8ImUQMPsz(wtpPNnq&v?*7Ms&g}#Bkh9r%
zHMI~MAb2DAJB#3zh5!Da4eD{3fohqu!IvbHJJ-9x?-wfsu`*EIiz{ecaExA@flgw(
zt~Mj+P(Y5cC#56Eyc<d!mj8ipXJ-88*=;(qvMjqPZGw!=K?F<Ecw(#^p;_KQ)hQDW
z4X5ewXZQe_rkc6vuY`^&|GLCG*9qZn;1o*!xSEPSxdf(kdYiBtcxg%oKH}t!HF25{
zs;ao^tdJ1K_}6@paR$oS`9<8AZoa4H=!wQq(dA{?vM^l<-S4q<V0xm6VUb*M!c@(L
zz?NDsr9l6W`4O)Wi4@MdAmkrhnGn>3gaShecS>JM6M9z+^(~%3x`+*PiOQU!8Wkx*
zFDOi0OBu6r&FaMJ)uWGQYAW34tN&qP<TaopY6RWVLk4n3tZ6^Ry9}>oRgX%E$yS(%
zNqTaAnq%6~=iimx+&zgHjFGT1xQLq{WWbp!7VS;9FTArF?Ca?sTVB;NfXOT5V9Eda
z<5SjdPDgaMqst85^yXKQ`+>0m!yN8R6#eWC7ry+HXHCJ_C=G^>{2aW>dXi~YTk_hE
zLWN`S3MYc9@FZ@{B_DMnQ>&IW(peu<)sp>Y9%cv4fPxIJ)L$`0@fGu@^wv~tLt{TS
z%o&?^VWqNv!$?+yr1hP$%7S6ZTIuGIJe`+Gyh*V2Zyr7f)^R9W2$oMM<!_MP`|KYG
zHYYZ81EXS!t|+yb4em`wn|GR2WiKi)Ig-p=b1Y_VBQUSw?4Ub4syN^56pR*{-OFn@
z7)<0nx?|2|Dm_N_Z2hNLItN8EBE^rr;AJp34f19|kcYx?1*hS1o8{-6w&;uAcHT+c
zA*Zw&OCv@sOT(hajfAIS9I{|dlJ{Y8u}Rnd25m;*l&$+TPGy8~68*mkwJ2$1R8*9^
zbw(shKCeE^j&I0GmKv$-u0{Ro+{0aY*GlPbEOmh#CQHK7C95aNd@=O4n=@ebeQWk7
zHkftrP-avBhqM-d5h&5^`F8))f+G?GA<18mAg7}0&(3l>&B=LdZ`aBS>BC6lkP~ad
z*stujTfjNXupN4%2-?;hx0K76o)6C-BV@adHVW2TDj^+fO9mQqgN$Q+Z{Hexuuw15
z9l~CWS4*~><dE3CjjriMm(bM;3!=ZWkq=Sm4)%-egM|`!W5f<Y^OpycbCpzP1s^N>
z*hT5u>~_je>t24s)(;TU`^XH$ExPi;sP6~Z(~{i`yA=Lq&y}9+O!joW$&VyvWuhFe
z4Y}1dO-B{F4=P4&nh9~_MrX3!SNndwO<#XOgTPVRX5^+<cvr-N=zjTW3`S>R{C^4=
zWQ^{C#2BW5uChkCOswzV^3&1CL=B2$_qe}RxuDqhFtU;vP@@5SP$2p~#0hI})-uWY
z6NbvY&$kVt8y(Yv=tkGa3w@@P#C2afFIuT-KX#LB$ZQh}I~08kRY}hX^@@e*fP{n~
zA<sT#8JJf^Mh@q)K>lf<LTCJd-kG_zYj|(!-SIoLJo+wpcGz7_N+5p;Bk*U>74Vlh
z=hsj<$C(+emmc>`>Fq&W^UQ(bc`v_h2bc@y@w)ce&H=UjI}Qe;vzCH{kKi?CH7MIl
zUcqEG`8e-5j&K()+q4H6j;L(cS$EcaY(3#pm2=R#5g~7HURd1d({tCAUw1k&Tz;^j
zL?r}f=Dq|A(?<)lXU*{ihe}L*A~eSAK@CxTH_1auPl+8&6UIH4y6T-ABqxcBI26Uy
ztD~;z|Kj=b2Bxawt*tlM^Jen;nMXJ+-Z5R`Vx#J8xkYx`!{rZ+m%+@ONgm6fsG@nV
zdog5lNYNnF6X|QVML5WdO}QAOdLGQTKcl9lN^PuLVNemI{#xWU<<INm37N^x_5thx
zty<e!WD%&)@u#rT@ciLWj+-3F89j0^D);pp%26pA8j89#v2G|SIfadww~5V?Z%sHO
z=0#W5ty*%dPf-<4Zdfv!t*9+-J<=u;MsG)9fpNDS#ArU)EC&)aakh?gM^lI=+va_4
zUy^<!AVicA+Co@qc3&8DXp2Jio_|49?ug=QdSI!UG_O&6kN-Q42b{-<|1wsR$)H)6
zJrT#HLYPnzPln%XwXft|+x~%sD*B0$+!-x9Uyk@~>T*lVnh+1$UFzW~A$Z*!iZsce
zVrlT*|CRqu5jo><&glNp+!lvt{g4-|&H#OS$MD(i5hMw25(TsAN-71Tg33l!-<5}6
zjQ1HIsz=%~x_(hTuFfx$XARI`@%P1OuIk*q7SR&c?8Y6f#yueOeMRQI-_3EvxE$&2
zc};eX^_}Kw{`|*H_u%x5%k5NOg@9wYOaC}Q*)6{DCf@5HLN1y-eAD%P+l-!v;@!c<
zvR^<!jpS5F-^R}_qQF1GX9M8dA7GE%cMn<`Oc~$QM4&2D9Td)iFZY3Oguijrm-bnD
zx4^$O@CG)zJ7v`sp<Z*N|0#AtR?knLo?99fAVQ_H6m^9=U3Nr=mg-DV(zrdic8E@6
ziaf{jV%6{FpY+OLu7&uQ3md6VVZdXICV@f*2{8Jwq<Q6T0$5kU#oi4>@7nL-x#QyU
zSrUbN71qdEZv*eZyd6au=#Gjo$AR#1&1-_Dzke6b&QH$Pf$DgB4UsXcEDB6I`Q!KD
z&wmW@AS-%(Zf-8L$yx?s)JHNhwc@88`~yPgk~mk^c(h(}XiO0fwkDm`t@|BCKdTm0
z)h=@`jMQxcmc4XMfs<b~<`xk-=2s6sGmY(h`_$=0!3>H2?BGRMT>nuyQyZ4_{gu*L
zPNeLNHr^y2LV|7vyY;uQiSg+BKF!a^G~&OT;vJ}XU$#6$%vV`iUr>+cPm;u?K;xaC
zD8Zi9k`K_Ssx#7oI%(UaZK|660`3Ox*n?c}@iYlS6DBtT>^YuQV%wiry=WpA%>!xB
znF<zrn=p<=Chg*w+)7}IqLY^V&x{IlXQHP1m$2jz4th&XI~!nn2A@e78r)m7P@Gc&
zRFWC|y!m<?VyLxN#Neici(8VwxRS2#K+ew!*;_k)=gZoxvV7CR8}Mt$PSzQCrqeAl
zDP(UIZes^DICzxPr{t;%gqA!4wFP-|no6`Z7zL3*IrZK@(O$=v(~u3lz3jG>=NLtK
zorPT`C2W-3@Yte9uJ-R}k6>E)bVPAP^hwrkR(v~(JcQ5bs1L_!n)C)+CM)L{>yHK`
z**gLpo*FFd?0u_A951Ene-5k|LkI5aqH<f8uVIJz*#|?SrObLP7PctPf%2&l7L>>6
z4+4M&23m>cC0yku5>%01L;ChQwsOI05`2HsG8^g2T=gbJ4u1GBeKDk9xnskgEfw#j
zbL0nhSx+|Tr(A4SN|{(!Vdcfc1)IfVd#u6)n($qdyAw41P~3=Es%V;jTo7IlcVNE!
z4~ir5PkdRtAD6~8ZY#(R<;^QkLFSBZH3Idxq@IF+0O?aIjOHreNS8}M(YT~J+>~jL
zGr7{}+|~|tE9piejhThW^0-LfAwS~cWLhRN7{6}sc#`MGhdl}*C7Q3$Yrp!87oYV*
zOy%k)m0v)KjcyxoG?sL(NMeI<t7rHK1~j>Y3#LDu6P?@v##EXdhnc!7TF)|UivO+%
zA(j|=Nh<#~JM5r|`Tw1@P)4#_B{n#PbJzpy-`W1}uf0%LoVd$QUTO9p5GRFA2+IuK
z95j(IEjGf()yA#ZY#ROBLa>8uV`U_jv%-2-{$oTJcbDSv|A9i}>!kzj9e<*2#q5lG
znpph6cl)2)?2rCmfaL!Nh(@X&+dVlYMRiSU4JSD{#R|3<@h@`ma%W|gX-I%P)0WHs
z9Y^vv-OTj#`+sg0=k2d9q0+kMfWHutug^R2knEZwG#eY6x}4u9$U`TPPaj4`6nS)N
zl}J&Uwthyj{pz{l(uM@&-@D-*t@$@CCTFZYSr8UpHD&|8Zfo~reyRUY&;Pa&`wuY!
zlj|o!$I54CXFmf|ko0x0P<PAji1dN!f87CIxSS-JWPNL&YOT4tKt>Hjl}vK5aZG3!
zj316(CoFH4$vYMlI|jbH5k}#4&djcWXVv#+`L0&q`DFIpdy=zoV*0f*tF4RMLuOXn
zC=7}5B2iSFz3drU!aPY&|1|dZDBLlhf&mhZ&CNP*eD{OYq|UG~$?^mzO$0`bUK{aQ
zP7~jbY#Vegft!~?gh#@|k&#0?ZXLWrt54l<C?@PFpXmYvg5Kw2TKa$B+JbNn$JDvs
z-5ZV9sM$Yzj#1ilAjcveM=^?$&N7p|kfRB108($}UMTC<u>KJQziq@}odc`eNH!da
z>uogC!ay0KIZHI42t-7e`}q6S@x+pqb-edVczysPH9A=&W5zv^+4G%IZdmFoUT4he
zf(JJW`anWAJ$RX(_3S`{-k!*oAc|@wp90<<xinmbJBx8iHfoyJ?ROBJhV|zHcEbaO
za}rBphChD?8Zyx~6i}7D!ou<oa_T69mV>GqdI2Ges7v~Jg5sye+pt0c^1jb~J|fo-
z5G(N_N%Z_%RbhAEuROC3xA9KhUUK7?l;lh}D1HGK;(h^5yEekxDtC7sP549vKJ%|7
zrPeI|SWY(O{pW5!-ZlBpZ|!dcr|~!259!oz<n!9}pqH=tp4Kz}K9+X|E4H}v{Vr~n
zTw}x7!V#8#Fl^VFr?>sacCCK+^LyfkLq*{#jY~0S(SH{B$?!cMaKoOzxGgNtW}D`H
z6517BXKlo(p)LF;uF0m8=Ux93ER5$2!-c*eB&9A8VDvcr>Ra5xD#yij_B&>Ut7q@8
ztQzZPeMl}n-E}MLCOL7g8EXq?3_355Ou&SttsPnBPD_~hi(|J(bvjPkb@mTgz5C{z
zWA&1%MQvHfjwO#9_M-vhY~>B7Vigzj5#1I?mY%~t>rA6m2$1o0hK~_hm7Vd<7sc+?
zcdKsgpuMB+5$NHWi(5DVIlSsgh1PuM92VZ;G!#XWLgv^VlT6L)oM+YjA@<VD>3#fG
z7~Q4^ugWPNFj`k}h6vO~OE+sSHq3`ZL9Bino%4HI=t;AF5tRs|m($u>30fFPkiyE&
zZ40vl+{-_m8qn)qz{8V-AV$Q}Z{d1JN`0Aa5=V_z6F{kzY3f%&YePLgJgt)$v(keC
z$$QL7r3s7&&yF!CbHb^WS*PNYT5mEoQHxOFRXE2<p||GE+rfUv%&s04$1yC`W9&6e
zwqe9E3Dyynfs!Kzf3FF5^nkd$xrJAl=a}Mad#iR19N3)od4#5v&D2Z#UNUqVQV7!7
znx8{Flt<QRDf@_Xl)y0mO4$VqYI<56GmH&7y&6yCBqRFff3sJG;I#%3SUnMZbYkT-
z;~hG8Drv|WXRIoiSN*FR%`b?xhZ{uvRA6MBaSkJ9qCIQTYj+n5Z@u~`tp?^9mISKO
z-ryoc?`4{YDF7o><*2BcPfma_GCSP(Fnxwld{Y|R&}f{bpud7qYK<sgG+4ptT(8@=
z)<*_%kLdlix`j;&cn?(_mSY|4xL9GVT+j`5g_z$2<r*22ZN0~H`|v)pqd_0E(=fcr
z<WafLh@HEbp8zRYX<yyeQ4Mr@`8=<BDC|UH{D2XS@y-asRJW#rCU|Z&5PDGcspNi7
z{j#3OhK7~Yo)dbk-HNp!$O~Tai{&E=`vFiaU}k8<L{t#LZtWWpGjz#K9rBzpeJ`t5
z4=)(+VcG*4tUPvHlxPM$U`m@S_<FDo&IiJ1xm;Z0=tN&wNE3B<i{Rh?9MD8!V!alm
zxaJWSR%&UQm`puN+)IOsERH=P3lw?n**7K@x$0+P$}l-S|Euaz;Yh3L6|8uX$>zx_
zB-Cn>L3PB0DS8e_bstlkojjEGciGQ(h;DXgj0p(mymQCEK@9&@5XsGNEv@>R$6F9n
zr|kj0Hf~3+PDpU5S?1IyMkyZ2=ov9U26U8^(6zK?Fxdi)<yDDlgOE3Coh&rh{Z&0#
ztX){Oh&<*+d!iYUfRz>UJtP#_8{}3Pheo=b)R43=2j6}#m2*!Md*}J)O*EHD*npq`
zwl36Gm;RcLqmVZM;7xNMk@h;HgPgyutZ8x{Dm**ljr4wD|Lp7{{`gpYqf3w+*>kvc
z!gli<hBBHldh^Eej?j#vZlA0~Oml}uH(J984o<yUZ$T$j>arI$w8Za}ZV`#d?dvMl
zeccu(cNbkkl@5?91iJZlog=FUE4vRcb_qGW_QbENrq!76Qh?IurZZ`e^O^4!5%H2E
zMia(Kg0_4kqbgl%=cl0}QIFS{Y<Geh2VZl7EnfogF93Eob~1zg14K?;3!lu)sjSNx
zn7hh$){GBtGP1aM!8}>b>bnZI<?L#umIHMkv{!7GwG%qQOCRBDx$|xC%?buuPmwRV
zK*g>R<vMQTB7eVn{5Syo%D^u0{O^ed(CG2m#o4$0#D#MX($a*4mVkQA7S@a<zs+tr
zNf={EyKLCjX!M-Vij7U=0}FmaUQNs|BP%;AWBUMK?&VUs4}J_2@vU6JE6DU8Z5g=D
zwlwVFdo*oaYFxszpWp@pY&YAmf%!gd9FF+uPtUHp!cVAS@<4i0A}7{83aWOZl+)Vl
zTAZ;Z##LO3p16MTtU!q3AulmugdnE9k$nv2Nm%UW>zjAYE?(M-#BGZ$uTD$vU)A>J
zLx17!9?MB8btv{3u=13{ptSbkRwNW7lm@q6TXt%%id5TEj|FMK;!Ed2<oWJJeIBRu
z@_IzGQnTJK(q79zeKn_3VDFaoU<LE?XiJ0bDtJDzZNGoCo%(Jqtor1Zus6e#1(5ni
z0ayjLEH<FgOV=MZI4q{h4_S|GUkmK96*;bP7=1MAvwd7hFdBS6N5w^hS|Y-Piug5$
zU3{~Qp|pFjk#*}q6DZ?p?bw_q7$O5%yWxUGp79<A_p|R3A38>e><91o&YJ`kWq9M&
zYkIzZ6%?IQpWE7!u?07ct=Th-S4a*D5j#C<Cte)B!&<6upC9P&{)&rqjGr>HJNZai
z^0ck!akB|Ra!iFOj~#?NmupPI?9*Iawk=*>3i56PD-&I(hyIB7Yg;+(D-6CeGH<#z
zex|>}9c%8I_6zLB5UdHs<y?vu=x8zbE8taVY>jk2Z3XI!wAGcCX1U-@QUUw)&b^5Q
z4!Ld-Qfg9;a1_VV3uc-9Q&*bLuf55-p*;r49L7(gQ3b!!S%vR0dhwN81K0mxso55t
z9jTioHpK<;d%LI%>KnE#9uO9PI!XWRadpN_sar#PJ3ia+8f<?NenVk8&Bs4i`FMOD
zqP^>2uYs{E;(`k;)8FRM4}ncS;)!1Sr_y4Vm?%+KO-T)TN()EZkeF1D$Fqa>%KNRk
zkvdrTA(#K|Xzaz|#N!QUOh8S9)$Ix2<93~y$o0UVZ~wlz<e}qh^ZdxzE=W%iZdcf?
zIf%*nRA-T7rdqV@{!}-%J?l?tQD3!Ydc>n=v)$n7Y2Tj(NaxZVR5$yWMv4AoZ2T4T
z(pj%{LN=4A&Vh}mB6oiB!`{_P^U`*)BkBf!TB%N`zBh4(5EP;m{Nrz(e4%E)dVlh!
z1o7FY!y8X?k3tRfM$^?8)J8?epn?aJQqt;r_%t_ZK}IN&Qp*DjBLwU@`E7{jM)nOm
z-6?F9WAHGl$5?q)|MT?c5c7@QlK{1Vw4k7lGlFxlQciFRkNTV@l@K}F9MHKISr{(_
z#%ETrgE*Za$I+H&x^8Q!!uJZPq9*`3lBu1r-d>tijw`8Uw!<&ozq)JCW*AE7y2@Hs
z`WXgrmRv|hSw&L?cdcMzvYt|RK~Sb_-8Ck%mu4{*FFHs^sR@ND_JgcfoJK-${S*-7
ztSVjEEi=K)K_>yN4_I3YfYVxAJQ2mh2suHf;O`2pZF}sjSy|)Z1IMrR={?I#$+ZxQ
zN2@aXesUE@sd7G+mb3AB&%`a!g7Q_@OIV4UR2~yEUh`032bpz2YtB$0Hfnui$1jUE
z)t~eEk&Y4v^u8^mi(~s}HLzuxYg%Jg_DkziD;5#KjByGT^l+r*;>B)MI5Kn`#Op(;
z+WD+d*~rH@_Tw|NI(3Z<AEJUV@FZQI<=fHgy$QDatB>|w+%IMV56^Urbkd_`pMxV;
zDe=C{zXq~72Jr7<>@zukHp67J9-0Oo@h!nH2T_P!5JsyQ%k$C@k@GCV8MOLWdcge$
zyj-&+q505@M;v;bS$|;b!+MK*2F@wAn<rlnU|wa)aInY2%C0ZdqF{4)iA>z&6I0R9
zx2R@k(Tf|i%&b{A5g2Yp)uf5KyL^x3n{7P2k5Lp4SHsPaC(7co?!Cw9l!_e7G=IP7
zhDV~K{d{MU^oi_<9wRbR+|cE>kaC(hc9gz&!P7am!plhQr!iO8cxySQwr*{GEr#nr
zUs=1)craAXOQCh!t&ni^poj{uU|WREb>sLQIS>Rh29@y-nJqy#;tUtiyqvmhjk2w+
z)8I^G$c>L7>YuiJ{fqj_PIL-k9J>Q%OyjOb!lW9P?6Q1p|0Mpp2Z}WSs0Y{T#Qyx}
zH~jK}!g#k$EP#F88t`wkl=qgOAzc_x8ixFQ=|*xQ;JX<~-6#ms#6Sp{OK9`Q)bO6q
zl0g<$iXh{{of~QSuxaE$R-O`wIp;+Qu3<^Jw^TUcRE5W4or)e|810=}6JazS%e?hz
zIBBrNBEVvJAiH~HFOk=y>O#u8<r?bt1edJIEyGafp7^3ghL+X~VWyQ<fIq(f*(DVG
zbKa}L{Lz1JE+<mXAjhR$NZ=(y>liD86jcyz3voSyzn-F_Z|w&iUB_?Zlf}qNn-@21
zk2i{D1<IdoQ@MB1uDI<dMcfWrT5h>LtR{CE3;>sYCJA4urgB|BWuQ<hdDa<;)O|e_
z^Kxzbfd^Lfx(PlB=ln1#{>6wSUF)o7Uy77ybhci`_Sb!&{5{HX;<Phm{!IqM+7#;4
z`SU>|2ce2iR?yBcg^B6R4$`sU@_1Wfn;$ko3SHVKi7^z6=0v*_{K-Jrs^7`WD)jB?
z%UbVJ0M!2Wv+Ax_4XIm?(q86%wf(B+SP&ulRuSD`U)S8TUu!WzRtq25vv`&)K!9M%
z%M&*{#$BE^G2!;gz!`McJl6Qgo$)^uT<<LU8Bf~Ye4A%fojp~&_t~58PS)(CD<1SO
zQHd^Bq|T{wQ*m04vr}5P2kn?4DfUWjdtXvLw)ALNnUvx3Jx5x2D;ySh=|+`gTKAtO
zccT*F(GopyK8&=zdf&M30xjatq>Ti*^1DIj+nXD?lQ3MU2-?pLtPHpe0#58?KMnTF
zZC>Nxkb(x`VUuw8UzxE5sY@0BL+gaedlt^!Gmi})m=Knvxk>|WpcC+Txe$%61#O7~
zRhe#Lj?hd_m(4}$%g9<ZAN{~~6JXY?xj*bQn8P1pk(LO&=+hYyESgm{_1+uHZAP(g
zKH(=NGNnDnq*}Gy2Zlx`G)?VpfGE<1Tah+-U)+#<u(9)(NjhAAws<~O>`pmXK&N<B
zyD7whz@U9@tSQ!JgaPn}&@*wFLS<kxE~CPeU3@tJ(WUp;yYCi;19>uEX1zWXaYsra
z@Gw7;ld*QpYm8{8FL3FG6>p4a)BJL?a?Vx_Y&-7nPWi8`vI$B2Q(|o!)T|5%c21W-
z2kW@9Yb~%S))bjz|CG}v06>TZG(z=%qlUiP`rsTN7>lcOZ0fjs9BFj54;g~>mj$y2
z931vsI~mlL?WXf#jLPgrqtJ|r*G==9n3oiHv>9h;^@*&_{h6H@6%ynErM~GgeeoK@
zF+V)Wd&W%Ucsmk4qIJ{rF#kBx$8l7YU-ibzhayPfBpg<gBsqr**vLy6jWseq$sgX<
zCeARv_?B5Y2lZe-m!ZDuI-eWY=_BdA1~W#cK>9Uec}p|KN_3zU$)xDVf8&e4!9Fsw
zvi+@fV|s{!F>#LSl<?z>U(HTfk{(RIwbR4c<XDVJ7`r$hU#s^k1DcuHap%T|6xrPT
z4wW(Vto_s=%GJ1`NmE;$pXMfon$#4-VDpi~z*?J}Gz_*E35Kdl**b{4TX(;>F_x*@
zk6I9L!{Z^kfjO;hK?E%QNZ%paLs3aRP+E|v5DU1wmdcFw!!;mSY)uT?<+1gt!uXT{
zJNRf5C5v{#mgWW3)BOs6jMQyi==#|Ed}h~HMO%k{?hi-Wxu^d_<-j(0{Z>2(8JQ59
zhJ}j&W(lM3JW|mi--j{1pf^w%39Xiat(Dt{M{i@(gKsd9Eo1~QSH9?DC9f^3#<5QL
zW)TkFZ^A|6s(!{HLh-=Mdp5h(obEHJFj)Z2j_m9G5z|vcTk)c8pC>*<FK3>FvFM+3
zbVEZiqj-WdXbbrcL`2x(;+#s{t{9)=;tV^hF+pohrv1EWd=K^-^FAa~+SY>0L8OAV
z?XQL#cf_Tr-%qH%s?7)8c0RQyjJl|6vy&*&mRFS^YcI`Gcn*<Vk&5DaiFfwWkPA^D
zED?Cp+Y26U+fOdQ>Xz(ji=CV5SXcxU`Uoj)Ns%`vtppEqtTLu@6eO0G(h77V10Ac}
zp(ZDh3=Sic%fm2^F)|{tP15e(+M93;_k>|&q!;OCB}kpD%~;@`7)KeTS30ksn9e}D
zUNcg>{tnAmxR^kmc9vZUl?D$UaU{|D!f#b3pUU#t3tHY>v0ZAWqHT3lTrTP88oZxd
z9^<MEv$qS~8yL0*whrN*^JbIwxsa2r5vDCbpg<HzU=@8=f3t$m*1(|gwB38(VE!KL
zftP!1Pv?ybJrR>uO;;04LQ?Z~g!_>079m$$8*aR2_R<39`rwpZRwN<C7V?ugA0yXR
z-)|LU)(_x(-6_7?WJ8ldSc5}MucQhfvA=Mol&XwzyK{pPmgTZ7cobSnVS;n>@r|Cs
z_saYTUOqmQHC(g2#zQw5^2C?>CNjG*!6BVtL$>|~c8P#yi_X5=TU&22)vra!c477T
z*}A{T7Ka*b6Q6M$MlO6){km0VgDd8#Q=JaMF*N$se1dScFFOoLDM2^zTPF86Nln^`
zm6%VoY2=#`36ZqET_A;YZyPHSExbDLo%iOw_RZ_d_k0YC^H0R9jSnQb*18reNV_Dp
zGx(n(Z_2cpD2+{3eTKQGq0gJG@32THOqE%wLk?1rLbDfXBcD*#TCbF2$sSDf>3fG7
zTP_rZ=j-;~;-%b%j)<%>)+9JK7S&WZu5gScAjYOoIIzUK`&v9a$9WqyH?F14Ft5^D
zXR!XQcfaSF8_4nds{F|$H%6GEFhm-1CCi+u5ZC_;TS1m4LR2ammP>RM9?gn{ZDYuw
zr>w-zP>Ub(Djq<d!bV1}r~K{{Xz|Auwk_$@ndw(#0j3b2s=v?qnIJ-hZM_$37e>$T
z*OgDvMA76K$7c?oq+*cyY65v!7HU6BQ5dk|7VMj_i!Ao@S**!e?haJq+45ChRCH%u
zG;*>%!+?RL*dQpok{g0HDG_h=8r*97Dh!nGxvvRF>KOtxzKB&=)kq4q!nl8c1Nj+=
z9)kCIfsC)POUk;%%_^J!@b`@tU_>7uVnNzt0Ai9@f<p#Zl9EVl+j%i&C1`(+GC!yw
zvhX9IY-i^%vgG((#nB36$BhXZ^ecUj+btwWAgyz#Z|Db|v+?tmvm7K|QtoIj>M(J%
zx*#$`*KBPu56RK4us2NP*mz16gU~3%w}W>~*jJCS4Sx(NKZ8^$z3=Km5Zrdncy`x*
z%F$P!p1DLk?6*=(U<^(;1y9XOmLs}*=+h|E5lu`<+hF3VM~e7SdmjS<GC29A){Sn-
z4+ml~%T{ls4nap*rS+IuE)rROtBS*F4j+wf22aoZ_UEUz{2=VdpF~Rh@_{B+I)OW<
zNp6PbT#&0YwvU^Jx@Cr{1o;J_KYO>dSi)lUKa^NDy4Qt{_dOPnZsgnC1y-c$Kp3Sl
z+N0QR1Ct?13q2s`@>}t4WM$jMZ>-*!I-VE!?1B2CC7NXynWw*!ki)|hZi!fhXZH}K
z*?+IHLa(Mk=6##x73YVTJsLFpHtWbR$bx=uAFT!R;*&Gz8TTf$sAk?n)Q(Ig$hZ~M
z<`DB;iDi`@+kZ@uOu5A6MY`na$U1neiHvD!$1AGEygS}j<b$+Pn^is4_0o|uTKXzR
zx_o}b!tr+kKx@-K!M?1+bay({Yr#u^vFTrw?arr-)b@akebY~>Y7`IfQ$|^|;oKlB
ztnGJyW47H!8Z(YFN`l?Xbt0ZQE_l&m8LzGmCla1c3C!OGy7}KC45?czO|CHNI{l)b
zwTmk`iUEAg<uI~)6kAv&rW-C%(hE=H7c?bGVI(Tg3CfvgGfn355NdfU9WXd7I;gl?
zinOlXUKi+ypwo0<U}az<m3G3SkW3*`4on1Ty`A;CCxC@esZLGD^MZ2`DlW(oA=2d&
zYKnnzeXsWOITa#NQGv&u5p=?e0yrj#Xsh~y30_oShs8y)&NzLARQS2{?aVWCZ2T`P
z$f8tqDt0}E6?{Y0#aVUO#gJAOeKKod!y^@s`RcFO456Vrjh*T#bZ)h@_v}SAij1nM
zaV$ITST-Y7Z%C{SHSQ7pyF{snJ?he(OV?lXU<d7V;4j=B*{ipCILQ$&ye-8aq5N^h
zb)CplrJt}1F@$_4ERIj&GshJ+IVd`$eiC`z7DUF^HLXfpC*Y$|mhw{O!mBD~@2~e~
zAMFHaLpL%4UL0GEaYz*gqtrW`6hZY7{+?+jSN3%o@62F)zSETwtcQkaL6(hzP)E8t
zJAI(j&xQox2bQIquSkK`_C&b;*k*R_;;tfm`|N^fN4GRG)Jexk#(9}t*2{fcp^P6*
zDEFb8Ooc3xZuS8l`K9(*_Gi}z^J>04<5WShvO{(ZbA@WD9gjP#FR2B76+we<9aIt<
z$4rr?Mt)rI(_$pBL(Z8fS#plI#<GM+;}Qzn^L8dT%%1o@H=qe>en*XS4K1;D*JS-|
z#E^E3<s4L5*_KZrO<Wk?X}wMm@&V8H$Yt`fZ=h22twNIOkB=k*A9e?fD(&_Ffpgrm
zbIC1#zLW4V{o!L&+zs612d<uQc!$MFjv8HO%6=8meuvSe2v75%?-r|yY@t)GH5-@L
zLxU%WZGY3hKp12wt;M6;WzZ5P@F(`yjvQuof4Bap^2BsXwRV2^lH=tMPIO%=iA8k7
zv0HLx!YzEmwfQA)+%+ab_s?9xX9VjP5b1o~MCX5|cP5Z!zzav_nW~<_RMD2)7|ES!
zi{B`nKaSfK-e<f<q4@Lf{h#F=<%YQV?y8?Sd)}k+=w64ijl$>&*nXcTJa=Sao{PbY
zZO$#&CK4Q0>LIcL?y&aW9|wG=N-_>)^h#`Lu>OQInPDuR=ND&!sMx>ntte?+ZsJbB
z5FCuU?19UHD@XjGt(_WK^$3_6c(UKY2WD6aOve*qCZvunRu+Osud_OmhZ`-v(UC74
z)KWni`A-BqG*8`JB2~}ch8|kv`CWH_;i!~aU&XRCVc@bh^tv{jITJbDm#=Txo~SKt
zz2_?mL?$j<w^x5(3K{<P@lc3&HdwWEE4N;?u}~5gsaqdl3&$p7$1T50c-~TcJc!n0
z-^sRQw>zNUMCg^58GDbh=74Y*X=@lY658KHa!A)hsaaULhLj4HQQ(10GzWt#tv)pW
z>T^JbA=wdk9z+>BG^^QRt(4u`AO1%81}leH<zE{X_%oW;282IHynCK>wJ0I%?4Ox{
zrAXpn)AhR~QfZ~AVb|cJE;MOacH@0F`i$5^;YQdPHjS7NN2}nM(40`BKWjw0+O0jM
z;#YORQ7Oa!W8OagecfG;iIJm4We_Q;Bf()2CK}<@Mcpn+4A54PO<pN^m|y<OG6e>9
zQq-Ii!*R(@_jq76$TE9#nf1CZ*fP>pA=+Lx&OoKY!{*z!59SJh-R19uqxYA7e3#0Y
zeCW&0i={W@OzD?bT019OeMZ@3S@V)(Y2eqb?(uC;XiiMp<_GDG4~nYxwKdPNW`?7s
zH=A97`W-Vb4`}V~8*hj2z+eukP|4v`lo*dum9v<cGAs3l>;|z2+tW%3_CIC4xqVJ(
zkc@-jWr3QwSWgGXVz72kVmn?mhGp(QbA#fYbg=MtK`=1Ti-%T>pq$hxhDZUP!#Uzd
zG9AXu&^JBOPFSVRzHfHz0PD+=$*2WaD*)f!i~6LlQNb!{bf!$;Vw{2!d124W6}uRG
zya>vsI!eb9$^gF8WqAntVdHHyQ+UnB$D8zH-o*By6CgQ8g2_5d;q?_jYO-uJs2Rer
z<9}C`y_BFVN*U`Fx@fcs0d)`c!BI?11RV>$f(<&ngy%!WRF{<%%00f~wN!9`z74c#
z_}*GLLYK&jNA1Ys4?kP#m_U|;Gh8b(v(F=&X;Ap+a`V6M;aMoIyBBKOzSH<pr>a&V
zZ>xxQcxZ}3rlFdefpusJv3~mMF~&yKB}lWbO0O<fk13D&2rxTmCt-8gT(SxVdLY@&
z<dF4cB~(&DQGRN#Rvr8<&zR=lDKN^=N{oFxS*t)jU2(@<eK{OS!+_ygj2d27U*1jZ
z^>@kD7c<JA!?sd&eB@0Hhj=+7irhV<X^Fm2<=QR=H6?$sCoe9yjz?)Jcs}^1(6s)I
znCAdF!wfwY5{UbKsfbo{xhh?T<sg1Rp>tqpY^=`b>~?haG{cGP??UV#D=EIa0E*zS
zu2X8K2z4<mny1itUd{Kjey)uDtgT58Za;d1wY3pq^f)TQ2U$HgO(p%&59jH(%_^wn
z7+%Yw39p7hMdNNtC-0aybHgRwy;D-U!BjRBYI^vN$m)d>7y88>uyBzr4bTUj7CT(%
zzpRftS*r{Dbj-|p&AnrKk+dmmZA#4J#Z71_>W+IT;@zxfbX;Id?m2b9(r!kt@I4)X
z5h}mbLNc`K4m@4<<koLFw`%z@<qF$4{6J}56{0dm!VRJP)pzsTZOxZGDP1UH_1o)>
z=y<ysrJ2p`!PRG<S$3L;#Uj-I!`@r|RT*{vx{8E!cS@IZvnXk4k&<qZ?gr@w=}rle
z?pkz7E(D~z7Tqiw&h4|G-`VHgd9mN_KVZRU#hmw?V|=e`@MMDjhw|I`J#9BYp%pap
zCkS@y^+3hUIAy+;5v26p0jn2IAfe1n)n7+K;R_?&<Y?*lBoJxlpAe204s7Q|T%VRP
z@A!4AyD@>0U!!UU>hJPWXd<)r_vS{6#>T7()KsM~GamZ)o#zl!e>5*ZG-^FT4={Xb
zFD^`oagOaQas{Te&V#B=)=j8FH3J5*snE_B6*57*o=~rmlya0Q_vV?`Rqir>AbrzI
znWH61J^RiF`tVxgg}1W8jL)hq@&Fe{De8!u+I15&cG<r_7DQu;^|zi~0DcnV@V%Dn
zmM{O9kZ9S5!Yy6&!>_2Pn5ztW7c0+JTSYtQjaOemnukADE&tc=`r}Q3ZiI*}^<0Ls
zn_b8CD5tRY>(yD>!4*VPlr#%UtR+Jsu8I8bxRfL;<#x5xYgXPWWe5)dXvT)x#V@7-
za}v!_eZiuYrfvGdwu(JSY`>|1(i~?j%iF;ysfjK8C0E2*;~7F5t<deGs<Brku#GL^
z4C=r%__c^XX2_<V)9TkTBSnBMT>XdX%@-e00de;4XDwz@ju~HF-m>SVwJFg55}Dc=
zFy^Myv1S}mY^1m5lPQhKg#$9{I;kBraSL~fUzjWIy4pz}C252V33dg%e;lpVz@cC7
zMw-<{<g(ZLA!Z$4Hw6m~{^j0^Fh20~xz08a#Pb)}xGJd@gG-yXBxwCZI>g>h=CrOy
zF&f!-2|MA)8>diD(e5H9lYS@bN!gE16xV&mILs}RR__xXp(uOU>PCyzle5pczvX;_
zHzoAo```3@GL8HVmXD%o#s?K+gaE-N&DsO<71ULVoYQm<1Uy}t8!{%7xUs&iS9TVC
z0Uz?5sK3=<s_N+k<+Qec86WP0v^#tJXerbdb(x=Q@K2KFa7xuQuvCPoMPS4w9$r4|
z#=J+;?<zMcZT!_KV_PgThKi!87h0hk>4Ios*|=ZCmxb|O>;Vye7-#WnHUVr;n%#Y$
z-f4~hYqYXJFIO~|$Z4vTOKK+{0~{H=ga}Cw{x*pfSyxK|P`v%-kKwrPBZ4tD&?$r|
zCQJOtV_~8n7KR}*rV%VXfnI;+RT(59ZO+OWRoxe3Wl7=!E1wkeB&cU2TLzq+(3B52
zM6_X*hEb1C!qQTgu<OC?=<1~6-%I$u>D9ZP`P~xO`M@=0R>dH8&R)gptG#`Q1IOq}
zW_Xl~sFOWK*%1auHLOa<c8X>AgOQW~qKeqHIE7HR9mT3B5!WG_r+1d8(r4b?XZf|d
zk{zKT_!5xUnxmW0?yG5%lqtgt)r)0=(j$rgehY2i?WgO1IM&eFWW7bi<JyzpHeGjJ
zm?$12@B4!zBp`1Ci7?CN-etx9x)<AThbPqHd!L^CCw<BzYJR9d3y6ij14u{;401C6
z3BIM-l1lEp6?3K=P9{^wsqb6n_LPV|*@`wV>74>DGuBgT+pWX{8T70FAs4J<HJwt6
z(#-LX2xAudR3swA24XF0+v(C91aWKHP`sUz%`Ef7M&q+iDGW~jRhfaJAIrXxs4${s
z?t8sZMwbV~Bg3v`*Zck2S7_P(C^C8OwXQ|Pz}f6F%|QH#A@5tQ<<nsQkV{~;h;Blg
zeWd4#D*|`|=vqohc(=VJHM@7P?@i}s<MmpnPf}9P&mWJycDu+S|06MNVLDNXsVnIc
zR}fd06)rq-QKy!e^$w2cVLdor7`;PACB;|+X)Z8$TKRPa0L|KndMVr=`91;_EBtx~
zf2`NW30w^7KU%diN<R^fS=w(sNw8OEXgyL|ylfP(WwB=Mkp3+fo46m{+7mXW2ZJjC
zN;WUf9O6|3*H_4~|5z^FUqqmx@g<};W+wQ6W#Gq3xxcu<fpVGF*pQQ@A^rKZO>B9m
z&W!{*$r!I^09v+p-7-7a;nN(uG;Bgi){Sv#Bdl+Uu__gRTib0E?-(#dP&@5uV>d3&
zzDN9TgMFVF^x?yYlSV($w)8j|Cv#32ddxI0+gm^FUM%9xWGssKcjv*hj-;W86F!nX
zp(d);Kk;Ieh-v=y1%RsLuT<p}p!`vSnNEMXcLx3sdU4U+DnjF9;#*zo$WPw(w7YEH
zo;GUv;ZK-T{2}k)J=>P^H&Ky$DpA&191BG<pmQ$Z^!vQU^YISQ!r;6s_*qEA&gxAp
z-DD%!>jE6YIoqICfM-XJ!B_6l2ynurW4sRc;LW(*;xJw3(G$AEkwVZh5IKtItUY>f
zJHEL4>1e|V_Lxte{rj%{SS-J=X=+|#c&oHWfaNO-2bXoon4y$Jsg`Thw(;pt10phh
zEI<d7&hcr4xfw*c!@J{bpw%~_tt5>M1VW@;KSrVTAxEnuPxxP)A<GeSO7s(ly;fIA
zL^g}&zEOa#VjSD^knddd>P>rz47f)5717MuT;YXC4P`u~LXkS82eZAHEcD$-;XWum
z@zz{E{8h*Nz03$2B))<p5;(@|zoukeQ*f@U@qcc=(4iKSGW)IMRIuMbSh!ZCv)h&n
z<UJ(1BDrH@gUu=MtMGQWC>agjKGElV`v>=Pvvn<GFURm+t|klZX&YPxMed8d4Yn8~
zNILjcUi%z<Qz58Fb}@qOer5WAo(L&!UHAu!Q}ejGiL75U%7nn=#H%@RC#?<uprfX!
z_-3!{6aT%6kio$#Y0%m?BeUaWtK8R>r|g4Cd2P6A!%vE$Fh1)*rh17zVSjqxH<H^R
z3~aQR2${|GKYPMX1oXKcym4Lk5F(HZJ&6rzjs5e(F1Rffl;85(SeZsy6m#|Y+2T#j
zbvPuU-Ev971l70|D4Q7DiWqdN|6DrTLE>Uae#(&6Ho(hJ?zp2s6M0rthYiuN@*(%A
z)s`&3S>Gc3_Iq2Dh+~HqFcIYa@j1f7ZKfNsz1@^#()F?sOvWG0Ncub-J<!%k3VwQ=
zv|L6@%G30_xj2{J%>R?_yK%~Kt;H|r&yhKjXyOa27FIJmsf0;EUaKqHwqky~bgmFm
z_n_*+!1tbm50NqIXM!rU5s#nXi&5E4Gh<i#Gbb|Oo0lkGNnF$<d}@Z1?Bm>P5tJhq
zT(Bi&_cE{6&E{tEIOR*biTQ6<pG>(ecD@hMPg)590Ax9-t%-m_esSL)mbTKjNHmFI
z3PdX|c%M*VKwcgL8$>n`OTz@4R0uJ_3r+1mgD*M*uWr$vWW2F7YN$;Z{lRTu5dHzu
z_H<~LuBq#(=%m3J#T>-}07Sk@(5(GZ{%Lqz29Zo6C=irM#0+^5`*MRUx^R{;&)_00
zP{7hp;Z2sJnL@Gza5ZU3`1~~e>~A}Kt_HhXUMFHa@2G+S;Ebx2wpqinp*+9xN&qOF
zi&y`dRvOC0HO>;k#`?CGY#>JjLPHL&_%7heSl8sx_Y5E^2XE_wzhvjcwH+`Q#pI%~
zJE*8rg1__F!Xqk)q0=Y}sR$kU4Jy^4&v7zjpuYEhB4&MAmln)x64-K#776jnskdQa
zjz(1~bxGV$_Hkh`@xJL1pDv)_x@3*vAvbalLZbAj8;)KrF+B;p2!cB|5KQf-=+Ud=
z@Wb4Eh?O#P4vJvLqo&5eQCh&XD|<8C7na}F0GHME@L7sWwutTTU!%2q^SAp6F+k6I
z1lD~f2h<1u&IC_$J{GxfIKx7T+&2Y@Kz;-!_gCSi)p-$&bPIxsmT_K={rwa!dfJNA
zGUKh?dP{G5)c%w_$M=0>Vle$(!OeN`IrRyF2y|%{GB`*o%sBI^J(ax&`qHTj^3P6T
z$qUSG9vWruUS|KXO}u>!Wv5<T-<s%lRL={`^TkLEkLR(z!h<w<t7~<7?@A<9-JP8|
zvw1GwwML2rwL(y*VAM{(JwpbrzjGDs;U}cW^!HRAYU&ug25zM54YvP<gm)aLuSL$>
z&v?+!@O?BlPcL6W(XB}bTuORed6;s^-Zom9o%XL9a|PxxnsG$D?Ty8#sW)0%O9kMB
z$=w-(qs<ZB(X|1FLs4lZKlDeiFE%kzpive-rm&t)v5u#65BII{!w30z7J)IF;VN^_
zZNTK>a!;@z^QV7p$Z!+w9F>nxr%>5mhXN3HB(}nY6ZzWTS2-%4#vd~_hL^S@X_i#=
z2IqmI9Gd(sk4AZSdh0eh8ic~dAq-lAi@cLR!r&(UHBVWfouP#>Jb-f|+A@suPW&gg
z&e8-2s=L3I?-|Z&*NwNC{Td3tZooK0H>yNJ&l&0kr`I?bDT<5xY*_3$$RRHvIdsvO
z*s+W0D+^iN1(%a!%h*A@Vc+4)jg77DZLEAAuVrjjII#OZV&UP#bs0s)d@tZz5WX?F
zmUZ}I5<<3sC`)SYR*i9Fwjz>keUCh;P|7NX<gn((+v#<(_It4vz40%Z{Y(i)`sDmx
z8-^;?PsXvn<p+m;X#C4x*NdB4PlUWzDrHC#*b-;wH{XvdOnJ?B<}X$}wC^Wg%&g`l
zSn4t=jQtlcqevG!OuwWA51S4z&RU)cN~tuAuTAjpvwcTSA!gh|2+6FAM@#6Maq;w4
zk`^TBx`PvKIraXGr<(4@CNt^oUGDd^7JjyG8X%#T%F7d%aGQBgX*+eg<$x@-tbBg;
z)U`@=U84>cE*N=}QWU%2VE1gz(G{nsZ<fP2p*WBg(1s=kzA)TAS+E|-ki5sJ(p5U1
zdH+8dyX6)-Iy&-|)fQYv9LGdcHvS1gW3xgF-SDv3>zG0`;=ut3o}2(4Gl)vBQ)^ap
zSPJ6KnC|Fo>b>GP0gsXQ%dWzlP<#{Z4CA|e*fEsgGv05}4^Gyz`57(7v!`Za)pu#F
zH&o3$S|Ba@n~aFx`Er7pn~>eHO6~f99M2<x%RMK=>5r#5q9Wd-XQg}jjQh5n`ee@y
zf~G8dmUg^O1X1Er!i#+)@#0C8vy&*^3_B@11%Vy$eQ3}f<4qbk8d^$0qy(A<x-Rf5
zUUc5~UfvD)5j_;1j;^eaK_4)m&DXYBpynIdbZnRN*^oQ!mNpFtHm=!Q1tZS}S9&Hq
z4XtD<x8qMmRXQP4U|Zs*9?zv<FFcd_$Vs1{0>Vn7lo(d*9kGQzFkBy>D`Lhq9mk`V
zX%@4&i_`D5Zk2c0m{pTA1dG3ivjlBp_cuN&G$iyb=@rLIh5`S5n>$=B#jI2TB#Y|}
z=J`7}3KG8hnNZ+{0jpmwAN{DBnfE?Vzo*W^Z*#;}Z?SVOz3gIP#U0}Eli<>M?S7{Y
z4u4v?nhGP&CUh8c*Tc#-kvA>&&LB0-{6H;Z`jGnt2oeClY|~Cvkd?T6Tx1q5bF-iU
zJu;v97#C{1N~*E6v6DQP1C}(YcgJLK_)e$g&WCef6SACh&jRx7ONROW@r{@(iY>Fh
z^l-zbvrOKTM{6x`8g#p@J4z6)xjXCd(ar~i@R6ik0|iB0A2{1^B4I%_p70v^wisl*
zdgi6}p#WOe=#nrQco`X*ZV=_7H|M!`3Y|1{FZqcL%YVy2+-=44l0}k##x#^L_~QGT
z88_!QCg~ow(`@CMb0A?s;UW~67-dd-p1<8}Kgcb)<oqvo$}DX+9ajl_#gSMQ8QOlw
z(A0=6A{4aBo*Jt}-!!{#E=0UoQYJz+-CuOEOvBU^aoo_}xIf#tqK9cLMDm+ik(Hm}
zI*R;*WuMk!Z=5U5mN7+hWEhk?SCaTwPn?NDX{xzjIzz3wU%q%>N*YGyw|PADLzc8<
z_nkiy>6S5)96L;2f0bK>N+O4x3J*`<^6)hm$n}r}DAg@T<J{z@n68JL{=^NFL@`%j
z0_q#DUA)!u#A;1=^1l+hOx4!Lxmp!0$Nfs@(i6ry0!r0hT<JVMsz06evz4&}B~RYy
zt~8o%VpmI<>iW3-dS7`M(;AP_=SSt1cW18PV|jW`IT{E>?P{Djl`PZ!@8@`-zTae`
z;?bQlpQN7YBexk{$XHp>-9vX4z_r_1<e}(+gZKYRB;xp(VGY56t=9Q=1M8<TLpB{Q
zakD4t)7!lQz*zOoA@?Ecu^1ZiKU`rHfIPMTW0%77?iWe2owsrXc1lRiO^zs09eOq^
z^gwP4I=k#?*P`IQwf{Dx0DorH&-rN#fXW)#2*1pvMI(v4mH1K7>f%oeX0Rz|uJQXr
zcyW`TQuV)IQ1I^scs|Nx&T#gE`-qM~TR~AD9`{#-&4Vlef@Ei?uRt%{VS<9~UA4>q
zHuKJ_)s~XCjC{`9TTP#|1p8K3l@HG866AwR3jcmJ`tOJHpLeo?0Mz8Z1*!P`*a;Lr
zeeHnha83RHfyj!N3Qw5GyEndc8g%0Lg4`l4|GNeLy9BrK|Nrx4|5pb5e=J1a0F1pe
zwhz9at?lh)Ox1rJldhkfV0iHVEyODpo?KoI0l1*U=q%D@5zsg#|KS59WH8K|UP`q6
z{LW-*Qs3Bs8-kDop~y9uwcH`ZTuVVmZ)!6>JL7O$=r}wX2n&;Wu-nF4{E&sW1C>j%
z4(H?lVyDJGjp%lB)OG`ZdQ9J0x+t;QvHSENfG6i)A#Z#)54getLJ(VWIet71f<oZR
z=zH(w#u#8-<8<M&fAx}Wd#uc{=KCfX8;6{s_YLWffrWjmit?1xl{|}Sao7iKTWS#4
zZ#(jnzCl#oqpghxf3O=*ku1pf@y(8pz7jrQQ=D9O%@3}A<l5-i+uDk4X*o-^)Umfm
zW@agG-(HZxW}v6<onJS$t+jLg#+RI?^^U!|Na08=xi6}0CJ_4%0N!&rpy&61w?5WQ
zPtN?N6pFL;OZ8b2dsQHUv*=fS(J^q7wrU70g^!kd1STqerXn)*vEAe@_Q9p$NT6lV
z*(PA`JYQ59yE%C{d|=Ng*`?><;q6}P(<LFd{pqSTFJGb+UfNvk8FQW{Tzpy6BMh98
zomM+uUgsP$qeP!gDM8<{yZJQ3;%fWCyrn*%umw1?eejmb5_)B;dquwbIBh=F;!th8
z^0%hy9v*Rv|HYRyQZg*c3*f0Y21qAlG<0ffSNhzGE)<G7JF$Zi={ANL9K^-HRf9ZE
zTehTqAC84QeJFKKtAX@yNI23)Btw<#>8jJ_#x#M0gNpCtIoGnsn$6j{ngh=cRI1)f
zi)XXw@U*8vp_&h3LgjF3a9JfxxD)ENtM6f!*MWZRwkK?l=zl$L0~j)sEdyA_Y@!@~
zwPG&EH0E{KEpm&_y8m1H*5USBs0YRdMR@3$W2A8};LqFpO^z-4-yNuwiM?^yavcNT
zimqrJ8E=g<P`#y5>3C)NxX%eMJ2cF<gSFvjP12=lq~ig(aF60lDa0c^Y{<jlqW<L8
zXDD-blw{0dP*UqK>fp3A_pUMuUG!_#n1~go{@u}z@Zz+qREOe|E84}0C<?HD8khcU
zpLnb;=a}0$V^v{yiT5g?JJT5O`87b_(FN>H$&&d?S5yLe(daUWmtsqr#qAOWmY*RB
zo@dz)j!$E2K+m44K1F}#WbRdgtSoRLzdcR@zCMu`nU~0+g~2r&)BUuFYu{Q5_KWQO
zrun?kCfNdCj)2BRW?Ag0@4UQjw-=)to~Iq5x9h>u@88kT7=r2XrSF{1h9ahxSu4^X
z5H)?zN{}6l?jEX5S1euB4Eu~To8J6;+U#KM(N}Ww&+yF@{%S%20EwTK<<f5Yjdpoc
zIqPSYoqswK$a0O)cM@xlR+36e=%tFUB$vOv9@NB;<!U;pHt7N>(qyn*WJB{OTs05B
zlBlWc%)UPi%DR(HsB4R9;!9VJUyR~9mzf^ybml2|C0`3IPV4Nai3i7ay*%lI?)QW}
zk7{qeXJ)=pQTfj!<OAkJ(3QyT6{W}eKRN;2wH6moc}LTEe=7O2ylM*C1w1J(e(Vuc
za$_tkX4IoLgWX8&y|*;=Uhc>L>4<ZxOLaHH!b)UtGuw*!<7qb0jD>~^Wy=OHqeEk@
zl)v6>#zNQE2$cr62o?JoO5VM9tta4Fxjp<<em=3HwRMil5ZO_^=FarBoOw(%r&?C<
z{`bcWyAL<8WcB1cByfE8%D;6Qbz!>C=YXvY)BUUy(i`;w`ReRWkjk{|udb9@8*jEx
zK_LgGxt9*TSz%j6e^aUORLn>sb!}2XUCTF2k|j}T!8X-7Yf^xFRU~r_UTFGv)EbHZ
z#|4tX^=q^o1@;BDRMMpf<P31KJ>0<rh;K7o#<5Vi;~pxW(AhdZlWmnyNgp|#fxId2
zJnw#ge>Y@`5!wW6YC8UQoHLLy-v8&MX>Q%nvJfBAJY*i)rU}z_Y7*HMcO-x&6qfZA
zl~y&;&17|Jv=mj+zSzi^`stUOl1W-J*za<XQb1Z35hI86E>*2uYqG3sBz>~=!zB~b
zt0q1a^ic#Xwa|(h%oXhqi3?{hS~c-zdGDdfW^HR<cGLs9o;kDi#_S&NAzx=O;`DD&
zietphXWb)Mvr@CjeD%tLt6+(|E$`jp;w)tVrM$t64osl2KG*!O)1xK`$+jVH>%f1V
zZ*lGKnU0>87JDFzoX*-f_pfL1b@xN<%SY#Vd(50+%+$zwm*c>>?M6)eD(4+m(Xfey
z9kP-XzQ%D>L>t|?v=o+fq=};jq?0X9v3J7$T(;a~o?;|E+3+XAGdIoTKFd{`PfSiP
zcw}o@zMZ^N^g|D;AeEyVkv#O+-G@NGT?d9Wecu4!Fbfqze7M|IJw?)j*e&dyojh^2
zcoQyeiIG4)SpIfdwn$RfCGKC%OQrm`ao?=}WbK$pQ~GX*mDr2BG4_<T<owYk!7%mI
zwTmW(g_Q~o*%;7>LKsJ+^d-r6(ZmoWagef<?B}!6qEz_DWG$Qs;QZcjxU8ea9P-L&
zCNNhs4Fy%MnYbN%5zHG}DzL(#z*@!e&~YP%?VbkbrLq6xIpjMSsA#F+PwY>AmQ_pq
zYMsCoCNKq3uM4vU$49K`h@{n=JjQQj7~;i0t0uP`bLltOxeO8jwj%cZ3&&<n5$pJ-
zJ2f@+z$XN2XL3W+o3kS0^hqC(H@vL8o~=L$p}EmrZpwT@y*f#HoEL|HJry8qqGf+e
znzr?6(CAZMq_ZgG$?U0=-rvQ5bZP$2LUt=cv4%PM;*+{@Uk$VdIpTn~n`aM$hi3~Z
zuK%YXVaCO)Umq1}u2x<Oeig}ujS75t_~(HD&ydr<<Y1;H$Rt|BhYrydoQCKC(pyBh
zzKnR*YWj16g?4vjcQ)N`L4G9|p6M3{-3VDw>6@eQ{NF@(2ANrrL0279dno$}-+H($
zM%{pmn|XkwJ~~&kL-(%3?mJ6}rVQ0mj_JZ7q3uCmzS`jK7S^;67Rc|ZD29nTIpfgI
zu<eyKsb;=_@c6!SRgCFvBy@e*CzosgHFFe-x!_{9zAhHOkw6%E009nSd{Ak8b*ov@
zLLNKJ&hVmGk(G(XP_P@3f0}fqzIvNUa^mEDVgUIL;q**j(L2C&COY>ymBwezLN{aS
zAU!g^2{6ULv0tX03-@09VrJo*<GH(`F|rKvTJ*WXD?{2i7ty_l_4=Z25LH1=TI+E9
z$t=s&$v+>Ks%V*M`hXVZU{r&vyP~S4Zoau>78w!lNy(cDXz$}G6T?-yfg%@RkPe8h
ztWfas@i1E2WUmS6`v_EMy8A}&oEI8Kmu9kB{QI_vqGSPq2H-+RGp>yURMK%X>Ojp{
zu(PQRu01sX(27H(r6sBahjg1Ltr@MPWW%YMn7(wDmO=}fnqCtc2)yYFze%+#D_iiq
zS)r`4(W39|g>$;!PvUUe6xY%CRal5<=v#}k=3Bo?w!nIKiWViEUE?{KZ>E^R1^)(-
zwr6hcBaL*#*ccqEP93q$+{p)P>l9Y`RqWZXno9IlQe**-R+s!e^V@V@d1+cP>|J_x
zo6A9+ez<2d>ikWt9nWc9;E2;s-L;dc!^Xzzw-_*fuSwJ-222TOXH0qavZ|_q-SLOB
zK1@=jT(jQ~;ns&}VA1%1^}tb1zk7}aK8IDV<$#w%v9J4XgF}yp3*3Xr!&J5WJUpsZ
z5{wty&@`j*O#S&z5%dSx&IN1Qj%(4n&-c^Y{s{Q17`egq+rS8vxUl$A4J7Tj0gZwA
z@14kiyqdb6Q9+YzN07Y7Wr1&xe1()|?4iw2(GdO~UXCO~%+}_tW3)-)YmfhV2yCzL
zU9PX8G~JD~<L8P9Lf?tdtBAhs=qhP56NiN+hm}wNO08aL()2fjA(vGjz+NhjM9gda
z%gt+Go25=dvV!&XEy{@G1l*n0e_~^bCU9kSWvU0HN!pOg^ze%pfm7K<oKbe?>%faK
z3fo*Qy^JKts?4Ro!V(w-h#DXPiAq!0IHu7-b@fL17>LqiYbOih;IZ$PN*GWSsdx}7
zx@D1nGrtd<T)(&6Rk5&-4Jcp6t+XDI-W~0aySi3&)}iA9(4N-IsSCWcRFskn<zvVE
zy=kgJX%4_Z$&m;lKG9Rer;7%!n_h&#YHS7hRV5bfk8~PUSIe(L?5eZ>9g9vNYS=!E
zpqPC@6pw6oVKAs-u!1JL54tl@y|I!)=_=r$lC%FRhv&-k@j2vhoSz@t4Y24*#pH|k
z$Pkl$09wae=heC0#55{H$dVQrG_7OpUM`sm)45QB&IopXzm)@TyjW$j-WG@N0-7iK
zv6h`a1YJ4q#PmfmNr;Sw3sXE>Xx+Q03>nW>$Dn`XLFCiQWf)0aY-ttuL@)^B#3+Zc
z<?Eil$(g~XMRBd=&5=ddZV^Pk@T*9rz(`^gQSCbSZx07{udPWPq03#=^t2>e%I#z{
zNwUA)+w=PfLG^jm5G24jMV<t0(%+32KH>QvPyJ<b;G|}%v2II_<oQY74F_K);wg4c
zz%cDMFTYvxm)@583Eo(tCHFV=HMKpOU5afWkkUvFUSvDx>GaS^-zI;a`(lKNC?_0>
z>@@q62sO?o?V2`TOnjc#Td_34WTOQii1Bb1<af2-F)|UJ+r-1+G?B|7V5Z;Kbuc=S
z<Tw32HtjTZ?U6O%RcG70#A-qGmHF5CADU+4EiKA)9`fv$PgNz7LXL~WB^%zjIEd(J
zJawJXf*5>Bn-ygLJXHi8=afwbn0k|wnnz6RjNL3Hua|AcK9hOuRF+_W6}pWXrJ&s6
z3uyMeK@jyia{o*~*z}cgI-1Vux78!8%tL8wNu|l}ac@N%Hu17(*ZoNj87&%sIs?8@
z_<f=!2TIHFt(cFi1YPFb!equy!z~9Ht#4xwjXOto3DWh2Q`rVsn)Q5DzX%64mBwU-
z;8wSj%1bIjXR}A9kDH6O1ChrtXWPe)v(BD7<_Z~I@}_0DM~9dAU!{qivn9%6aGx)^
z&vP_0U)&rIy;O-xhPkvJI0^(8;tg~pRdpfx4XT8Or(J{VmPU$_2JbD7zQ9C)dcXhl
z!ou$7bYqdYg9S`2u^Y~yUP)DrPodCiVN$^SE^r(lAnB@d`B-QEL!>K%J!7dUqwYy;
zKUuK6G@LYw1kc^R>*<4wA3ty(`V<n1V7%B+@ZLa_g5qSpLm2gD<%Wc<ZBHQEc|^d4
zen5pql93N|w=L`WaD2MH@8#(4B8E_0UX>Y(t4xG0Cg#B8GN@2gZ_lR+UZ^vt%=P!C
zn5|qo=3x^Cg`12!1o&`Gi25)G1wGO#N3o5``j?bWS>{!C2O-$R9;Zcm_8m4bz#PSn
zbA;T8*xHZa(a6%5O~UoXYp0IG>gqm@Wi@}MH>#?ru$EWx;dt(&gK>0uPJ|Ltgu6g;
z!hXvsJSN{%98)x2%>L5d+0k&Ww9I?@JY^-eE7#|-7E!bz0F+wgB9EBF^!_kL{dOn4
z^)Edx`dRoEKx&K@5%Tf}my;-(d_81-$QwpP#@hw$zbWYj+<i^LotKp!eATbEQ+774
z%^i&YOwe5zh)td8J5H>q0DK^{psEcy+Asj)&6qHKi3*n@?uU`E;iZed=G)y?*>T}J
zN?_~JUPY>~7|V?+`~>!ZT+cT?l9ob9qPXNGft}mt1n7+6R^Fuc<l6ama^&7GBH9{)
zUm1$JiA8aJk`8f4SgSvOBfGv$dLlitckbK?k6T)qu9}z>&;IN}>3v1=jkv4Tpvi+4
zrV1+1=GsXl1xU|mY%=_w-ez0gb8ZYT1`_9+qd|U%*w@wQypi^N?(1vEC=uVV@T{B7
ze<J|`)m6TgRu8kAQeJl3qihjoDKSn=3=*m=M^~<q`Ut-(`4`Sy!CgA6KTIIBD-TFj
zl?W`Jh3E4Ydb4Hxre+pA)*AFLDg!ju83AczMwn`6f1hD5W684f)F~&R%>NA3_}NYI
zEEt{72%CXO26jAAJ5n@xu1Xe>thP8~RfZEjJ&c4QR?wUBcx)lIq1RPMDJE9jLB&Cj
zXTx_I3v`*^LT1fFJ_QKHqP{2NB``u06Ay4A+2<^TrwR55Bcd4$B1TK}n9-$uXr}N%
zvKMM3sT2{Iu>+LKVC1dX6^h+)Wd}-clyjl0whhue$v@rz2W<bXCT7fokB2;1;|sy#
zvU``QW6X&hwcrO`C7&_Qh7SLwCs?<8_UK^Ru*2?MgW+1l^vkIO5Z<@JPG*i~QLPcf
z-9adx66dnxz6}rC3|V{1i{Oo^|8eoP!r@z)D9-#XInVLu?5DJV3>nKhp3Yw%nbp3h
zhW5tjO?Nl39oYy4zf-H#EU!SH!sc)@Iw<Df2zq#l?tj-u7o6UPI4#aLF(KPo+sfM2
zPcmWRE=4<Nw0G`OB)QtzX!~u}bo&TZsqc=xWQ)@FqPwn|wQF8)&eb_K67M}QB7n<6
z|C(Vs-B#anMaeR$u($KthPI37UPTmp)&KRbz}yWQ>8yxsjI6dT*FpJy+24rqHD8mX
zjgGCNBgIvha*-PDDb*!gA!N;5kMv2tn%ZTTZ7IF8?s_>AzOn6`#vJoc$r%h@Uoy35
zH<8N|!^24deIt)KKJ(5Ww%uFJrpy-W@|kF%%h0<a@|=*Itd{3Kr$1{3A6}TI#|csc
z|4NvM6n3EV(qmd_){`PCi(wFoo`Fp6F%n#b29lLb?P$h+OwP}-USCxRQ%EYF(#~l9
z{Xr!nyJ=U!pQDEB&`~Sqzb-!Z+$Z@KEuIfTryE9YtX;}<uUeD;iqn4im6-)Wl<eCZ
z4XwF%oSY;cYfm<09?Ne9+1|g0JXRuvbDigkK(ostB9WU<Fp+LMWaBx2Qjmk=AaRlz
z0&^4|dWpS-Ier9#nnnum8d*HK5Dx;T=&f!HrI<MQQ+AI}aL}OJTuz(<yNDVn#FH&=
ztF?82e!}<j=g-j$6Z@qq{rFnh)KG!tPs$tqgHAmNLmSTz?0w(gF*S+cef*`A=`!#p
zk{ah!I3R%X@jz>63lh*N2M+VJ*D?+e)$`gZYuU|Qc#DqC;q_I&tEENH)_iqJAu60J
zBm}-N-~}GLSZE}=581A`dHm;g!rL=+4$^J0WX-fDuomhFYLQ)!spO-i6u8xY_H-R;
zcky?CJ8sDNrIEY3r>swf0Afm^aJsZy2RdR`Ou7XEEaUav-bBENqzo(;s5`zA`Yd-h
z4=2a-@+dY%o>A^g>rdwLM5t!Y!kpzYPj<js0QOn{2Qv!GcE+^o94%xO$gU|0oT7N%
zeX6i^3CMu{bDp=H^*|>rAEmtFI9v|L>wl@{vfu5lEHN&vEQ2^*XNPxNugQ9Gz1w{4
zorLpjpsNpP-tnXkN6i<%W|vDt_ZJw)-zQsYS}EcI0c+q25u4=kNmg*=M%l;=Guz`d
zfD`C_ViOU(b3gY|P1Le&L`7*m_;ClZjX(K|1YNzR1>#zfwLJseu(uMr!8E3d6rfmR
zhFlc>yM&abEXDii?Ueyb{=BV<FK7wKwGnb0cHjBT|GBFl!Dnxgkz40rt)oV!ZGXVx
z!1SgXHoG@K_G~D1%ubF2o$xtv;0Z{knd;8ETf;XakxKrLHntRL_)*Dla~J_)v=Ju_
z1O-2scn0ak_a)4~^>OZ*_Ou-EnYScSk;FlaK6Lx0@``VIrA?rHCygJ4BPL6-b3VP0
z#Z6C}mDvU;`TV*$`!^YBqs?nq2(?z&As#IUr5Rb~$@_|P2^O_C10&?;gL@df>Orrs
z3hj!&KC~^i?(XQimjpL%l8@=$Fr}Ut&_jp9uEf&=(%s@B0Sm06CYwtNd)(chBa2pF
z?kBwNI)!>pk8xS&ocM8G0^2(RU@#7F4G_KJm@8Pi1N^tN6z%zvgiaVVKSV}htV5gn
z3fm;Z*pbw`j#faI(O_sZM`7$4-x;lMYGt%`*>}x!O8k7H_`9<JhZDCu2IXRhHyz-*
zF$90)dw+|UGB6N9$$<t~gMR`8ulQel(13U`rEcfTCqRlczUCk4cbJx8`ccuf@-j7c
zz&}a38yFcE-fm5M{)No>>S<JEeF^#!XMQT`uz3sL>f`aQ%`)H?0V%o$OUz1O1ofi7
zyeCB#F`N=T+dXPO9Wyn|S<)^CtvO0DN_Mx2{PrX?!1$b0Wtf+5EC5M3Ex?8ZA|3IO
zI0R6l)&}%N1#gyg{F`*cU>Hy;-+Uddr=%gM>32oY>9e1weGjYGe%dHExjachRnyW4
zj_CLQ$;LjI>E)0)`1R0mctN!&{C7TaeeuRe{<GWHl0}^0+gRuspa&tS&IXJ^A&(6m
z5r_PEqR-EX6c|p6?H0%%FS(R*8jsADT?sMYqRXzG56Mf;vvUx&Yx%O5TL_X~^!xO$
zxE|0O-U|0)lJ%lQfjfk2&#oD^)7CZ3Wv0i^-FQoIyPP^&J6qE<*|6D@^W`ww-$Zf2
zOt<DJFT~5l$a@Z_pZtNG;XC_oDxS<ZWj{Og)UKCArXNE!GerT{L)F7)jr#QD(!|!l
z_8I74gf;7T-;~0};3i&WyJ`QkT6Pw8JYVl!CK!-#gdE=te){z>dM8Z}!_Z2XKS#oN
zm|fsAd|!x!WRd#d(7^0z*LTCJH1IjktRJkYb(VU114ZgHWOb?)ISP$??kMy)x+}2&
zJ<jc$Qym!K;#I?3)Id>(V(>tgQJeq8)b}nq@@BlW7Ral~ErAzZJH0a|q#OE4aaQ_V
ze|`EV6&?<zc|hfqPV5Oz;<TPJx~$ZJ$MFsT-L7YGp)re!ehT9_Z;6O7!oy460M3}C
z?BKIjR!Y&YpDEAT3g&wXlzeV7Zvw~rq!03kf0H_ZaJemFup}bI@Fs1bVlQ_@)m@$9
zRgJa_n%M65SGC|5BCRg}NbOGFazK+3F87BelzJjXE0@|`O?<ISj8H}7n$y7lj>Y#X
z!K*1#z*_d>#|lsUKf>WVebo1`CAaxzc3E4e5SOKF*D38~??XX82MT^X{FJT6xz0fa
zcE7;pwdd9CVde_GR>@6e<v8rXaQA5pjFf{hN47;ymtDE5qXo&%)A@wJP*naVPb@dY
z-JS^S4myX*NB5+6Um&1M<W=8GUGT%nhx4yrz@AC9Hl<_`Uq+ss{p!vyG`-(V{X`Pa
zp7?3~vBK!kc~57^tSx<8+;ba*MRi^TJ6G#ojk;HVhrx&YPM4>OEnmePMP`PP{1bs)
z|9o%jph(Gq4$N57U%eR%{##~qb}+@Sn6-}U+qt3O>##^-1FP(V9UX6~ky<bEER9)W
z>>vEBbQM@{H(H-b+PPCnNQWSZzCgwN)6kyei9jTQ#>RE*jMQTwRKE4#uGMdJ4li@y
zrLPGfS2C6uFSam~rCx}dh909B;`~+#qwj?NOH;1=^;jUit=<`Or}u%s7iQJVrv8y`
zJ6fm22DOx`HfC%jA0!?ANc*BO^2fEK^WqRC_QWHuNK9Dtx07At#WX{|m^+o#=wb(v
zV01%p{1Le>R_^><KfYydaIG`j5}L+<^ls*9#*Kz~bWR8a?wP~+pTOXI2Ezz<YW%L>
zq2_wIB?XSI)`Xu9*rd+GPCDJI3Na*X_>$e}H&Ib0PqK4Fa*{_54c-?4a2g!;TA8oD
z(=ADOE%``&vRx1pp<#GJJp@O07#3={d1y(&cUtulRY}_EOWB_47ae0qI{qN=`N9}-
zK&QM!^smGY3I9P4JDX#Q5?reHFtVBDyZig~<J$11zUk_j*vF%d(bkqaqL-%=!R-`i
zG{*-B{zNs9n%(SA%OY5>kG)hf{e^A4n+Zy*ng?TqB2Va=5?YcCP<&|L|8qH5*L>EK
ziK(WqMKd&;p@|F~SV(C``r#9YwGlxf+G!xlbrlVrAHeN}bmh7tBMx4t*-WBB7f$nr
z7)V9=VW}g06LWyvtC`LvdJNiUh$L?-$Eta%8$8m+TpGKiX%cV!9RM~W=j*E-#V8&&
zomdxZ;Sunq2X}?ei<m1|lX<Fn(d8!O`wg+khna^HeB}X|ow3#lMSu8!vDBtxtG~CJ
zkb%x)*qC}lwNb#R6WYVLhe6*;WKKA_OYCL^kD1-lx1*daTC?<t;s7~-mKV?<G0Con
zj=ntpb$#qg8%%r>Ko;gJXajHOBjR0tcgY>=1n~bGMG5YJ){3gZp@8<ku2*@o(rOe3
zU(3ykORtnz%tdoLw=6ReNuU`V<ggGtTmIPk#gPlmSZU!n6U^`@_z-ETB3aDo{0D;5
zl}q92^B0001i_!>Wy!6E^VTI#`&pU=zbc3Tb6VxW>kh+0AM|eG5~tBS<d$+U2PLJ5
zXSDT<NOW(52o5GRu;BR4$3qS(Iy!$$NM>Jkvm$|RAzK@sqcsuVEyEldOlj~;IC5D}
zhY9rIf^_Vp1%a@v#O4|7l9Mm1ZPFqRanL|lgUzL*2K(?B8ohahw&aAw&$?iR87ton
zAolpi5jw3Wmp4EH_6`XhbJI<5nQA;)_8@bh#JEH1qT(jKEaG^{Fi!AuZMawO9eT<0
z0itIn1{o!`!rDFM?#ly(`Y9~yqVXbGvhv~vi1+=C{x_wEaTFrE+DZPYIU57YJ!e5a
zx~#1BHNpJw!T~#D2B~96Ir&i(_={a}@d9@sMS&Y<Z1Rj8($44KGl5wHKcdLfGNV3i
zpDu-xD|E?N+2U;8V<emQZ6>NFx30~EUssOws2!beW~-Fg*V}4Ad`6d119+@g>Gq+%
z(tzBLdT3c4u+E#rou`2d8UDiP4CO&_ZikN3Uyht4<>e75e2zr0B^l8sxy@!P5-z}&
z>0$wNtDdH>zuFv{VNg!=Ot*VvA8jp*0u|h*NX%3!PH$H+J-=&P`C$FM4US*qS?U9l
z5%0LUDMwT70!v#}WMcMc;84P8)yTfII{PU2h8VoMGHJcrb&&#7wYJi=Ky$UsFvu}9
z3QMs>98=j=sNw#02jaa}8d%TjH!J3@4Zp*}#SIgC+NTx_AF?pp-a2VF<g+#i1%q*B
zW@aL9rPR#NH^<*1pYwaosoNGpr=#$f=paJgNZjoHzvd-<m^@=C#Z(VKqsJ4%hP^jy
z4p@P|N=jz8CDEOoql&ZB`SE^OiFE-p9d)Y7HXgvTR+vX43!%^ePvmG1wiFm@0P9AL
zbLZteY+cx6{JC+nb=0GV$<BJMV!0jV?=?7WY2KzYPZ-83nEKFUZqI#s6kzP|Z15SF
zr527E9=Ma$G;V)MVNf@EFY$8{3~?ECZl`?$Rswrks~wH#1C;zi^7Yz#|92a^g2D@C
zi9p&fL9&%E;3X17>U?9*56z`E!4gq`Wm<CQk;0%cPqI;J^$=X(IS~7ovMUg1p8Krd
zeWJcRIMRELeXPM*DQ^ZfQ{#BcKW0CSpNsTazFX7HR{z;Lyj-7aKY#W&{C2M(OU>xW
zJp1pi$Ve01T3N@?1QtZ^`PH9XfTqj_Bn*SVkUVZLlQ`a{s;4WFF~4;gGg}=NF~swn
zaY64X{^|s%HxM))dk|R$cRjA}AF~%K$wDss-oef?Mg?w9(Dt!y7EQl{18z=i%qNPO
zlS*OW_hWLvuJw1mWnB^&JAoq+CHY<Wa`itE9tPQCWaW*}E=8T5CHTwD`$#+d!3-?}
zs>0CH_EPvLVm#%Rc<PUt*5A&#rAheWfN5ZrF5?3<aDw~gHl7o(3i4a;Lw=KHeLNFi
zGQ^b3yeNm}XR7myFTtCLBk*yLw&8iWy<0#>tcGXRYO79!rPg!q6L$DsM*k&oGms&f
ziss_<I0o%=sQ!*bkL847*x{G5%rh3}*iv=Y?BQDzO)2m4+U>e|`WY~E6}~F}H20gA
zY@_zdR|v~D#tw;uIMIE2!QL+hpV8Az`ppbRxOgqiSIjH%vxfLB{#e>5+w7LPbE2X3
zit~OMa%kEH{Ls`Z-f-LJU7GW#a~SYI_;j1UF5Z0AfX-68`9xUZqJ%9?e=P%Oi%NR_
zI(cCOS`xe?1zz;ME;-3)r$u09nbSSm17Fp5!%s^nZ&|y5Ibl*(@r>3r&MoL7Dot#U
zdKW?amO{sTm+j$cr4gXKOc`O)A@UUB%Wqw`m3P}pkz3*vo>3h1xSvZ;Ih!(ql|FO&
zAx4km@gB8gAc{&;?WU1g?(sVj_`(GhIV}=kkSh>#A{%oePZ3vCRSd?NuMz3mAvH*x
zyvwqKt$()Nr)^*oU>@XYiS^s{>DX^kXLRBw_b53Q&&>p#!BXr}ilw!mS6`&i4t^T1
zE^1N^Q?^dN>jmwyFG$7-kI$Q4^n`3?+diI6i4{vgkjocZ3q>CF4D~}#4W9`=x-97M
znA@-ko9$XViwUG^=`xxTSel#su~Er<2uc#q@*E$I$#UbfnX@=2Gx3d~r*4h6O=H)G
zx7(h4kLZ2t>E%Vm-z*1@ce2Ip2Ax22DWY4hnJ_cQkBWS>jie=59yGO>AT!P)BBt&I
z-)<68M>y``OOqkmoRIN@FDE>>A~~AX#5bfS)YJ-`^@YhMMk9atTu0fR16BY?iBcIC
zAr=CzPd1r}%;~$u`Gi-J=fRiA#YNhyLS(<LmtFFVpS&+I&Nv~=NtA#={!%!!Ld#H}
zib5LJpppxkFS`7lmNui~pwd4iJM4iGP~_Pd5D>S`G8)^&HEF-7(9Trjch*CFURXG;
zkFYk5>XANiF#=$88q}w@orkw<JD8*m1JBP@95-Z8xNS@!CHfpSiG*Epiltu|Si7&o
z9i=n*-ilr74@BndpR=6mEj?tOUkZV97pQW@veV*EH>$$YSQXb;dWG@~eJyrQ7MR7p
zI2ziyHNNG@^)1POnsf|D*FCbXl2~Lh1X24mfG}@;i=lLRy1pm1-h?H`eZbsGy94Lo
zzRQ8-q1}bg`d@-h$GIyDYv!XML;3{V>y1ZlsR4JQn`UDa2D?<3hB^{~pn9u%(`>xr
zdJo(FC`^OI8~V#xe0N>h9REnr8simUi4CgG?WE+z-!Y+tw$!d7*%IE$i%u?uixJe4
z|M3~)$=J&jn;^3`i?DD^2X|F8NchM6tYpPs0x8}f-%%Z-Cb2g}U%o#1@?3fJZvcRQ
zr>h_8#@k-RPa68gXbXv_yP@NTivoEVVwZ%?b(ts&4C#EYMPeAEYP_(vZ7M&m<J3n>
zP6*7!*AL=XL*A)m<3_+|-U?ie%)+qdiBBn~F$<mxoHh91(vH4(fB$U~jxMk4n5~}5
zGG|7V$njI!>fkL!v&Gt%yOhZ#9*|y-r!Xm$9hILJGwT)QbdM=rS#{M=3Bv65(gaD!
z(c>xLxz($U(&)#d^p4HlyqXWLjnjEp&}AH1W+4yIstKHJ3YL^$n&-)HrG8tE8_jLH
zBxHIzRDwM=Os2nJgiJ%$X6nG$9+YObES&037kkqLhtlN|TJkuPljx6wNBllkZiqCn
ziGM~z!Qebi2j<r@q8V@SMuMG0)+(uV|Lt`Shk6LT0hh;(=JwGtEBZ{;miV*U%zBjG
zxSg$=6v@MJ@<zfpFT#ZHLG%8S(za&g>(wKV_*v%51om6(7d~@rsirzj0MUxINF#BO
z#G;M(z+d%2Ub$^XT2{LW58cJxjt@H%o&FX9K3Y~&YvM&*cn{s=RC`o7zf%HaTlCD!
z6NwaQPJ2jfW=|Gv0gw14X$xmoLOEhg(35dl)pov?hJ2~%nDBMs`(?vVpO{#jGZZ4x
zYiqIP6k~AS4Pqd~B+yhAm^MVh&d74t2vEeX&NfgCCtynjm4Shs)U#Oaz?4aHN=oPr
zDscH}`kK<}5KE5sL*_a(&(J2JiDi99AOP#Kq~qR%us06zLXqowwU*qdk><<GkDuup
znZbz%pN7?+Z+i<?m*=owmppYJwg{!F&^*LT$|9H&h`{S|xKBQOrx4<aT;<rrrt#{}
z`X*^JF5D-C=P&W@y<4<o`2IK}Tm+MIZm)-=ymFYAj9_aQ7~HSG>38|n%l=$#06CqB
zY9UV-&wP)GbS@wrH0Qbed_9`b70`?(ZI2pmHEkLpS$*g0f-s%NVTNYneW`EGm)<hx
ze^1X0)a3Fn`3R=PtF@|B2OE$=blqFO-jG`hjA_-aroJE<$0SqASylXtE!FStH6+JT
zk$u^@g|p)kkw+25GhxL5FW;7}9+?IQ`v|@(9M#HN=rI)p4W}ruke2GPb^h#iy&6|Y
zMDNK>7~VY%)D~F=nbldfJN6kO3ZBNA{`iA!&(1kiL|8Ivxj16{Sx;tVRn-~6Zhig*
zeV<f%Pv|?+M@yq;@B7<)**uAh9BHZMqo9V45@V|`amx9(-E?E9+^~^<Y7?<hb2DL9
zb{AhauTUP8fS4s7{UbqIhB5g4wam@c_OA&%zlC@HPGy9Vp9W_N-AT_aoG*V6p8wh0
zXQ-RQOR~^Xg4Kg4;_-lh^xIe3&LuL|1B3k|37;~_amXci3mJ(U{JPr+s(^%hv9HIC
z>pzj6tTvdJlM#Aa(ETUHB#c{o=z}Tl1`9b;Ic`5j0=ENGj~OUnV24AL;=|>WWS@DN
zY3dJdZOs^AAE-r=g>uD41x0$Cp;9B+5S$x-4wW|<`R5i{BIQ9vr$Kva1Pbp~?X}}U
zvYM`vL2pXOtVX@IMhM%iluX!fOZUxY2Ne$?q<C@z)9DgF9EO54l<<g!1ZZ@bte!=7
zni(itPKdqw^D5nh(HBpB*`A;8mxo#-6BE57D$TAB$hrP-hO%L-ITP%a^$SK>5kj`$
z|LOzBt_AT^GH+H+h?N+WUg)?Xl6udPa<rm@hw}aWYz^35F^2<U+qp&_oF*({W1~7?
zJojjcBd3zn<meM?-43K@hb_HN;*^MEjUy$CSv}D^`{Gjt*3o984!GD9jTZW^c+;Wz
z5!YZkZ3nUg3_Ax72z<{`>0Im+#r9=nxslgW81o<D2+5d=d!MtX0no4Na_Re7UT2Wf
z+qdyjZ$-rH+;UIRF-|wR_DsT%$t;wrDS*Z5O`o!BHZo#x+~M!VmX>&P(YSkB&dQW%
z{#Ve0EYE8KE-f6&;ewbe<p*jy#Y#Kw>_c9{@zrwvDabwqE^Jl@a1f<uzK{MK4Gbs$
zl5}E*Z6&2M#oDn*ma4b;M6T=7Ypl5RA58~_N7u0V%vu(;BwGJcxgl;9mtrB1J@5*&
zazW^3+5Ju;-lC<QE-Sw)Eo>@WNfRaYhqbyA$<{d3c)oMHI;7fyVzI?RPorc8IK!>q
zO<lfYr2hB|KRVe&MPDQI?g%8NbCT`b($7QU1u7><LnUO#(xsWf2jwOyx^Nj154uFu
z@Htv*up1iQO$@Ved)RUeTXp4<p;Qc`?@}ZRM%cP7V?U&j-L$p|orczd6*I)TucEiR
zMR6^9^SXqDwlz#8-+0GngsuOyvTo2yO&%r#>C;_dMR|IaEhRKr<KO<^&~n}9wr2)J
z!wdq4keA-y>g?hdXXj_<s8|Uhy_m-*U(*i@I3pczc=E;q(67<CpdWnPzTIgjH?Gaw
z^1=lBsi{D}R$N-jf3@U4d6FN73m(ZuBThK_DS5+q3fnTt#g^zzX}g4l!>JXdq`tr9
zTOL34@)~IBm@vNIFYEL2z0ELD*3*+J_dUP9mazIs2k#Q#@9#9q!EJdq6B7ffU3r(O
zImySyuL20M6#`j=gwQOrId$Oiv^K`L8Gbmd!DDNr%V21sqM|0>#_=eQj-)K9YFZTM
zP(x1rJxFr>?=AG428QcgeVv^f$VMB~Umc*d>-BiB4-k6qQSciH0liH+3V{cM$F1J3
z4MReaMI4H*+f&cIY#^Ia6b>HJfdzKHE`gEjd1lLgcX?F9Gk(nWy@RPc4CGn7V3(qa
zAot;hzw2k8$5B$$4eRTU=R0ncVEm>oWwgQ7IKJDeZL^vKD-zBf_WoE+=Dp@9hLBhX
zQl4F#;9dpOFy^WfQIm<1TIBk{9<(-Nq7sbd!9?*}j~<n|3U>Lb8uUZ+d<}t4-kuK;
zgo#DN@E8L}{QI6Uf?IP$Zi@rTeS6dFJ(IYQrZA=@910==YXjCQJ3+m(y9vRNJ*ENI
z^S>knY2Oqm{1DPW7bt@{euDQgVi<uNA{8E5k{!SaXy#w%Sj@v;4ds>LOe`7C3QDJ~
zu>VF~AXO3xe=F?!hCAOYrkXQu(htB4r_HZ+4tQ#=8&Rf@6NC<4N(cwHCdb|I%D)xU
z5gjnAmAzmb%Fq7~dvDnmN85A@hXBFdg9Qi@++BhN2p-%acyM<}kf6aeXwcy9?lQn2
zK?ipjd~kjHdhT5N-TTAy5AOap#|%Am&)HRV)~dBuiMFQ3tUbJp(~op(KW`^F$J?G^
z=SUSJEmP6P>}>qtLr41;Hi&swpvXI%P0JgOERnh}7nFI(pskf&X(ahT4WH^{dR|4u
z@vRt#p^GRY&iL81$~41|iM>Ssup<zatftI|gVyZMvc-O86U#R)5J8}qVJ%A8^5)9a
zL>Fpfb2bKxq4oe9s%2>|O6Hof?~I~3J{e_qRX*SHR`}bC`rmIMrTPOF>>rP5I<MS4
zvne!})+q5u$_ScXVhU-K&lD|=pjvO$`g$?!I<tR=0G7nNfnU~KjvZ&qS-%h4aPK?z
zxjB#4F0B@s&tnUDUL;zxJub<_;G^O-u+U+yCfigQ0!m-<?`sZoim*5Bu_#RY_a7Fc
zvVk*1Qw@L|uNKtK)`C(%@n;o1l30&~yBS>l7LhzLm?AzcFO@9Xoi+{I?hfVvEGLTs
zRW!r)e$hsGkS0bygg(0s2k4&52%mOs^REZ1(y_IhQ{+{AI3IlQ=vmWgdM*PTN#yv}
zx2WhxPHnx9=;~2W)=@ZpK`XD;GK9uYxv}`hm#;6A9lE^E<~7$oV{Q8UNHB*ybMCqe
zyN*A4-Ynfa_5T5}Gx+Bqi!BdB263LkER<nz`payE9ZuqK5WWbJ4F2Zj-Q?oE440^t
zcjq%?A7cgx6$H-OZHSa^ee?F#53;s3Bgiw%p5N5!t(C-5AS17<2*b*YNyOLV_&2YR
z9M|EM*8=Y;JV;o*Pvrp35Na{}Q!}~B`Hgd+9A0cY=+y$+We#0&{cU9qD(tV6geWW@
zZ%rQVPAb<+EXBHcp6?Y1^JHI(8utB(_RsnB4~<9c==w#pxIPL`=H>)=2VsztD7p^)
zCf==<PAl=c6|kZHIVQeJuCnTwGd;gh2v?qD1gCgoz5le`Lm<G9KAg4Bsa}MK@y5Gz
zejkuk;?-YRd#EljcOZlLqSCoirX(+~@32b`H=|!UXfJU7s;%5dzfR|$-bzcd8AW4V
z9ha*qo~|*fH67N%bP;zY2eh9&JKQ0a)enz+D#;Gb{SdTyzAhqF5w%!OZ(*gDCthHf
zsTq*#HlNxz8O)Ps-`?0(2E&zx?fqm+_}ZXlSm^&pLu;s-#dUFs_2V2)Zf)bM$e4t;
z5TWP?L$%)UZ`-Bv3?q(NIq-GkY#mr+n>iDc0O;fMmAv|@Lh)Drkw2;)IpP4=buowu
zE3X&B&fKu=c89O&{#eWG_)SIZ2QVh)wuv`?aD`DP!aq}!IKCAAHy<yK$HL27HBZ-T
z_YWs6*A_P`B=oH^=4DEtHfG|86|4uu?q(<N%N2>u?OR$*o3h0S+pA8^byQuoj_uD@
z)&+cH?!E7aUSd3czT3Enb8!Tdcx*&irwn6>Eo+?abJMA{@mG>U;W~friqiY1u(2BZ
zpAl0)ob}Up%RB>|vr&8<TvF5I-(tE@4o#>-wCJnw^}LckF8OM?k<H^@T<l2T?D$c;
z_%*kTtcQr{Y9tYu96$1{fr==lk5`tHC+jl3PN3mcP5f`sNuq~|45LBMkE}|~`=YnM
zq!B5B!NF`k1Bl*T+H~4n4L5>qwp_b)+3<+YlGRI6b6b2}B%WT-bgP<Vng8L4W&^0j
z)|m0A_j`2xR{dor9=V6b4K_V~g9}5!Y|9my;21BMCz+34`*28d(@L_tu%r&#NAvLw
z#SwS?brT6PU|P2xiTCC@ogEF%#3Vk>+^e|;RcB6V)c3`XwbPEM??t2D*@K(qT9TfB
zh=b7$pF8s{1>*g6`m6RC_KHgD55}Gd7OM|h=s!jEgne5O629@6_kVgqC&OgG<7(7G
z92F4t7oVcCS})S*HN{+cCfhKl6zn_cC=B{DSuTA-Y;#O0RL(VvYj10Wj?W{=!0d&k
z5^R04c=b!0`pqA{jU=rOAnvq>%TRc=s{Icn_x!MX&l#%HDrt>LQ<t>fdNR9-72x@5
z&gdE8xOvxHC)4)vZkZ)Z{ld4T(jne_(E*rABdM_<(vRlGH+t1~HJFyC#7Dbt|1pmw
zBuU!S>8!NL=D5A-KJVTU9E#Ml?D8BDD(Z*tWq!*Usw`8*^8RG=Hoc>(opL#*Rgp}Y
z+4J?P-8}`w#l0T=wvb`VWMsQ;z1C^H$;Hm{If|nZF;uK}yhayx%bEabR2%d<aRfN6
zr*)K;2cb072u}a39iPST{6f+jj$9}9HlO_FQg{pO@#dC9mSP(%>32l#h<qO!#Q^Jy
z<3078)9%))hjt=6<)B$!%6F+n7v~R8{Q+wLWr=rP*!K$D`5DFsi);2v_p;QH)V(eF
zyrSH^=L|^Mk2OR&W|9K=^5gY~{w$@Q?}&&*N*lJ<f>vmaK55ZO*^Sx07K53+m1Rb|
zWkiCT>t75X#=k4(DCa254}p)wJ?wi1SVI%)vwUO)s8bKkP2(h!^KTSLZbLt4<H<Lu
z84Lg6doy(R82~eqzuRhYA27<J>IueN$eyDKS8d2O_Wdz&{{x5ud_5R>eOn!u06&O9
zOx0QC933Qz7oG5dfoJkDU2_CszC1Xb_4^Nt3Rca}hx5&@aD)#h4xZb|Xuz5)+AJ5U
z=vm<b72tJg47B;SbF43|_B5ht<(5=1C-}b280`5;wscvqBa{kuR}Bkw_6#;_p+40C
z4QB~FW2|ErSFtA-LwzFen??xxDWFvr>Y9sm<-F`l<GZF~q-NVZS`VL>j=D{m;f8PP
zNdi<=n~_C#80kd$n2kCxPCD*`m^D_?PM_}4&dvtqRdsbI%~wQ%Z@cPL)gcWm<gne!
ze->tlP32=_7y$i(Lm8NviPHItxwT}{1zgNhJg?as-_<D`h#2lHuM1`2F^XOFcb;}1
zG|y+YBK0TrpToz9{0V2+v|c%@fMD2}K)xPD=}tpelH1AA8vG*%JD;$QeV^@@VC#=J
zj$?y0F$EHnmIGh+{~0@Ud3x}5s@;KAcT>ItIM#{=?vG2KrHgjM(-m)aePVj+blKgC
z8(Kac;I6|IcgAa1zuLu~^IRLg*qw3;_o_(e6<731+Y)ntEFPeiJwDhm2RtK|-K_O1
zg_xg+&?gAWBN&7s8;W+&Wk;Af1=9lXC%~8!$m+auZF{&!tM*=LJ%>l*3JUm;;Vqi$
zgT;PQlkKt$q_<mhKz=Q2oHwckb#c>1P$`3ZQd&>^HIBZ$NR)i(Pq6O~qGY14G;=7V
zPCp8zdtOnSWS9U=CMBOAuXzc55UyjvQF(1G`u^*+XtE=5JwMe715Ix*n6ClzZ_oSm
z?;VlnpPPN|5p_!gQ#aqKsN?VEMZmCJyV<s+)9zKZ?8-fRim3nwLp-Ls$9H)T7oOj3
zA`n0-z=2ZvBAZ`6xM66kBdn1n6NFOa!Tj`Thx)GA`t!mm=EJ*rY08K%4|*D(!dO=Q
zA{w7u1%Nm=;L6H8_xL;OF<-PQ;dFC9qg*&5AqbRTup@@Wh80YI^@Xo<ubnV!0*_)P
zd8kV+Q$Y1*vVyNIWv{<`tt4nxgPn~bN5$B<LB)1&l4i7FPM|K$kUnECHKbd~(%oIP
zw9e73tVhB|iRXA%Y6RcqkX+sJE!_tY8DB+Ba08&~EnhbS0-?2g_Yc)T-IepkzmLzt
z4fc9IDrN29g6v}mh4{5Teffe3j1b5#`J6DZP~*ezUap^C%d|zO-n5UkuW*&n{{(<_
zuwiBkA*7}Oq{4Xr>2Q?GC)UIFJwxCFKgY?hg+Z9WT#U%+p@^5ySx;`t889wXz|wuQ
z;J;RqZ9q#$iIS>WK}hSJB=@_hk}@#Qbj)u}P|-KrS^Q-y?+BGH0AAt}tv><&0qKs4
z)q@z_s`|I%ZW+fr9jMU}=Bh#=PSf&S4$rLbuqXyMn>UD33{tMe>4U<cnLQ=)?prXb
z*@A}CL0lQV8>MZ=97<7FE`QHUlwmdtyy=-dtpGLFXh~Io$d{Onw=8kEZWGn1GQ;y{
zv7>dkZ(#Z3DM#Vy9IsN(UP*P=cR77oX6lzWk%{}iQ#Y>&jvd^5)m&%*37cUf%>9io
z6yYXk=L;lJTua=9c)g02lVrU_ejl6)71|ExU+jI{DtX8r#-n7Lwi1XiBqZO6m6@vI
z_;EST+mhCq5tSx^V1k1veGxm}S{f}xDs6HmSF^<=qkeXN3-o!$I9$v?ksI*siu|Lp
zuCxiGhIvJ<UufB#`-J3Kjq?B^xPm4c6?fj;yft+h;3oj@PbL0oHKJy!%3;~rI`)~E
z`GgUTQY;`rh9dMS*qT1ZjMIVANIe<Qq@HsrmsDJv=!|ab)-{<L!Hjrx99URaVJ1Da
zt-(}xiFZ&#^0cADAffR(#CO3N7%K=??9>@w|I`y=?X#tIoQH@=f%;liRW&Rsirajf
z6%tL@=C7&wt*py@jAlCO8$2F_z(U)<_xojFObicmumnyDZ)5K+oXDrE!{Nk}sL)b3
z)l;V=%`7hC_wk<4robLq3k$fb6EK0^yRaydY2^lw2Gnz%iG^Lbm|Eq;QMKG$2{Y;g
zNxv*tSN_l@FCQOM*pB+)QkuzBFUs#RE`gy{-iM>1-e1~qqO%9q4_fNh7-_DE(!ah{
z!}oDrzxQtv&CBD(LZnqXZF7GX+)5#;3zLt^J}S&+@)b2dT6&Au&qabi0?@6hvc#1b
zU%}!j7*T>`Kxe-8syVE~V^&_XXAUM7j3bff3KUN8vFwid!jYLXie4UrbR&Sb%H)L7
zdMmdn==GnOKatkraU27)akKhjnOiEs0*2s@8k+3g3t_=OK9O}Em{4qpjKTz<D2R~q
z$1;n9sT`3J^U?c%QfikpR*ubj<aZ`*0!_zDMCe?Vq3XL?MoqDe0{8r!c}`$-`L;l5
zXWbh9=pR3K^i677je_~KTD{k1?l)<8Y=m0LsiV7QV}_|y4mI4=ne8gj^w;dv)(GCg
zQlLu`UQ1gi@5hR-Kk$<YND=EodNiwyPE~7*L{igv2xz%A0+o%iB|hVp8x;%cRL+0c
za)$5_%$$edkzk6g+gb-x=<_L}{PZPezcfoT{od+T>}QkWe&;QQ$ZNk)wOqd;Wc^Sh
z`}rQL_cQ@qPnYa&0^jITfLo#8bAR&Nm=)yZQ}S`-ZE4R&{6xUi*&dHY#9<+JT#H2<
zbL740F7~@Ip3E<Jjl~VfTo`w!H)h;Uh$Tur4+P41E=sLn+FB5VI~LKD52}YpFR^E~
z;!Huq+UZbPXuEX1C!{@6MZ+(Hm7)t?sh1OcRR;eqkB&yr{OdIAmWfB}y*!DCbhP>G
zRBs7?Xl7v^%@uph=H{joSmb)GpU9{Wjjipgo<lpq`uaMxKP85zk$S3SSD~Vhv@nqD
zpC0iz%w#{NqolGli3tl617xY>mL5SHBI83QZ=7SjR`*8q?a%!vC@5WZ){>d1viR!p
z@wot?Zxp^tP?uyVBKA-<>08^DH}He>Yq45r1ih+{TgrVZ1;rEzT*zE5i;hl+GbV_h
zifS=mWH0u+=Gm-qhpv1HmtBQjzAR`s?`S_KY}U}9OzCEP^D_#*<_%k)n3S!@s}z|S
z^r$W;_#%)8^wAz;()kta(|Kog&~D_nl@9%OdG^6s-am^Q&t5&8EaOWtz<}OXbHAl)
zryPCQZnVLkWBz2FI4}?`4VbR5fXhdZNSR}#^g2!jbk#H?F2-zhvqI#0=~H(N0*b&A
zPhY14roxdUqBys0A$tfBoBQcnG?qn|QI{9agMF|HDZulny8h9Ud-qo!(T~v3ltd4i
zQ{r)wA%dPQN4$+!=%Gb1F=6jB`#VzMDwpk-Iq&&{$5}79D9hRo<~0F2b%D?MrnjQZ
z3*OTP3%it8Xhvax`;Rd&Vl-v^S#p4bjK$;|Z=uv>A3#}3II&HvK9lijS{wlE8&hWk
zB7Ao%9n!gYX-eOgChek$SrS^BNZx&b^;{=j{5av!d>!mo_^a#I@2%4Gsc}a8*yZ?Z
zzm_m5{NS31+Um(ZpSm{d`I{levG`_WZB5%l=LH>9eTBwNUql6YnIE$>zz{PjFkzME
zdwp@<byLeGW2KN=9od;Y(};RSt55F4htWD}2bDfuul$h_YJTeE!v4_bGNGdOW1vOQ
z`fZK4x|SMXTv0g8Q|9swK$YfO-NRjbT3_=1ob$ipji{+Hlf{>g-(>AX>{S+3ejN%J
zh-&Iu*WQ9o@M-R%v%jf1_`j#K5Pog`p)x;XSx+IzuN_&Vjc*{^2k{>_%I{(U(#zFb
zHr1Dd+`>9U!0!B8%a;hl2OeHrQ$+!<yPIDfg(VtR+!z}TD){?=g(a{ZH0-+|v1K-<
zY5VXRtba6;m3=|6!PaZKqfc#7^Ud&QN(f++sK%S=vsCkIELz^Y`ZBVvv9hsPt&>2R
zo>lJ9$_6rHp6_BDy#RR93{aF#<KvS5NYi>!O{U!u#mQD!vmD>&hDAWIZO?;ldt5y2
zgA_jc@7+GAnJjNs<uO&Z;i?rWg~aUtxw*gJ{G+Dak4U8NsD40aqN$>Ca|OYuxe(}d
z8NOhu;78=(SZ}!X<=-lkLm=2$uJq?o(~Q}u&5#(#l}=YtQCTdZP*GWEV8IY_&xZ(W
z{QMs#rL!3&&R4R{EzYCP+p%!x_UJwg!%c|8oXLryJlS^58Q+R0A)Ny88z)XtpQWo4
z^g7EwzdLTGvst+ZF`XIB>O-7t=c{Cr(~4vXfSDOCa0atvDMKIsE5lM21+M<|(n?W{
z@w_$jf@Y-(Fc>|mv!jDzx(JKl*AK|hO1sYIdz{KjI`8om8L)0g%xwp0f-r77wOZeF
z@+vtXsn#Lh<U3NpnH_D~z8ynjx9;`JpX%l^U!HxWZ<w;B53+#Z(}r$K!(NeA*p5wN
z=H+=hn8G|=c+>22rJj;v3~Pib{G6YCNxcCW2sQ>_gFir<`;!<+%`n@#9AOHr8_$3%
z%xA_}>&O|N&yfOiPGM}%ys2#qcaKFo&9>op;_D`<?)xX^3=JYm_b%xav76(~3ek8p
z`fnPNlZ?lT9Dcee`pl^BXVDCN9xo4&G)}#YvheAund@^91@XF&|0AXQ>#XkCr%sKc
z5#PEA!S614=G<_re(z|}<3YiTUZ4YwIHTSq)zai_c%UP6(xb<TT9d<w4D#|}GAI`}
z6#RqhM0z#ny#>=jNr_ajX{!V<z;f%gfrE?dTzLdj4e1fLb_@3$N4S3_Z~8@h1G_Ky
z;=Q_KB6{{leBPt!eR!D4Ck<*g7v8UXh}^v9Q5gxEsox2FwCcGl_;1fH;HJ$e$axb9
zkF}XgEl;(xAk*A^3=LJ{ZL1lrw%6y$1N@{qQRyp!!Go}$k5Mcctwn_$y0+n{(aCbS
z0V21bNs{{;K3KcGGt|C7J!fLpvXKEe{RUOmwVjQfhm`T`#Nvfc+77Q14#}iKKbShN
zhBu9J6K+bHZZAgN91)xV6ZtmPgB)U3Hf{zy9grt@Mu58D=I?D9>!dlWDHp^|&8|-u
zayXsW*t}u|2qEVh#fW~g@5TKdW~`;6YBk2@N?hUhdx*^kP+X+qrBD?~P=qfcSw|VJ
z5&ZO!6DRvx_?6!Pr6o@m-DqNXs=V`50f%L%y9<e%lCQAiLGkI9b6+9V3cx0`E~}>M
z|Bg*hK;d<O$iLnqgoBSmn`w*s5iRR6T_Jeyi?682H=wPE==6vLx{l9aF3Wzd@Z}|H
ze4=<+m)U7QMV=*}3ey7ENMaY+!e}_*erCj^cI$oZ>*4SAM7$Xb^_}4Xeb3f>JHT5k
zzZ;O@v_&5P9JIu_EF$GH@T?w*PUN!qcamt}i?X8T{e!k#tnGPu{V%_9GaJmn+ZPRO
z5aBbLcSBN+&s<q{%?5#a`OlN3%5|hNKBr_m@*A<Lg{&9cXYY4jeufw8awX%Z;trry
zqTr?Qr!rv}mF3wblO?T$xZHS-cHCm3XE_miM`til@MTPQErZZR{o%qC4t@Xvmwj>o
zZ|tVKH{V3*hr@yo+vm9&2PytT^tvP$;mLajry<8_9d`2U{H#6mk%+md>c`wnZd)>-
zKv7L?fVdYT9<h!rvh}{9PDt2#*h}m<m*)?8=gw}$2R%iZTM2(svj`8{-^V!AeS!z(
z@F|_6Ohxr}gH8V&qOBhdnqYU$Qi^hI*C-x`Hk@|Ik_qb`yfHyiJU|cl<*sUpjCi@X
zXQl5vOnZ8HnRMctJWc5TbO)`}>ktZh`C`{GjoZ44EFn3WzEnu(?EQD5Oy?2pH2F}8
zo;jmRad-D$@91s99Wdj53VVgM!CxJLx&eZsIpuo8(h@a}-gor<ce^my>3X}uQX!Z9
zI2Q70!Y{qf$0Ip@0NQj`#FT-w3*jjXpT!~b<NF^@e6Y{c=5{;4z8&ZP@Z*3c`(5<S
z?d=dxMY9E_#n|h2)^)qzZk46T7Z3QWxhhpPHAAMRJ`STDqoJO2%@NbNU^s19MQ}WT
zewp@uFEC!LGikaz5NNvDNQ?h&4r(yIzq-H_@%3aG$LVw7&yLl2CBk8)f=f*u$l+*Q
zvz6i}OA3o^fS6D?`DX!aCAi&eF$3XP@2e<rS=pTV(FZIH4DPeH>t(ft8+@Gta(zt6
zGXDN3Sw8lnA3ySk#z4K@_;XTwJh<#E)4v7?7>#$`z*(?~I3H{ojBig4raz`AJtbL#
zqok6&kam-I!iIW19CZ{0k`E0nq`w69wg*1EJkx`9i20u?L@B)f<~VJEa)uUPvsu5;
zKuj!#4a}L71NfF)#mV-u`67J!>Hhay7Pk%A9uK(eBfx)c==x7R2-t3YO1P#tk3#+t
zK6sii9_JyaVO_igt52F`{F48ik2mUK{o7Z`ppndHj#=>3o_i4A{E!PgD~BBUBW_pI
zcC|ZfpG!82KBJdqNVrtM<au~*)h;U(Z_FjKw|S}Rvh`N`VPKj$U(>7W+Q`Yh#glE|
z3Q7-tq-ejRYMFbF!>4^K;oWCCk7Bw05C78Bx2?HCW5B-9P>X;_?zI_lgXH5z$HO0k
z4f5A~U-5{0=NDG%=q<vHCD?$`lZQabzNbXh1ooXMd8c@<mFO&&h6T#6ZrPnS8|IMT
zncg;s#)~#1@&%9U=2_wTxmlmFc6HEr06rKRZfFz_7@|rCj+EkdiAp$u#qROLj$7su
z$GmD<Xm?3aQDgAgC^}3fectBG{T}8&ZVc$F!UJ=g?^`UQ&fXi}`hWq6z!v*|@Hd%P
zjrsX+r~e#KsPLyPI&fBGdi@w(kgV<(-Ty`IJu3VNSiYULD4V-6Ki#=K03wUy8NDgc
z+{{Sx;!oB~=>ZRJN5@=G)GuY5t<9FjPD~C*mub&V^&`0X$OAI<LnS7}Iz077@ggK^
z6_)r$?=pRhWD`{rsU(n$eDyU-+X^K#7m=^9l5L!Y+TF75*QI^iZ_|^T7JKR__#9tX
z{PYh&7X7h9wkW0goyk)!qtMBUs5})Z+x~{ElG$JJ_4f+Hw&>e#Q&c8Y^V`{sDd7aM
z&?1h}3&mqDl(}>pM-5z5QQN4R>aot?xt+@N>wt4?-h4afG%T6dPbIT9quc;zY`RdD
zhzdj2g1j`*lyC}d_pgaD%a82Y2}l`=8sLXgThTnm9VlEL-6l{}nET2tiQx1c#)lyk
zu%JYI9p7lwrKP+&XHS!HmFc&ON9?!gHMPIcXis3+vaer?w6&2Dy>>bOKEv~AhJW27
z;CswIadn)$&#k<fn@;mX&0sUyLm<4Ag2~L#Vh1_c&`-D2NCu^II$<Bs^oaK@Wnk_D
z@nwg$skbaOvT7Aq(q*^}cCtG+)W1D_!mv4AEGGh079F`?kiNz?f4!X|@|5MdqU|-j
z?Y3NXI%BC|xWblT{G^>am$3;@NVBD-B-)1I)s71(p{WR{6wsZ3Z+!NxWf~&JgT|x#
zoxGQOl+F~jp>oBg92jR2sKXAtr;maBe4*7X!+kai%cnC%H#c!&5wY`zi;$k1y>IR-
zM$eI1G#^qqt11Th0sFDVLOai^5drEC0?8!Pi@mO#<DDlueOLR_@tW?ljPlINjbCH2
z)lQaav(t{t$c4^UL^>spSB{Vbjd_U!)SW5y&;H0DBct){a&JC%8ioSW9z44gZ5bRQ
z(R5h>1b~(~93d5>Lec*AYPz85hP}f$?TRH?#br#u1AKURxK$O#dQKyFvEIh%{-{x&
z+{I07vUjr$R=7T^2Ts_X+~`ktoHJmHWS6W=7#hNUxL=*4qNUZ;>}fe_cs3dPnMv(?
zO6;Ay<PJfc>VA5<zufT#M`D|VO1#?5Z-!PF4Nt9J@UFXA;e`+!?+T0JtMVU4obBuq
zZ9d%_4W4g9%xl&gv>O$;D%#yK{Vz*5`;{bcLY$^5j9s#)62fkK*LD2v2O=l>U{B{M
zMA8$v9#J$=DYqVr)(MWAKP<yMC8`5AFhb>%QncijgxT09Tvf)5pnxlP>t<zu9|3HK
zC85q0LHb=ql-u)T3KcJ}#D+PAAgc6eWaG*mVT|Zqbe{v8L}7}dJOHcIZ)lih+Kff^
zty^hm%)-K4V5j^$OxG@}c`+T4fJVu3NmTUPQHMqU?K+Ut`2^m!pN&H{Ko$*O+l1~j
z;c>$J4Ar!P=2ALo?ZQh7pEaZNh{i=IMZRv7Y$Uq~EO3T?5%aui4?~TJZ@^leyErUn
zw~NxnW7{<K8GtO^Tv~`dAG0mR4DN_U*X6pVh`k_Jx?$yqMh|jajiMF!`@xpP0K=4+
zVOXHc>{~3#P^0Hf#hX`md2j4GmtY0WIxN>i)&af<0R2vT{P8MeuzGDFwCr}&_Ov-k
z*jISG87da0@G*9m0x;J!(x5FkN5?=0sBGs;oEk3A0~nqsD^QihH17$QNJnCAb!P_{
z^8@7u61r7*?aC6ohIxUr(tFveI=GL{&eaZ&AJMZo25mjKz@^K8q+7>!+WwnDTv7Q)
z$sYFmWF(HmzUgfNnLSlKJ)Owc<6>sGBQw~3FMSm$PRXFnnJF!lZ2BI&j1kU?x`K>+
zHViv|A}je$V2~e~6r2DGU#&seoN}Y!e-xu-48_WB^!BLM2ii6L8sO0okN02ahvj+y
z@T7TY6$qE^PeLh2h1YWNkyemSR{Ssky$9hvk;;DvsJ>P0j4ep`2!uzvfdT%v{0){T
z*lYx^BG8Zmd0a=@R<_7Gj25uBn$eM((|`8PBiEcX*0brYvDroMF0RZWb=y=`pWgwb
zpY5H>g1A9BoLeSCZR1q##d+d9dQd>84<=fJFT{Q`5@Y?OfvwNfNSM{-rU{fG68qt^
ztt@&Ed=IS>=JLkd*h*$Uj7nxN13EwVnaS$SR7x_4!*)_>BDtW#E<pG^J++MaJbb)e
z(~`P8lVK<y{msjr+l1zOCq>6oKxfb0Eledjn<)@!jPnSp*<D|5Pp7OncvfC?wjLC~
zj#1w}rTx~wy94=0WN^UmPBX5^wzfA|Mi0B<zK^5uNM5r1K|f@9mx$+M?~6~pZ8KPW
z0I~xN+}VGe1d1ZpEK2K<b2MY`+04Zzrm%|UYDlZwJpuRKvF+&Qej{6W&0BHA>l{Mw
zf(W75WmmNI0G(wCH#`cz5iWXThXv3BILTi?^`E}z85sm+g802)%BAvUXc^it0J!RE
zkX{903-=6^n_BUbUo|wP3xdb{vSvyZC0BlShR^gv=J2Q{e<4SjlOx9t@JVV|hwmeZ
zgm}d~WnY4L_3lDnfMVoFJAJ@B+OT0D#(n1bVs6v^PLRF<w~z-gr)|3}0s$6(w>jye
z;9Zr02?`Tj;daGPNNT?lT8!83cxh~_WxA_>hUv#;kRjF$s8OIflUm^SxT_;2wmT{?
zfmH3dF5&gTnV1M(NJ#AAG$@RSd_|cK6BP~JuKm+ntWjQ@=WgCJU=9WZZB%?C{T<+-
zEV+Q0Ja;os%&l-6ImUngR>3i57{MM;q}q=bGVF^W7!RTN%(uQ^Db~vc_GRqtBc`0j
z!NK{w5rqwyV?f)l2{-lujOw@f^?0@5lqOm9{Pf*0y}a}v-5Rr6<afT~lZ2>7PCE+(
z)>%jo<45>DQ`0aGk7H`vm9~?d2N%Q`NRv~g98_Mr!M4<O9OYT?)xav@;;>(-rrltc
zhz@n?B;gLFG;4&+s1~Y-0=;xqx!(2WijPshmQnd}-H%ZC#XgT^P^3`;?F4kpuit}v
z^)e%yp7*V$=KNL+g?`w2xI<PCl8tD*`NHic(&{ir;aJy9EyL-&iBNt^1uEcFEA4I_
zUF=rKtnj%RRVz3=1x-u}NqRWuz3<iOAk|aV<<RbPS4U4BD~|VJQ~Dh%*aA<tv9XLZ
zPnH_&kK<N8Vpo|}zT#H6(hveJ_aoIzwY+es-VSw8@JjbK!-Splf?3p%ZpsSYkEsbb
zfLp*~xZ*uY6OvX)L~?7juFxKb?KGL%n?p>lYfTlYs%kU_1#piH)+$|Z2k_GiDrmN*
z-^(-p#!Dsip7On!V7WOox{hd~9p(wY#JGrSx-~Q={WfP-Tfef@>oZr;e|~Kw<tDKE
zTbLgU^Sju9r|~cZ{PTJ^yWmk=+fTP0E8oE7H0UeX1QQWbeTbxvk00q**WW7)3hD{M
zCL{`3`;84-CU$tr<F9*)R6ZRys;KMp@-db9r%|9YRW9sv=&0|Tqx@z=!W0MK8&C*e
z?Ytz#E55yt(@SRhjBI32_0;_gUNLwvuWI1;LM+`a#kOX6VEG4eY^-s2CSfSyJC^=(
zZL~UHd^W@gpU753+_%Bld>hAEWvvb!Ffo_~?EHL1m-p)Cd5_y6J8Gr*dOXeN#7g>i
zf#3I8A;@_H!~@P_aPe(p^|Ai%e7o`G&mNFdg9lzJ!4(Rd<0p5UV_qMO6&9aULrs&0
zCD@r?ylO1f<KksKM~H4;k?w7@=sg-ATe6Og1+?X;>T8%#_F_)W9xo#0l7DQSf|UsY
zpZZ0WC~=h6(?Cc=`(;{DtJtCHQ45H%gvmAv4xwmClTQ(G^E@X$LMVPO@cYL~K6Pyl
z3)M5Y*U!DAfyBRujn|A^Wft;Y=0Q^$?Go_K?B-J77>!#TCY33aw<L&&qg;Zj;U!!R
z_o74mA8dA&k~O#jYGiWq;&tYvMWcy+2IlEutmRS=Ocw?URX5enm*^QmD`V)k-RU(N
ziCC;1EuH%7P<ZKZUa_y#Y9!P~SBDSJmJj(CC3PzZo0Ku1;;m;q9xfDaNT6-JIn^L?
z4Hd_8;T9Tnt`>GZ8PRT>I=<TgBsaymduejZAsT_K^mLQfRB%n8pRMMZWk&41hCY%u
zq;;RWQ-${kzgFmnvrHWovum@taxt(DC0%wZB+R(@S9s-WEPL3)_j1Nj*crI#5H}8n
z)M;d-zbmy=EYS%tLOx%k{_O^ao3hyopQ>?Dw)7)7-m@q(U<J{|q$tYoE(qZ61){(P
z=Sf%u-KQg29;2UruoCs(?#0(yz8-YyYmWQVlKJ@Ka!9{(q{U^19!pX_Yg$>%;5!DX
z#kZ_*Jo=IA)c}X=nJXXP=<aVgZ(h_HF4Vo7UYUE=`C23ynjCm6g)E$tPNaKdoOOfQ
z;!#jkw8xfduR_ZAc`08cCG;zT>;BYYi{Gz%q8389myGFrmO74z%F#U!IElq*$uT)H
zyW{)Lht=?ISx3w{Y8v%w4R)XgR`bq>nvZib7eeJMFM?NX7ZXjOx)A?vK@-E}minai
zUZ0P16?V3M%SRueB*Kei-S2E+4R!(zl@A87bFV<%-t`Wv61sI=rdu8&A*kZ7UHK&E
z*xRl`Rt4dH@62MLZ=S!U8}JOYKlUF>w4P4ZeZwf$IiX8Kj_lqm$A@Nm^NZB8h%oal
z+4k3`6s-(;msxN^NIIdMl4^7k1o^yf#ybpSyez##5{m5kHRzh7*I$Ah(ex~zwk|QN
z+(&j$T~RBpTDRegfy^`0WD6QaXNU2$gib>BEYK)DVbdPO-#Qv*IZfF*nP>DO(X*LS
zHA8Qyb+7lWlz<&6Wcvy=YY2ra%GJ+ZLCEzUl7ZfZG4@77J61y0PlsRWb$YxewyzOt
zQXYj38_M-<z0CZwCc+*t0Es7+@QS5y>wVe8PX)JYcZgg^@6>hD=d5p6KK*Qj+CqYz
z>{KZ7MqL`aiAH$-pq>7{!F>F#+vu3cgsczeA4$*snS}V6&8GJ?%P!3Bo|)rMLP1Of
zXSlMRoic{Jf(>+Q+uh&fSqs$SrB?7p5?UpCj}HyI2!g9r^*5RD7v{h>C0g)sqp<R7
zHNSnSBVZb6YfwFOWwX{LhJ!=Sh&f`6rOkv>K6Ddg90qwHY3q!9x!8Da3Xcd4`1&!$
zV*(zLK^Og7eg0VsJB;YC`Fgq&lF2tZZf#PB*0CuRbLTim3Ee2=fGybt>;>Gg?9^5o
z>mVyFD4nUU=FB|>qDE=>J{;Pp6JJ}A>>%BB(WnHXPEk0vxl&+w3G@F<qtQnA?aAYY
z2pYia=8QG;(^|m%QBkWoZqphR>bHY1S`~ihXE3-J&6K>0Nf>_8Ot5;_#UWvn@cQna
z{V+i{c}}JEYB+1Ks=@T@<JY5}&IYc>w{T_jtnpBx56YRY<fX#<vs!*v<pyup=5Ixw
zeo7kqLHV<cVcj9=NLO*4XgpeDUbzRY=tRjO-JS+F<O3bW;1is_;xgAOf<V*`7O@Gz
zq2;8Ga-V~{qvNXRA$fMMmGg}<u9tKe@{RUr#H(H{(4gB~ri65NXe+ueC2XMcX>QIX
zBlbxWo^r0`P&axPgU!>djkN6yNu1jQvU~*Fsi2vy{Ppd(Uw66QO-1REjLq3*8BIq}
zPJtfQS$VIg%ZM61#@#9s!%XJmo<YijQ5%RUk%U1=pP%Rpg?I&&#A4arqdd=J#n3V>
z4-);R`UrmY(RjaW*tA;g01bAYPPF0aH9Oc^HZ4`SOEg=3Y;N3I<;S}zf|btFJ0%T2
zw?LE}^mxK%_a}?qG=Ta<a&fT{Dk~+06cfZd@4}9Ze<%+6_5GD=*x%zS6h6Qv>fcvC
z(St;!{vP7N-4xXSIdb3h7%P*rcTs0;Z7bj)nZ1|#_l-MLMFoYnRoG8o<E<~kjP+$D
z3YxZh?|^WU(yy1i*fr++2@WWy{yE5_Zn!H1*EoI?XpEck<E)9FhJO3^3k0>RNwvsN
z!!CcHGNex=MgIS9q;kwl{{4sWrHBuy{(dnkHTo6yf6j}8i1W(l-wOyt{$DQu>+i=^
zu}!K%|9+k!jQzi-|L52J@6Gw|$^jm&|2|egVEAuP09L|(gTg;4;(rYaxKkZ|_nBF9
ze-KJaOU!o1zxa^jPAU8h`MwB!G_iE6SJ%?=dOBz0W>$C>>$?s5n{EOzgG3aWSqnmf
zW%{-G3)<?mQET1b9BleFu^lbFQ$>-(CC@gid=zJA5OXP2*U~sJLcjT&<q&CJ`TQJJ
zimcYZ0c#b^4RH|4>j*n#wE4_!<G!X?za8&T(@<p{d|byAxI0h3;0Y9HMYEn&h;Xji
z+7`>CYliU3)vDB;{>`7c#=RUqKec6RRZvi70yuaeovnP_)DqNr($u-VUkXo$*bRsj
z##|**I2-o<{t2oD`V}`1_ldaPJC8+WZ0*H1+i#<{2TZryP_XUK`x*Kgi*fWyx5XHy
ze+vjCgdh>|m&-NDN@PrwPmix}Z=Nd}_k-@A-`0vgR~nDzJaX9no5RUH(XSA2iXg<f
zf~|DII8jc2D}!7ktXDpP3Sspd3p^ZCe=C6tXiY$nh>*vkS{@?^bg_#1I1TgA*{_g$
zb{|YGE-9OTtUdn(=uOvv4*~}xO$AyXJ1j)aKN&Rd;*}oM)Rv4q3v+aqd;vGU9&DI7
zX_L33bZR62TUeCDJYMfzW(pnQYyrEsgsLHicT9|lQEp86_c()~t^{zx?;C`7W8fsO
zP0{#mwT`5NgOB^%R>A0^kDtSp6>HZ<zlYSC?N~Z|bh6j%k_i=p%jcA{ieokSPDX1G
zO36|^%u*JKC(1MoO0IV)8YxM`qxFFa8WcG~57bje&Yi>MKE_eVkkK#4kkN324_kep
zA?TN``7*0qW7R|SDK(QSMf9oRLhyj!Mz!{4&|y1MC_oFEH&<;SblF+Q+MW+NHO=>1
zrNYI*bLf<brMKDC)w!G2UTE@+M(mO3?GfoLBMp>+SAHtbxSz9hy_cS2lQ(R+Qy*S#
zh>J0#|IbZ8{dCVVAC$~1SH3u+0S;gsgA{-oW}s&4KwnKYMW0613FodE)*eyAczb@<
zn*UZzYrJDuE?ho3>EKITrXZDZwe9)a+r1~7{5tkkt`B3Z$UAR-|8W>ya30-ml#Prg
zKk|FP6!|Gk34IRVvWHNqt#~O}_}>+l>)FM~$;1|b*7COcpvMUf?RVNf3{a}=ed-Xm
z)i&4aD8KVc=dF-{_zs!u-;}fDhvMvX1&T5xkN4^MCmfctt`bpPM&Oe9mIHUKM6KcT
z;Z;y0n=k_jRP&>@)Y@7c-~TBT{rx2H@bFHq+g6I)kIRyW<~>O~H@t9hGT*GnRZlss
z8Q+(&3mHlD(y&-fG1sNl?F+nbK_{gRFCHs+!=%hO!pLI$N#kAoVn>jfgDzV-Avnpx
z#1JkrQ>Z+0E)1<QZra}K{YJc+S<yI_c=S{t?z_?9ncGlVPtPXA7;00~f_oepc`K>J
zvNWy*3lr0&oA)_Iu(denmDTLVHJn*Er(fr}obQHay)=&#Ps5Pq6l~rd-DA!jMI<*R
zCO0faxs{r)UuW<0hIsEZpzTB+wXHoHCTG#eKs|a`!qhc%Sfm%cKE)g5!;t5WW>=4G
z>B?K&lcIw@jNILF<-c)hUG=&vsVIrpT)vY%Q7zKOE^f=0+VQ=OUVXptq+X8n0x!_T
zhwEFX>4*P3%NI5cMnNK~PAHsvi7;A0QE`c030e;*;qAq<ee$q_2vfh8z*NVxW4osV
zT^vU>E;$h!mso&Ihvx+wuxwMNawW0k?2}l>B%~STLFW75P1}y+e$j!peZdD}fumU@
zT4BzQL=2yF)fv=Q{8Z`Mu9_5{*UWV|_eqBr+YEKx4^2StZGO48BOQ3##<PHk4sqU#
zSH0yZ|M9T1VwvyNdiBX5{JnI+=<SBV?Qwso`Qkm>$VRx4+)DJ3AlNWIaaQ*;?x%|s
zD#6OwuiX@NZ$H}4D2)fKNn0<shpHz*(}g?^+6IP$wX$R1od~nlcKYgRDm|-4IJ?l6
zyC_@FGV5>XanD`u(SeGh8a0L75Ct&@jFq_zDhdU{BO?>$=jYphVmC3d4UW_NjUsdY
zM*|^U!vtVo>fwF6@@-JCR#xJPq36WFukf(wq*bxUxVjS|Y$vQ?NP~u9>y!TQQq!V;
zq?4b&&#DU$&#CoN*pm1hM2#ZOYYKE~RnXpsNJ|Y%W3{s`V^Y4;AW$MI1XDIE?nL{6
zbzdx8mz3t0)Jz3RSQX=?EG^BNLQ1yJcNBN!O_X-*zmY_xx%?@Z1BK76*nOk$z20ri
zPC3a6n3zQ|_{FYV$8aFFLnZ(n50H~mi0Wumo1=eg%*Xn$a{-}<Gubt?{YHUc-Vv}*
zm3H?e-?CDW<>Vz{OJdc{c~ggWbZ-(DcXbb@?3fAUt^a|DfTYLATDK$j@nbv^rLT&h
z>)soUmgOHI@4-<dT(*X(SbG0vGXYFl7^Z%TR+1K|K|`K-?`w{<mvyz7gTARaio$~4
zoV@=IFG@^UV%4ea@M^X6NJQk}^(2rUDacLBO<n|%H6Dq`E?$x!y4IrN>D=`0)~vNE
z(+V@{22h%alQGj%3+m2}W&vr&2k#8OxB}ODp*Yl)!Z{JqKwqe|)J~v1Ms!RUB_x=d
zp-!Q&<Nl>+61xGLM;0<S%hQ>>PClM4#mp;~SY3~kN|=a22~&of!RezVCn9X-NSu%T
z5O0(e)O7*kqa2A3H?>;{lfD!=X1>pYB<-twyOdUWRqi~|0*u_6R2Eu^B>zo@b*=EY
zht>3o8k(|Ra_jaioF5}m4~^P15)-V;^-pDFYq5R2iLz=!y5v38qY$;1jFFy>_}3i6
zqh=xl{7T#Mms<RPnU7|%h_ZKD)LiVI%}S#}1J|7wuPF<&k5IF6>M;Y~7az6k*;r<3
zDm*HoayII)50EB*dp{XZ*_jwIg(?QRAfy)g5Ea+7g`lS^XZUOM?gB$EtFuBQl}PvH
zuU2gh!~t4d>zVpSZil%A3d8`0?uH!5OqovYR#CQpyj%hukM~c0r(9<Jrqft7&%b-=
zBO4^T`8@2yVg{BOt=5+`+qC^WXM{G08<!*<F@(Zj2Ma>IU$Wj%=$_=e9oYrsX<!45
z-HtD36yhXAE8xW{-5S2~`tD>LHd*MO-T{1x6!!P0g<h?!ys=!f#WiiZ4a|C9!$SOy
z(O{!XO&@?_X<qOq$PHuK&0%hkB=t3mTSK`ab$SY0m0c+V)1{CJ`|eR`%11fztVc|i
z=y=`6`WKVG5H>Fp6BCDEU@^Aq6`#t7$CChui{RDH`{Pz6!D61Ke`6ffxhqfvIY+xR
zYPn)sB#uJEQ2lp+*VAmI7n*lsioj}PqewQ@JRS)PN^$uii~co{#YvUD$+y54o<wko
zZRF`J>_In5`Y`VBXyuwS_d}sOh^HU8#e5%&abnCCCfHf-@Z!X@8F7IGE=WXs?st=v
zoK+fY195su0pv8VVD-%M<Ep9hwa|L)xTK7i{~O*WHwEfff$mSDf7Y7#X8tXzT7?|D
zvjlT!l`+~2vHo~UvfbLfu!veWO?kR&-&nFG)%Uu7A0GqcA)8dR@*oKs_O<G9X|di^
z0V{F%RE$8yg8q=Z#b<?T|D^5hPr9+Bj@4E?_~h9Ps?}!=sU;(sK<-;VZ>3*Nl&A2a
z1?=kaDnSb#VxOUE1|iAJ)YWpucPHXLH-GEdy%z$Wd5GU~g*x1Gh17osdr>q-8)+mb
z>GG9*ouu$1EteRPk2vJRMDP-3Qw(zxI&Vun7(pei6PRE&&>*I2A+&<4(}TVl#R8R^
z_RRIrNW2xsO|Y*OsfJDj(_2eRNgZC|6Yg}TQ~YPSO;#?zd0lvagf#H7pFP`mO7Svh
zSjW7Uk<Y@bxGjz9I0O?eq&`hPZ-E9+s=(MyatiV(pzJ%@pN$S|1&ICUBRoYw9V5@;
zh?@}0oC3sPU{RWI<kE`yF;T8u(`OcS!u_04DngqeX%u6W1h-7Q5*Y{2Q&k!SL|c+&
ztB_*t_Su{y#1R>>2$lA=ShhbDalQ2by|R(3qaRh<+K_o-5|JUZgRI@}xg7@mxiZ+k
zR`z2+VbNXmVJcLNovEc;NEN#bu`}fJD6!S$1<&%t2Axep&(6-Am)+*%(s}K!4kn?<
z7tGpK?+v}S3DVgHkNy_l&SdA0Unj~@gGfysEnAmX-|=QCT4@I(y1f*4e8b5*9?1E1
z_WkT7n!}zYGJ6vK)}m#qyr)#1)Ic9~zUj$rPwr`$hwC(IojP-ES<}P--pf8~_cDWb
zc&ol2{r&G3yp<E4uA^*qb$xQ^b9AqUJHEdl4dfQ_xk~&7k2LZX4*s>s`gNv|hehn)
z#@Wyy73HD0@hcW!1H;9v6r}w@|D!x<5dFl94o{-x9Cj7Xj?d+W&YVC2*h%O4b#>=x
zo_c04sbIDHA%0qj-=O)S>)z>IF>Xpm1V_r_G$O&SxxDy*rXJl+`ch%Ll=R@DS_?LF
z$w_d9&E>kt0Ct02BdggxNpg%PKM4s*jTg)U@K=z}qkj+;5`u79{>{41<aaDS5)x$v
zqYWC7M-B+OxSyuqmdmq2dAG4v99Ak4^6U1^4C=5~hXJGr<;pXV3c1kCPMfAc8Zj(Y
zR@>}%{`DT)ao&QThL+aHkC1WPA`mA2VRuXJ<wRk>i(Oh6Y5hLOdmyaTvRYmez6smL
z+KF@<Q0_VXQ<Gfo_IU7oUl`003*gk-Sw6>ctIto@&f7^^yxw=qC;s%ntImax$p1V!
z|8xJ~^bZ}h+0{lOK!+#5_wV~%6pz0E`M;zQDlI7(j5B@9O4~15QGu&d*?DbWA7rdN
zu(%!HY?9>G*mK=N>6Fn|R9DAp*gA=o9>SfllBXOOwuk)4<;aeh&GP7#fxj!L{8Ii)
zCadIl)1Q^fBag4!nn@}0vLdYxRE|m6)fyx69}iE4_N?TfQ+m6bm-)4i=n(DbqU5CH
z<ICXNzZ<j3<3Q^9pziN{tXVYaDNod6J$)^Pmy|9I7q9(FeryMIXLGDMYe<Kv@2!%*
z!b|}QhU43E*`4n^H5OOLL2oy!cdTY=U8nd0iP3dm6nLDJcD72mAGDpOUjFn-PL_>{
zqnBOX9IxwAwUclo(MujnF;^Ra+GIM^58><@(1t>NtKX0c{jQnXvFpx^aP~PA%6gSN
z&uI|da^am>0UIrMC!V$(1a3>Md1l8fpM=z_Oy`dJ_azK04VUip%#9VsYHH$+6~%Oa
zSIDA))#uL}k6Cl~_$|27{j}OuRDx9EjGz(oZjWr)mFO|T#P_8?StVuU2H_-G7^<y@
zU+QpN`s~(b3SBmyaqexw-qLZXGJ=@Qj^{lJ*y*Q52c^Yc5*CCY0$`xrRb5$CnBzXD
zXJYUNslA$vWR_t&LD^0Y%VD{>Rr>JROu0Gv)$G&ryc&zyw<s8^FsB^>J-&Hw+Bz!}
z@$4(VSCAJI=KY9Qn_II-W^3<=hEV2~Se%B3KWXTgL#LU!r8sBHlS5Vm3M<=nH-IO7
z>rlH@LJ*6D<!mhBvR!4L_W>9V4+>dBF&n#RpU=magD%PoqVwy2(0Bbo7I2x#d77V4
z@P<pC@cAZ*>$Nfq999`ppaBktli>4YLput~R6ZzH6_+5J;hmk}M{n>GFOS2V&N)|u
z_GyGTU<&`wkt;bz^}Ofii#f|EU6jYxsZm6VV9<oU^j<#uvw<d%c%=pg<ZV9%wqscx
zac(9wZbOTnzCc4~g*zr{3ZbW_TaDLZPz09}sg!quouA(ggOkEiwKFYq+Z+AHyoVtj
zI)YfDQ@ZH4qTb7-uafNNJgR_wY7uR@4Us|g^awIbSj72xep~*~OC979seb7NhpO=P
zIO<N{#{yr+lkChK1@dns2$R&grrQ~vA}oF}BH|?zFgZRvDD(**G`7-aVEDATK)6(L
zvIhQ}QC&|LDwmtEGms-~3-Ml|VY<AMyh7y($(2@0^e<Ginm;Zj%zX}#-rK!`sQ8&)
zueOw{`g_BS&ihS0<i?Z7a1X41<E4|)vk2D3-Rz(cuD#F6<L8s<2sm>hxgPbKl^!w5
zLBjs6Bd{rRyVfBF<R2!R9g+lWn&P8rT2Y=!O>=03#`%IIAE=PFQ&x(57N-+~j2k5~
zEb&L=&Jd8~)GY$8ekBqREVP8|wU}h3dVGWcC5FdK5#+B)goL2;;%W1BEF<p~n3T<z
zZ+=mlve_c8kqSP~ou58wuOTFrWL8Hk4U^O@*o5N#&7xG+o%H|;M2f&a)q0(9I>b<1
zQj%y_n*2xjm@K=f?ucR7`iQye+hCTZy|!KUFFE<&a}JYJmay~NS?OVn^CtG}62Q2{
zep$Q)Q%tu7L+r!G&&EF|zl5}t61R2%gLgIa{yNIHrT|Nn@bxi5=7CBvEXrZ!qc^!*
z`~@+`K#Mcg{3lKIH>DO&3}~ItOS+fb4iKFpr3MUXfwbAXArJC)stE*ZgL~OsDw=}n
zE}FNxyFWBVz7oE_72*XQEQmp#7CS7`9tVtUel6xeiLXo}qnFE}Iuel=KaW1-l$J3V
zB`ZG7X4z^&?*6^xZo5uIAM}?TNd$zka8Yra%ek)%ikIjOelf*ee;p?56F1J^X~d~V
zQWb`sUz3lbeZy=-)Al?1b4w+=&DIs{)-Pz9rp?Q3M$%yP#Ve|*y!MzWN~Vslo7h93
zgB@(qQsKkp@Sg{STJZGB;s4-EVOPa1_iTTQWt>T>p#KL_`!DG7zxG%EZ$t{kdbQ(G
z32gJ_<%~vBO>N7fVJNo^-Izr0;pJ0RZHaD4DU(RREgtnRll7a>FidA5kK@FW62|++
zv>n=CzkX#YhrO<Bsp*-*4-U2>A!v9W`5k1SQIMQ1w0th44sytH=^hetz(7HXW6y#|
z`oUN__^N|>vbVt8giFX?vEUQfhmR#EV>#XHzOcMKlky1jo?sBI%E;^ZSdX7_xwcxT
z7TcOI-@5r7>}wFtKmA>f@^Y#Efd%MP?n|JpX5IPr_1meYL_*y3j0iE|ljDk(mX9@j
zl?ewCs6~Bm1;d(29MnoGW&=Uci}&j@^X54c5w~!&1ng|1nB8gZIv%5JoSdI@>%W-8
zN+w*~p|j;YpuF~4wyD`)FCZCzTH2+gq#>*3{UPzKelA^{n{x>FbOP<|!O|ZfkFgtV
zw!`414x_2z@t+45E7N2{u!sjI*z9b%9;8}&rNtcd|M7I*k5vDU7Y~ULNmfECdnS8R
zl1*f<Y+0Gto=IiTD|<$9&1+rbM%kNO<GQYOt&5B6(shmReSd!V{sH$F*L}U7&vVY>
zoa|hMeA<T(V>ibbp7QgLV~&3PFf%j+IlarivUG}+CNC#1zdhgLnMb@rN$hqi8_#4n
zg-}F(`+WI%MK2@-Vx789%1!{{O2eNRqX*o+9Npts9M!6-iLvJw5|-bd(R{%ryqd4R
zsAi#-+uP2j10S-}&zRcHJs3*hjCc`}B*!EvVt%m>I6k!%$(TH(oB*5_{QT-Gd!-K;
zX)v`C&0;K+?)oUfz&|{c##|?7?%V3QR41JL`2i{59&zaTDG$#ysX(Zap>Jdgo(oBw
z6POz|YuhqPiy=yGqIKYjUFtXzf!w2`XBPZP&5=fL*7m(@UcD;3&kWRbCU%1HDs=h)
zC=I^giI+lAp!IirJkv|gh<_ytXG?xkb#{WTXT0F|Jx2+jzckH3d0c%SuriBA2CZrn
z1%-qP{2l78p-Atp*TYoQHQc*(S=KLIWy5EwZC~dVc7Zzc&zISbQ4Rp&!7_8plJKgz
zlH1TowLU)hrJc<6`r6vY^%dsYaphOE#jX4E=pROv?<2aZJ&G^uz+fw{&La;z0;5;&
zyTEL5;sMVVCRWANcr53;MPlw)P2w;}+pD9LNLSXv$AYG{nH>iZ?Z(E&Js#~BT%LXX
z{T3k)BwSM~K>w*QJo<$~^z?))x#E&kOuO(81EiYbj-82(jfhJ@K=#sy-*;M+2U0ji
zQxs2gX1gBbZe_V$j)E*d#Y<)wOdR!xI?dCa(>t@$bDUPw6q&l^x^i#|r`Y(Hf_3#K
zecxAJpwU0Xy{@#rP{N_YHa<GTxSSwbAV<bzF4L>m9sg)PU|UMipMuw|pg5Q%D$_`>
zLE^NRTGXsSK4&;~^3R_?1jV`^w@>5`bTmDY3pN<D4hyD;>xBq6s_rI;7KB#b0z%9F
zK2mx{-M$xZU>*1T=}Y3B(rFxsdF}S@st#hWPa$foV9}zJ?l;cL|3u?7D;R4c{qxa4
zn1Fyl^BQ9XRy*4vnU&9_NO4*qZL0tP&iCn0^WFXNyV<wZT*_<75To@(;Kk8VgT*Qf
zSLDUIn=c;4yBIc*$STB}FZOa=!S7>!(AwRdg{CeOD?zYY?z=e67cfr7$wI}@VaYkG
zGRAtAj*-Kn>kEzJ<71)27fG?}QJwfI?b-~YygK{|KZ9$#_<F30xH=qkW%b>x_|N0<
zW^{HLit$!X`zpRPbQJ)+%~we$k0br!y*z!j+~!(XlXH5h)bT9lgoT4ig!zuLZU2d^
z2`xrkCwW`Eb805dnhwatPM4wEp2uA$qc8YdX#G;v#>&|EX<uKT?;m=N*Z%%su<KNB
zjG`0qWIK0m5OW;x=d+S{&<6m44b=#DqIQzI>bsNcv+leD?F;moGn2W#E7o?p=w1<e
zc3^VQebmo(JR%B}fVkg<(e3;Mga1yO7#nAnS5ypv-DYSZ;JE#Gy#k9NCty+<43sz0
z8;3rXi)yJW=9vGk%hpc(IJd0q0;w-?t|M`8KKk7960JVJE7o--3kc$rEs11iT?T<1
z-E)Tpgjyyt9LiABWw!>aY}hMI^Ozb#LbJ&kB0TY==^8qYZ4c*K6aHwZThzNO{?vsi
zH@cHL5Xh$8Jd8hDDcDCP9?WXq0%!qsiGsJ8MJ@X2bc1@jmAG$I;`Ws4qLhB>TSx@u
zjoGsgt8HtW0@G7z8XRr`2Q8lPVWPBEGmgk9#`Jt{j(!w<L@p(iD?FnVX6qbw(V{z}
zbWvww!Yf+@zh+?X{9sB$lEZA<Y3@a&U3R9*3Y-@2#A$H}h)FMfXi>oZEkZTVd1c6B
zI#tFM&-iv=pd?8KeO+DfZBUp(K%rd8<HdGA2ys}Ob0ga_;z=wKu4!v?1x>1NlfNj}
z&EeckiJJILXIsIT;kWPSVHsX*_d*?;?>9b$muQ6&g(~@l`gdf6?-=sL3aQ;`whcPb
z<Sd6e$gA8|H)Iu<k;b51RekWAnfSxkC0t{Uu3KEb{u5K;kO4mEOcjU5z7(F2ET6Qe
zq?TUFAdD>j^<f}zbM#xIT@NYTH2nM|Z9Q3jT!e4uSC2Y@Ncy2-=_1lol&==P(<xtQ
zrp&$>smr|=Bmd-8Vrt2Yc#8+bWteuIT!|m5(PWVZdL3@mA!i4|1Duf;s#z`dU6i}T
zt$`oM7g*^bOk`wCc?CD;-yKeL*ycBv#W)$8>kI5e;OGM@(cTt!dpU@yhAhC$TQ=0y
z+b!#%cMr{|v>w{M^3LwNq5_%XyqCKdI8Qm1dQ4gG?~SL5Ok1=V;Yzv#F3o3#ft`~A
zEl&0|`I2BDd!3=8e2~S3#9q{DzDBiDIlqduYPsa!lvJJ=qSmHL3g9-?!Rsl7{xiX^
zHOPCM@0D1hfOW{Tzq^XIPvt0O@ot}KSSHn<FK$}US2wO`IS2&(v&Jm5*O1P%dh@y?
zVyQzrQ&5i%*Yt2>gY+s|+S+ME-qD`=xOnz%L|&1tkn)v`zD=lSekTrgCLT%5*3&A{
zQM>aHX+w+JwlzmYvNO(t=|JoRJvVV6wnAuwA6}jzElXa6K6xp}9{BN3#UE`a*%;+n
z=&%sIb-NZ7w!9OpR;d+U;2kZNjP<p9q`@a3m>x476J_M4{v*@9G@!C~S`CW){c%E|
z8W)ok-()hVKSk=W)aDa7mD15#Ss3!O{QGx{b`^?S5Jml5umQ1vPBD0|fzG2Bjp@#F
zAE^2<tom@BIH*Qkxk7W?luc#IUDH#FNXd3UhGCNeZuiKQg;H9iunYCvH&%cAxYubg
zl_R~1d1PBWUT(aH3RgN_AV1PDz~LUbwY6o*T-y~5ldG`M|Gk08V#3wFv9|frZJQiY
zxD_r}KYWt2VwSaV<eQ@Pxo{+;^F&bxm8A1qMk9^)kuH}f*PTb8fc-^WF>RF8e}?g&
z!YkDCRYe$K+aOx4i(ddfSYt@}@sk{gMGV>O-a529o5yDT?qbY4Io+m9?lrOD;$vJK
z@y9%Zf}ji2pjWtn;PV~*4cqJU*IwOtS^R+V6>occ+&Q%b9VokLf`jXl0QlJ?9v9GW
z{WC*PPIGO2-BZzJG>40*zb&FWlE6kk4ScAReLD;dm@<J(Bk)#h0ex#3wX5)+wByyT
zNbo_|9<1Yf^`f?_CRL+fNo1}oplRvfxMzp1bf;gn)%B^ccsNezHuaq*FZ19&{PCRr
zwa6SaV5ATkNib=@jO}pr+?2cR-ft2WplTx@dg@``<eYSg_+iA$D*4`c()fVx?dcy*
z{ma?V_gXUjFFw&s9d`m}r49N^YdnAY!lXR|_T~*=rEr8BR0#i-HV|E0_~Y<tUywKd
zLkzh?ns;e6;wz(|nP@3btRIux`-7$S?EcBOcLRJj6lF|l18P>u*{lW+q=J`u&$Cos
zC8?&nOxSP{&pb`U#}mP~k5PY+^ZH-8PEOR-zdO~B9=T?w-hJ{rUY_xV;}uI3o+xU)
z%oatul1IbisH2OI)?G#-uL%WL_r_wL**Sn81{g!-sA-IE`t&>t`$z0AmY;An`RZh!
zmwV45$gb3uRX0|>-8>!Y(4?y#Y+gvWtupyg%~$*QX|<T(im{FP_r5m2D^pBOdVcA{
zA)(O|0-~n(A@w6eL~6>`#>TvvVw3_C!t)*jgYZNJS>^ZhT}esC%-^nKu`483eLfnk
z)rpCTJk%VIJW-pQ!<qa>d*0HX+}zyHJSFe$ss+879!z_(KVQPl0fD!JzJJaooxV4H
ze|PD%gN**vSd+Cp4X@93Q4>LWzKmC(@WESi8|Uk_KYh?{hHi?tklz4;PGtN*{2WE@
z_l}XHg1yUE#-9V<e_!`545|weeR3*Q^z|1ywEiX1C;Bt)$4OJ+0c5R-ES&30tpp0i
zC>4y<RYg-tn5#>(A#Vd@cgcokkeFxf*jjY&GIKYlYuD24qPeV{HuR=fM+e8h0w{jH
zt6u^M@>E_62fO~DhP+?x=-_dz?>Not&?nW6KJ^KbIUg0xu=EaBy~0lbHZt^iMi!eb
z-7Vk}_fOd&+0Cx2RyT|5{(Gi7UY>j2W8E=kvaZ_T%8~Hf^c@-=ZiNPDgMYaiy7b+I
z$96<k-CgTI&(#5mcLK?uq`RAu+0wz&lE-`(Nu6~1PG7&y?fYPdK5fK93J*G;&E}&+
zWQ6Drhv_=8<U-X-sRM91@*MrQ*4PFdc0=7J165M^B5K{t&;~L95PK(^qv5#}?uI;D
z1hDWS3*s>!1E?~ct#eF^kq(Xfji_0rq6ZL#Er1QY5;}11p!N-B@LK<%bIF$ujyh|D
zLzDR2_gQwu!!}*8f26jd0sGpvucZ&X0YycXopLI3cU&777KJ2Mz=W@H|1QUC`9GKf
zhu@DCRSS3=AG8=$xG`bg+Fu9h2GJGaW04)$USk)?sSp_htgQRJW$U?6tH@x{v%9>7
zxgQluw1Fbit=`CxIr39Uk0PizEcB-*^UCwE0bA=T5j|GS1uY5$SlAo0)(9I2?ds$W
z`y{u}bAwQ<k-`y}fAk>5yhthh+@_M-_O+!eBDVJ3Ys>9VA9*9$KZD~u{m{AxAJ2h)
zIt3TKpc#DLA>zEsHpFv2qQ6`I;_$cd3YlfASC)DL6Z>CKNPQe*ZcDyq-M3DjizE@p
zpvAYIdpWCe{m*Bckz&ec9cHh>+n`4_8lC+i@|II#ffu;eafQnX<|6W8ZCZ(P{>0`u
ziCfztU%+#zVOYCnBu7{B%ezeU{5uGOW{@#hk^Vh;Sbf_4gx`0bpZT@1RZQmnjiF9&
zrBro>?s_f9*54^<B`!7sM`JVoE~dl%>P?P|6m_p7`lW8)!6J27M}GX|%h))foeuxk
zFzxhj>K-tL8swbo575%IX{jlYe`}&cP!)#>1D#7&aqoMRRdm9zCe!GujPilc&+ZlE
z)R;dl{P6ZEQ3Z0;m-tWl6A54Qyd3w*jug2%vLwP_$l3Gl0OQUEFV7I#%mWP$-u-i$
z|AHD5E~xZoACS;D857u*_-tX$y$2@O8a*ZU@`oG9;r7VzSFMJ7LxQ5#;qS{hyOQyJ
z+kpU%uuS&6OeuC;j$BGbpE_+aa?=Uxi0IsD^hDXq_Y>AN9k2YiQ=Q3By~*sdjqhH3
zdMhPHiP)aH#N9k_N*TQc?pL9{li1a0-^UXwdUuqJm9+n(?D60-48$w9DBI09NZfh;
zj{WA{bK}Cg$mD4bZ_F(+iomod$=ky0@-08V@lwfw->BA;3co!(soL8krIz`pW*+5Z
z4*__)3+o=jvjkgT>*;kX`i5-`M)>|U495lJN3a?0QCUvZK@0iwq?{;Yo(43GTD4Ng
zvT+8QNn0XeAR~{Wm8Ra_d*7Z@kQE634ex%j)xK3WaHJ?i4efg}f=J}N<^U}H%H{Q`
zvFTD*B6xiD1WJ$6W~J~OF<IUiKr+bI(_D-bo_mP)PxKgp>nGBp%cz<9DL;Rv?A)KX
zIq@8b8j%r^`Hty)-UIP`cj#n0kNNRK^Y~<D&ODU@Ef#v}ORZF*hQip>Kvm~J8l2(A
zkN~tZ62B5lw@06P2lV$*)l{b^v*!ER{;P3#!`qfRo5x>-XHfMsBAK->G_-Zz_`%b-
z@$HRnLl3`6A3Rxr*q5NFqeYlV2#olkyh5@)!`Z>g)>?}i?9i0{3!&>>=}n>kJZe;E
z+h>vSv6hQK{9lX&-C_y$tO#kpJlL^rWT`ozs-czgBn3!R(LD=N0l@cHje!UE%}=pG
zLsrl?xo$Swc>o6vdW-Ip2LWGSz8siGo_5y`)cZx$p=TNLgsWZjv*MJ)Y6qOp!V>}n
zg@=Kti~;Rn{z3zkvL|;lv2_LR-^)YHR(#*Ly{THtkPZslR@`;`qTXJHj*n8?{50FR
zO_=b!8_qj{4DXimN*3u*AMiRYuOE+3s(1RRAsc)j#))qBS@ke~_8+g&{p+WFNi%94
zQYdo^)NwW&-so=IR!>VbYI99ReMw`Q9|D+RW$FV5>64{dW6H#|*wFNCw^CwUXd3gc
z>8bk2@?C3Dg+(O?gccr^ch<aD=s+lK0i)E++34AM+0XCHga>z)2kdxa<Ea#REVCzo
zeHJ`kHd07fNv(Llt|?#LIY;QH|KbrZ`(>OwG*xSb3Qm${WyLVc-51S7OUwa<`g?{<
ztt)IR`<@6#rfxPI%ss?vgjz6@a+=32_?``%SqO661ng3iR3!gkf66nhNx{}6t>xuY
zIM(Fp%O+cjUY)Mg_5MONcUd!3UR<NH>21c>G&h!&7?X%7+>1jFXbfn_Z|+Ot<6X(S
z)nJY_S;s$p32^4_qQZQUvGq55n9}LT=D_CVWxAU^Pj&W<7B!)Pm*tUgrF)zxe^2%9
zE?GdrsMT{zCU5h{6{rf>80c9lX6=h+kisd^z4W-U4aa2>ciS^gAxb*|wyN#1K5x%Y
zc3kzVcBjffvkW(!7ItSXo!0&YH-1>PfD<PiND0g!T|>@aVfFm*7Sv#WQiuXN=qux9
ze5HMhcabx;2mF4PDU>w(Oi5MtYh#m9CG1p%{O`LDMmI;uMtO;&MiF?Y3Pcp~XqPu@
zn4k9u7ke31X?ukd-GVo^dX1&Cd56_brPyfHqkdM`rYDGY&$qbac4y@<PzihnK+K!w
zW->T#pxyy7kjySfEK~U?>=IcaTtl%7dGhL{`DBGd-``U9Y$+`yv7nMpuK@Bg;@mBI
zcA=9Cw1HwYFQAn^A_Yk$GxTdbl&P;))MRuUC@?iQZ?E7Sb(kt`Hi@j(N6cI6D-3fW
z4(cbwTBd)~DSKd~Z*6{21#&7}2kjtQ;*8&SVS6TchBHxY9v_f_V+^RN1bwh-RO{yb
zK~L`#pa*E~6U=A4?e1U7IsXOFTfd!|+KLlT_?vsB61KC!PxRrX*!(W%1R+E|7aU8)
zZFLrl$!|ywx@Q8+uCkPFvb$Xv^Y%@0NX(YD&0#o=aem>R754s)<k$m+EG^_21;%8X
zm|Epl{RD95jSuh;Zpdw2xdgS7AJ3#Wi)+k1Gfdf9hlnl)adcg6NY6NiMQPMIbd;S0
z0d!{FUv`b}g5U^oE|ar3EW&rJRoWRfWYv{z-sl#;aX)L`C3-r8dXD_Z?Pte1wHg-o
z`PrN(-?>oa5Y-q(efMt&I1O6h2WFqav$YI`bLyq~Dh{yuMZ8nChu}B!P~aK8raA1^
z1h50sZ8`J7JY$oN0FR(PEf!B0MswO+80WTX&kY~lGFMn-WA&NJvY@{+L%Os(X2B2V
zceoC(v;LSLFz0ZOJ*DR@?$P!~$k;IJ4C)eU?WxM95IxLBOOuQ#HWJO80wT(p6+k$}
zf;tY)&Xl<vIvZxkO;@J_?r-NkBQX1_mMFRdky9Dp#QY#cS4hL;TbH#_GHi1}<#nqk
zN&vq*1RXx~69xW%Sn{yE!X!VX{Wx$Z`0wp|jA&z5L<l2{5p4xx-TP^7<aiNz<wYXi
zx@HOdPIJ0)-~oJBw((DSS$IF>3;;81okDmXboYhIe9Frn1&VTX>84<(Z;85lKf3d)
z>&(VJW$u<$j`HCzOW)qxv8YA&39ucNIH~k=mAIQ<ziN4^{35={-t|!0R@nt$mi1~o
zj~%Iv9~GBs*9_fQ!aoWkb_Cf6pDs)|U7oWC+`0z|)Sbi0X@9zdHaC(9aS_O=+qXsZ
zpkbcC0nTfK%~A6Pjdx#k=wtU4343>@n_T4c&UB>}dEu>F!`1-nN5o6svZr-J-Qn4H
z-EVRebRGrnf>8zg)D67O*8Wosk_7rDu-(sbI&>7`t5f&&Rm$;dd`68gszcFi!lkf%
znZq!ds*IX3xV3^_fc1*XU#O|P+_)BcWln8^H_;<?^Qfml?oV@DUtMC9EhlNR-YRh4
zB-1E8-k?S!f4*%b51o8W87D~^1wi1meqAOwm&LOO=DejrkUZfGS|M?-O@y+M;%9eS
zFO8BR8@_~4SS9Ds-!q%ECQR^~#gkp9ExtG1q2p^-BE#cAPMdHm=ZND^mplflZulSz
z%l6GApL@v1=EDgB8#p$R)Ff&bx2vEc8b$P*FNsO@`1R|2f3%SI?_Tm_C|oqneyk%r
zR*3R}<jkeSVL2A8^RKBq;J^dpfT$OcME|VA20HGXc0QU>4x7E5A5<x94gHwCbG$V-
zpsDuWPuIPWE;FJWJIVUPxg^)?@*!0xu*oT*^P;jzvS?{T8l!f&xjo{YmEqMk26GwG
z&)C4BBcALN=rV}qf(DCcGYPLILSj(~R7aV&>lC<@yRJmtHnVCQ#wpWF9=guBNB!ZE
zWt6&DJ~pTGpmsL5Fy<k8f>se>`#wVm$TK^9O!(J<&$jp_-swhd4Pgxh&LF!PDPePw
zI@@bD{_%;0^_+i3h9&~gE{wAH-V$>6)0f)c3CA;e*V!{(p)b!l0g#XCa%@SSE$g*t
zTML87%H3Dq^3Hm!`wP#yPnw2@eRt#>!TU&BviRK=*;O)t^4=mvg?Qve<?(NAI^s+I
z)vvrqCLG%mha1=pGX#|1(_*%U*@Wx|2^!)&F2s608t%I&Os5G~8b6}Wwxbgfvwv*A
zarGRNw(J>V`C+rcyD;<12=9;^nO+(;zEE&%Jr52j=t>HE$Q&>x#P<<XJF#fcq;L^o
zMilKP+yEA-+)a;+4$il27P4E%FmmsHZM!GqWAl{`jR~R4pYX@OnCaZ&mLjETz!#dy
zdf7-CMa$&fXK#qg-kwZ532$bs%%>bo=~%)>q!E%h_inlq_u|s~WAKZaaH|X3ke!Ma
z1Pc_EZRyn@VzZ2bU43l)sb8db9Iu>T%G{1{A7<r!y-veJRS~lK+-EH;cr~uhaU{D2
zF|1b3{@39OSLeRB$)uADR`G&fa$kn;r|`V`1|aNj4aG5YxQ{HhJu|9p-$Yu}ZvDM!
z_Ms=@Wa@HXzUud*V!N&E@i}XKj2wyAoA9GPiO^IMj-q5=uu`u0d@Qb~?36QjgPyI9
zVt|<>XMiJg)Z5y@yX3&HkI{55yTz*uXA<fW^ul{hf+?AsG-5(2v~0Fx2~0NMnyn$Q
zryYs@C^z!+ZC>*-p$*nCtT+$|$f1i#k363tMln9L=7uEuUG4onPt3SwApPps7Jdr~
zW=m|Il_nw3f$BLzwz=30*aohDR2HUBC=qIUaF3Z=0Jg+mux~{u+PK~IWhCn~Kcet4
z`RPYKo$S`fE`S(;_078U6c>>eCaGmVjSrO?AwLy-r*5Yg&#|^!EF)^s(NX$2MG`46
zaps2$`d13hIpj{+JMBIM_`1iWJyHQSu6cR(HyW`oO<BC6SW^~mY75-C=(cUMP|9t7
zc@+Y4b*m$kx7g9O?EE3I-v46G{wCuold5_+)VYvqq}~Zl;r`rrqsxujX*#|6T$Z%x
zlo2Y1xn$fUxfvi<eD2v=62WXA3qtw~u)~HUY&Yl$+P{5=yFQXxo0|xu`bx7V{bi41
z1j`%2YU0vfb7xZ*>`1D#vdG9L8<`8;$E9UYEg3vjU)7peNvZ-r`z!5z*a<yxV+xy3
z;x2z5+Ic|C$2}baS)u{?DPMi5xE(H?`3HOX;CdCxJpAT*ZJV^RN4e1VedBIfD|90H
z5YC92{{^+*Zq-U+g~mm*NAFm^hk@R<*80K<^yjU0FRYnInVBho3a7T9&xInZX=yE9
zj80jQF?o+=Zt{vkn&NLKGRJ)Fhb5*sd%I$5k3D@e)1_ub%+*#!IPz*-9oTJaxK<;0
z>UovyVJ!-Jv`Z*?$2AQJ<?M3RoW&dnn4R((ztocSY>(YZelTk*x+j`8UO<0ii9a|v
zuJPUPK#-WDD~X9Sw8sHFGo8LiyqvxJ^vsetwfkQ7bUwOJYgVt?qhxZi%_p8b;n4E_
zhk=LB4TzeH;6DAzau1C+B~>x`G#Q<h7eMU>R70JP@qs^s8aeHVN_Si$1=f8*e4(X>
z_y!N?E4c3=PpUu}R?<vEOzrobmrnKziFt!Z+xJ_Li=Fh2hZL+L0?BjMfbY&>%=01g
z=z1eTpEyV@9Q61zvP?WJ_knkGGjVEHqFQCIy_@o0+_p=HzTU9!KE7Rke?m(}KvjlA
z4^{qcKcUtd?uwFHOfC_9h#?v@mUe@PgT910`~pSC%5#M{YDU9{TYGyR=B-`@A|0p0
zsmk-Qvk=;>+?uqxmc1p7@lLFd+e`=0`q0IDv6dOHjYE3onwHGQzBUev$q)M-F1|qn
z%6uGKL;h1B3nh2`#yXh?IKRQ<JB^D3mWgz~DYbuc(o~RMopo=7St1gwnifzoInDQw
zd0dG=5`2?%-r*{&u6C<zy3xpG#G&~04AW*ttKtZZaX~W-yXm{ud^UJH64nXVU+g8X
zW*a>!>>P>cJiS~hP*Y$Qh~DYTK6tW+q(V@vt{TGb?GQsnyal&P1ELB|<DAkf2-5EB
zCj7Hu51a4(**2Pnhio=bw{U_?LlR}Q?C$}{Lx#T&bBAhwa@;-LT3ebXE+#&xef}T>
z7VO&-Tc?E$wBA*)S@ELhDUTi&&+N;ka4U=o_)WmQMQK{jpZ)>2`1ad!Wb{-PmJB?8
znPQY&i(wMvBpFfFUS6;3whl>AdJEh)@|q(e9cr}lxW~d<HfiH8J}m^_a8iee1}Hnb
zK|p)~5YVF=&oA<+QPO9M?!o}FeKH|Ty^$zbPW273%}Oyr^rT#{T+|~H$n^q6BMdFO
zmoA({9&>Sr`BA7m`LN3wyF}fbuPWWcm{)1<=zX=I3@ExX5#Zl{#$9cbA!e%I$pcz8
zy@0;@>XpG(eqU02MQ#Q96h_5h9@)jNr0oB(ctcMr!xBn4Q;WgeCdrr5-0>aw(K@_C
zrk~dKx{~`|b`v+C+GEhy;wcxM+A+&~MAQXp!?Nb>y_UL$NeG!Lr%*LfC!{#x7kbTE
zrCeO=vym2FQ}g>kgW486ya0BK?`pxSprv?=6m*D2HBuYi67sMMqQ8F8b2e$uic;8>
zbq$}XNPs1|FGyv&30pihFCc-c21_{&KNduGjaX-5FV6$Ai4KWb0<H9yN2#%Fud5gw
zoj2iUV(*sZ6}Yf<kzHoC8t_m8TMIE0yK;muQ@XNgbo|187EUtqtGKQ$>W}#J^t6Xn
zH(A5})vjTo5bROcKLRaJBnbyIB>2vQzq;rSQ&;x(O0PUW!xTB5-M;^NG79cv99SLt
zX|k^g&=DL9xE}nLZI9O<^hd_Rz|72bpqhU-l5+XuPMDaisu!KL##y+30!Z|x>nq$I
z-!)+QNW3&OKDU^raJQLDaDr5l4fo)Fq=Pb8TQhV0JTTDW??25*%GyVt*8{&adf$|x
z34$Lx)?p}OeodRl65V8nPi|Kn4(Oz_$u&xJj=5PWx)wkD%j-Da;TAmN=l<S@IrrCF
zQq$0I!7s0>l@LlbWELWzDfbsTtmNu%D&QZl<=n1(;Xj^iEUkv`Va_S<EClET?%(3o
zoqZ@8-qmWbWz+mVD_}patxaYJ(Q!s1NSCWsRrvBO)COZj`KP!VR%|p5bnUADWxZ6B
zghu@))MX8MUuqC@_hxad0OltrF}&hDCf~cSbDL?2*o6y`f66NddsY^?a~la(X-9In
z@KVxBtk8N-(DM%SOJDuYJ$vC}Q;RUXILgv+I*+d#TqDv5?Cz$!eJ|-OnXRh8JF~XO
z&s&Fx@!Eibz?8)j(r@X@Hsf7qtRTMc@H2YXZ>K%(#|SLlF!;>2rS`@`p7iTn8w2j<
z>5m*mHBf&MssE2e;;4G5H{qS0F0$9r%;o9><ADv$Mo5Cw67eKQO*iuQtnk0G1>up(
zuu5ia`4hEb;KxtuS{_>I;O)s5#K4$|;8cYqoP+XUuVoje5t1TDzEzkB%pA-YMK&C~
zZEkiv{<O|S&njHGd=WVf(a^UvT|yLGt)c^6SV3jZkypy*C%>5+ea}-MqEeb+TJGqt
zE&o9&E+glPEs>GGqT1T>qI*fB>A-(fqwI&$?;J0>BM0H$`l)nE|1`Vt&e=M7Eq`t;
zsT(<3Wlj`yXzBm6CQy*SY4!4KR>R46eAbAgiksUi($8YHeJF_OIx7pq&HCrjK^Rv>
zn5i@*V*Af{^=({xc#-RI0fgV2dd;<Oot(yl6xRF4(t$P)1Tj!$L#0dquVH71hu@UK
zztmW@Yf^X0pie)_Klukgt7RIdULtvk66NOZ5%8q+D!KPLAe3mNp+Kz>^#Oeu<sX0)
zh{XDEO-&z*XF{z9udb-xgl$c0r_2p6ele-CBTVX|Q9x++^(96g4n#&O&v7Uw<NgFX
zgc9vxR}w-046KdqC>daLl94*g5FGYR>|G)GQhoQlsXNRg%cba2E#luQb4TbGjGfv{
z08#J;P`b_raJ1Dc8}RoOylGX7aVjNt*><0N!k0BBVjqm{kPw(1j)^!*e+xs=s;zO@
z-3`@ZYEZP8tv^@T6<$<`WxH<rBTj|HZ(MfE^})W~$cuMaVk7@v_P{JN#5hp?!psec
z5gTI$Xj#4S>*CYTh^g_wUsY&0T{J}khrE?5@vF=sBwfo8$C43-<Bu&}w}x|NjU<r^
zrD(`jFnV)6Rv&7H-Dc@H)F_4^k1E|!PhU>p>o(#|Ic600)ZMv1DggF7ckZzk@IBKM
zb{seLM)!qAn_e}|9RDCvl%g8fS0M#ZBNFw6!{61{>6+J<twidLaZE!wD*)^K+TY%f
zn6cD@#&?EBHFqyG`Y+dZT1F;5ixh{|rB!4^q^c_?&jhek;%DY^fq=|^f0?qS%h={x
zMU@1!c_Q44FuDEi3&)^6v!+ql2(nw_!}s4&tJ6-~uc@`Nn3=UZlw$oO&$337ECULP
zvNW$lUyZ~A9S=+~!;L?Xy9Bf(=74*_5@x<YG?boNt?WJ{Rx>AN#|Gy@uOBHxaWSJ8
zCwCC1Kqr{Xpsg~H$TsX?dMnfkwGb~Z2<Nm%QK@C#{#|6iEYvVfEuzcq=&||p(A;r{
zpC98i+|6efyKPe2Uu>pan+c2<7T#2%u;S|=hQZ9|TI<M@Ci@_|brBXY5b$W?+9)FA
zQd9nbSgkY%HuSWfehGfew5&5HISYBZ{wfI*0s4Cu!Xf)=976CBl&_cNCdZ585T=g3
z)wzYsk%kfYb24*9EU)&AusmQc9I5k0z6JQXDo8N&S7>(?19<<r**Ik68?Tp3HY3yE
z$GIzE2AhaK7N3OiZZ-F}Hs7<tP0utT-`y$`1g8i4`>>9EU<{x_IIgm%yu5tgdb8`B
z{mBxs^J^>`A_#dQe<w{uSbhPZJ*>SEU!lARUAC@%A1e`MB3cGX0so>s=D!)9d?0Hf
zDq-)mxoCbo4n9L+ZM)MOu+X$3Q=K&OJSL0Q8t|WNaZeo0LLs#5yOoM|$w@g(ys-l|
z=g(d*qAWttq#F5}FF&vJVM@Gh{n1V#Z1$?Zlcz5KCo@Y2z^yNA>MeScq4w;iw-`^#
z;Dli1pykYR+QeoU*E(}}9a`Q}YRh9Pp`zs;BL}ySq0cl|bJr;B>?r5_))o+Q*8@JK
z!RcuYnZ!BoFE+!%A|sJQ>RqMY$YkH^ODNXgr?#bWYdwW`NU$yp6p-{qJaCVunt}Lw
zK4zbcx$pFEV<^ERbSXS)j);7%6myG1*u57sxv80IXw5dc=KcTBcWr=Yk|*&ipA6zI
z&Y(z(YI_V!hEJbPxwA6+hxL982c2>dVV@JceB3lkG5M9k2`37znU>&u0<x$G+I?yP
zgdH;@rW*<a*hTipz&Bn=KMmuw?laU^12!(k+`U?fxr2!QP(!kgEAH@jc6Qc1_j><Y
zpCw^J*OE=zD>E3oY5onWY*$or_38XR#I2+|E`*LWh+Pic=pPG?3gO-@blus5H7|v*
zPBW59%W!t3FemeV(pI@a=SGjSA`S@19us%tEvrKw3i|JBMA4COlple?_t~g!;FK%f
z=L-ku-Ixj6Co}r3FvWVO5vFQpE{w;;5mm#D{u_wEo7z~dVaLVcq(;fgN!SxT^dy-a
zxCJH*$5NxVD$x54$Sc@X+^_z3>@%pAnO7Ft4>yAQETk`wfsDte8~Ed%6ZPX+(GjP|
z5DgH*UNsdqbo<Q@C?RlFfFIkg^b3<WFkmLeC$2EcL45U=O2TkM-W=6iMnx^cPW6SB
ze1?rDOTW$p!;j7Un;#U<sY>Vje+-cP-6`d?HKNVnybuy0>(ahY4KCbChgmJ|G$bAS
zlc=kBod48ix_iuglWn%TEP&dwLD)!X4*hw;MKnv_ML130MKt3GYdU1yb<3<<(#YcJ
zA1&V+Kcgl<9urKB0)4j%b-#sIT*3HOz~7S!_?PfQY$Gn+GO_<Im&NzK*P;db?}FPs
z21jQDmRt#CE$eTYH=;Gz12VtwZ)T5wz%uHiW4pd#qWy_z-n{LMQqZixHDg6zqi5ga
z9l1SMO+5frT|OueMFhU?Cf<%}_%J#9^K5$t+{jj%)u&~ya3mO-SQEfGb(5W*L9h~W
ztPe5(_zSFX_Yo8Kt%HdvHW|H3Is~s06Ax*j9Dd7e;rfJ<sx}!xp*M?xEmYV8W-giN
zpN{|9O=XzHEgJgdu1}zqsE(~`{W-UUOyG_}lept$r&~)Qk?iMP%`JHnaEta-rX8g@
zw=9UDQ`QnJ&wck^{VcB%Llp7CUW-fBw&3^#`N2Hr)y{f<sdL95qd2VbjT_e|nlDw2
zy9aiL?s@?R{8~IR$F%R}|Lm%F{c1-iaK=*h?gZlnj$+saW*~#dA(>9zFdc)r=1%bI
zufF0qQujC;gNqtwGuzkiP6OHMNHB8%bKRjkr^Tti`=l*?UtC}xqg!IT*C`K*@1d6C
zdP5VKKuuRD%a-C5fS&7p8P`5P>=-od>axSki3A`-y!O5b#*Q7q-rUOKw;C+#eI9=P
zhWcNzHj{XqlxMCbSAuhW2Zg-BYo1{gu*gxBc}Y>#V7-R|*`o2WOwGu`f(MW9V-m_y
zj!Ysqs;{+dbq+mf+%McIeiRt1$QRXcKv_sU>lj~Mom~&7bEDm1j1JUXRgxZNbb|gz
zmf)oxxkq*lmLw(Q^am>l-x2S9`FUFZqH{RXrZxQQ<^p9^N%P;YjCmmYC=r{SdTYvy
zM>4UONqBoT=fo?*NVaEv@z^mcM{4oO@YF*VWDkM3bLZ0N{{zbo=mk6rq)+yIArP8$
z=k!K)>^chr!w*7fM#b$!{U}eey(fEE!?N=s$+>(l&@VClJvRTe7mHMX9a+)Q0}4c@
zt6fUwITf?QmCy0U&?%&(Ggk5Zjherrz{$TbTwRCL=0Kh0^5@PnW6?0_(IOGqT<Bwi
z<r?#xmnHlW0iSIYuI<Qwa8OQIu`l;<-28k(oj%=v+aQTBL3Rw<Op-SdouS4(9APFB
zbsiTYi3#aMMypY(cIqNxj-2(lJCN8LCnl>PK70&JqK-^EyFR6${`x3VT`9pQvSl~S
zM310~E#sJOd;)p;cx6g6Tj$(jI3Yo(r~zETjJIr7=xLK;iKm>&cEW-$WYi<!LsD*i
ze>rh6*M<w-?={B&9v0mO;xLg`hqc-=QH&kkspu(1f9wLWb)ge*#+ZGL+Eo%36#4t(
z*RfBcB-nZUKDC`=jcqf$vq8UsZmRiTHSa;Xc~R%P^{Ev=MO<(TXF@X#yxX~5sOT6J
zVDzG1z3H(#(GDATv|i>xaCTIR!kI&QI(?3;%&U<Nnsa2Tje+45+c9V2^G3}^BM3gh
zYGj8-3XsELtBu+6t`{otcKTv%0@02{E$Dw~eBb|a2;LC-h#uwP@y1UUT^p&X6(S!>
zCoQXVPAGEPtn{|P$A?Gx3i@5XM^LOc3w=#ZCZla_><%W3Hy!}pAW*5lnO_1gB*Gtt
z&azLtP2RPP$TC^!j@b1UM-^;U3>xwBO-g*Z@y6UREy)RWD=;ipF^5y+f;h|>*iPJ1
z04ZOyy5vt)-Bz_uT#XjVZJVdb9JkVau=ho~xmXKbZW3b1Ajmm^*Rjd{_fxU9tG;H^
zV&y&>-@@<GG-`gGP%~lWZ9Q=Kt)q}2FdxgL;X)_-B6&x05k}^;A^&_CC|%j<iH^O&
zNUE}+zR>81ePdqGU_y%)GZ4M`*i)gnGd;|_(dh>)<9!`7KGtb9<{@6}*vh}Dr?|^t
z?#N9*rh9725N3TF(PwDGReUt~1Y>@ecwhGrgeKq~Q{Zm+cf7K`8F1(F2cpd!;$PNW
z<umd;e<p`w*ciO68j(c3T@;b)jjF`7yi*!96@_fe?95hog(o*)m&33firpzVb@vr+
zZP*ZlewZUOjt^)k%`YI(bWZK{xp=gHjK_lkMo)Mk93OEN96bx&75JSeK2OG57*u*^
zoQCrFQBYLvgCshRzqHKBLFLLK4;!w`cnYVR>$10~NB-zO`&%cr8YE=dG8F@GRsAGK
zbGlBaJenoF%;Js9N1!eBa(mB1*dt;+<+t+7B$O4FY@XN>W2OAr&W3fq=y~FGqTyuG
zP;?#vJ9tgal5o!ffAeP~m&|D%@}l1Y{toXti`{QUF1jqcHE<n1-_SHN^#HH*&$o{_
zb-3(lY}m43Hap^~QQF~=y=%<@w?f23Zw7hE;VIp!15_>wDz|nRZ1st+J>lj93zoWD
zc<TljXe!3J5C)AjZ=Hn%3s1RIA`eUmL~Qbm-k|+Br)RqBt=|tApz*{iWFtY7>2r^G
zee!mrR452mhr|6Me(Q)xu_6^>FGW?)KZWgc+~S{hh^J)@qfrS*<R}9u{%*diolE!k
zifDEV^IsJ3Px$dk{4NjLeS)TI#qZ|t4?^A34*x~)5zvYj&JUyF*n|yqu;9L#o^|_K
zG>6Kvl>fGYJNf517H1Ou9l6Y+8?BMPY@G9^S&XxRXZOF6ox|6M_!jb}BK4K$nuiw}
zRRv4oo_7r-jzkBSzB7s|=xRKb;~Kh^5B^(XK;k*)^0~Y$!KlJ36E-*b({V@NVL#?3
zwiV^tPhT5ZSEO9#>_KHrY6-?XCRW}!VSP%}I5Sc`$imlzZcm;6X6lOig;4P)P$hhx
zmz)@MHCMu0b`XAg&r&h!(ey7Fi7I$l;|SP+@Tbp}f)OIalP@_LO<b1grM$tpJ?y?U
zWT(1BO1^bp?eWrFed316zJGAr`VZ!2i>UP&5()t!-#Lgq4V46FprJQzfv6dzv4>1;
zz%6!Fej$V;BKxBt=H|ht(lJbY3I>CAolG=*GTzJYzsVX2CfJbEQE}7A#3ukM#oV3a
zSo?83eex0U7)-8yP8fDfr+R5cQE01e{nkQUjI`tHvAh9y)U?~<6e#_NUTeCW9*33~
z-0x668R&&?E_uXZgmo(YW3!I~QBSs`_WLnntGHzLg+OVFZ_L>#*{|COpN6xfN4HL9
zQtIVZQp@6IiRIJpv&d=85`V=v)s{%v3Sn_`1D2E2SoPIpp4EBaa=rDx?z4?9YqMI+
z*7D5NsDDAYu`qG0yu!qBxpnwn3x3Hjr7=41`V1=nzc4zH2{E9Ruj#ns`dQu;uyoY8
z?@%7AB4#UL)E{@t=859oe1k*E`lQ*Q1F<l|hUtAF3lpo;Rt+|yYBRo}@nY+!(f&Lo
z+xxNdo?ICCX}K(DZ8zrfUWuFC>6W)STJvIGh-{1b_*9H~typmT!iQ7K$0nKFDbb4;
z_U_P)S<C9{O!z&JQ2*st#E@A>?{UyZo!3=NG%&%h$#D&>|N5O?ayV4I@<7V26R->t
zd^hs)j0-c=AB1)MW+t+UA+YP1U5608$S-1JWMVRx^O49Wa1+sH1vXT6p0tHhEf&%8
zzCkFr7~eM0X{c1vygAQ{f$hV@4Y7<h$`M-!8W#y?0>dMY=L2<`J`qZ4KKB@z-O0q@
z<A>U!2mV(%K>2qOx}@-ugaZ5#Tcgv0Xh&8vABEWVbFV<-OgZbAQu}psGJ_{9J9m28
zmo<t6b`%Nb=fB&uGOh@j<foJGM}^%Rlnpm;Sm=71cdH<ozkEr_P#Ta~?`f;2xw!(N
zWpl#5Tzpl0J6agtDj7iPQZ&j=s3l)N>{@)edptGk>&-j5rP9lyG&47vN#`6<!^|{$
zkN!JB^^9Vg=UDP0No9QL4%Sh7St_(klxs59<fX<L3rI7#(njIqFnD$bl@M&Zuti@o
zer|SEN^zT{urCj2rp-yMRlREc$ztS@hyII_dnwOC#3F^Dj23QapI-DOIqC`IWl2S%
z(cE3OGg{D7g9(-3FUM2KDc!Fw$v_FuB8n`(m(@o5&Rcutm$n7Ric?}WZu|&6lY-q@
zXjQnifp-mvmZP&-KYQ*hG$YR5bJs<=!{GZW36(1%@dh1?0(bxH(_I(?=3SSsFBK|5
z^!q%-uIT%Bp4UJT&)k~CYC0iiT>Z(3KY?xFL5hY(@qo0xmq3oFMG4V!_D6k9LWJ+-
zhu<0LUs+zp@jevX%8~oh2ZG}BmPlH=O1$DtmWZy4%C)b>z&qEQH*ckl=i@Oyy&rF5
z&(Z5OSDz6{!H?(fGool7>{DY_OBgBB+l$I<K3*=|AE34-BCvp+<)T?~>tb5WqF%`N
zGWnVa+v|7}aU-C2eS@bMmsoT_Ka?et1p^ux%rfO2&mslQ5vW%rByU>FmGfvp1KpSg
zd&HsNTHj=Q45zw|>3^spKE)$G1-!gL9{W0UZs^j-Xma2=a1fASVre&mcy+dunsfM|
z2Q(%deB2d5SC4^|b8C0PxGpP5yrYwEU-=?6Qfv4hwKaLdr9zW%YKVuz*j<Z(*oAK4
z9<r&w(Nr9Tj;QeI(k#jq&|0TZmkM8#@?k;a+T)&?A0HflKWTpRBTEx7u>PNQt!<oW
z6fGbs(zLmj{Vik+8CU!j>zLy~U|}SBCZB?p0*c@2jEpN*1Y4fe1vtm@4zIY0SNR{j
zQhrJ9Fr;sbmYU_jzvv?dGZ~&rsfr%sFl$R;=vx(RB~YxNjK57AHequ!3l+jug-(aH
z1CwSaa|Sl0zShQesJCtZ%FualDJ8P`AuX`}*J{-SL*?Yl1;RX6aLJ9Af+c2b3i2<-
zC1S63t65E6pUaO2(SZ~ut(ZVj29MT~nfm)wNmwW&fT>YbDl4NmNB*1>%xi21%Y96}
z+L`2*$)q8v?d#)rNOhu21icQ2;sW&a^{2F40dS5{24TP?`?xLnje4P_236{ZLS&3a
zn4S{4w=%CN+ujkT{;~pu8}#crC*`O?Rr%|-ZUlBv`J}**t=!S4%Oq;*`#e!$)cUvF
zzQ&M`ysCZ)PCq)kPs}>k{%{>yw{lH*3VscgQas0SsNB8*;yr#YPE@+C1FmR47=A{z
zA>6H9r=m>+3gVLkMFu6mh~!-qcvvV590MYAcjEA;k|TR+oC&=`^0W$JrZ;wO{0b4B
z!@a6Dym^n*hgAA%jzOCF@LN-g!T{v}(nT%qws84m<9&DvBt#2MOW{aJ%OuwJInCA-
zfprkiZPjJlJ#qK=hjE!rkfO<$0rsbC1}!BDI{|-(AeVCbm*d0+rTrZ<y$AD(U8JW^
zZsA|uVBmC^E51n9)M>b+<b%Rb9z6Bocgz0nLo1Z0Fm6FlI1s$BHnebb5}Gan)o{vi
z9`T{PyYa93Q#g^^x1k!<U1R~kyCXsZg4sUQ(5$ZlR<1Yo<SA?RQY32@&AG`IaX_Y{
zA1y@*BMB(Dy7G3Pk<%|~u9`DZbYvQlOCjiXY_v?vHN1_DRh;LftX=8lta~PU6Z!0h
zhtWn4F0PSLpQ=w?97Y!VIE76bK5P`(aRWz^_#-yE!%Y7#`{V-(X{Qf1?P9h|gS{DT
zVz!Br@i0)}@jsPdDLWOH9wY6B9dhytn1$?Qgv#X&5_Sx=S{NhptZ=fI85D6>qF1h%
zBepVd)&#l0oI0>JZ(mT)!9sRxeP;#Ny_UTdg|f#J<Wt*3SLVwCXe~ahOP=m0vJ(G-
zt*;rnQ<JCOQ|hO@=z2gP4_H^h*@Pu$k8dCE9tb|H`yTV;K(G5fVU!Ifpw@~y^z!s1
zO!Yd=o_gTjcH}~bO*}n>9v1XmX;GNb-b&F+^x5UZhZHFIWx=K{%fmb4YprHXz5JJ}
z+%@<-;uNh%&?7+;fZp~N=g?%kyH1@9ob233h9QdHMYlFN=pw7%z+LXd;a&ts!3v@-
zKS_N4wO$f7esKMultM(|kk1FvR0zngH9a`$*Td;av_0~naH4INr_e*)L;hyBxTzw<
zwZnvR@v6h|8w7EtNVI0Z3{hJ9S5p1PKWw^oxD<ANcl*m^Y$B_}4Y!V8lm^e#m_P67
zx|bUO*{cqSyu0<EmM)3~WYtBfdv7))b-|^f#Tua3s5!94+$)9SQ`Nth?%gQ?OLM7*
zQwkiq%Us6K_qbfFdHDC`7M}Gy`vavwYerGgAMh35rSc~}i(763o*p_1-hWj!xw6Kt
z`^HB#g*q~l0V1(#&#7D6V40q7530^2=Txk>VVP$UR;f=p>dMf2@ScosEtxGBu_iLV
zZ2XD%pHLmEbQZKvmmA%o%D2a9+}laV4P%XKhYQodlWHy1)wx>=u39)peg~GIAFKde
z5e#K<qgS;{lH3o040<e&MLR!~DRPrLP!my4yH5vpLH_)OYM=aLG@6m2n2O`=GE?{!
z7Qj5qCLrmBCWZu-ZhNHrY0=3;+?_nl7{ne$sU}aw-L^JAysZE1Of<g|wq_Q&=w5Vx
z&dv$F*@KSiWJ2gL<T})QZyd=xp}-hWHADYXY##Hj@z0_sks9g!nEF~4_r9@?Gu~<z
z*M^Tc*q{C2*0fZ*ZG}|ilgUr(aSpL6T4VQ_N(kRa3r%*IQs`M+`o}Ssp<Z0FT!;6A
ztfUYeT|qDLEBKjVW$*dr`HfBD0s$R7d_GaqFCq})g$_K|Cd>JNrAWEZ$W?vt+GF9>
zNS`%I+Ro!is7l5-gK%BV1#Rl97vRTH*;P&<&)#)RQzSn>?j`bmX2d{4<l;=&1Nqe<
z@w)EwpPL!Fe({i!yH6L=JJey$HBENoJ;mP+t`9&EbM)gH<YxX@V;G#OCu+<o%i{a_
zH{ytdlhHWS5I@XM_1#6G)X+}Sur&P!pGe@YOjNch!LO4eGofvJA9e?1oe#MM(JIph
zcm~{2@F$V>GqC*?SeD=aPjML~wAt)w*^*QcTTC5D*Dv;Rav;=pm3ugnhAN5Lk5^^g
zfBgt>MC)~~&Q6Ms@w<s9&Hh4tZf%rP>m5g4`9W<B;5yZy{LXT!wEqr=9Xl1(BzZWJ
zBD8ZRTRw22gqCo3{lrd)j~Hk2nY2c96r%4CSlj}S)^MkmI{f@Tg|*=w(<RhJ4j-(}
zepvnswQcErPg7&`#v{}J{VI64A2MbMZT_BTXnnf%eI*29G~#F}s>32pX+27slojh~
z-XJ^z0j$mi_&}qXUa~xWFmS)=)s4~6UX@|;3YHq=4~BR*rP!m`Qymft+GN3T^9yPj
zTh=`{A%siwzBKsw$Cnc~QdMj~wZvof1$g0vKaXo5XVK#H@^qaAn$4J!%!w+9SvcVs
z;L(6t=b_{<OyHevqhNq05t~O7STAm1UsOn~kY{w;<`nS9Nxe$i(zTVtZ2u!xjZIky
zdC&0cFc=c+b~1ZhtRBY~M5H0z2EPI4Wp1BlvRvxAvfS#>@!4!oDY;kF99-aGCNm>2
z&Qe(dKQ#13`4x?r1FkJ^6@+zzG0IlNf|Y&Jil3cUlW+I!qSrt3{Mml<@rL#bGa_2=
z9(%MV$+Hyjo17?-Jmey_$F(7MMxUpEhco5rDp3>yET`G3cI4}~O1?-4ysGlBx7(^L
zgun{N<APTX*dpS;-WLe_EG2z2J6`0iSMI6Hgx}4qM-$p{MfP^?(v%-byG-i>Px*`^
z;mJFR&uM`(dTcf`+%(V^Bh28#o(nql`D-GKniDY7`zUt4vPuEzJAx(lYfTIiiNM^Y
z;4ML4-0XEaZ_&j(gnF&bUd4r`<qK6Cn=y6(bkRQWv&8{+S0<Y~cp+DGpE4h?1N`dS
z<PpYlHqTq$+)yhLcu4vo<F-ip$Ve(usz#;<v9;+br)~Dpxe>zRPayV55>uE2aVWl6
z*Tu@?ny2+t0cQ}&obh+*xaj+$%bn=Uw>DO;!R6MdJ@QQNsoO)-*Z-POiEz#)h0{gp
zh*b2|nZua>r6mI@SGv2&2T-$`CBpfC?49Q~Tw&X{6TO!t4AF%Uy^H8X1ks{*f*^YD
zy+@7SMz7I(H(J!_!l*-Zqj$#e?%enNH@u&o{lT)9%?In6eeLT!&);zfG8~Ak!ZNF)
z`rp2@l&NRszd7+;+9AtJCLEF8`GEWI0iL5y?y`^0Ke!0vyQmng;3o_cPGcN6i26P=
zLiR8tNTe=5%ZCT$*H#umqc<MqWrgq-_QNT$G_i_pMwd9qP<M<2Hb|(b8QC}ZjV|xQ
zH(b=OiJg3MOgCFrWf9<$g&!*fsSaZ$T!cww!?hW)BImYP_tZUt*Zdq@#D!2J6{UO%
ze%-mh^w`fw{E~_uMzr0fL`}K_IGV=q#LIJxxy)cu7D=MJBU@-@4(COUEB$9(*n#gp
zW(t(e)nUR0@43ct2Cx?Ev)8(v9Q#n&11J9=vb{M{K!z<kK*5OsNFBN<9(OpWiag%q
z3uwl9t432;-5?(}F>*Q%M_s$!zC4g+;-0*;!rekXtX}y(H8si4HubczYMt|*lwozi
z_#WqWFx2Q0J|^pB`-kOZ#+=x}^?JPrYv=R%jb-FDS_jdtcKwRguvbmbrSdnCeL3co
z%eK<e_wp<#Fsy;4)|=RD*yRE><tsdOaCR1L3Bf*xAlNBH#BW&c?0}?X*jX#vkhk+9
zoOi2whZ1O?#VgX?JP!_7m2``kN*-<XlROcf)!m>PxvFw&Bh342gc`{!@kRRAlMN|t
za~k{I>&gD5HlIHUQjSk-@o1G%q`QY&;idmrZcgt9h2ju>`Ya>h_Q=)Bm<|(FvQ!jz
zB2;z~kRTSb%)r3HmlO3LSIF?8(8v3@tj9Qi%q)%@h{a}u-f=yt=#YL*1ptCV3Qi_u
z5U?1uq@|yXnAVw&y~K`lN=XN}sI1F7qn?i*A2<;6?Pr6jkRPlM=aJD5NJW6z3hH#2
z`p)2#jEaZAs{AxLQk;jtwL&9da<T8Vexc&guqjPwU={ZcpHK8=jl`)B-@<B}$Rr(|
zXQ8mrL)fls)z>{pG#Q12rh<mW+jM~PwLfC1=<H_0KM4rZ%djd7F3f<rKGMzgG}CKQ
z;C*drp5&xh3h)gO_R%tM{cc+Tv2`yAr*4yv759rTnci)P@3kikVwf_9Vz~76882Fb
zLk=ZDjff-fnOG`ap><KT5TebVnFXQ~$?`YqA;gq@0jZcM)08OO0?ba`S=a`hSt+46
zI9()}X%(KvX_=xJ#YWvSE~afs^z8i%j7uoBk6auRQ(F2=ATe!kKacg`iH|_3si~WH
zJFwP=jZ5QtQ9#tdJ?WKzgbV69$Q~={T4>v)^*%jr>mD=bANh;{K=j5paDW*r)xuYp
zyit-hMQTn*ta=u2%mqnPBJ5OG4L-cY*%AyACfmT$3pRbXJL|laAD(^geXxRUzFn&D
zxQoVh8D5H-OQAI;W<~nV6{W^mVzh>(AqA%<)8Q4m>DaoJfh*OA!atp00zN#aXh)|j
zS*m=FHd&i!S`iIg1P1(1s|-`?RzJm7({uY2J>`z!<B$ovm(JkAU(G-#W(fA>)n%HX
z)1z1Oz08`aKQ$8@ZXD-e3k&oW^Nwun0n_8X#fobTyh?aRt?3<m(Bw4A`AOLI)IJKK
zw}eitc`M-$hdjf;TCYhQa;_$v>_u|sn3l_U*n?Zr{f_v&9(Ze3jWBh!Q&>BS5G|>r
z<myn*)9$4Z#M)8;E}d?_DAjo~xl^da<oUMJ4F_kR1ruBqeiXxb04gz8L5?>fJ(cD#
z&#GF6K3b=vkl`|wkV$T!=2Uz28A_|5KP5#dXb}c8&8Oh~$oaCin{bBXZ9oy79L%Ag
z8cj(HAW4FjefF`0tRw{Yl2?0mMb}l~t7@5>Xv(8ZC75xIZ(Y8+PD%hmCZ1`k@g?bn
zp3J4W3=<E@Hkk%$Bm!_Vk3qogF;90>9Zr_3<Oq<bc4dl$4VjYgYhs_e*MD6rH2E<w
zOVNXvy4*pI;cZpemwPoQMA%W?g_z?kHweri<IL3Y#O8m%E}x6oO$(2<pPpA*E<Ozn
zyHM44R08@+9Y+MMf76xCfIU|Teevmd){p2KYJBW?k8@E?4?dS<!}wW#lNrZr40n#n
zI?VKRPW5iC8oKOnmy6aLHbho}!9D0Nf<7HC7A<-RLJ%vFXTN#pN0YU(zd*|U^R#4x
zDlv~;-LFa>ZLh#J$2EuJA;Nsw>6BmMhfM>Hl)%#>?uBlP#)%$<BC(t-Y;V7ozTY5t
zvf8)y+j*O<S^lkGFsaz?)t07{PoA>)beUdDi(+bK*m(W8@9}NWYt}+Gye8^WMiqP{
zc0yhv=UnR(h=c0U!*r8@#TMt|IoLtB&6BPT+=@7q=x%Kjd`j4`OYU^Xc$SHHcvac%
zR)5$hSz1w=a8N01HZyGw1pcO1il4#lHW#*0GX{^4e$9dPopCvKvB#%Zf<0AyYj|OY
zPUC|HdmusoEDpoKIneF9qs>2tej6R{1>=F`fAM5&m)ez9Eryelq~z?l`&9Z#$&pn_
zNa(lQ{K#%{vo^;|`a%*)P<P!_V@V#n)ivn|;iuL6<npfP35|N=`3H{e-#n$kPuU|+
ziLkB*@PkAx9pQ5^L=oiN`OUr{1ggQ@RLpN9TqcVL;YzWf5|?-ETHwTIEA8B3^w|%4
zqOB`q1nPz{Tlp062s9C37J>4#*;f&Lw7cuQJA=avsV+WcDRMGtxyl3jV%BzK)S8fX
z;}u$H{PuG7ZLvnGcZUOY+qno?J46Vuj+Q+3sg;a0E??O!N)E2<++d=<(KFG*Er)Mt
zdp=xYT2TgO!<8sGAQb!`KmVPqo3tX2n3EoDHXGrDHxG+Ugv<lqLw&&fj+CDecuOHX
zFZT$ZPty)W{baYLIoK~9qd{?jy&_Um4H~QwmIGC~WO1XRFqrvLV2&N#|DhE2tzB|_
zPq$yz4Zr4gDE0=`3LI%m1=13mpopxx?iHS%?FIY)0C>~wt1~5p$UoWYbr4js-Jd55
zea%DV{$dKN9k-v;PxjU7eD;+SYVA5TyTHC0s%>h2)zr3TGihq7ELt+XxGWEZ_NiGV
z@{lX;&IhNv9t~!`#o;u5%D@D^fwKRtsInj$>g2BjA(w@gD=VPK_6Xe=IJH#Avy4cg
z({{((j$E72&;qkTZpWC%d|vuHE~wa<6D9X|bo?ha<l4S^+1~l^+UD_m!oLjNJ$-CL
z<qLYcW{~&wz{VPQdt~=zI8**Q^mgL~fd+s+(M670CB)Uf03^CrB=kl%Vl4b`2LfDD
zfRpH>0J!bC{aWa=s^)&{+p6%+w}qN-CjBeTWej7Z{g3Ew5AuYO+ujew6MUSMj|Z<l
ze~yQv;S;0{9HJ!|kX3J|vD*JQ8NyPe@v2qVX9ha<{|MjBXLpwtaLCT=r)op@rVhs$
zPat|M<y|AeRU?s}Z2=Ab=nRZE7>FS)kfZ$*)LnW2Jl$>~kSX1B<b782s_Ejo>Y>sJ
zkHpg|*q}Q~a#<S9_+WWen|!v{{u|G~kxVZ|kqFphittfO-vHq$htU<&0lf3AZ6@Uv
z3W}gMQ}hzA^`VXDmu1M-uzgO|^~FsK)7B1&8bbx~CB6Oy69WU(dUFs9%fra*h;Ybm
zWiw#Q@8Op3c1Ij`s#koMsq(s@E08t@g2`j7c9Nvh;PfFq<Y>WzL3o7a6P57aV<pWG
zbVdWxSd)jNFplo7o)#C7zC%Ni$tMrv9tBR{qRTXk%{KUhsVlj=)C$H*7B9~kfhE-w
zNz&se0m^e<WD*t3m~eWoi<-&uQKIwL;oUCUi&wAQ8<)>8Y4U!xyG!s)+I2tlq{-V|
z%isIbp8uAJw_g{(CuH}@r{ABVzYFO0-0ynktxFnfV$6Ill@(YhdVCZ7WS%?2o)T6a
z<+{;(eDb0?T<lTq;q0o}X|NqO0ydmEw2w>l_Z8KUXZLsAcC3-~3}A9--C^F}ZlUvL
z&Z!Ugslin2W`k}tO;4#up4wC>XxhF}C(LjyIg<ckL<^cm8i`Af(z;-x{>D(&iR`7#
zefsr#B34{Pqa?lP@vW3pkM?^H9P+hA1|YNxWcCzzHa6z>Mc0W3$)K&{!p9E>G9Mx(
zNU6&*>s^B@(g1Jx<6r52;Pz40`7FBQRo7&|53quwy_-!+O!7wd{)+G!c|p3fFXu}_
zp0|dfp}*XiP^wka*59c8XSnm>X2(mJ#+OfSt{O&t@|Ws(ql%J6!BQYbR%_J{q2M&e
zuRL7Y+(iovXKE2rU|_kjXpnK1RF!!17v1}~s5xC>&OP;g6u((xf-LRZ_g=-(h>xl{
zid^t0pm|SEXB8g~(bQ0VeD2z_apbr|67R-U6$lw~pVl!^WPI_QdSdZK4>e`;&@NeI
z>E#|x%Orm433JI3F70u>WRR%;F>ab*5Z@5j=}Qn&fhnf`e$1$A$3Ej>iWLZ3g{Sz_
ze7C|K{8xcy%rnn0IMDb;e(~qIJKUwD0%86Wa4c9DL``b^e(f96bKh5xg^5YI@6aDV
z$HG0*_z;-M2M7I*FFO{j_>^_KM*$2@R2F}FzsP<mvNALNlpQ|xhjsEzzy?ub8k=e@
zS<cvbF=1i#)!~xD@oHz0R<&VF_6Y*u+31Cy8Miwu2xd94)(~}mEUjU3*i&la4y}uF
zGW%O6$78bh{?>6%`7*;#`>dd1Oxqe_k5xet8+xsl!NbdIts8Jas7)|1;HcM?+o9d*
zRi9FiUv6!@*da7(zYs{uhK07?gWVF*CnKm^@kNJup8JEbynKf*XyfM^hK6k*xrhM8
zO2$+374e5tanHS?+N*{l+od`-qBW(^&>~$sO<k@v2`M7nK~e<S;7dV`E7vDa2YZEW
z&CSf9V_;-*(-q@29mCtfDGhNkzxl-a`@TfkSM2O4`!y}tyY@zFq9SyJpanZGyM?-<
z#-oMQ!E!nWWaN<wmd6uny=<GE{s$w-;uZE42Q{uyg{u}q0TRVz{Kw7D)Nc2Rq;mds
zV72#U3w)=*+pWPjuq1RGR%3AVxGQ^WGWAj?C*69qA4xfDZ}YR)r?D&2)XRZ#V8u97
zg?jDBV}MHp-)?NXYw0)bG7qk4R8>_&7W8!CWj;2?H|&Sl27%ASSAUT`^$5azMz@ba
zHoJhq?D`z?K&t$*767QBs#mEky5NzubR)FweB^O6ON`oR)or-B8nz7H8U5yE5m=qW
z3{1LHddTssW6BEPU|5t;er1KVjJz-e*~|bIh~Xvw)=$8Iw;}5v%V(Ski+NF^{1tfy
zl#`S<#=G;;$WVE_ATAGlvMdn0%_MsnotiJlvg+RAWv0WJ{z%2brSCOT3mBZPEBU;Q
zw;6E>r9oh3cG`i@-xCQp|IKH!4Af*_UDBEl2^h}9L3FoAVU7R1>pneWpimIMS)wV#
zfAP8$t!=Nu6+U<Mu&Vj(<lM3|B1Yju*9+R?X-*R^uW0PAQ8U<24bVuw2^bpz575P$
z#Wq%x+8<td;OXMvn3WAMWR_pY*x#sXrz>BW;%S6{C?<6ORiv#Uk8;W<N^3z-jqQ*%
zakm2#LnpNVg|MM_Iv*a3V<3wY0<{TSd6h+Glx;%Ha<$2M>`Ktg303rMIF}!FHW4ON
zS=BMJZ9;bs{$vN>hl9iUJjMf`0Bp3^*7H?=L>vtOz_#D{_L$}X)lY{e6RAZ>K^75g
zT>snN9jvbY_qkC?kHY=CT^gSa>6gx%;DRS$a6&dzKEJQvGDw0G$K~M?Jjqg;7yoE-
zqNsn<etQTQPG3?T^oT6K`{a9Wzj;Xx#N}WP`!8Ft`qaKJb4=o=qZo9r`!>k`lBTQC
zQ3@!@6b(Uv0BDp|U?fiMe5ZPjFC(w|dO7EYDw{G4Q$Cj<=(r%8$6{Q*CrvsH<QKlx
z`usCW8}}=+0GQx%m2K}kfxk7-V~0-w=g$%zgRR`tujY28VSqD8QyJbdLpqKeC}yv)
zke<=><^P&P&m0keuZiA6eO*==xb|w6W~GfdD3ffY8`jnBZTYm(6cc04J;=FIx3Nz1
zvts9AOlSjGLxs&Vn_JS6+THfga+9+?9cM#l{yEOfId+`Kb!H$-2UR%&B48QVAUf~4
z$k3HTNl|p`v_?VTA<W33n|~;rE$9nKT$M7D(7a=w{)B)bpX+d=7(c`BvHa-#2YFVo
zZ@uA#ZzE4;?oNyYkby^wAFVxNOuswPDDU*brT^r$Z&SbRanN#K`uXchaNnsEtHkxt
z)cyLt2Qjwf{v)Xg%y;hc%y^)U<f@;*J(?6vhG)LI?%H9%L_gvp8wuv^Cj8CYXkZaJ
zgJYTt#h4?Bi-XlzHX&ooORQzYg*t=3rd=K%&BjL)fNV<F{xV@{Gd6u9^q2yvP>o76
z3XAY8Qe$=;>^Sl-#W{*+2^U@J8Tk-~l4VRZS_bCINArWkR2(`cgG`4LTvN&lau1i9
z3##<;`Ms;E7n>kW6E!~SSCbbvOZGdK=si}RVG_7^@`4Ohe55>rBHyS<RI`{`DBfDC
zXibzLf;jr$x(C!$B20L#x0ZbuFAGW0I)Kv0Vq7^iQE|EV>AA$@Buv>MYf7IS6m%oD
z9hD-gs9M%DQSfD>P-A`-kREu(X-3<65he_g;oU{eF(#9SEX)JCZA~^8U9(-2xJkYh
z$8O^B@XM7Wi~}0BzR<yBnju<D<Gmr_f@fJzQ0F+sO@S8)Csa_^<ahqgd5OrujeU3F
ztV<{<Sp4kanaJK}NUYKO_XFAzWl!5{<xGU5oOl}U4fFS&+D8C*4fSA}qGRvuwJfQ0
zL!GMg$+#2akPB*56aogj9sShRXC&e6_W<lvd!Z8Id$(7zbq#s!blv%O;Bvc_I@)=?
zYR!I2@N49iQo-~H55-4vgaAFC)E7Rg+tKn9Y7|jhwZ;8k5^EbejrH;=b_OOOTxN?U
zL3_^1glQX_yt{8UhLNyFtC<0gk<*v1v6t|9Z!uf4dr}ZiDP(P&Yg*A41-sNJSNIwj
ztUTaHp$F1eXVGP9kLS4@X@c1`p+``t<}%;@QgdPhfWH71U!8-g#dM)e8~a0Q6fBRL
zs08E*^ix4d51MLZj}P|^FUCdV`{2j<<9RKsvY8+>h@^FX7~Vb{AEMLg)%d6EiR-89
zd^;nF>_7@ZEEW^;8;UKFX&C@tYC`!VthiDl%hh2HH8V~ki#MRDQ-i6n=udgG<qB3H
zYvr-E0lt-hN_N<xY@4K4-*v4I9!i&|bMe?HNKE!RU87uc!JgY>iLlt1aE`mVMSbpQ
z$gNFy&r>*8npG|~rpdX5Bd`e-ff*-KebPWCbzSpz_EvN!mJ?0mU30(Mn27yNAgpTc
zvOz=>l9obdp*-$1Fuw47u2L4Rj=PAK!M*CH`}kJSnU!y1Xs)devVon3d-eQ^7$e$k
zM|a8V*dA(E$X&>PeY1uE94g0!oGss7e0%A)_M{?5TzZ6WMsk%T5N``QBbuxYQ!kV=
zeC}Ocpp>LN7Vu+{$fD=C6B3oe3ka)52r9Y$ekB&$_uC|E7v80K54^x4R|m5=)?M{O
zHbV3OnhoHTs)r;_>kgkdb2QI4YS?irvw?y=-znplQ?N|8E;*jAxj05e#QxJfSq9G3
zJi)no74*Ta1d?LgHy66`hT99zK_0NoeMF}oK6s<@%@m<Be;tI&5f3u*Q8n~Ox_X`s
zGnwC<tZPb8W4*0*{P8fOjN9<#$i@5{tg8sXsA932S+iQ!irN|7NL1>^)BW{UDo@v)
zj`j<YnjVt?<B(s;tsq?O*9m^lRW9(yMaVvNo0gNV{1S2Wsj){OZ%<&>RW;Z*=D6UE
zu(41JYEse*_wHD6TKk@rl4MrF1(>$1I$zwopE;EUkc}|DVEMbXo|QxA->5zLz>5(o
zY0%C1bdf@(RQN_zP_aR~n>XZ8yKCEMD3j|$x_yRBb}%lSI-_}=Y^@!j?2iDzr^?W=
zu8_IUx57;$KgpkeJA*zjWwRdMQNAYGY7{4Z8CWjwBdgWcRY^HI%4bUl-s6ynl6T>*
zbftat#e4H~ui%-oeEkKdTRknkE@u{UlYQuMO5S=mJ=KbQZIjA4*vb^cRjNDxvy(t|
zqi?{QJacO~yGoE}h87K0OKVVxem?(m0`#iGrE7|xTi4{{r<XPb;!wY&a42V<7^-{z
zZ>g66BS~izKv&jsZ?X+F4+^O;{_wK95W7=H*x8N(8bRl}H^tH<)4BjzCrR_Ir)xto
z<mydE*gcvv%O%uB0ZxCIU`==x@@;_Ee**;wkgROyyhvm&!F#L2i6wPCRPSTbwTm&A
z2Ef!!1MOUoo{Zhz-ll*A`G~?h+JP<qtQ(bZ$hx}ob#=$tcYXI2*Ez+~D%&SX6l72s
zd3Td80IOmx$>RAv;^n^;J$go@Y*0|u6IQGEkIG>_Dxj?sk|`HXv#Jm)QrFx#*V6@S
z06<e5`lef*;zQqRl<XE_HKd~mI*#r6Pdl4z^+qC@2x?tjA;F~6fhp^_Jq$^GH|%yz
zYkP+Ht?S#pR|S7kmcEk)3b|BP*ghWN|86!NjMcSpKrwA8n@MX3<y$OL0R6)|swOts
zwXBB$(_;Sdt6XUUBj^=OD1KyXOWqari^dd0rH+`V175W3jdm-`$F-6f0G_pSh3i+{
zDPn6BpcL5!7RmV67>=f}*OQ8Q#N?7+S(rArmYGx%$EP<r6}WEwoOXRbxz*AIw!MFe
z<fU~v9dz?1{B$g}_?z=ha?s}z>KEjsKhI_0TnJ}DZKd#82HfAY@z1eR>c3bQo536L
zQqpoSp?&RiuvO<`4^G2sp%YKrntiArn&%Tqrr&~-Zd#UYyFap7`r4Z=(bBVg15Sg#
zEodM}rb(yCUQ*GH(XB=Bd6Zgf$2!ON$_y75L52oooNXrMT{`#Om(!>He=p4KV=A_(
zmdXCA0X%Mi5Q0&9;(=;9-N4KYki3*zJlc@FPt}l8z%d7=HNoF&w+$-Wu99AjLtf_S
z%Dg!5?_eEOIIL~;b?v?W7W@^Vr^S|3l-T7srvVW^$bt<IMO8H(*3<^Gu-L4UE>CT(
ziDrpfuKF$iaUY(7w9D;mW-&MR{8Ari+e)ZR_HMQjI+(>}s8mx1Tbv#}$1uBe4x{g$
zIWJAzEQBAWY8rSgEf|!+8V?6U-&~dZVQvsH!qI2DpZi-wq>IQwPOVFVI+JA_%)etr
z&OWyO{myz&g^sg+w9-D>IB#g}Zb+cMMdr@RHF<QgryyGS$!$aiMD=PX+dC=oLjy`s
z5Pm4=wYp6fih6}qkZf^||HIT(y5K!VvXIE|7oa1hxz}S)EaA8l?yq_(y|$L+{6SkX
z`ZtOD7SJG?f*t65Jix{J7ELAGchvhNh|7K1F|f*Cc*(D#z8%Rtc}k6<Z8<IK<s#m?
zlO5F<Z@<xuvWB0|cJyLBmOdqtRNW$5sP|~I718H?mTK5}>qi6M){{FCH0o$>CrTG`
zZauoD7mdrRaw*Avr{7!&u|QmkT>i%NFUalMOxS36wkN_u6Ku|5(6*}f=FFeNxC<A=
zkyKH^h7e_OY;mx}86p&>5s(VY@ebX3W49X0J#yl;HoUX2GZdw=qsx7X`DSSOE4)E@
z&+V_2H<09$GtrFgOorAKW%(frri<q#a9^~;_qTb6N=~SCpZ)!d%<|4onOi{!1@7vu
zq0q4NKV4dA(=#G>89~q3`Nl|cTk+sGUnpD=+wh0M1yO6Wtp21zE7LIep9yop?Oh<A
zd%FUgw%~T0yTVRSxL!J%_SkL+1Ncu>yIG^8vCY!Ruh03Q1|kyBE7|NvCJO8E=1<*z
zt28RF|6)X-lQoJEL}4Xz&o7&rvjnLz+q)(6kHga9#g;Cq=u2Q%r5EB4HcLhvK4C{D
zti^uz=lxxWJe$m-+dE5cbcd#Xc6U;~xquwvK)(!se5^QwKtua_`L90k(33uo`s%jM
z^>gr}T&!YBo%~8p#@ng=l8XFKpEoi`?ZdOJpXM!|S)+a9Ipej-qW)bkzvE~#F?F_N
zvh%H#;_G?kY;drrsT2|Rv5C+y_psAI?u?7j4%FUJy0WWA$&x+LF)98l*E+}1)nVWu
z^c?eCvSydi|0WoZLg4pyWP9H~ta_I3)k33PJMLvvtFzW($UL+AKMt@E(nT><mpb2Y
zJw=)M6}S!Ktdpo*VcBJ-)oFlS_T~-5ttXVkPG1q+*1sKDho`*Ni%(hcN^I~9d>8t7
zC~Fv8=dkfX^x@9;x9RRA^>l%(w{>8!6Dd5vU)A4gRmAWgHNG=I3isv2Cv|!Gpb|#8
zL!c#8Ytl!s&_-fPEM#ClE@V+C`V?;|&b;OMLzS%?mVB-P2FL1hw{{%0=+!XoSQ^bh
z>fTIA96K(&^Xu44JwNGb(ksp&87(8<#+7@QnYna}>^DFO*8bo;tXC)++33a|RIVh7
zb(y>oOJN4zp9}3^GI~pTdn?0Z(krs0S%Inh-N5meoUZ@8v*rxzc2t{iN^%w@lSen!
zem(S0so=v&ALm_XH)@`#fo6p^onDjiE-s!kkqM%{Ebh;$ghDcTxxnyWMPW8*pho-2
zke`4t_gSqIdp!|aot@IVWO!HqY@T_(zStdCWYqVXi4Hoh#A<}i^RW2I+l`H>PGR||
z5bZIoMzPIwqx5m6*Yx_S^F|p3QH2ve9VQ}RcHQutcDwS<fwn9tr2GPSwT;tu*Sn|n
z2%fhfIARo?*7FGBdC<{OQKuKi!KploIF!xBT{445ihS6Kl$|yuKg?hn$%&=;#-78a
zRl%Wq!|?uiNphOvr^Q5e3KI~Dj&Zm>+v30dmB+%z&3)=O0r$mm|5EVxSw$TQI6gPI
z(MjiIg~@FPUlLn=pdF8jkqQ$}W`Qv?h0THG+s&uvlIr>$c+*h{Y5ngjb2WyRYmenk
z2r?c+7oV2RjyR|q#$LSiOwc4fy&2}h-G4vGq-#zNQ#gbwVt~E6vOLC&7`r@wGV)c-
zyK~;F&5dPNC69~Jz*FlzgI(Soe3bi^{FUUxva6EKf-WmeiDr!Suzk^VAg+DiEmAIG
z9*2pCqmpko)MKFSPu&S(q9Uj>$~my!bc_dl{1H&v{ygTd|IW;EXW1Bxn-2Jbm$zgR
zkpR^ZP{bNC4H~+JUx<1&Y}&zINBL8hJ5N&;!HKZu2$S8GZx7vWxZMJYU>%_@6n+)`
zMNJ7QRNf!k{iF|-zKtr0GPuJ(;2N&GbEwy~1I|^Zhg*R?=L?-_Ro4DmxiLloA&qOT
zkeHaH#k}&1Nu=L&<F$O|s^w`wkc1h47#QBy(e*4JlnXMA?!_f<UFs(d#pLkX83$V<
zD=3`m^0v@fBm%^Gb}>Rn!mpeDgcq@osW=Makz@c}>y;)VI4t%F28(%%a2T!^8WWIR
zHx8zLIC5vQ%yZALlJ(bt_+@^|?)d26Tpl1x7coM}VhPY9nwa#~j<lVeI?!FYib_hp
z$y9`so|TM55c}Ht=Q>>=ubs^fpy_SmZq;10q8si272~&a3H$r66NV<pW26Sha_{ZU
z24TJD`Z4x%zgpRD?+J?PTEf~DZDmT<XbL_jGjaBojS2<wSp1_|*WJnKAgw_faqWlu
z9p24lXIY&|TxD))TsoNRm4>b^UBQRT#1OO+d;c6Us)O1sOSzESFR(9CT09j@ELB~D
zFW}+0A?Fh`?7%ejP%H&Y!wpqN!Yl0}*v|!{mYI-=6EFZRwXKXiVc@gjG=`af-QGV=
zs7U9T4i;`<?20-zp;GY_J*cSy)MiYhRwY!1=X!WwB1$SNbqYdl0{Eg~rjs9D@U)CZ
zLmXvI00wiDXxEZ9#U|$pfw<-bypL_7H%tK6y4Vx$5UF65=ZV6RBwb`~v9fypKwIGN
zUJ&w)Q68310l)+@EsjzX?kOwH4qB`1#Kdkcq8O*0nS8dQBkLTeY}zgDnK$v3;89;Q
zjN)biaAY=g`s_5VfBRV;(1)DOzf}E(;Ap}}UiA7!Z?A_xZaY~m6qU2xTsaS~)OF4I
zV8A=o&g*~!mC3EhnI&5_yvDTBTe+m~gJqXT^r2vMUcK+-XI$o+rDNvuce`FL<nKN|
zPV(t#)1D9b?t0Q|AMP!Ij`H&l_WAGlu;)#Qib0rQjF=kI1Z{#5?gpveHo7O%*DW&#
zf;tp}9zh~>q)!0NiPmRLed8q!3LnF?L^>-2FFA_Np{ab;8A*30BV{|fO7Dy2C-R7g
zzKT&!yBE7C0FDc-gviW#roI{pZZ)Z7%Qy*z@qNFZ^x6kdF|)3U4LTL+qB3(x9^2+R
zQn>n0c<kDz>sN8}wsCcHilVaFfXGYO`j%upDM~k%vC*y^E~xGmP4;T=VV%=ymT?~L
z`1aF|kiwibx#7%9MNk+r#PjOy8JdA#M_0*CcXuB@@*7xURHzVmX}cSWh^m^}Q`WRE
zopYvG3Ow`P`0H8s<z<izzVf`Hm1POtbdfpJQV!YD_J5M^msAmCJ%rH~ygv~i<zkc@
z5B1ty)7hXcHLJit0W4KL>0{Il>2$TNQCVqK(rk;mRqFIO20(RVOYOHDd-aYW1s2z#
z?$va}p(C^KjGWpEM2_`L(NO3Q2eX?3@12wM9L8S7(&B3&1I}kZi<G_W_ojYk8oqDY
zI6X*$4!zw=T2#GjhBGl;3#)Pn{go1G+hKkK5cG4jlxm%)1=%rzcsTj_H8@%~pS1@J
z9CzLP#RZBiMiqEvocfdb?c)F@!Uob;+Z%FElY<0|1-f>#4=$WOVP}MK?6SC#Hy*R7
zeGQ-X&CQud{M_+qX8BbFs2$(awBDa3ZeBygC4mD_+`j-D_1Ri=$8{KKLxC(|jpol?
z-B4Ty393CH*>k9hc@n1BeO)6~YO(iqzbk64t~wJrP+0<;eo7w!E6ai=hH1KGUht6=
zR1<G8`tF0lB@e)S#u%|YP)=HB_E+6@+nZi&!^pD%!W*dS1<?e2%9NP;L+ji0YnK;$
zGmK1_u32F@gn?@}8v~5fXanfIcK|U&3n-RmVHB|+^X(HT9Tc<Mj26lz+T_eO5&Eu|
zLuE@Le~U@BLrr()f*oGjpRCydMa4FQwn9Bt%2~b6)Zlqb?65ZSAp10?{a;|==BDE>
z@^W$w7d|Vl7P?4cnB=<;tWEB#w>@)=lWiNmE;)OXdD4owY@god5Tp#wn^F-CK{NE2
z|M4{~E7ii_ell<pm&CemoU?aG4h;LFc`-#La;U+B$1;~yIuR?3L)%=E0%j{I0aCtt
zmv1eM@co`GXvrTkC&uW7PgyWZ4$W&skN$9BX&*?5=2s&U{VL>X0<qa?HYbW;`t2_#
zvt~E`V9x8~wdltkyj8YoMs}HT@s8>&(TdbYjGpds&A~s9#np=p<;?}78P3_ptBhWh
zL>)pEYHf{ny*Hv%mv>Jjt3Gc&&!}1yR8BJWa!)Cbqo7~WD_QK48ZAQW{My|ye>C(1
zpoISUeJ#<`8rHop12RzOwR=6i2*@*xs|mv{=CtX6vgdqX$8lalTH4%)Dv#~wmEDcW
z2^~_mu*1}$uu!k5pCsaFW=fMd_q(GH71oSA$b!e=)Hs+)&ic6C7u`1fhgsR_4qxv#
ze?sp6IZOpA<LA|1@;Bx?In4f=A93_=F@VjZfFv=;)D$j*h&E`~#w}eaIq1PTYb}bt
z#~S31%cytGSvKCU`P6ZQ_d~lq{r$&}-W4>q&j}ambFRrZB+%p~pIP$x|H-if{1dFH
zdJSL1<XI!~NlyM_n71I&F!I=!hadTFEknfEZTX(htQ15J@4xZN(=c;)a;yzK6`(hb
z?}1yit&(9n9zi)b`w&&G&kU&|2|;)sFOWL!Vsxm=@A^_up%~XC+VclvOIusw*|w<l
zUPgTD<2w<!4-6$Meh^#XD*xKbts)J#A8M6@?E0+R+^0$n)X(<^Akhf@E{{#O&YFI=
zgH|KoyqaDxZkCy49k!i`+x3dO-qXmSDh_6>wW@ZGBsJw<=+{c#pRR11J1s#gN9aD%
z>m0g|4`5dVj)ovY^c1z)1*>4{zp<aw3{?id6^BbanM2JnnQXgWMW|ByUih+V8>GQb
z)<w65nX1dm)B(KA)g<Q2oL5A|)d&)JwAAxt5xgLeN2I-O(J&H>J1QN&SyocjhyvQ%
z2nBT|2*M~KOLYd8zQIBF>*3XPBjgWVJK0+1BT2umngvUoi0M-e+=7%$D{4LrMBM(Z
zQVh&a)E3h{&rRp|i}MXJjhF!97((g7?#W${9WM{*r|zqs1wh`N$z5GNxApiam_qNV
z+UKuv?gll+W7OyUUTb6>cl);94a~iM7Y`KhJ@D%q0|5M!Bz)wtVAy^lyqqo-Ra_89
zZG7G>)^n^H=^CWuvHzi>l8~haD%jF~x$JDTsHSmGg=&v=MCpA{o3cv$LW(tK$sV#Y
z5NA0V8osSlvV0K`-EzVr467HkuaS5k{(xjBeDmi!Z}!0pa*A?L*Oby~LAj=`3Nwwk
zLjha*MI%1haa?Ac5psH-lq<YCr%k!I0}+J(bBPSApZ3OO)?9F8SE2fk2!~Ya#^$T7
zB3%>aNuLj#%*1GeX)L?FArn9R=}sO86hXs_LW#)jdj5FM-D~BW^Ufxt6TQsv*Q}W$
zMty!m3w0WX(;ZyyOCONl6Uf*UfIinQx#L=LII(Q2w;4k*;jF77@Rln#SxYo7w)KqD
z>Gzf(^gce=M1avt2_WI<JWwSP*&pW}WZW*ayE*?oXn!!@D^!P^14XT&yW1IS7$!B=
zTJ!F#!8l@2Az`&3^GMpFy;F;7tM4Z{uPe;(0IgHR^Q|r=*m**Y=b{4FMkZA;oA!9s
zx#(zAl(pQ0Q>?K_|Mp-fVY(Q6xC5Bi-ve}0?2`6s!X*CPF4x6MLMxr6Ww})7_IL2-
zEF;=v67v)e#|<WtpE`=Hx)pFJ6HSubqwnHXvfy$Dna5xI&EF6adg5B{t4sseAbM|k
ztt7*PFDrB7>6JO53QMb~HvXS{AMtPZBZUvX4hII%;-94f4AFw<J?9F|*>XU&p?CCf
zV~7+SeSTj`u`k0-JoS$}DnBtNdhphlXNUI|HtW7GN@%2=nu5)~SzQ8SJp8<f)26!q
zV+ZSEGxf#C(xb^jC<8?cc_5gd+<H?`di)31%I$8|<q34~qTSoD!lVo9Bk~qnzEyLD
zdN|(F2qa)sR5WuzgywEjfsO%RUV(F;HW6-MDC+PlQX}uM5mm=NsFK;jUw@N%zv#-`
z`OB|+&Bq5#%+9tmm~ae?gFJOjv6?9!2J9p5?7ZIyuUb?H3hV;vFjjI-gp>T4gH^`!
z8PP__(5#3mB!!}FKX_h_u)+Z4x4){jPVjNQ{sRK$Z=pQfd-ep<gN_71!PTRVy0sWV
zp1B9@8?onGn~C8|A}X<8q()6=!&NX|)daf^!Z;z(RT+Qg7Re#7CiX$<;aCc}E?06R
z=9$_~V3d_n^PvqM?dov-W{yT#q4RE^$=0Vs;=KQ%H((e@Q?!U05W<6MTf`6tyuyZx
z=vMHXjkr|65H&MR8YIU&L6O)rS&0Bn)3v#AnN5=;=KHf{x-_$GtzB9$zT;ju)139N
zjRlLiNk}c5MYrwoJL0ZjrWW8Ry&+&%cx?d`aE&Jb>hibX?*&KSVQao6{4NnIFWxJ-
z%pR*$OIzO}jvuSh1x{BMX-j!MPK@EbVYrTH{x!-VPQ&F~WJ`6P3?7PzV75yLx+BYF
znz2!Slv?&2@#v*Jn%Fbbd>cy2hE<)+AMDK#(NzqOtHbe7QtnzW{FCPTxD-k7v@Kqa
zgQBc?290qRtq+A{N=xa-^hU2lboVMzlQh>z-6lr0Bfn!|T&de)+=`tysO`i2rWft+
z#FKb$lnAPX8&+SI>;PWuT(p;h{>j_Nk<;~xKf%Gue!PkU6GnTUHtncz^6J}31B>W{
zp0;eLQo~#iq*bO!>9$}VJ@SsK1H#`dkxeH2=jxA+wAX@&pMKp3YJ}7GPCH{ByU#<`
zj(wMa=c{&2eB7i^c-iIGts8T>Xt?c{k1BcVV--%MKSPrlrFIsVX@_#nn-p8>n}qK*
z#hYRK-wNEeFZBwEe%qfdO$;*;uF{jkRZj;HmfO_q0PpnUr!UdA+^Z9S6C5UT9^ixM
z4c6_sqS<|_o;^PXraykXjH?4kS0z#v=h4wov!2EBHg(RjW{F=#a>P6eY;**aIpwo(
znA@<>g4&aTDr-XXgHyT&OVqiOZ5OlA#^*1#t8!Muy5yH=UM?)-qp6|dpW;0?#2mkV
zj;lvWc0mI+zypHYy)jTBm++_?6^DyTx?9P&u_tb@Jc4^a#Nrm<6}GYX#1QHl8G-W$
z$1{PUq&XhSu!7mfl4h@J7yHTMko@*+N*EC6Lj8<rm_7XN_L$%bF~aO`YbW(xdxFsG
zoES6s1=-YO*T1h}v7ow=yuJ+dq^k*#&EFW-22O`DAHK$Z9cd|_sxz-^v~!FDWp9Sc
z#%>%X+lNI3ak8>bw1sbNzl;0p9>ln=o6DqAPTepr`2@S2=xL&|SUTJF%JqlJ<X3E%
zkRCTvDaj~TY&4iVBnsOffe>4jrP-J_q$ml!%WUFw_<dr4)-XN&xuRMCg&qnkRmn3R
z)D{FZMTu)u=Yewz6j!5zG|JQRD{x&o!*79nzi|-*-cm*VMqZcO2#QZ#OYJ+?S?0IB
zZ9AJUz70I4bljiKXDFT92-gx4ty%y1*}7uuLc<<~?D9B$w-PIOz<qhS;8y6ktop&6
zh{RT(FE=6B^&49uxGvv^98v9Ub~`)l*74<8{zZpkqYrU>iFuU~Cyl5>FdNcSzguF0
zBjboaM#!`F!F<Q#a4=tB$eWlf?PhV~?4ffW*hC<f*$0J{5v5#mv&irF!kROrBj{-F
zfAZVf>b1%j`<}G92LI$8`m*O$;&XlfIJ`p$D;rHKM-)M^tBn2H6`?7MO_(4BJDm`@
zR0DTS$|pSIMCmtokj-{Lv^bqFwemR5RI6)9=N~hFk8b*A>u@OL^wCqh%j;fgp~bj3
zY12YaC-NRt>9~(uY1CA$^ZNMlkR@H*uOgXH8~^yMdj#!T<YYmBwWM~1xrCo;)=n!u
z?(k>jB}sN^sHT+tieFiZ^G4QOeH#we5o5#0_!lc6?6OGRuKi!)v6+4;^{$z}AO1eq
z;df53P33He2R|M%Wi;70@U8x?w+FwI{Mp|xyU>xMe5(T}*hq&L>Q?PajeRrMvn~&U
zP_f;Gi9b3P52TtiKmRdkEH=g`c{o-~_pw3LKD>kPlw={PUWfIOSho?Uc1@|gpU>*Y
zWN*UQ#3<9~xp)@?MZLhnjj*<x?bFvjDLV0Ql%u`nSF<0w%yz9My1*~M43*FI7Vb9+
z{@OfXy#OVV4mJGYHS0~FIqXG&RT6GuwzSCN65dCNfCQ7digB+rq=Ah-JGi|&EmhCI
z=NHNmPHM`XJ>xFnNYUO*>M(nA2d(L#Qgfzyz!?U+j>>#DDoXwZ{RDe8&^<`B?@E7@
z$i!LUEObLDNLTzxXLGX_09YP0yp$@>&C2pw%3geXUK$T1kNQX4ceYb!G_d3B2j8_-
z8meB2-=DOf0Xg==sbg({X_IozH`66?ynYG>TsghHUot1!0TpOZMJ?Ms>F7WR1>1N`
z)}oq#U%1z6j<u7Q&!?Znf~McAIn2xv%Tk&=ej%^7n6PtM92p;<GZkLg-U2uI-Ji;?
z<S20lhxOVM#s3qeZACJPCCEh0Eh_SrP&e$nz?vq|je`O4C?2b6sU<f9^YYQZWCe!N
zYYa`OgFEHbU$v1{VgOA<SJHTp3XTO@hgN&5aV~<0hSKvcSG-pr4ZDmwbmd)OC@8P2
zB6U9i=Os-qCF6Ul3Pjj%eR6UC<QK_z_IV*|%93)e_d84Eb(R}$nS@<ob8l@E_-Q`I
z&fhnre=_Vcq^1#;nL8Njk!<A2ZzAQvYVqOqFA}-ERA0!@GKT%C)9oLW*E1OG-(`No
zUY#`Yl|2~*$zk+lENZf0!-btIVta5_m(I?a1lbAlkW^SCQJ=e1);B83B{AUr%C)jw
z{2Al)I4HhICR^TQzSrTle<oe1<%MfSO=BBafQoGe@_dQ`ALNz<N$Cd*T-==-H^6kS
zXrIzLOy`>lme1xn<+Pe4uNd?x{ZbmB+6bBDLpnu>;!vT6F3PR#G?Tqu(2ZGh<^)Vw
zGU?+bjV>=mJ_DBQ53Hl?m8?&)`vZ@#t#_3EjI6&OUhPoYf%ENE;~u5RG8IETa75!M
zUrouqxwDH%0+B#FNR=tA;aRgk!AiCVylkw_8yTW=Qxr#a`Mp6^DpZ(MRvQ@oyB(Fd
zp?0`OZhut|$<!<d7tIC4ZDf6q*1Y;wO!=&3Q=jlnC{GTr*i{&LI_6bmTqdrxUK$uR
zwDlTXCN)u2!a*7LK(4Ln>*bw!reG7v*tyn3HiJ%EY|G70``p3M_UFjE%X+_~$jHb!
z$ZbKKSe;lRK0EEo2uAJ}`kl|brtzOtwR9qQQ0P++#PzXRrIiX6-C-U|NTA1*FHRhr
zth42bRw-TI;PJ`q>t$lec?L93A`!_g40u$aZz`LPeVo8A4*@R*>lF|di*c^#vIZ(z
z7!U9HeX(Q(XT}H9g&Y0H1{5-PLewH1A53>$hrud~iK{sgP_U*c_-Qg{gOg6}7l2;W
zd-bX`^t)NE=UuN?-m{<)sZ0T{A2d<L+3h8a2buU3O?om=n9r$UiBhFaCl}<d_t-%Y
z2hjubar{lxFLft1<3h<>$jg+rLFvzD{UH|zh=>}FHwi<kajy<Ncfd^Z`pz-zH0b!c
zcWbJZT?ksD;}p@Bcz{6C+TnGmGe?iUVSoA3HNo&8pOped61`6Myt*h6(#RwkO841-
zVkLFRd-AZJ{7lK0)gAu&&CufN+WAUqqPw*iMXIH;*g>(U?KebK%C)iN(6xNjnK}F{
zw9f9qWq7e+Zh*A+-Msfg8{THw4@c~T>9)?0!MDK)bCdIAO!!Y3Dxx5$;g|1)LeFsC
z;KD{{k4?KS#U{lnGSjywtiaw1$5SOxwU;ZJEJK-72^PO*j~Pt+zA-`bLGxu)ZP4ys
zsE|x+I)B{DYEJlS#jOi2ySEB;u631deD6blmrQo7X0<j3ZS~;B>C_gk5yzXM`ukPR
z6=lmeGj(e2*uM&<F4-G=#wMV;@KfLyu8Ld_84bc?Qh@nHc{7`iL%mT7`R^5!`UU2S
z#YcoTf>(0g0W`6#<Ik5OI4pO0Y|qtxc`gX}QMW|{#D%Ioc8=CoU(K{4*Ld5~-~*rw
zs>l9ZSGSqjMtGodA+2-xK?XPVPAXJHRA`0Tf?z>Lm*a)meiJ`OXB(Fr!N8VK9h?}@
z3~_eGJ94D);_#Xe?5Ec`oPT*U*cBI>Iwx1@Dw?6ARn6Y2cjz5RYZ>fWve)Ep|7Wbt
zhCj~Q?&&M$a<<gyAUvH0&q$k*)r#|_FO@-)29;e7AW-mXekdl(wI%mSGXyz?a@104
z$DD*u){MC_g-JkZ3EF`9KX$J4_xjoGc6~5Fh3qX08H{Ee&Dr8^HR{bvu&%M|@ibJ`
zPPe6RsP2^t&X!b#zzQ@}!U?RdDCSexb==hY5S@#;9fR+_bbuOK9NRL51SK5`FnS7r
zZAV)aP4eldQem0*##GV=zRPc|hR(R`XLTl%DFp|Xk>CA^i84gJTN^nX0cp-4^-2+q
z?}aL0apjl=B-U|rI@^N#P(<%N_cpaIma&RS66~WMS%R_s%v`QyFk3gTNiQ1im9en(
z(p+0EN9Y~tel6B`?$NHOc*nrlhgl2ja%o}LCM*CK-5E`}pfy7pTc60ZacxNzvW|Ds
zFSPu&Uvc<Os&ltxBNoUTLj79=Bv{Lqm-U5d2wv~L3+;ACkjhOr8z}58H5D<ux9XI?
ze${4rIQedU*;UUhJ-eOUE7qeq;S|%LU~*;er0Ni`*9tr<AH*D_<Ar=Iu3Bcs6|kwd
zGpKDlo-)93RlScn^+Q2dKH<Pn7<k?q=3@2gpBl*V&YN<TUFep_YC%`0!qxb`!&bh7
z14gc9p=X!Z`1ve#mFPDh<ouDn$8KY6QGtu1;NfrTmK|f=;Bro2ofjg)qI(|Xmc9PE
zCJWyS96+0xqjL;;JLs@-3pI_?Sq*?ZURL|-%#TWUY7ibkj0ty%Wze5TV#v;t3!BXU
zkRifcLF;Zl3xnm3s^r4%ex`%EjurgFh3g-V(B?2e6juzaFwZZ3ZB^P9$AAZ?xNAe(
zco|*&3Rzu!?q5o4iigQoU0Y-KvA=7yr|bPMkx8*%zp$F{lYJp@teEBMRh5s6cb>Fr
zmX^7gL~o@6QA$6~?hr^x=hBDa%}*B)T7cduX25kh{Niw;)gs<ilKYtWwXAh_pipMN
zyb+xnGvGA4Z){}gaawsoYD|7v=CSS$9n_9}bp3$duj6NmYx=vn$vpS7jvjS@u<t{x
zvcbOd^76%&{lhqm(0Qpn<6P)%{@LwGAEBN(v3m#&Ak0XA>-eP;ml9??F_%2LHJ0he
zDkG#b-Wn>{xrft!tA$+q+uAE6@zVrSnTcM^pHt@xKfW=+TzY5vC}7t*&co-MySP!0
zSNH}_eL(pU(sAwyxB$*dyDEGxQU||yyN~LrrsV&2h%K`B)G>k%HdTwBpMKrKzX~Q~
zm3BeUO}e|C%CWXRT)z8y=es0Ur~&hRn(bUcw`5Si+I3!UD^dQT;l-r6S5svzqM?i~
zB5*Anx6<_Gvt36ziOp>-k2?)D0|pc1<jROU<gbrJGy8qi<=}M(teVh$39N&l6a*7q
zp*+yIhJ+nG*O&N5MQB=kH@64Kp6a8V4+;jdM~945!yQe;yeUM0;pHPSD7ZcY>Ssa2
zht@Px#_F-EqNMOQtSHLhe4##T;L3Ym@}$)rS;#$#YLRo$DxHNifFJ`lHmUt9CIAN4
z;a`Gt%Y##QlT|=ilCueP8=4_gZGfLUWd`brP|&k<tZ9bt_dd8xr$26J=ODksFvJAX
zRN`!JpiBTfi5Cdp29qB2`g!b^m293*h>Kft_7fLkzp&}qesvO~$$lAlYwXu`VqI<u
z$E#&<p>kdYK*|c8uuus+6qPPE+r4A%M|osya^Hw8`ilsvN8K|PH2-DOW54-OQn|pL
zj$^vF{sZI|LyCecW}bTqj0$Wm5fUdoq$)PiNQwhG`h4Bi(D4|-Rm<eobY6=Q-|H2N
z<N1=wQ_z9uMlnoYjvB*2>;jjaKEE?CN+x<fsJ(?<uc?6)#d0AK$B&C|p`?utzrlKb
z9L0w$n#6Q}*>&FFY7^WkhibPdRZdsPE;2%6Wj&QI<4tvg;Fm^PY!3a<1WBz8c9L<j
zxwKsn^OFW;?NUcjlzZ^JntnNM)bJZG#W}?Si+D;BjYN@e$}KaAkesKVMac3}`u%%v
zO!w3&gAUuJCO42fpi9QPxF1>5-dM-9qzW>Y)YZ+%0w|xF+xdSaP1oGhKy7+-@ge5X
zG~(G-<=AU2zcctp2XuS8`V+mmp9sKZ8k*z7CKZYLB51+u7_0+BIK|Fx5K%91M8tj=
zF0l)2vCXwy)E5QS|L-wCUtFMyXv>iB7WbmE_2y<4rpBk<RsH+^J%n^fdP>7>cQQ|a
znsz|O8(_QJ=#y;6K{(#7lJSufqW2_1xkw2x%w?>93>2cHc_$<$T2IgQhT?94-B)W*
zbD?);$7>$Be`bpnGZ_Q~G@I;Uc7wQqeEJpV%S8j-LQyGS$Sx>$M92NJ>OUIAkV|bE
z|FYA6{~nX>g;Z%bFbFcB4szR|r^op%sO);Kdw1zLs*Yy*^7A-GZ$qO!oi?1hy!fr*
z`!1_tMbn-Tet-J>;CQ{8+_fHX`JJZaN#$gXig!qZ;*S`L$>a!TGCEF~?C%{U)j#H^
zs}sEsD0Kh(ch>1C5shWBpdZIV6w#F@(3q%*UE0v^`NDr-N|6!YlY0+*7FrwvD~!0*
z01>;nvf!6ri4mo>o~-Bv#Y(t}qT-8G*PQ%~#>#3312XSlm`x_*#+_n_9Px?p;BX$;
z)Bn40{QmnhslHbX^zY`1Pf9H@ydwTxe&%GrTK(^ZLyvbwMEu_iofyOWKQ{nhg0O-`
z-~aED4C$f%-|hLdy~c$9yIqyYq$d1-_cKRuL4g(ZpX<f@|NHaA|M$=Nw-)|y<p7_R
z|DPYLAv)rNvZ|_t{h#ugZbubY9eU!z<>l6XdIlO>o~eE@Kl$0!tzLFFIPWM;v;&V@
z>fe}=|DFg`Qj5*@{wniVQc_Y3@3?g`Lc`Vx%a}P5rt?v}A$G!yJOYxv1z!&MgGB!+
zYU?v3bBrkT*ZKzr<Uip}2Ypmk{gITEG|Nkc{uQLKY7swfXTRfF<J`O;pl8$$il!Tr
zayh>gM+!nsGQng2kG;2Sien4cK#}0?!7YRV28ZAtAh-p0cXxM4Ao$>J!JXhv2n2!?
z+}+&<y}i%g=hXdn|G=%9qM)j0fL^QDYmdwZR(RTr;KE=Anoh&)n!kGA)@A104g@@N
z7q4V&nZNRr>L(eT1TE-d?D}?Oc^g-cJ!Pt@k+J%TFfjan8F+GWb!B$6+xPqD?Ac%U
zR7UchV5Dbc5o7Va6tZ4uE#X45eh)B>6}$XTVNv3uZvB{Q%cETQ_8iAQy|P=%xtk~B
zTUZV)dFn9=%(8ffNMBb3MW+KcFs)}@k3~wk!i4+Y>%mB7s~z6BgtO;vLD5Br<u+wg
zTm(1-)Hn$^IZ9lk8?mijly<eF$plu~&C^Xhs?F_yo--~fB`v!%yDFeF4CO;mRZ*d)
zlPtc?1Y*J?Pg=|1+^v{fQCMW8jH+yb{yfn(foz$BueEjhNgUgAr7+Rtr`%+IrQ`9m
zd5~O^^Vd3D0&+TY{+f_wafUf)GGXeXi@3yHK=&rEiLjP*YE>k~mQk?%&${y2z(Ui4
zcQk@}G^R9k!t7%qT;6<8RRax&1A?X4Yn`(#brtiqLBIa%wfiDoA$pqP#-f>6qS(N%
z6s*Tv7WuwwL4->oZX5oJ+*~!zaZo@Hn59rEW{OLl<sS<;zvr7dT%0;4ELtE06s$YN
zBl(uUV-%LD_PbS+Ihs;njR5Rc&(JPM)$V2m=3OUg9c9-{72|C>BLGLjBp$~u<qxcx
zCCR%^<ISM?je^~RXqLo^nyy6_k1psgsrS>x;K^pms%PU3Sd0q3TRNyjAyQ+iOd9E6
zvAwXU+0ovxH+=q@(sE?}uB!p<&M5dfY+dRsc!jRYUe0=rPJhuuzFAbs>tmWdQ<s{G
z0YmEow0=9)oa3s*(Cm3w;Z|F%*@-t^r&%t^0KQG8xhuO^Vh-?oAlJrYD=RN=>gjDu
zCN(>qLEvzFFkxU~SUQ1QA-dJI)BAQ!GOsYLYMJdwFC(uQe?H`7y=(2SVxU2tzSLmW
z`-hI>anHNneG4<w(p1;fC?`LEbUQ>Ohgs;svmpo;j!<s5D&EuZ{vAli+laJIA^V2x
zWVSil!IH@O@Au;D_IZ!e(o#s@6ODG0QSqPj4$%}=0N#<e7{Zz6C~lHPQFA)=Ly2C<
zBKsXRgF&AY#eP7=>>6!Qqp95YKe?^*v6{bTD?9lO2rFabl1ChP$EPT#h^LkzXBI<q
zXMu*c50~MB$F=>KNr6vWl(>|XtN@`Q8KA>UaBy&d9sgZPXXH1&xI0m=RP)m-AwqzZ
zYlq*}wf_y)t>9C{HQ(;X&ODr>zMC~&?Bnzr0y9M401QSZrec6vaydjB_vr}W4J`Go
zc&;(arGOL8&mW3m_0}b&q&~eGcb?GHL>sKr@hHV>b;7_mUc%}HJ)aM}gRTV%&!Dwi
z@GmSy3ci7#5@<|p&uLM?P295cbygt@5sh^ya>Z$%W)aU86))Z1+a77oeSWtVAUGM@
zn@=X{)O@gMJDyBUi=-8g=48B9sr`k%HWKaGCi9#}M<1NstSamU<y@%sxpH8((!0mp
zy<1`$hwL&Zod|egJr0MMJ+QVRt83Eu^ND?_JmI^9>Ta1`HgnbvNu6jmuJUcRP`eB;
z6KUJxpncn~$I0{AJ*y#`8bpm3R@3E49dT^yfvX2F3|l6HIFEm0Q>Xc{0DZ+L24<Ed
zYdnh)ddF|cAy4An!B|CS3H0bm4J>@xiE2sUq58p##jh7mSq>^`*w@hQ(K-&-O+7T9
z4;E{iPXad~>@KG}CB{GO1^Q_F%%2AFwyP{fE;F0pdL8R91EC{^W`EMyZ&!64JLv!#
zifreV#6Ni$Ki8YCaN>96j7*J*Rix$ao^EIrec>fBrJ~)0C0Q;}MI|Lml4F(H4Q2q|
z7Nf0J%m{vkYpnYL%y;{`Ab{4x82dg}=}Wu7J5eXk$W6t3QMr|saQ>UyiuU&APZ*3^
z45z0o!Pwwa)U~HQmyMl^&G;}j5uZ`tyVKiQPfG`n+f4&}4OXp`b5W}ZN)TCnk+^W`
zc<?<Grm=TO0J(Vxt%#jgNDZuI)LLsb%c1kT?|4#T@FOhTPHbigX-ua_3v#KIBR0pl
zW&CaFF2={nP(e`_qaUtDbJQxh;G9l9u!Pf?@<Ud^Y<9-vNt~e)4o1JEkqA6yA;l}9
zDv$CdQ54)ZGg$0H<va~E>M2GffhyJXAYLhmN8f+ki-fIBDI6$7gH5b|h6S5IL+TkT
z+sV!rVeomo0#;}8tl#kQ@ewlmpflwNHJY8G>O7p;M6HtcLxYQzBZ}p_6NC;PkXYz>
zJexVMLPk)1NVHrLA|6Of^5!>OggoxUK*P!gM=AE-i|);yK;EQpH>G);(mEsDe&{2~
z!ureiTO@Efp{Mc;9=Lfsuc-xJiVKChj{|D~VrEvjG(TVO#V00aFlLG>00FNbNWPAa
zXO?KfbAnzphwI1UCy|DV6ag<B3$d#$5x13k%I`<WI(lOk+EQj;pV6uwwvcr0=Efm!
zNtrifjby73#Z^|?+|8kcS)7*U=D$gg)6&vhn6-mMw>Neag%I7Td9XBvjgb|!!SfPq
zp3a^)6=)qRYu#5=_68050X59(=))NI#nvljeD(&N*%nW{nJ*cFk%XgW`QE3DI;~Ef
z$n6zGj(KVRk=j(%^w2O1!i5%=0SbO8_D9q4sI$bwzSVrp8{(;$4nE4;GyYGm0fwU8
zem6wVcXO&bA1srL{7<G7vP71KSSKRI``{({wvNseR$mMnjiH1%V=Aoj@in81Tm>oQ
znX0OK993Giv4BaDQpU|uV58TptPWowd1PzCEO>P&#5)2kBTmtWi$KWK-h5JcZ;FDD
zB3Wz-T0Y)SJc>|jcc}V_g$)sDq1>Q|;Zv+2s$#_*yutx3=1m;3*EaaAH3iT=L$B;I
zUi#&AcwPCJbDJY(Sr&TCn^YEUKYr%*_Gypic;68K+A-VtvN^_Ym=IXS&t3SwibST1
zHM2S6yFZkE6lxF4oYSdV31l3#@4)o*aU@)^@y>`*MGp#xUyC(W#a+@z&<=uBp~C{d
zWVOB!i9+)*k5>XjA2KeRm;@<Dn%29~HR8@K5k)M3^O#w)yY!zu5LGZ^XVY9C={O{-
z@O`GG5JDLEZ7OM#P9Eh`*^TIe>%{<3G^grJH~^fR-u|65j@>2)FK4Z{qCcQHk{~ho
zlW1g3ZFfpU?X`GC_f};v$KG5UA*^(<^jJ)1oyX!NF{aUK7L}jOKa4|_b3hzMPHnyD
zt#zb28a}o2AYQ5dTnY`Z_=LN&GoiV%_f_Y6z+OgmyPG)L`jBYa`e>2dgaK{@P3t*L
zgG0*#VU6b;Z;SGn9`y~FVX>Q2=TH%+H>;gS>-Oh`&|0TRGU2G6qO<fS1F5)>C0-6n
zMd`Oj^u;=7p4<xBfTZ}iYKQ9jch*941NFNMS)$D+Qz5?$c*OfY;FEY141MIAB{au^
zSSi<wiJ_SP$rTw-@JLTjVHjpclJ+gjbSm((G6a(^Qp(x4HR8X@y`_Ycc;Yi`R`#)p
z*p3Sl@e@t>der@BM8cfj9)%lAABBsk9U+L&;<)snU+-6mG6#Pxao!5Qv-bV(QgIa|
z+^_4UU}MF4eLs4CyDX{EeMOOh$x^eQ&qp#@2xQo(58gOh?QGAwy1Od@+7(}aC~0q%
z{O@v=`)hb!nhEdhn5j7sqQwy+J)Ia$IsB;~iA~$N>f8-3)(xj>adE<Ze}$`9jSQ=W
zYe78NL54{=OskifEA73YT@tuA{zXnZ=={BRyg&c(^u{{h!lm7nw(J3wlE1&d<fQ2?
zX^K<v_Z0V~mub=aUVy3adZz2MbrH8e7#EO#Gx9<0&kJq6Zeo~bbTs0D1pihn-IqM{
z0YZGTA)=9vunCkksU@D{f=+>gfd`l!Y!c!=@$x$ODd-n!vc1XXGsS#wPQqwOA~KCk
zZ=;ZLr^>9Xc=2cpX4YfhczWhV9x>6&l{ZiX`u4Rh&UBN)xTpqO=j)L@w5#OTA=l#*
z>xB+=H!xrU>^;%UWk(^?1-qI!SR8EI;U{SIlOXixA!-VP>#LSoV@$y$|Cv|-xv*21
zu=kGcVy#Pb(d5?=ggxf$@WKwKfU~CO#)KM)F1c=;bqS12DSUE)w-6LRm;7hSJ(nI_
zc90i%1TAof@5%e(m;mBLQ^%&qyB6WPSKkN^+7+h_wH|7Dx%{@j5@~f&Nig+&Ov;)1
zB=duPL7{Y<?2A!4@RaFGvKRliY?$Th3-ELi?ah?#Z@uaCAC?E*ZjMBT*iR3zO1*Q9
zKMzi-;Ir)X@-$L}bK6VgE9m4pzu~cANf8%pXKrcpVza)hnN#jVjkSruyY&8p=Qz#n
z`0_Ee`<m_tx!+g$j0DVA9~G|a)xG^go^M>}UC%O?sOSWA3Nq^kmOP#|v$n`FEw6NK
zn>#gVR~y5cUEkT`x+G5`74l#(4gGQ<Vzp}M6*(5887jb-Q8KNUBODOPAV4+WL2%qj
z!b=>Wff>d`Hmcn%wes2{rTJN(r%lqr!iq%5f-+?>F-wK->(wg5msKMw8tI;7-K&nP
zC5*a*(X{%ZejOj-&EpdOR6cJA2MlURqUb99K*<+(>^|wVd2upAu-2&>iSh?@vXw{@
zZpR+`ts+nr=1DZeP3s<$RsmuKqgHnEjMN3`rN!Eq6%;L=p4vTB*?k;lCtNtMaYyl9
z=*5eMTm+Z?2H|FKX*{~aPiR24<ehR8CZ@a9_D(1W#3c&ewNQMH!`(z5m|TFl2rWR0
zJ;V{&mH=2*LkkPg4u_I|gW<4sx%5JRk_<>p=ayWCr{k40#`lo<kj;hc2MJr64i=fb
z{x3Z#&8{Z`{oCWgZBmUIc|wk~Jd-~D`!N}}ySs*i*qS|jzS9Q}%$4h2M|^D9hKEFg
zPs*k90{oM7N*$7(uWXqPPv@S8!Y>wWhkqT*9?dmXMgri=Rwh*34qPc!&MMCG+nD($
z{?_;_`PX!3>$sBW+)&EHJG7=&dqL$?gb9cOx5bi9;rH8`OMt5a<vOd|!WS`#sl9^=
zPu6i}+@Jn(y`*~{+_G9u?+>n0@i>a8-$P)L*}T+FwzuEizy<gWA5{$E4n-HEKvA2?
zUm~9(!lS+(gAgqH21b_m5I$*Y_n0fA8#DQEPSO-DwRBb*!!`Qaoy6)=#hMN_*CW5y
z%Jva%_)A|y_&Wjz#p?L^bfZEukS~7-dPn<llNm@9JAnM`RCd{*9H7kFkyMmRgOu>(
z0=eF)9!#M{)xFgdzAUx+@W-~DOtXc*3vY(2Vn}(04atD7gp|TID~12_u&4@@rYr;)
zIA8*IKH`_~4GV%a_omE;+-(#m4Ij+noKG7lE-ZgHyzP&I`>nPmYT^sEw%*>QNRj)3
z!-HPmY_WN4m>4JRUEJ&Dq^2lSQ9CGBpU#|pM;c;ci=Wz5W#B=R8$TCNX_j*=V2|sT
z6vLol;G~P5O5*TJ9PcxjWhkF3$8aZeClI;&^P4jzlKYG^i^cZ^FUo3V_6yMnEV2N%
zaE$7fDv0w(4>f&B@86La63-(!777DD`@bOHch3kRqULesZ!RLHRkUu`Im0T2d~b)E
zj>>EKnnjH~w%T->1JXeBM&XHxX|i8Y7<O>G#w{ntjpkVHgK+4$azg`i>u-&lL|<Vl
zXedP*1KkLIO|^Y%lpmW5iME5Opyc5pB5{WrUEbc{G9?6le`b;!ZDf9%XJEpYmA@{Q
z>lu~U3aGHCV?zY8nEDfu5F$82#2GjXjTS7~MJG3jw&iGR-hNbMQG}4?ioeQd(+~GI
ztOQ_FJV?4~vQSb|+AKDf`Dx%wssOLCz;eGPEL%cwB#Fm3=l13<%P{_ZvGxb3?ZZLB
z(SEc5>-SuW&fwV>z8IJK%Tj}i*lj$xqqIc&yfj49#5P-~ppLLRYSq(?v<~};z+!_<
znY;}Xr$UpbQ;%Qvi_aCq!_%z_*VXeLh@KPP1-8}aL0K!wqS1GN;~`-4@AZw)mi1s_
zU|FnCiXIO4@l1)Rm@EGacF2l9eCgTET|#Cvpq%DH;p8<GTBfwtYy54;@2!G}um}05
z*9pp^;GN^JNj_#-Sn@}8*rWhf)*4yLoSYnWMlEq8R4lBL`1p8n^XZr8N3;O-F~rjG
z6eNfs-vZyV#^FS~+=J@MLp}{x;fXPQIl8lluQ7;dLcB0TGO8A<mf*fuh+Xm_=n`(t
zLd+NzpYXSP#ubSM-(cL&Xv*TmOKuU?I3LVIn)Xk@?jt<Y-_@7Qhzm?+OhJ<wjSipl
z<k>wIqE}r@Rfh?{$!O8YDI5~7@Wu3GMy9&AQ;V=q*FDZWzrKbyXiNmJAesN(UqHP0
z5*L>WpyF@lJG`&ZzGpN3o49$DduR;e*@A`k{eJ)0#>4%SWTXYRJsp_atr*yO!+T=D
zo~!ur?7RNR?$jB86+r9uReDDc(v>YXGKoF;`|7nropJF{c>OqnHMz^zef5#Fdt{sd
zZdS}!gSjZV+h1Yu_9Tx$t$Y0p?dJosS+V700p)7p3e(<n?`VbfM)R4nM?^mQB?=~e
z4V4yOR?}%P=ia%+35d76RSPyaP{cR<#cxYxQP7c}e5Es4k!)Vl7fL)*5}>G-+m+qb
zWl2_*%lG%(_%jN1QKgY(KD+6!3C}fuX+!@T_3*fw?w1T<=r&$dR8%5c1st+wZ*Q;3
z3VeM-gY{aM|D`scKo2UMVwAI}7AE>liNiDhlzp6`w8XEwtq+!id*o(w<=%nJ3@sJ6
zp>2CRTQb>?jj3ltNLrJ2`rUHEn)smhx}(gvZXVCVT8Mz8?yOy^x%dDB-TF39N5ESQ
ziaDiWG+{Ff#ccYKFvh~6QU#+j0*`r{sq%c#*90Y`yI;!3!G)X1?FCex0yhryx@{7J
zNX1h?ZU^EKXr8Y_-U%Up^}#x&*}?0ikJxoxN~q^Y+6i|uKJj|GcZPoBH6bQS>C%v5
zFZ(Y!KrjEi5ZXN5WsEb^vLs4qaKC~1@o^TMuihQVgi4NNX=zGKm|}mi>;Z+0@2{tD
z7uYc{d4)#tB>f6r(Hs-U^<MQgn<G?d<mKgyuHU`yBEQ7LmpS<Aj8D{=NtUg@bJrM#
z=f`3&wZ6TpSe*mT6KgPs%@Wx<;om*CMsY3xi7(cfRblQS2=_b27e<p>?BF-}!;e01
zn=eRF9PrN>)t~L|O2NXy)-yYrV~U7$H=H^ayi>jf+RNeNfByU_`uX!Cjum7$l)0k)
z?&#y|<E4zx#`NI+z&J$-{c2mTga5V8;Y>xgR-;%iyX#`$^#G}es*MTb+#chJO*zsj
zy^;I0r<Z5lA2wzu4HhkFX-tZ$+HMd?=7^_S&OgFE)#Gb+>w2m8PRI|vIr}+lJtW)s
z1Uy7#<<f3YG~rVx1oKn~jEXU!{HOQ9X21CP_w<PVFcxZRIA^MgkB>!ovMX|-b4A3A
zsi@^|Pu@A<mObn${rZGQkEIhBxz}2u$5N)qc!AgO<mF?-J+;2CuFco@&(uvuBF>v(
zeo_H>-M?H|)NlV?tE|`0Wp5CfD0n$<G0|~?&&?<%XQ6?r9#SMqmC>kF${uF#=%;SO
zU(bIgDM6%_2Y>LgY1hpAbs~EOSm@Nte}N4Ps=WTp7pIoa@D-A5VRsNi%PNrM!ZE$0
zgCuR~;&Q`ldU?FQuLUUChkdF^nBWvkyTy#XY%YJl;We8t5oxtEoZM*hw_KLocDAx?
zB4v*N77Cxw4+4Ga+Dg~f);3vDBN*?-7Nux!YgbnOB7#6jwi=M5nrlp+fS>y=D#x%J
zTH$nbfCdXPUv#q;llXO;bQTd$Dd8GusJzjJ1WP6x<jnJp=9S7&1+|A{Czs)B|7XIz
z$<?gW<x&LyjWz2EY6={4Mr&5h+y@xt$b?jQ)>Da-wjU64P371s?G7;r2Z8>F@CU4;
ziVHmhgD$CVtbtKOk#XipCEP>B+>Mno>rAPq6A1W+Zo3;^9NwM*JJJ!wZ!BRWjIYJb
zrJny~V^D=;S>rT$lLOpq5^Qu%wWNdj)e~^ge9nZ433_zK@wde6_#E4Fj}P>;<QbtM
zoF0(OWNK8VqJAGHefj$<03<e-@WGd!9%|ROAc|6l#HaW2r@0M!ukyHoch7G*JUW(m
zGH*V}re`Il&d6j$a0TFoCQajnq>|n%l%HR0Cn-$<6MVz6?Al?p%inr^2zkf7q;@;U
zLB(lbon;zLrzsX&)3IJi8(x7qD_P4o_>gC-r<lV1RZGWwkljtT=l9UdMZ_sg5lPKu
zVT~-#Lc>oovW35MU&NdTnZBQk2Y&s)c^ZK;)5e+NXOgZ${^0o_uJgfBp^lKh7IcYq
zS%{O0MMG~Xt<BGMl?sJGN)^);E?ut5<koICk%=i*;!}M+$?~iXpBdUy)r)Ef<ftv_
zGn|gB>W+Vpht^OCIM>>N9PD&?Oj`9r-{Wvuxlxw)k4tt!ElILC_CCwx<|g!-YWHew
z?eBkLVPOehBlw?q54lQZ%!Zmm6D&n*k;T*c`)J-n=Qm6cu))2uf-dp(({)Koy}xbx
zd@gOFLPSaC9Vn7yd<FB?Xv2A%o9GD2$72c*C$DZ$7xw2kl;+{tb0Ymf5$^qo(5F6{
z9b#{UVM<*ApKRN0sJFM}PN^&vmHJM79U8{{rgpa<XXnd&<n?S)?u2fVaKd<v;5EfF
zpEss?k>xPAJwJDeKyHUY({ltzQ4<^;dL>_V>mCPc8R<vpG7(khbXS(x((IWDY}=Fj
zq?Ip}%j%EhjHQ*QH)5qA&}jKM-JKi5RFt!{iT_F}Sj{Ra6@JAdpwn!JmrB%hT1{eZ
ztAHe+%Wb2Q`+k`tB1=E(9pzT+uW2mGC-I9xT&5z3ly@&RAHJRbqX*k?^p)>IcZKAF
z+Omtu^4|6~Xr$Q>-oGyK8{tbs*6rCB77aePCK)B3N@dsw1>RlF|I4K07eZy2pB9Yn
z7+*k9*x<F{-@|NO^Ds$HdP&(i8cyGLgG=}J_GBNZw2~1(zPP4^*1ejJDjGf!b_7_)
zR0Rr1BtCP1<pxss<%Ph?`Af||MT6Q=Fc{L}TL)upH&6}1f?&j!d&vdl2br^;ub5Jl
z{kC>98mG><9YB9Ltjaa7xLEEGL`BEQbDoRCwj}~*mf812jsNYBscv&6<F+QJ6Bn-J
zthKZJouy8o6A{{mIs(is+9yhxkgn3<Mv_jD_V<(ml9=bB^)Vrt(m?O*yRbyt*G(A3
z^nAOaQFe&Eg>JmW82-TJ-6HZOwID}Wb-N=R6!Bc^=N<=VJ+EP4iGMq0RAPMxRgLl~
zywuHi>aB;bQ%F%HpohHk(5Y(PHQ9TN;ru3jU0GfyOd-c#;?U80@HR-*M9M3rTN8gI
zpswDS;olR2JCgCoac|L-iwVk(<ZnDVy=ePvV-3fwvUG@ui&IyJ@vY<a6&}y@>R+Y-
zpFua>H{B%jRh7nrEHeT3T5V6oO8Yv5d9+Jz40;qA1bLO`-|y_m)KL!cQykJo*?#4V
z|5@m_8`%whlH@a`&*hHhnR@C@8Ri2~7U*#0rI2St8R=#QDaV<!;yF-AX8Shd1yct2
zykvKMu~5vU&rRUjLGYffvm|ZXjFlVuJs^`3L@tcYB+CUxVRC>H9yUid3&eEUAh)))
zyL6|s2KboV8)((83y<`tz!jX|p=veHxEUFLp^M-VcB7O0NQ~K7_tXBe1h5QMlLXho
z3|9>HjJX{s6uw)lg*czxK3|UtBr(vjD;x$hpFd5UQ;`HLz_h-nXNuf?X^Kz8Q~f=;
zQ1?lD0~^86n8?Bp^wf#FvvY8ut)-M8?PO(`;8XEm9s)iF8vNxS;UcI)%=PdLp0YCT
zh(WG@(@rwY<t~=(<XOhEuuTsVqf4+>t~V(0XKHcQlp6Qj<E(dm$AQGbwpV=s-duHW
za&BT@+A!&iSVrdhBvcbP7Gea&JRq)3J&EA3z)?~W<>y=F+N^>#gSNO2f3g^7Wt9K8
zQMfYeKMmQEVNO=>L?4s@_+sO6<)7;rwg#`kBp!#uceu~jKVvqnNcO6F6It&M%a%Gi
z?>T{@EIowu2h~{~o6FsWSul(^EL!rI2fpVy_A{Vx%kS*OgBG->5ga1(8c!`@1weZ)
zTMI<3f?T)9MP9QC3Jb?uvA_C6Yyld;Mb2BlzEWZwf4G{Ez*4cZJJ$9W4ttCkr8x*D
za-=8Hw8Uw9$*fBYc)Hgt8z(C7c(c*0GJ@##detn`WyCkKyhb#ol>7AF`!gxq(LxKU
z)Ksk;(wFtvwjX9=vJ8R9rvw+%@N25fi`>P!l${YB{z52<yGlQQO_tR7(WeN_?{@JT
znFHc{SkGsem)l+NgqoA>rXc#kh%=(oKx23O$=YfVL{wIlaezXQajwAO6*_IV*l;3R
z1>)2y@b+q*WX3XNR&31o`k3B(iRg}JUHK&he&=tOuWj2we#OvezdAk#&^SLxO6@LA
zgO`^5OqHMAcsP@F>izJ(?oHo^huT<)Tz4sFXMI;^_l--`RPxSsOVFtoTsAMzUjfrH
zCw=$7pdTQQ=7ETtzq#mszFj)F9AQm;2-QdX2j>cm@C}oh4*m-y4RX-p1k{kPToBfU
zgZX5XBuyGiO-~6})o*Pmb4CKLmOQ4q%ZtchzxWAwQq75B!){<VbfgM*S|j_rk85Sz
z^o~H>xHQ22E)7ZvfggnqIbI?df1@E2-5mZctk!Q=O=Zz%bay+Pky)jn{o@*gTn=s|
z<8%3#6WPDAqTiHI6SbF%fvtq>x9i*mYm7G%S<e|mJ`sPk3cE@(k@5o_e3kb-nRFB0
zd}!k0*?f-~6girm_1%xXvoDGQ`4uDM<J8Ta0+pj7>t_pD!2~cNO)-N{l!>(a@!^-h
zI3Auh)2(zrPG<`CCtDG?ZI?dc3w^@EbQEEYw#`T)8r(1=zFMvCup#-96kU-7{vpw(
z;!z3-eBp}RRWQ^;kyN7Kwp~f9?tD~rxpPX|@g2m>&8t~^NXJYq*k^!Si`+AfY}X*q
zboSPUsPzVO*e%!RtJLtG4xQ!2!|<(rSCg}u>|$tb;~RlqhE&a7_%$v8f_)3w6@sX=
z{}aaXi!D@(Iopz;tq{MbT6qoU9-BcXw$EX7x>@}0iGJdJzZb}(LPC;~dyqH5Z`?4Q
zoZN5+h&&v{0?2?$jWFQx!}Iz1U$@O31C@KDy=ot3ewQw9xz#K=Hs~M=v$L}^J|(4L
z87(z+zUr*l4gwrjrI*npOl`ZomzP)goBZ-$_$m2h{e(kWga|u-p@?8CFhd+D=DJT7
z@jzNw^t_T?_D!|8GT}ApLbQ~j&<u7WU$t$DUEY~Un(RRtF@eT)*bLftQi_(4kUluN
z{>071gAtxUt}MY+nMqT6ji3W-Vn|8`i?j<3+8N644_I24TDwdsT-h%|lHkx`RR^o3
z(h?BsaY!Vat*mZnE(qlMYt4<_y|pw{>ndyWbFa<88G~zach#2yq#c5VQWpKYo}JA{
zqz$sTn8I<x$DG?!aJ#GdO!0L{y{v*v!_X@y*8M>Jo36}1YpS?v13qPf%oyfxEdK-t
z2VW9Hoq&=~R#8#IJnnynYO>6E!zs?N;6NLmYgLlFna4|Vyg!D9Es-C_a-=X7SzN|&
z?Ic#+pJKQU^HZTZppV1d-zrYl>$qMHu)H}GA;}q1YeH^x<cswY$K<xZII5?HMu&?w
z?z%-gYr0|(2nz1gLa#?Vw-}%w!Ae8pCzq(mPP~k0!^8np<+4VR^?3o$57&nkHHqEb
z-N-D4fzRCjSED?ZbVBd3XU29pShlgkc5JI#BOa~~Xq{KuJz4z$Y01kwJ=S0WT~X|a
zq;=oi_VlbQ3`6#F@~abHH993(+-emMpdDPC3Gk|PnjM(kEeE0qt^s9^8DwhwSu!C+
zx9_%Rt)DARj`@Uvb=+=Lz$uv_V4$6$J>GR6Oqz?nL7YOD94B&o`lLH|0#2Pa0+H+K
zWy$?9g(;R6{X)5NGZV8-w9N!<v;O<xA>@(yYW27>OO+tzzSGrmEY7xuAe=G9PBB+F
zI{>;vR5nhaQk2D<w=ZU3a`a%FiU}Eid;Id`oT7E}S#J`^PmN*S_KHYB1uFV{NRs20
zH38DT&E)F=UB^2J160EmT*}fUPKK_7cz`l@MbbUO!&wd8^@wKRS9Rhk#2u`sPss!j
zeJ3ZjPQY6npOEmqyAdFrGqtvl%kp1}oXeP;b&mKn(c}(Eamu1!(CRaj9`(VZMjGTC
z9nW{_y{xk6M7(;3Pd{9f)W;>veoM7ua^%vrl8$f?^d&-&x;j7(rD$$mwG$r!8SUNs
zp@t;yR=;iM#bMSW2obUa+fmJNLWa&v$hsbU{?7SEzZ(#=#CJqLfBqapWJKi`Lm`y$
zUOr6}D926y!ikZ*MAqV0F<woOASWnqLT1rNJ(jC|nOvM*jW;r0ojW$V-%;K)z1bkS
zGdY-KZeA9LM?}Y#!vV7s^SQhONX!J`#+pRR*KUUkDb*dOV7`VRAYwTzSUj9|mO)bV
z#3Zp&TT1|X1z;>RQzSZ>1dM77yRSIy^_vTq+a<($jjb2drV?^;wBIS^E7v}Ca@h*1
zX*&Tj5Ng0Eq^JuK0bvq=B!h;B%Nuta5D)s;P5ML^sO7q@x&yzD6?ebaTrNQJ-lp7^
z<5B2P1^Z&3el5HxNLy;K4{XI3f|2DJ5oA66v_s2Dn{*bWkK6AYxudQloAH#Blfs~(
zzXSU$^XbzU(9Z<S`?H05^XUIq<MQ)@speo#Vk}ufr!8R;`G_RFHlfv`Db<gCfvn4V
zPH&oQrV#2EahxfSAupTxx@N7$&u;`-p;qX0<U&7l^WRV%dR=gDlfRX1)h0IK`|R~a
zywv7>>nGDk<5ZPYx#}c3?N3aYoX@;@Gh$@$v?OXtr=jn#;v;_2A~6UtD264wVuVp6
zH(G9$%3?c$H*~rKI{hM*r)!NRw5wrYe-wzah4%FeQa$xgR?K9EZ1yx<x+of&QWXDX
zeu||d7|rBK<=qt3`h+ybZ9P?_>Rbc*5?azMpwQyZ_2XnV>Ucw>+^F3hxl0*tLv*>e
ziaRConqn&E0sZ7X6k&~B3%Ak!c}z8xhZ@Vz-@mvr&+2ExXh+Z)2tqlgj`WGhuT0PX
z&dZe4JN77IC-a}vwIHnJ9n?X8H$^&tB{f@}K(wpXw{7~zokHGhw#q3z1G})c$4Uq;
z3S4JZEf`BTg{)~YceGz;q1TW(kX3;=r2fmdZ}<pt+6Kz!qQ*$(O71!Z`GSc0E;uo5
zHpq9$?nRiR8Ml<rw>KhELOX4?Od_NGTDa=?MOcJdJc3@Wb)&+Dv{5;87--<_)PH`%
z&k+(}<u7_*l)uy+A6}p7Y0wd+D(Ey?QIF{15T-cwhG1{}R|)Ki+^(*!juZL`?-_}s
z&>@0-fvi<Zk)^smi?QC`vu{jy;ede#3mPt|3and60FThev1%5M-@oJu18fU;s`G`I
zh?Hg8t<#MXbCXl#a{|O>YE5~xRd+~9q!LcrekU>>dl4+xrjvPD=scmQw7`VL(HRNn
zZ;<g6oJ3nrP73MrVPb-R?y7k;G|e>bQRTcPm8<yh`WOkahN<rD>@s%WSp|#^)by?o
z$Qu<ep2?;4B}31$J}LO37xoPOO2h)j6Im~6xIAmM$TrJAL@?U#yipxyKK8*z2j0hH
zp3N)2O)()=JLgK3Xl-kYx<r~4CcqJ=E&}q7FZ_4bN-E5UBMxf3B+XVJMfXuQ*ZQlW
z{=w{wMjNZAvA`p4WW0Q3zdQ()s05Y$Ty-|c6I?|@C3rSp7Yd-$4R+wTz><0^dR9*u
zv}I@KIWrypc$eocsm1hT)mi~AQCV5!1B>}$NysGLP@{!x)IAt{!G=IxD!J@oT+i(*
zKx5|}>YMFIl+yth7UUvz=5ND^B}O6(zZuwO47>V$5a9xhzI4x^H3-l1lQcVg>b(is
zpWoOjFk?G{nd0BgFwUx3L6HmX8B_2DaneR-G~3I^*oT>Wuisrm&@XmW4R%;_Fp3FM
zij0XEejZr;an|Xy^8e~SgBuoX9{K!uM-(1BsYLoL2!TWsGUL)IYxe_DtPBomdT!ZO
zJ3ndLw9dKF(XtU7Zqv783~W01C^An^&EHs@h6x%o?I_LPzE}GT;4?a`D=~<O#AP(w
z+HqFWNJk?=@N>n9^*o0&>6H(mNDr#bFXRK1*qd-AE17e4sUI{9usyGtYSa4bWx|^M
zpe`BQwi?NI?Em$yTJvT7Z_~E-c2)029P*9tgh%nzeO1hQrs$c4gE>?ahgJr21)n-3
z6I`s8JnFZ;T7OC+^s|Th-NJw@mi^(a`p|M!KkrpTa^dN)(eeq~aKkcPp5FL-9BD;~
zG%7(p?oC;a%MfBp37?A`dhfV{xkPz4fAj2#S;&$U)6<{t`MKO&Qezn!w&o<Uj%!~W
z#a)jtDMcx(|Am`@B8CwoGXtehhgLXMcfjkjrw?#cGc{(R!3jS#qEjqsf_>!A$+S?@
z<{Q=#D7oKb|HM^#H(3$qN42Dr`tFlhFT9~doY3uCZrfRT2stkm<R5|b>uDvCZ1r3T
zbJ&9j?ta>uRXWTtjC`V5A@9v^yh6<Shb5$FgY&0L!PA+8&JnDnwNAS(8BYt8kfdy;
zyJW=6Jn1-!Ye4b%OCOS)+`BmF8K8t^b+Y&)ky0IHbEd-{uWfH3lkb(BPPhlapD5uS
zN&xtS>%;yt6&I6o<XL1mg?qrkB$+%dPY|Y*^}JV=q;#BD*y}}3hduYI=#PLqah_g&
zOW%i+m)-!}k{e(q6UpkQzT1TW4YJK=Yq=a|G9MHGd1GtH%cCn~@v4wDZuzQ*w0&^%
z6KUIQl506erz6??ddVm^RqN*P`&j*YBifK)muV_$9M7;TsUSm=FhV!yGGy~QW>BO+
z!M2HrF@G2}XRbvOzU?(!7wtLpAMnoe^@U?%^<@Y@hOO8=cfX5_pR{4C8F0Ed8vftD
zv9a7hm@zi?9bh$_GISJ@v_?TO!-6K$Zx{pz;4ijNdn0&_6g~C)!r*%T`xWAKjm_77
z@oqW%EcvU|!i}2p``*O;)V;Is?t!n0eA?*|1xR9Al#)gpJ->6Z^e*&F2g&7(wBd87
zz9fYvrN>B7otJF&gun(Q@k(TVrlKO5pR9v#-~Z?a&K{2J`Dw|E>}iOF>hM;KI)=o_
z;GU4N7}&ektEp#`m3KKfk}BHab!f!fIF+coqnY%mc%UJEw|(W`#d<?`1xRG<{U~+b
z^U*ZF58F6Wh#<-UIWvzw*Dvw3b&ulVVvv%K9Gfd4#U?lxOY-N&KCOu}O$w%7GL9x?
zAZxj<h&(v;3C{oHWcarDy^zoSo9FY%n(Jx$?z(df1kr^nBOugYuZLpEQh@c_K8GnG
z7b?H|EVr_EP4j|1u>n~C+e&;5zJ~<7z4&Epa*gEFXsaEFE@?LDD<?9Hl@rUmkv@hn
z&Qb%{SDC3Sz6#KM3W@~1&hv_3ZwOG^4&ltV_TBgm^Go^k5sS%72k(8~mo8bP@eUgr
zeA`i0#+<lS6d^^;y-ZG}1M%F$hbn=$|K<q!T1|gfpkplj-$rV*P2Dk$SABYFDwABP
z4%S-*41rr3dLE@GQR3fU8~rcjf%&ATtkSoV^WBpn`kDEgHz_>8z-=AH2Kv6Z_8iuD
zbTQu0%YWf=1)q$%*3z<QD#e=<#3Up+*1x9InL_@1bTF<~@Mx%d6ygU?8J|ly6VAT@
z6{*Oj2bIug<)zslnZq$}TyVQX4emmGO`9C^8e2?VR^P~yD!m$^ZeB-n+solp2_!@s
zQY{FOq>ySICTBQh-BxMKw`-22axE<h;G;+|5QDtVXGt7u6mkJ9HT3=S^WEA`Q@q&1
ztUdPo5s&kTuF5=?o$c+Z)ssgCrUpAX9c;&A14QZO`D6gD8=d35ORUrCV*aDUU3!sv
zJK`@^VGxz!LNK@^pcMQXTpRt4S}A^vHZpcHg<IgKwte5%pK!KOQ`2@B5CN}KIo8g1
zau-00=3pa|vO2|F?@MH4DiXIHiAugD?Xz~eb{0L-PsTqS0`aK^t2yER(%zyikDio8
zKY5Q;HM#_UT>*_;P`Nj}%B?6C?BK+sCpoyKz%MVhPx}g3a%rHEkVL0f+Cabnz;kc$
z@JT+ox5jFK4Eb2`6yHq-uQ;6t@V|UQOEGEneJh;F*tbfK1mZ5+962q>_)BC<bFa%E
z@J^8|H<c$YLB#31{cSYzBXD<0)&d@@0MOWf+rU-&hQSxLXH2z3G5ei>uWt&^HyNg<
zSy5uRBVv$yt>EzI*`B`d;1{1LS`i*;e70XKnv(RtF_Gs?HosLZDToN3mzkCsiTnLA
z*H!M}#Rpl-gU{U8ee(tD5xZ&8<!3M2Z;Hs1&ehIM>?4cC)LFOZ5REW60P&_afKBjG
zIdY8xj8y)G6{nxNPUZVfv7juLAiSIj)wdruf1_^C)|9Y>uCeKQP^;TkJ6u_f8I9uO
zB5G}#wBQ%cz61{9QiD0mzNsQ=Py_gk?{fye3La<=XWg&+K<Nir8`gEOga<Xmzo7Oz
ze;_bT`4W1DI?JG(?kA<)>&!Kb@k!*@hWeYvl8yezktyaPyd_Vi9KqJblsEa98p|Q$
zwdnPFvDT_tNjvp4?s-{J6nIDcT6$!1@cx@Xqwb@3)aAktTOy%{{la=z6Z6H&|0%Ac
zR2xcf)5%lZdNUBB@R3P*&xx1nGKUF3{J@lRyR>o&e%xi<30oMSD<$&By9n1^S7rjB
zh@ijXX`MqI@wI302}0IX9Fuy;KW6+CXe=yRH*oP;QfVOAi97JcZ?VZf6NIAEWG9o&
z?_NHEi-)YCg{hiQSc!^?)1&=qz+-Dy*HcTT=T7DaOWpdugmHLd8Y-fekf;+k7LM>F
z%YDW$a?B?YkN5qA*8u?mCOc*<gB6&PSfUT$k%r|+&?72b1I#c%=3MGUsk|^PYn^%v
zv4$jpkd>UVu`!p!GH)F^pU-tExVu7e0JhQ)d{x#V?JSGmfbsddw6p+8Ypyp6<Rf1}
z0onzM-^>hl(+E&vMMKBG0|rP#+}J5Z2jflJ_O@9Dhj~=Tq=%<x$;H-SQggFsc$M22
z!S|g%`;&zS<rUSqz21Nf@AaHvpn7IvVYwKw@m?bU^G<%aIif!>s5OEGoONWMzs|5J
zYZ_1DQBbfD3IWc-m`a%+s`}bj8TJT4p0(h5fVx?L99c{Y*|@L#uYt@`?snuZS%#T+
z&gZiI1%w0hMwK^lpf%(218G!AX(^~}Cq?&X0fMehzxEZWV+VC0E50$hhNjc4fKd-j
zonVj0#5p{mHCCO1#ElH{G4kgILJ9*bnCBz;3!n=6u@04PCA=U{&`>npIQpqK6lug0
zwB7Z5s|&3druer!UzGmwc&q{`hY7)iAk?M=V@L5U|83Qc=dQI1=G<L!Xji}|tu=!l
zH;h{Wo?`{qr6*}S@ZEy`kGI9Z6gz{@^*eH*jEoH9w^_D{0DzCQ53mdfBL667nP^sv
zyxdDbgv-+^((}$j6EROPkjQLpwf`+cW)x@YrCX_j;^8+CQ7Opo^`?VV;bYV5(075h
zNj@IEq8@e>MbKsN07012(?3e42X||M0P3K8zsx(YBM<U%!Kj4mc(>8)H?9g<>1CdU
z>20u;rH;nGxxo*jAHY9F5&~@^7k>*8#4ct17$2|=kB>rRLy?7*hN=c38kR|R`S;=|
zRo96*pD0y{pq_92dNv;UGy*>QJ;T*k2n}5Ym7gd7MOhVqxY2;2wk(D?0a*spfA1^+
zB?Ac5|N8;><_GfsPQL$JVw%qj|N8;|T&EhX{9otm-*?F}i~s*0@c(@eY#{Y}0G>L0
zuuQE4b&N4eRhKWkDS)GUombmQqYNVtOJt=ack>KEWCo_(rc!L4plVI>Y^U(Oxe<Y}
zw&L)@V0{eZ9^vn!kirkm<|F*>)VJEe{pZGL2Uuw&lcY;aOG(kuXbTGqzweHZ?S@B3
zlg9z5w_=vwN@;9oq2Ji}Nd9uAJhU?Fm6tb9$&>M33X-0#0EPZ#{VWdS?C!pE&o|_i
zl%6i*?d^@woudz&uS2Vo+QJgXNlEDj7O=plz&B&vg*g7Srwv^i3g>)3Tc#Gdyu7^4
zc?lL0(idj3L_*#gGHf6>GdDlTIwITL+!wvoD-m-LzC-YaUbZ$F-&R>QSY&hT-2i{M
z>|Zrnh|pVfe)q$VN`e<*^5=Q?P$2AE0a|<<R-k0sTzbN^c|Lv;@gFW8J()hphPSbb
z{OJtzX@FTdo9}PDGv%Zl_?_kNeBG04jzVJl@mwO~jxH{jRug%c*Ao({F1ZfU#Ka5N
zGiqxSvsFkRQ9kR7mf;Z)NCNZzf}kLnsIOo10qcs`SV|EV)A5CEvj7T=jvMDg{ue1X
zLN~uPmWP82QV=a)@Ug77hJoUTALN25QiAs!%?J)ye$QEPh6U=~0ScI7EC`x^4JJg4
zI(+u?BTE6&%H`!=D3)-Z_tkDf(zG>#i2|JS*y!k<*V5em3z}=l`nr)1P(hU$bou~I
zbtQoF29}hT9(k$|+`iyV)6Dz%W3TDd%2@=Tq}B{p(T~EvHyruWXmcoMhPj+2u%R%W
z)N^|8YA(KC`_y$<oF!EzKrEeievSbbsi8wzR&iM=wV9$YkeuHQv=Z=mq8bO7z<dHy
zgX%vb?()EBx{RVcpr{V^y-e-l#l;0RD=X{e?QKf8?2c%c$$6D-GmF0aU-g*+$!OPG
z#K*o(2*^82sH1dx-?nsXhzV_sX=mW~O156-^3do~)fa+Lmlps~(DU3PKuK{q*PD-d
zCxalO0l$xRak*YU!cqvB&Lu`hA~EYUeu<?J@|h(Nf-CtuIiP;n#IzUOKp$sqPT<A%
z-sN@k^hkllV3jPQ#7~5-d}0ynjf9roXpWF>*sq=*(YB|{k>6`;hHnNlq$tpS*P4)i
zsKkpQozvYk%y#bt>$&`<<Dl=+vjv*e1gh7f>=T0E_+ReUVq0xZd6*b|McL^xXOmpw
zV}?_$eBn?Zmj(T|H+Sf~3)PCT+Xsi@1`cD}o7+@-<s=Y=l2RuIJc#ymJPZTgjJotA
zKUbc*1k~Pa@cZ`3scLB{6VtwQDyyMlU0oeV93`(4mHX+kHuEnaB9|wS<R8Pn4P*Pt
ziRGush<S<K_XltelRN9U!d!6~Vcy=`qu)!G?Ne{qDy`U0QL>o0FEK3nGMoK#SlU8f
z7TWU>Pdjxya&}Z_Al=_*!JT{$4%8;#wlC`caGh{}ywaY!`zPI1ag8^6oyk43`*U6Z
z=^epSXU_u>X!nENJXI7jM04v`j5VOqV3Mqw6jV@R*zIw>5qUR-#=6|oXRpTK$-=f3
zLW20l5#$|#PE2zVX7sX`8%X%G)z3)V;_lbYelL#D0ktq<QxpgO8wBH+gBiWCHJ(ZV
z$?*HH1QX#=*O3U)l9z4u>t82+VAwBd4Q~hI2$i~iZh@f8xQ@kQx+$DBT`pyZwkg-9
zA~m;P4&V*v{1ah;v;|OkpgRpi!5f;!i|RsaJ_eHI@+6YN<6hCAW?8nCU+=K++791h
z+oBDP$H!x*Pl){v4JO)~fi$_~Db6}O6?jMts#%%l%-1g`)Xb_zsTPX0hp)4H9BW-L
z1fRL?jTHmo7P6F)FT3tQ@+?apOQNUq*O~3X4Y~mbYf6BR$7wlM^z9ouG?tF0_H&gC
zDkVtp;m%RIN|$Hd2f<u+R8>kl#3v5=7szwLDKju~_O)4c;&QvK{3a-x7K$w9JKJAb
zN#tt5$a~J)hNT)p+{o7yAle3bm)2vjFT}38{g-1&>RK|<46WAP@j`?2p)O!)ROWfM
zYNK8aL`;PlWOFQh!KEhqLO{tR^R(@VrEM!#U>``WtLL{R7~a6!D=I3+_~QAzxJF%i
zyJXh55=rv02IWlxR!w|)T)k6O1JOit@{(BaC1V1qWBE#sU1`Fd=3M?R({C5K?mS&e
z^IvSn2-4u~mRk~<999_D`uqBTlTZvHswM*-U$C{+n`3v@YyuPLTS);Ft|Y-Q0>me(
z139W4J7b-@JcE3pyf|`plQ8g6x%G*L3jU6Go)pNJZL&v^G;P9up+5YB<ElI`2izNY
z^CnbVA$J|ff%Jw%tzjy#wQh{`05Wn6*s!c@nBpfpEnh+bPgTKw3=>pK>;PpD>KDWQ
zs|q(B>wE?N2oE{xt?eU_bdH7J&`Ci>SH-X+>(K*z;+y~BQ2_nDxp|MgtlT_dV<;Wj
zTHz|U{o-f8JF84};N<ctx8MB&3*(sn2R;<$J!Oitx7b??ukUb(fySt@%v8)+S{;xb
z=1_CN(cobY`JiBo4dXKWY9_Q4XJ%%KS#R_2qd*gpSDT_`7FQ7OqOl7zGX28<>e!Lr
zwvvNl!WM*glC39Yc($8}VxF<}Tbt<dYN)=K3kPQuS?hu}ZfuncD@s`8<2PTW!Mk^Z
znmI}47vC!7`63%31e`0a3+a-Z(N0&qV_w3SHBWCbYjNmDoMPB2x^qkx5TD%EYp<<?
zK77$Ht*>{{66MD98w)0QyQW6c^6MdB#L0+HIYk(a%D{q*IX7l}VPU^LDkA1+SoiZj
zCw7Xt;KhoU!2&!JL-mlr$a#z(Xbso!I&?m`0wBjCIFEfVmP<`(7t#)W94{>?edN<q
z8kvlhpO6~8je<8eB4W8>BQ%C!xR!{LArq}l2K@0VP-6Etqb5&z#+(X$99E5JxqD~G
zWUPBlZ^4U9^c-aVs%F5}Rc+X9kg~kty|J+|Q)ep8FmecUgKs~1$+lRmBK)c1<}5{8
z`zTxcwdyf6su4Dbm*%X^<AgDHl0~ChkKrVr=Blb3Zscq2XK>~f%gAPr=P_8Fu$x0m
z{C#`Gg^EtshUS${f3Cs-mHZK}>{CnZuuCbbSxvfwHmW7g=|OF4Olo4vA)Gzouv<rq
z<Vwg*0DuABar8c_`X>S9D>pxKcf0lKx2Yy>MhunE(IF`=DuQ_9h8)$ZMe?P+d-raL
zKhSP^v0R-geZA)!zGy2f6OH`+1^TinTG-a<seu<IL}0!E-*K>3U9c%^fQ7mR;QD_B
zY>DaR;A>yVL-l2xrS(Rc&E#a6HiM4VJqMr2-`9VMhLX=wbs%D_bM}A~<-;T-BnH?y
zr-Ht&0Gh*~gC+xtNGG-gVL<V>ml%PHl{{M?)S)f#a%8v7OzGi%Ty_?NrP7&VzteUS
zy*HdPG`qw=YuLca$>6akHEfw5sf7QeiZ9=xR42NPM<^51eqB8Az6yX6DFlR0BztXB
zo#=dsi#=KDU2`2;&A(>8#vuawItzpp&WcLkwpxmC@HKhYiUvgiE(0weKk*W;WpT+w
zFvf`WS_T~?Jwx8T<%KX3xfmSQ0)0@kDomNFRdWPJ$qXGxA!f(bqm7=UG_p}K7WudV
zu~D;p+jLC^x#xYBGnM0_>1Z{Ndzud?1HRBC@_)EWqrO;lXX|V-XiuqRD-AJCW3SPw
zv6LUC)D`Zb-iguCoPqq3Wo5+69@;&7D6J(WnY@-K6Z!0|ts9=WJA+O3lZ}4R4-U;`
zQ=VGzu-JR>@SSBUg-)k6hgX~(Er#rucsBsjAbGZGY1Ej!XnULMWi-F`SvR!9-E;6X
zg5n7y1!uu8FzP2e?|hukor5Hh3G4(67`!RIzsJuI38Xz)_WrV^(pErzN2Q2^BSxlS
z8}$u+SGYJfOT_D(cQms(=dq8L%jUCZ2G;?jjINXvp+eR@gZ*M-T#b-^Gq&b-a$H==
zq(QywJK-U`z(B{tzZC>OtZ-B&q;od~<qN~nczzR}-a9afREY%!p8S}b`e)oZE+N9>
zy!nxscG#txS<%J6aqEF%_7CttzU~uUx;7k4%(^$)Z-_t$J-<l-w%#{wmlh829|vzw
z3u?~Vzi}jPj_)(^n$(kKw>ygVqA>9~a;V`J+oxy<BaaMBN9%PyZQJTRIS1sB|DgOJ
zwn)HRZ^;Cp>61BH44t;SzN^2g%;fni9q3La;aN>D8y=kBVl01;Hk4)d?y<&nn1_&f
zoJzpn6qxCH{lxVDsNus%zV*S^Xj)cQ7+1nOi-#^;lWgjY14zW*+?=f$3Y#APN`aO}
z7Vi#fzqu*)zCtc*u;|G1YJE6_6&$o;S(~d~88toW1|j0>Qh%X8ytzF=k~CJ+>rmLS
zbgE??=ge?=jsqVn<-za=2qtJ$&Wp8s41DqxGgKx#FF+Em_@^ff#XLGdqV%g(C3H}>
z(3@))(qE{+_`7la)L=~1Dd5`bC-sm+F&EdyUCx9Wt!V-NF1ds?l(K3L3q=c$XwF2F
zM7;lhSUStFsNT10(?fT6hYVdqmxy#J4AKH3Auu4_-5}j90*bV>bVy4}GjtB!_3q#Q
zeZF!Wd|;US-q*g?TIbSU{6mO+<+3ATRiA5x&=aMTQf+;|!NtTAGNrAYXOFiIKSsQ0
z)hO&ux@lME1sOFbtmcaQxBE)_E7Pt|35}n8vcw`4j9+>d!NV)y)o~k~>(LmJ&OA11
zzUKcFBhI@Q;O-l+KhLV*pIze)^5gd@aqx4*BZYY7RJA)+l6L!fo}b8JUtcnd+0dWg
ztoT-*N7-iPTv^udK&9A9YzE!0MUza?zT#P)_Ue?<*#`VJm)0<CP&peLp2;+A_a#(1
zrA*98)JXpJq$X*(;m}_LE?A|L^5mBWkOz~F{L1|{kbVOSJm78Vt@1D6X^h!vLBsBp
zVU>~0PNIGoa03V3-5pd73v-%xN92~Scy8d$>r%iinVWpB9p~e)RjF*qX9d>Q3?T)$
zLHPv)GB<(lhyN_j#XUE|mSy>wx_|RO4hU6tJb4R*h4*Ng_cNS!qU{3%3Q4|Vh83w;
zp%2hmvZW#9pD*F|!Ve&4prdH$6nqWBIs|$u^>?A9wnnv9DuuO}drDFao@+tf7pWjA
zkJ$>tBvEbS96sTbR}1;*88j>RVPVaTysN<sXa`a<LgQ=3A_7<Zey*#YI{*F=e?Gyr
z@zB~Afa~u|ZA+Y;T<?f>6_SlqX{YcL{}~)9G#P#>@yh?Lz)^Jl<#$}mdz2U?52t{Z
ztDpIt^(t!QLuY&G6o^7R@X{|j!90wmGfwnZqJ*VYA9NHc!<aNjt~4(0)vhO^P+Ozp
zszQpBL#NyxMoFg&u;wM_VfUQW<m)+kDGIIn4{B-wcaiAJM@rp_bUa0eaG$xcj8}Cs
zp5b@PH~c_0jIG%dq9*g&Z5tKdUsLcj?xQs^R^mGR^+IB?AkK0i*Z*-rwXOZhTh#9c
z{&|1)r}#2#fYWZiwjI;#15NiO1fNbqp0DFU=%-mnX=E*G2k_ny`zm%<_0!_3C}u2G
z`U|lSlz+G}0|`kBU*4V$cwHQK9I%c%{CaCF=KYz^`v$HiPAdtl)<M8sg@s2ond#W>
z1qq8Nj;A8Sg$GYht7l1h$y}SbzAOM&5_CFMR%(9kBYQM-K4eqn^6{z5uj3|tCHr{6
z7q9FOOleR-Ji3LbfnaX7W$h2kKF1N$C2D$O<1sj7DxI<1-B~hz;y{buDp=S??WQlN
zO;Pr?<M~nR&IoB>1o$IrW2MQEvGwx*dQZj!fkhR~9EP&e4{2LF#B}pOZFT!SsR&{=
z|IXMoVmdV`MY{Tnz4S+LT)@6`u}z|C0JRT&{)d1Y6rMXcs*YSbTKB%&mWl8tLS^zp
zn#OI4Zx3_%WoIF=GFDbXt{TM@LzR@ApOfd8`#kPjs#1n*>U8j9XFOZorx*H#p;sIH
zg%^$!9%#+g=Fv}ME~!*^!^umfRyJ0XU2)tW#|HQ&YW$S$gKANf`fg4)`brr()zXz2
z{8I2~A=+>kFke{!0x@<0;O4a-@0bK5I0?#1Ra9(^J7mAOJIv#&|79q6ZrV&T{XB0?
z5!DTv%XFi?r22%bnkQoKKCzSk+;g6;IO(p6a%~YWWIM?UJSoz1Lf(!GjZ3k(#;z~;
z*yeBpgFkgsWxx;4mL^5L?>one@k`_XTxDL<ZQH-(>i2?y#WlKf+ub8LUBF~A+8>+5
z#V3aiaEr<qW!sNQf8WjtW;I&#i7!NgQY4NJA||IS?}rd?Y?nHuuFj1Y#{IH*N<evL
zvVS8gJIbWp>?nC*qLcA9J#ZL3&kB)7{&QnEf^s_SPWT632U+Q%d{(z7F6DeX*RLqt
zLhq%Hq6a$2@=_|!$Kr1#y6Osok2T()(d9PfNjtT9>tLxLs@4SiZ-0AV?*43CNlH&$
z`=8zs@SJ;KKa;Qw+)ETM$PW6rqnLK*W6rwII<3%lU@5{ueE3TtK1BEpw!m44uwCm6
z60FXc71EA1Cj=Z0`4RNqWxbcrm?UtEbz(w7M$NAg1Pr_=Xb-sZ%^}V(S#~?h>62~T
zR+BZVS;nKzSbM|CXq)WQ&dUb&4-g=hq|Yi_xK@h>Yj{|ea?US@e8>I*vN?dvbt;D!
z$FPVu!=9S+WEfv&Ps3Hpww%%;igAQrGz?P=7Z^+bwjHr_s#&{X^tu5?84fIzcOUHV
ztyyhq);@eCiZ@)urPefPzrza?b!0=av0eKqNpBK?j?7^rW+&!=5|?`+8un6zQ8I65
z+iJ6BEmOs?94gp72YaJP)5x0t#alQ1XHd~VUr!9L|Aj{U`GM~zJr&7XX0iLPp<$4n
zt(>HjWxwBL=esr2P4*qS8!<(pDE7P3<9by;90`@Q^d)xWlZfbxLQ>hDh=?8z#mXIH
z@R^<UK^c@ZkGOaI9QzBusHC4CE~nf5Si~br$O-p#btjeh6&J(E)L*_lp>iWnz7IlO
zoGQ~<?yVhoLkb_21$z?WccXebCbPqVJ($7Ns2D-AYu_*A&s^WW>~3wnvK+Yjrc9qH
zmzXGDWUivBy7R<sgc$Ikpx|gZ?JoptUnfEqQDAN^F6I_)U|@;ie0C+;u-$iA?FjfE
z)4frVFM~k+I;-uUnh9QZ*h$8-zpk37noo|1;NDRN={yNL((+!jRNC4ahQX9%J#V-C
z_F4`wRDN@@<Dgv(|NG4)D5$$0j;iW!blD0vrz-S#4KcYJiI|2>Lu);plXi-8ix9;;
zg+BOBE3v#@XC^=vz6-|L)Qt88jZ;3yxcTQ$SAO$xpi&N9zOqk}0uLLXMuB>g4}|wW
ztO?IOe&Jy~?U3g9Mw5%r)(f(kW(t$52b+g`=$`9HLw2^t5`~vu{sDr80+BzmWy)T?
zdy4ZzEB!a{Nm7gCZS1{x#3gN;bYlDPT82@5*Ufmzq+BJTeaW3h_316zv}FoSzwtDx
znVA{Y#_$j4aw+#EecHas{?stbbN<v)q<o$LA%YEh4$a}r4KcZvoE+M{r~83Pzw5YE
zcdNVzbf!BL@PWdkf=|%fW$vmabvg-QVss~!g7%&yYOnBdOcMR}H1cjTl$k^)u~f;k
zK9rnpFLCU#R18w->TNG)PpuZGk79O&n`-zuHkd4N_kE%D>NOvNP_Skt;Qpqc4X<aX
zIeX0zztGoh%$!dG*czC7r`p<Cx3Ea1BNvm?jbBP<zalryPR!+osgI8lZH*Vq>U@5#
zMr?NZfl0w^iQ{qK453IS3qRSEV7H3!%})6Hh>Dk*LfsmJ>;Bo#^2lg09GerLP$g*6
z)Jj4Lw28y4->0^%j~Sp%>u_2V&;|E>q!TZ29`J1gpT7AoSd9OFo~=sVf=Id2gc7al
z&Zzi!UgL#Od1Ogv^IRF3^l$+c7vf|cb(B#o8Sr~^!8;(3H7sB=Mte(7O3JEA#c5fM
z%PlUR`QN%LB$*AfR3({x8mM=>1yp(>u*167Kif6nS4sil`c;my!k%7-zhB-+vuaMw
zGAp~Day+d#i!tuENbZJLAZJzJR@)b<M9s*M9}GPhm3R5CfrGT`%v*|NnD`66&@ulX
z=61q(XNI&4xGBWCS0oKoshknH*o+2ep@R@s4bi@KRSIjU4c;QSmGY9T_aZqmYmCM}
zA^HvD@w$ollAYf8;BPf2`cBdkn-1Esun!%+2A3)t%|#w#AGAsLCHFmvNvoUiHtUj7
zmF>d>Ch9MC#y_53A5ScF_<sY$d?3M*`_Rqu^7T3HMM(k1=*NMi=9KhY+q$-|(oIb|
ztxOw(Q?g(36~UV&y7xrsNgmor10DoFZ?<UO$y~Yu>1(xx_K)mz%XPW=8*Jq2**lN&
zrx8~*CtfZ77KRvL?^iMxf^L)onrM=kp5><;kUTx{3cin25sk1bzR%`8*s8H{aGQvv
z)<%va(;8^b$UCF;f`@8fR>s;wC|R@jt@g`<s}y1iEj9nA9CE*F`k$Fk(tQqw!wRK*
zx8XC^^5>*xJ7aDPtz@IA&SP%5ztFrEI|A;5#qK!_F^7o@aiX!H$G}WUe7BXZB<!}Y
zU^o`qh-fNxTt<4=@;S!0krcNO-1W#~^eVd6YBVeN;(C>Puc|};Z@q2I=K21NB;;^o
z$>}yy5Ut*J>dSo^nxVHf6}?<`r*1O6^gC7B!aqUG*a6SCHe`PHN3>B4X=&D0R(c^A
zOuFQR3Ozk>o(f);JSof-SR{9oGX?}H<2Tam?D6f-{`3sblOvhj%>i*+PeTz3ijSN8
zJz=TiIpX$v|7u=}+XOp*@%V$fNwOfazTamp<+Sn+VYU<5H<83<#2P>?ikoOJ_uD(D
z-et?(@XDI%8{6&%e6`Q`3DsT$s@n}K`}x$s%Id%5gxB%RX17u*+bCPvLz}vbXaBKF
z?D%g|zD$yJ_~1Z%Qh+DR89qMycp!ge{ww;g9Q{+_hEu)iqnU2AAM38-9Y24b({wCe
z<l^;~ZJcxVV&`40TX2`G#ZcY+BWhBnlP)u<nQQ30B6aL^cQ86&dp-2r^!!mX&;O|?
z_7o4Ncx3ZU*5kx81kY($P=Hf+qRB-15%P(flj+0rUVGp<QsQ2~Fw1o9sySlrhC6?y
z)lQ%>0D;OL@ajkNXpvUba(mPyF$sWI$e}gKu^n!c-MjH$Bo8SGY8!w}C{Ya7+NIF-
z8$%a^zwe`-P~wV1v!bk1tEWT4;z~=o$IaTl<`fqvz+|i6_}59z{uIK8Hd<TD+v*9A
zX*-y@8=AFTEp(wD{lITbX9a}q_WCL+p|pwZ(feCfNgetPY9}X;yyEF%GVX>Lw>-CO
zk*)`H$q76v!I#}itvA30!2O8|eM{-UyK5Q@)~?WSXNW4E@gY!Ka5kUUjcknI{E&u_
zcn#V}>XRsi5az`svn-3%OJ3Sf$dSIFw7kxL{dLd2acX+YXs_|D(cUVLk1fu1)PUaP
z;CPP)DHl%TQE5hK3Nu`>B#=#0yUE;^yq#~+E%J&^tDcHLN_Ki6`R=T4qKbLNRqU8-
zE6I9gU4o!;iXiTg$(G_-+*rW*$RkGVZ#|jO$J;!`l)O}Bw`S9qXm^xqJrGPO500Iw
z7h@a;X9`U^T~E`khJNj0>9fiViS<vvEjS))UAP~A`*ZMY$ppw=7yl`hLvX;fgb<BW
zg08mqBd6~V0z0Yxmgqoc&aczjmOJ52(j(W-y*adWyG7fHfYzZ-CEhHYe@-$~LfpfR
zoFbd^L(FN#7~i!r{MMx;HTFMUF0&jk-XM{PkpPJ5i?-YC?Dtnu@@x0U9i>(6ci)}g
zn-MoL3^zH;rok-Uqgr>5<x1||?&MP_5qHZsRUZbS?1WL666T9$+{oPqu~}1QfWt5_
zF#y0L@iRUqiTQgpp=qF<QJY;IlVV)C3D7nQ2N(j3!Pd_7*i2vwxp$*MqYn?jTP+3e
z2Jo-P_Pdx+CbqFZ!F5QDJ|N=Fy98wX==J_dYS3v&d25u{D$wf1BI(5F-$VebweAbK
zS@3?EM}rN#d@L``OL7rCuV-d#|Bpr}s0{h=l=n&7RQr3zC0PKIgUR(3solZ`^GK`h
z7T@ZW6W_8;Z`TWY!i(O7uo*9CNvN0;2fWGKTNN^Jee{hh*|CtNwXvEI3wb<#?O}Rm
zDk5C=gxHPNi>j2&?aRk25Z}Z)gKTwkZ?|#3+wx*ESG|g>|18(gpgISIBTM%4qh4A%
zE8RvEJBW2y1p?@<UM^atF;zt&Mst&e8bA8JccKk_=e2Yy&>=C~J`iuAjmY4k`PfgB
zFabY0Py(!PBuLMSufLf64i`R2Q{Fon)2QDZAcz6WCi+?xjvM!*4@XMiyQjDuQa+wN
znAF?Mpp4o0%j_^z1h=6m?tM3CFN_qWBQ#N!pjI<mcEc)V`KHiqsU%P5zgI$24k$(M
zt%I#!BBY{Lt1b9`pZ335_`OK@nURw*uPpoISjYitb2?x5c~Ad#6b-)YHute6DO2=Q
zA^12jf$l<ao0Ud^E=6H9Yo?gOtW6$Gx_EU#8CW;eK#>h+`P#My1iwTu<e5*1Y7Qn5
zV8#)?AOcCzEcxGawmjW2ozb4G$H@L2$^6lr56P{%!Arw9?g(h)4Au6psexTwOs!7#
z$K5v_%gryU{Hvz-KJBAnYrl!#VbSR0*-%)Ky4`&1OE*TTA1nL7Z;)o&m4xiaJDG5!
zK-ljh9w^*)k8lFas?Ujabw}+udmTp&iyyl~v0R=7D$T05duI=}$Xw1{T+37%pFZ~u
zo5hsSWsT*{sM#*md1w!vLTzKJgXQn-C;gC2?k<!!5j)Bp=My&nQV;7Q1nww{^MlZ1
zZRGfuuWr@P@UauyGB4q3KswT+nT`s(W4%+ba>Za(LlNPxODa)*3cI-cy|K58_*sOx
zFNnaXiSH@?`#DYHG3#50gw6sH_`1^1UU0@~P6zRLemP*<S7k;nLo(}U8pQN@BO>67
z8D+m+5zq7E=;P{RN@miVzaMT{OVQ`L&$t)`qb1^7vfO#{mG5VHw_UdwY_#2>qH(8f
ziV^rO_<W*yzGo^_*WiQaW{jgzS^}g8m){R(8d3(kU8|ZCOU?FsH3RKm>?=0Hs>a4N
zZE}?FgU`;EJ&u#y!+jFyHP&j*XdKN>VDF(9MKT*ti;grZ2B!07XXvwmALpQ&cJmG3
zF5~Si`^rMV_hN|X!r=vnEY^XBRk>FJm?^5&htSER9>lYE==O3L*tNSMFv%V_+j;RK
zDX#Lb1%Z@{CF0AI9YVRA*qD8P0*tOFcFt!fqozOCd=NN`9{?jrkib5bh?z8UM_IWa
zSG}Z$?r2Jw>(~3KAvo!<-=}lBIz#{7X=0RUMBEi#j5PU&4L8x>+Ky2Hchqv^mBwx!
z)AA-&<OG45*kpp5sLDLkN~CPS*RD_06U&M%L+8B`Nzdesu{e9j%iyFIv2tah*~5b0
z4@>G7K~m(?U5jKGww-P=j=UG0C-l>GwxKz)r1@aAG2$sNdi%E>VA#$bYRAhz(xIR3
z__}%Wr1Y5*5`VpxOpAVtQ9nA&*O4=ju~@RwB}kx#1#fB=ywn6BJ6w`BV~`(_LLBma
z1e#mu7Vy!o##Mjr*%#Ff0z4`wRI}12UZ{mA$)zVJ&NZ0b=5L6b=oeF-fUmL3Pus9i
zX8|IUcHic<Ux9?k)@t`xM}4&z<B++v%<Wq6Ie$7VlSHp~&bvI_E5{yiqDg%*tNx~t
z7ZDcr_NMc3TjInOYoC}lqucBImQHEBfFX+Z2=v*JJ|4QRt-kCiZ7$q=?-{L~nxCxo
z?04D%Ve0p*E(+2PwXw1JHtaNTeY~}E7DsD5j9lDH{H@-^aRk!h<)wYmFLE)LouH%o
zAmATPgHthfhKpHV{UQ)`rzBan@E;vb45v3^oA(X+|8g`tmSdT|7nq}-k3`YS?eXaH
zd5jXRloqG(&S0kY9`SQv<MTTE!}E*r7et#x%H&T<R~4q5r*$5}>k;GSFlF1Git;3b
zvN1$^5cs9&_Hk-<tUfu!TE-U=Z|dh(3;osUjS&sPG|>)RlN{O|mrbAg{#Np(yam=m
zkH>0P#E4eC<$(O2finRq4<;(8yXTAmdfNFO=@P4(>0y42KUKIdh+n^IMFr>+`{E$F
z5y8gd`2^ffyS-6XPYxVAfFq9M6uc{nc41LX8e9Y#5BYnxk?<T<f}B<=xAxMTNDE^7
zwErWM87;lhY`_)ndmF*9J?P5#j~b?QgD-49YZo0Q&ID8XSB>56Iw~_pOw`h3_R<X5
zSvqIYQpL$@BWxs)xp*c}VDs<;CsasJcW+h8Y`Z1uYSVGruX0Dz_}QZ>M~ge)S^uoH
zf)Z9M^8&PBw+{cPDo)Prml%?a)29R!s$K|V(JM|(EE};R;%PmgbifZq^Tw{d(a>Dm
zH0KOA%O$VT5i(D@vd+&9=+D~vb}sVSB6mCwN~`GT&c+v}S&wkn&(7y|w?CN+^4|BL
za{|EANRTO^#nP|0S3AV@S_f?J!q@*sMhK8oJT6NW-28fb{TE@ejG;uZFFhIRZ*jiu
z=fbexK0yJALLuwuYQY>=Fglzn#S06)8T{QK_XOV}&x>OLioiv!J8y>qQ>?6Zw2kdB
za@SZy4lZywz9ePx?40`8M$>aTnU@O|`RO%}<sB?<hYaj)H2-)Rgd9ek!lLafmx)q>
z70;qlzRrQE&qH>}r$?_XTDin<X=`eh@*<L;!qVUDmxc_B<_%5^Gwpf9VKJLzQw4ZI
z2gdkhe>D(oR(Mnc>-P+7Pwf$y9Q6|}rQ7+^Sbf#yBGAhfIpBFi@WPQRTg7g6PcW!<
z08Apx0w(KmIt9|GKIHxS(<1eTBN1$?o|RGVMhnJ9z|j`>$+=IgK^utz#n)l6%kgXV
zpK%)LgUx|c`u+%oAF1oEq+W%E{T0+Pj16hliR}x<l2cY03bdw^Pumu5%E_3(zuUvq
z(7aSp1VRiDcb*G4F+KFWE0vGPD{2e0oMJ(5HJI0pmXiw(v4N=d{eor@5l&<TQ;SkO
zQ<Vhyj*pI&l6CmX+eX2PUe}_NLW_E9bv3((7;8Tn!1`ltEZv(B$AhG;uF0J4o1u6G
zXg$2|s0Oi08en-nx<gSLwZg+)$fw-i(kWUstqs~A-hKD=aCTumE~vKS3y^wBkWHI-
zdQ?{OdXMv!E|@x0(8&B2y)sD7KvzbdsMdNI+0Z*|xBz;3c?lLZ4m@E*;)Pc0fQc)@
zBtxkv(iK34Wr93WjLyvM-%hYk=zy&WUi`m*o`C%}yLX<N^MiR@wa2jqR>vXbxLN&2
zA+{PMn_jrEC$DchEyTH=kq%5->ukw69cy3F1w}GrE93Zs!|DCvJy!62P3zm=`gD*E
zab8-HLqWIDgH5VWX+5tb7iv7&8^ypz-1l{ae8W8>$<e`#)iD&{dsg*$HS$KL!;FYk
zbCGo4hsaZ~O!cCfw{Dvhxz^PMjxee42aIzw!0ap&Xn)eNzQEK?)}iPt4fcD1+K+#O
zgzQOQ;Wd1g^zB<neZleWWJzYCnQ>zcE9ud2+9m1&`or}|eusQ}v0p^!jqaN_85-6E
z-4VLDe1=GQ$T&D70biH6vAX%mh-S&zz6qi!+iC?Tp-4JfP?5QR9fHGl5=0icqV8Lh
zQXty?aylugt_wxG#wcp^hw(F=3N<L!_~5LHsf4Ps{A!8d89l=@8dU)+a^I7b&(deC
zY4?L0gRd6dUCI>YO%doF<XVY>HQ1|CW@%?Ha4;EEmFmWo{9gTMsL=TyZ};{p2Zz64
zEf5`_U3fNxs#Cm;t#^}1j`5^dH%qtk5=#AnTRl*P+V^7ZNT5=_$vG0bbV)(|ham+k
z&D4Smy#Zda8$>r*JwiY-f6fz7D0PY#!+5;rtaG&%#z4ve#jvpV0?DnwM<KLKiWna5
zt|gs!aWQ1IVEG%Ynu+>=N5L1i6nw~DwyF8mc|in(AP)S<fMBH_1le;c*D6c{W(zEO
z)g`<mbvF{`+vwp|HeYp28wz(j*)%IFzgS^cmU(NlJJaLlOKj=ck@mg8kmKyEWe1xC
zzm2^NTCEO~a4%MnF9?9Si$1`}KxcS!it@U~u&#W3QcX+}Ij{HS1#)c#Z@!z!X2{aa
zdTBe7?Qny3bae%-ZFsnz-vbh2rJ@7@ZQxm#yFWAU2}h`Xq63u7ko%kZ7rtt%UW+Rd
z=Wk|20Yo3`f)aH$sR)@r0qmuQphcE-mk&@VF&RUhnXg$?@unSGS`hTk!hGNpW_(7R
zd#Ep$ZomyoLqfXVgCscZFFnrjwo6bd-2$^NUanSl*M`-gp(c4kd9!%Y+ONvwR{$V3
z&nV=+hISB_hR<b9`Pdiq30qYSI$G#&=^+UQNyC`S|M{aIkqmQ8G$j7iChr;=O?yvt
z8`c)=f}5Y1kny}`D1x5CNs7yJfN{DiQS=2e;<uazK(hCAsQ-iv%~sW-^y$6~Yy|6|
zI{%3^`B6b8&Hj44tBEx_LaRium&jmEF3-G~JVmx4eyJ@vB`1$fUrT$G(E&<3BMjbI
zKpOHe+6V`n87_`xzodCrkkT$ND0X-P>hf~lh1fH((#qhLdiQB`?*Dn`2~q98#3WIR
zi;n3D!(QPma(Lkb)npvw(B=bzaDV3GI96}QToIu?6a5K&U2d4O(uB4ABrnM>2&%@r
z^#_w!?dUhRdwW2=Dy}ev&o$1*aQlHON7PKe_2$6=e7&VEY39FqB9I~=a?`)~{5~tD
zp5>wc^owtD-3$ng8`FwS9FrHPj6mC^^H$LPs^xra;$p$R@vqPEvIc{%vljgHbl`eo
zO3RE}CYZ5|NpkJazqCM`V0-h4kle=Tg9zsZ3EW`>VK?VGMu$Rp_3QPjf!Gk96B^pr
z>9bYjeesKKylm{>QZv|%Z7_`9-E5zkko5lWg$V|V<dvFzIg|>Ej^OD_ZjvUS%lk41
zB@S!QHKu<{HmfGB#qIwkUaIDSE0*<n5bjAa*K`Oy(TtPj^=~<(GdvlURMKd@F8xnS
z4PO$7qOL!9_2kxUwtn>`c~HSrvw?GH4C9X;W;~}Xt+u<uyk6p2g$m=7zd!x@=Dhk)
zBFpI{A{zA;hrv92eM-Bm8%ruhv7r{#%ewhZYthkKP8Y0ek`;Pca3&8U8ujC@urByh
zrovc-Dz&T>6#_Ia;|}Ma%#(96fSXm-#ZsoNk!6CMZHozIsb^}qqy@iI{1&!g{O!`W
z){W{P`Q&lSXh*qeb2AI?G|A1eGxoQ@s<u^1Xp_rUdX!#W1ROj=w+el1Db%<aL~Y2u
zoh}V>P)ridoEX%(P8ak8_^8dsCu-lF-Og{M-EPP-F6!1;X|GDyX_E)v!3tysX}snK
zK^&1wZI8Iz8M^!AyjoF`?9nOf`Q|j!MmKlP=T{<|viL~Dwbi&^zuu%#gr=mXIdJ+6
z(GBk-TPJ-n$24Z+=7<K}##rwD9b3-<z+H&12UQGUNq-<8r2DX$p*)`Vv&?d!zg8Xl
zdhPZEuA}P@r{+vrW1K%seu47vKshe59-CWp(g-=>G}g-@ATnB`hopzQ>^taYe$Hn6
zsp%M=it}YFiYpx5=G0vBsyBFCgoHErD^EHRXJa+1L)V;>-cn)D=B~q6*?O4JzQ>f`
z=PBimQ-Rz{KObyBqTm1okhhy~iq~G^O1abqqz4Z&G#2_|uTdY)C{<M;%(85cs<t!a
zpV)-XfpRa-pg8~ChhdUl%63O76D_{MSgmo=;k3HHzT<-KoBe)FDKBn7!}QPv(o!q=
z2UiH4Hg>8KQF3;^mWJZv-k^Zr?&yz*p)o5(0E~`^JJ6YX<u(p!Th<)st4aB>y=5@r
zZnQxf2W*CZ<-99DEax(p3n@}gl~(HN3}}cyCz?jTxh=CCKi#nlves>ezSBMNfyogC
zTN7`EqxREW>9MtO@nVhHlDizSRsK1ng;M0k+{H4E%UADUZ=$2|Lezh95*}7Ie;|~;
zoSvDkFbuWg-WTZ(?Tiq6NE<lah&uar@p+lg=-b<GO(T#@Y#!f%*$K95+x{{6evuEx
zKgfHjhP>4JWWltWJ0`bsG+&HvJ1g}+*0FV~3{?7v<eqI~<nRqR7-`AM`lEizlm)0T
zf>z=bXBb`wWR!9`xh=X8L>nIuzl!aBOL!^}4e4i`x<MrEs}C0NTBy+H1)-+plDRaP
z^vID2<b0!<68I{_QoQL;uhM^}#_2MQa)y1>`Be}&L7iTKHyOh@=6-2|wsrLiKe!mf
z!-o8xP*)YaJT3JU+>7!ferejSWsT{$?aiM)mK)T))(OnqfeVYXGu_6r#2m<DB~-eD
ze-3FkPJB4M_H}9mKJgu#?}y?8=1F0SAkmFD{2Yr_*Z^H^7y^)&_28)2W&q0}sVZE~
z>nRX2e1<XdsLgXDxw4;@h7D2TSsKNlNH5kLOie8hM)!)9jju#(dAoo5COD52rnxhw
zVJ!&*cgWob65u;@0k4*;e^Cj1M?P&WcW=xZt$tHUZZA(^(@~P6p6CZJQn`1&_^BoN
z=wmB#zl4lTF`I^T8o}}ujaWk~sqmti{NU6h?nVT;tsVJsmlQA-cw7an$|g5C9E+R~
zttsC+?pS~3$;8DU&a3RHa3yK|J=)yKD+-K!O?`fo4dmwuVVEXV>ed9Ex-07sXSrI8
zciIaX`p&NtlJy3MT@yP_-lE$}Ca>2b!<nkqs<88QuI-m}vx1u_8u*@ebj_vprsPQB
zU*}_~eep6Eizq-t-|A&(unQUa#9B(sIIvHgN%Ex(o4qc^!%Z%RVwGxt-ubOQ>3J><
z4*;taMahKTw}ZS^WL<}@Jn56&GsyiCK(467PTN|Gs;eNZktFS4<!6@&AuE_Jc%27h
zg?GbMNgfIr^Z0@Ri9TFv7Bcxfp)_5pm5<HvU!-XEr;))6vmePzha=qpZ3*%(iXcBJ
zRAJMEPMO1{6XDVnpNxk?Q!^``ZrFL7ou19}<i9EY0HGkLsynFKK|vd(>LyQfYHp<&
z0zrKI5zWVYzvjM80_ua6z4f8DHT}PJy4E*v4coV<IP6g3g66_euNrM#-9<(3YM$Sh
z@$cT^g<{`0DH;=T9HLoXjc*}eTQ(=xi>2g-6d#ruxIo-Dx3?_ov)^Gh6e+iN4%ysJ
zkK5jH6HMAP;+FC`GAIRfTnf|8|1zndC_I6=dlZ`*EOYsRQa)^<-^SKZ$ebYM;cSI9
zQ+40st|yele6%bB-{42=mLP}yC<$>7wfG@H4Y!%7u^iZxtQ-g@6&Y=Ib2OE)Hdp=$
zgr!~F$kA0)cq*6~+mh^X{rPS)OF6Ue6*gJU7DopYbDF@>2RP2un%|^za&N}g75nk6
zs`sOCPskahfK9ZR5Bl%NYr(9JOC&l-B4>L5c0w$8=noJkeeveO0L|f-<5)Jlbzn(H
z3wRGJr+zc9_lp0EVj>AKvF>33eizE(PfLYpd;URHRn-?)-X!4j7x+s!aC~K3`1I#5
z46jjF`qz)}*$-01RnPL@BnPb~j7)DTm{B>Su9N5{$fjzy=Q#btn4e$PtL!#%g*f!P
zBqfzrrlMM7DF5&z4@lYZ28Ab@L0NNo27UlkIWvuq%8{s~?EmCJj~ENr-C;Dg6Ga+Q
z0`6-vT`Ht()i(kJm@+{@E1`5{-v#l?Pyvm6X<O4s2Mzs}Iv6T2(P<s|!^_Vt(8iLB
zDrX;clPX1;@bmEf2g5VQr>2`x1=o<^*_cK78mi76i?AIcLgPcWt?df*&*X65pj3hf
zZwn(2uu)9RiXtG-Wwd*lqa2yan4MuSztIl)Mhfk~+VQ)+$4jBBPzVW0%<qqP{x{hy
z*nUf>`L>~WUGAZVCJ!E<T${3(uGLl5Am0Wr9+)D*{~q@pb_h*^g64+^i~}xqb#7x&
zXUQ{$r%7>epeJ(8FlO}DHu)~sXyymMKQn_m>poZ2ECvX(7=qIjaBgk7RJ|Wr5lMO$
zQ>%-LcoQU%gP~7}#1^*mw6ut)`WT!n8W^Sx80z*6;16Dk7ew!>-8!I05J{Ju8``wy
zOwi=|*8a<O=YM7jO*2?$9Kyo&e``VbvlEUJ{y>}{<$GcOMH6%%P0x}98kn;z-+soZ
zsgY$~Yv&UjdM+<7A2vkqE9<-_mv|sR)Z?^`KtuZ9$N^}{)!k|Y>ikXm9(m08)t|G3
zZHY|+9xp#nX^6eOX}Vgl-vg>16Wf07T6J_&o>wT)7F2L6lUCDm3y8jJ;7@x?Z_L^i
zmo;2z(3eU;TR!_~A}H*8u$a}n7paG{Q);TsVSHRIDiQq-Vn^%poyRx5b@kf^q}xy#
z3^&4PvS}Jll<zpbpfo<he<xHRu!nYf%#X1@z`lTHcy{+gGsS~=soYKHm}~^TMUvx%
z*}w4R`q#A>!$eEaYTKnI-T=jg*z+0WA<ywVY2B+x?z;sb+LV3DizQhpbSiMq$#WW-
zLSTa$CTyo;4*$SSa5gB(8T)V`N{DW&6v!;LdNI*A9+F@6esi2U*dkRo0$maahe^Xi
zXCI|A@9|hr2?L<~X=2C7h#)2MeuHy>oqcWadfu|Z@0<-=#wdOw6<H_bIrl736y3~N
zIe}gEIrIw1SiM?VL7dH#M><V?yR=G|+5}Bj2Lynhf92wb#ykXv5p?>ZMAy}&B8GyQ
zL(Q^sYfh8szqXnNHbR>>4Fuz%F6f}4kBXl0r9bL8%^kW+dmh`c!EV@NhCyc%!1h%p
z45`690rx~C&3zup7VGkoH2`rO^Wz?ZgC1|z<r(PN?~;!brVG2gXY~Ll2DOv!_FaBu
zG=5mUBKZ#Lm6nn+ek#K6@9T@2B+CE)-<($$O}jTWA*s#Q;sDhqr4hLgFMbAjJ(%rv
z^$M3`(ln40$xK0kRy(Y;-NtrN!!x#bFaf2yMYN~ajX$7IXjl@$?4TYx%Ac4nV&M|8
z!*b-=syB~1EB_WxmDr!d#T7SVs4P#W%WZI_(2C@fn)!Tk9;H@<hr^o7mK5BJlT_S7
zV=;#u0q(2?Q4l!UE%^odMg()--dISqJC$PU?@&7&w+Afis>)b!mJ;s77gw^5cM_6s
zs-AUqCA&W@w>%k$^l#3`zlx=lXYjva@&k;UMb5pf^%7(<X$tJ?pe+?xH{HX1Xy^^#
z);VI8F7xPwvb#B+&+V$YH1jQTjEBilQBl48v>*(Idh3#Y{UIMhXt&a4TGMUw7TMxy
ztt%w#k%x3;G)sj0=GS58wdT8SEQwnCn28VKhL@5TjH@SiwL<A4kcidaAw8Q5XTeR|
zu;<)FH1q-({t#OPF=_W(^t=TB@4~|}7p}Lr+O<@xHE8&0vSj5yKXQDmsvhY+KYjmk
zAf8TlZV0tID2&jQ()ViK<^oU_r+|VuQzm;p5&(7@g#-&xqH&*lVJ}`-AtYF9JDm1N
zOum4-para5aC&~B!6n4Sb&pqmV)Hx^lT3ZQM)Uw*!wlJ%%E~d-aXS{K6J<Iq;%0h*
z*2Dd~#VHvU(8Cr<$w@#a=qoCLC}GR_|C!wom2;#Vi;F9{xS^n0Q^a>W9H;zA6kk`2
z>gGVG|DG;vW*i<$8m*$9_|Sh%A9oOq9{&@{yflW_V@+kCvWN6aBDU1h3kqXY-yB|>
z)giX=9WcT7t^C?#!1htAn$iAmK@gGbSm|${>sjp5{r{#s5|IyIBT9x5%dOUgooJPB
zf9L~lObRcOu&CTE9TOC4c2kXP(UOT`<zX9I%D;pRy>&`4<ia2SGT!1dSpt<K^-ilY
zw+>kT177DVK%QH#)&1zDc*rp%p_+*OVM=a3<a`VHHj>SX(1pLS$2_LG0QwyvrQ|!(
z_&pU!QWQnsWhV2aiD$g+T41Y(aoa6ee!;YHZZqCAFU0h6!dqEzx^)VN?jjlR5$wAj
z`E9NvxYU$RBvYiC7?gJe=5(zVdC#N{ed}ztP;_oORd<b155jIT9_ww#oX#95FQz}u
zW?`(Jk~VvK$2>D(!qz*<-LhnNTo#yc8yLLMf9(ed>?onT6G^HSI0AvGTtw@!Ow9-m
z4o<TB-G79b5@n^Zwjm@Y<knBQki<9tWLn+qHRa`3mSiUAwOrax`k%gXzN`LxFa}f;
zgyU)P{g$J-zP$WWJtji<BAsWkA0~=M3l_@eFXRI>SJekXj$Sbi4g<7_AC=VFMb-*$
zgWBSvq7rM+^Al}$dnN)>KuZx<ZJ*$)OJe!!-kO^<#H!1DtyCS<N3>p@ZaiY#0NMCJ
z?^?|XZgVy+x)5Ny=4H57{@HVYU8h=sX{o@{34z+8>-fQYLMEKRQhcWwB#AO>KOfYh
z)szY-vDo<p=#WX%43M~?)$#7grvPDb2=$`8O$nP$H~gax3J@XVi>=b0W^`J+M%}RY
z8YY4p0}Q<PRMbR7Od#4vfR#T(T=}!T<umj~7B_}eQHO+xNd57?kwodErsf3dPP7k3
z8m#+PVjcNF8sd0;e!hO?)*>4?Ng)ZxqhHdw!|_$7uQ2O~C@M~%$Yv<J_snegv?vM?
z>gD#LDD-9~p65|cqvN%T`Xq~#U??$pF<En^Fq3zxz81MaZ`u`H(tkjFbGtxa2k=&`
z;c_!>viTy@#j0&)H~BWf8=^1^?}AoROti@{35?oo&fOm0i59pg8ds_9HWIQo=$O^q
zf8o?O_nYGS&g*9XCZV?JSt!k;urfEP#CEhB)mHo9)!E@Xf48`b!08}uJJAQDc;%+5
z^iX8B#L&TKMQ`V$jIq&jn<-4M_^8GoF;Gl0hWqcRm)Lxi0xPOsu#Rer9*oRIW`h44
z2snNI-tw8YHJ#vr+1&s^nsM3pe6;A0pMW#)yX#N!b@!RK27}r1>ZAgO-L-*1sG!N>
z1eq6OyGW#*9nVj#qss~>en6)3{b=a8=WuvqObCb|?LP4eaE=Xry#O#WsaUk6@>|Go
zW?v{d+#k2zGx-_-q)q`YS*>(q7pyocp;z(1EUXIz|E%ToRYBcD?K3fke+1CL%Dh!E
z4E|xf0ed+6ZZjs+)`xi0ZcA~~pB_f19siVUZhC*a6{T%M7}aIC^JEWn6KWw`qOz{2
z<u>vkUx!MFF$BOqP%o6B<29di?FUTt2H!^#TUy%A9IyVl{*JP!$(Mg6PPf#SN1G$w
zep_1B6M6a>pz!Ku%Ug2s*9r{G>EZi2PS{ZH45WV^BymbI(EgZ;RopjGXQNfKFW1^@
zMRJ6tAiE-=uWS~4g;g>(lqz{6bS7k$X04v}S;XyVK|O^NicLnnuSH=p9lm%pDs1(d
zqj`W3lL+JV*dNR)|C$H$h87v8$nQS>KWTqUXnnCzktF{3?SJl7AaTs7rDXsx?Spa-
zRy+KNn7+2i$!sWfe}DVqD_&6rjhG|Y5_3|Uel7dMPHlkgbBndrac?*7>dID<z3t5-
zpZ4oFI4;{Q$H0yR;4l_rjR&X})^0YgZJ%|YzRI4Lc;*{YSJ^mz>1b*yj3~b@e;!TH
zF0S+|ht2uB2=C42OQ9;foN%cOVg|KGzmmD3U4EGOOf9x2Z8z`18Yb+>W$WSMy0fZS
zE`OgVd^LDocG(#to-cmfw7Ssxfrzc^+CZFano}&c4#*58o`vd~1MQUS&XoDX*hBPb
z7(gm7wN_a+uqCL2PSU;NMAx>y<Cne1In)W7V2*RyilEl0YIDIcaUGSTH}jcjg8jCA
zxRDWdv1;rtF-S9y4OhJKI-hlwwdPaJY4}7l={jzryxj6@%_}G!`ebpt87GEcc#ojr
zo^83hs@_*AEZ3v>BfU%FV9^jV^TR;RJCP!zLyoYjvN=U0QsQiDF4f3(d;-m7Qz~eE
z{Z8g+uO{YR`~I$QbJ#0_IYN+gdLX7ntWsIaWHKE=x1f<`Ghdt2m5B$6*JU?_oKOTH
zN)Z5OZ~^A28IMnFno-4Jvhtg3S)xG5y4cj&G!5;{btTP@lr9}iF3L+gmK)L<$(AHL
zK(w{CwXObWB>FXq#Yd-rVjNj@kAUb><la&|uQ`c8`$9^%mcuOyJH6z+yL<RC!_B4q
zgHux(>P{vdorY>EaLCLN5p)G7+_yS!QTEA1Cjw_A0!2i(i??j+fM=%aR?SZQ&3y3q
z3^oQ*2BqcXR3R$Cd#QH{dqLxh;kaB<;ee%l?x)$ff=w4OIc_ZB=1FU_5y(G6p|7o<
z7+0a%d2g;NUy5LLXFQ)T9tc?W5_V?k0u51{k~t`rzXewGJBdM(s=!7t=2aXu+i7<}
zcoSpnU^2&4ES1ov(BylpNuY~^5CUjdT*>?yiPwl_qtHoUI*t*<2uQ4yeSXmE7S;Gi
z0fsWFk$^Ys@baQ=ECc&qlB=?Qof;TW&cjjzop-vz9+TMAu#ewHgW_K&$M+=#{N}`V
zntj7=d(ud<(8MKza3>Nu5c!gTJl&=hfbFyrBs!kmg<|wRQEWk1rdj;6!zzBX!(wt<
zNVD%*WH8N3odbExGNohqwBv5c^ii!*nTiK*zJwIl*814spQ4Uc|LChq)81|lwH(ne
zS8MD2*9jb2&vO3cIwP}q^*&nxujcg&e2K0rrmZCEt$Akb=fivzLs;8o3Ja(0I$zt(
zIK^W69YtsrRBkvGjZv7%cj?4I;;R+jDS$*PAiqEPB7X$^yh|tw{1YxXAzSMsc?1<S
z@~^85IWgQsY-k;Yo4gvUbo51@K#<`;Wb9Q<hQhrQ+_9}oxz<>GF{RJ9N4eAgMz+!M
z7yJg-A%O_!OPgKi^gp7+c$1L026Ob{aY;$Zt&>nusiT723uOEj{w(@nMi(nzlQI#E
zKfxmcn)4;(f4&n*IrbcuwHbS-lAdYv+Ub0oXeqrusYp{}l#os1-8=Syx{vKBOCYt-
zyURG?E-`{o%}^&p{73)}Q$y^&+!DlWyq+f?BH_X?nH7$Az&tC%^KLpFBA19hjOIXq
z7lYAz{!gH1@-G@Wj<asu1v3y|8_oKah5m2){W<Ag@OxZZ**sQww_SJ37!;*X>`ou*
zA`b&{(l=*Z<pjSl;62eiPk;UGrb`%_Ys}sDt>_#(j)25V&2-VY^Zw^k5#U)MvS}a|
zyXgC3?tW&IhMdW^XL1y;?wgLI{=B=*%e&-wdS(s-0=vKMI21^UF|8nx{e_r?1j_MG
zaRg6`7Ml=YG6(fJtTIYk$*Yjgf5RE5A&`yO-OZ!>FM&v#2j66v^*m~(Yxedg0wZQ0
zFG)%``i76Im!cdNfByNsNHLyt!#JjWg(l|kiswFo)qsx=+kO`CF#QC@7pFbG(p~J@
zC_B$MC|fKMb6%Yb-cUd@P!w2fm^7=B)yypCe6HOQms<|MJV-S#<=SBCA7%)6nD{p1
zpu%|6wCY<PEP>6MpPbYbH<BU>O{=Xzn;RG?ja0x;qe0H8=sDJ$s~l}UwjEXGF5dtq
zll4=d5Mw99j?>qi$~B}vlw$sLU%jm|WxNKxx>pErp11a6qaCr=E<HQboGi_MVzXGt
zI)DayzTXo{G09Lfw~YMv?_W2Cf(SCFhpv$8$DKq^J0kkm$fgM6BQ2c}M%n$jB4jrw
z-2`uu7*e%Nf!wUANPLYl|AGWgX>tciIRUKjvp(X`C~M=(hu`Hepnk?h!B?tG>-=u|
z7tuH>aKF{$8vQYY-aj;1C5K{d5rHGN9aq#;_6(C&*@?9fKh~!N0Ym~s+;yer)HQ*f
zXgY2KGbTHcE^Fe%U4t`ns`~z;)RAoBOSpfY3txkp*{{akTFDXac#hKK{0Qz}Wi+g_
zl3ldpF<D{-ytG;mbSv#xCf%@4BvcCcwD=9y5kPnKhWD%cO^J5T0p$6ainhxehv0_r
z^bAjdXo~bm9MkQwJn7c{X%S#Wf)fU8t#=63{R>!cZ#EKe$YdcALEUGZ`0A!7?)a)j
zlk75>G;`~nerY6pMr+L(nD29+7k=4q(Xz+>ho|krg~KzqUtTOab)?!y$MCUdHGKdY
ze~Cd-44{>UE@L`uwnNcNfDAbSfR6a)SWKE4Dni)SL$FTk#$~v%nK^$1-hvMlxoA#S
zgG%%R+D|D*pZ672*Rh0#lpp4uC!q9@f#>x5ilDO%pwLKotK7);zUbDsAG2q$FG_)0
zgj6Z1HKFF7{DKt*D&Cu;I4;(34G_xtMGkUY^WOObhcvr*`a<LcLeEo?hS)eu{k;#p
z-(4<R4UPTbpZwF|g>h5sJg1IdB_~_EaZY#<kC2fYxH8B2AftoEBKS{rg$njrNQMH(
zEZ*0{1(sqDBUq3(Os@aDr(3fhR2|_SB-#Pg*u&nvTWKNK3j6UR89+YiR13f0<;^tp
z-UGd>dEcrYKVf1{^(F+PaJts|D`(INF8JE}`Ej+`DMvnA^k_$T8s3iNFV<bAlH9_4
zktxR&(gt%CT90I7V!O;n8t?tp1RAe%YbIJH^@fKrG|A3$t`hOCBZuLU@;gh!$(%H<
zru{WA8(p_Q#NG(<eYF&gvgMiE{Qx!sKKmZw9C(_q^chO@Ln^m{D(wkfbpB*NrcuEt
z{X%WlH)$V=KE%H<x7j~VVoqA+_do|;^Rl=$2F@o}S^qH6v8S+{<-@OI5)!Tps!{et
zUjQ}W`A#G2e$YN8?F%Eh;4TYc^p?fL^XKh_MWY74_Ce;=#17|+c!+rOKUYukO9r=y
z^ODyc7RZ$vYO%#6OO9tXWU}ks3GV0arEB4MWv0XAq?Cr~=OqRBxkIHqINKA`HjbsC
zEE7uvDYD}ejolRVrk;!l%Bd5<(u$1&tRZ?!#(iXj(rxi5oUi{g*opTiVjksMPUn0=
zIp9n{F{R{V!fuuJ^G|}}iZQe!q@lmS>3>C<he-kpU&kD><_s#3LjQ>By!)_VueY0S
zmUi;HWQV)EyA7MqjnIbnkAXv%<BN7Xl<I{bm(5JG?WqyGRxTk9t%uvr_iW6a%4vqb
z;xWIOSGsB+n$uj2M#SrYeRuzH%I0(n{;Z;?EOxuM>>I$V40ieE1!at&c!8giz)?D(
z>}oL4k$tV|3fpR7np*%2&yttct@sl(0eq=8k`e_xL^GGy<lJAwyV_kGDN47F0@5a~
zdN_IoLHcq{bJSAkLs?bi6~(gpPRZl187e=K9*$dtYfoth)SRiP4;ML+e<~mlZ<l<(
z!6_uB)g30Hz42Elz3#1u_TO$ha#zD?o)PO)g<tG5LjRq1bN~&78DndH86t*4NM0Zp
z6oAJmU9bj7RaiLci5C5zG_Ou**c|}$U6NQMWe&W!ut_X{(z*HqTP>ZCzP$$<lZ44v
ztV@>wdZZ38QuBh?K0x)u8UsBg|26XY>Qkn@Y$88^KLgfqh0>8e4PteEM&|Yt29y8^
z&&6;b#+3EZbKotC+dhYg0}6o~Mjm|A$le7~m`dyKh_{Ub+UUZRNYG4%o0w?Z;EE5a
zwIO1=&b|9|w3<K{D)3lKKU}b6w$9At)rb6RZ)>^><Y&YbJ^Y*q4#57jkcH0UaeKtt
zi0<IiZn;Oh@aw`%Rf4_;1&S3-|7sy5Stw#I+gRGqiku24M8kJlg*wm<p{^j#^*AeR
zjiUFLl<@(0f?-zpxffWNZm}c8oj03XZD!ICBBl2yWCpW>xh08WhvBu9uN*kSAxPLF
z3fiOT*Bgx;CDCRnoeM!S1GC}yHBsz^Z!tW4%1onko_6>&YCw&NGn^J2GJ(*FvL0-4
z3$Ln)4i>wCE(D2P`Q0~ciBdBLGvuYFr0|jpp6t|k9205&$OWedci<};!c<daYoy>}
z_o$dr)tG@H246BWNBy=nZ;gzl7JtR83*QdykZ>gRfiUx&aC#|JGl7(g%j%4LMPh7;
z2?v+N9VLM3d;!d(+&Jah-|IJ@`c)8yt?m@pg|<l;Bo{Uo7Rb{rZm3Z-zE6ym)e0|o
zD_>KbVKPc>%7P<QU!ZdC!$F9E=llGN=G|cipF;sEju%B2L2vUwbGK=a$~wa_;lq<`
znv`4T5a*8v@7~?dX?=E{da+0I7awo9>1*&-{`@X2w!#a#6ja(=&@0iz_AZXEl@Pma
zhE0!p7AOMXL?|s#rMaSuI1J4-bXBEL`UK)T8nXd)eo*~r(SyxxroDf4?`5n@O8y*w
zT$I-t?KPBo9Ub>ABzx|U4M|F7c6x#_3GIwAIA{b?h<oyIc=5(|xHZjkeo=Sqc}b9q
zwW6!8Al4IDeD)3Cu0KyGGfV(UW#;Itv*44HNi=_N^P`BHN@$IO>G#gN9~GA0c?iX(
zT{*1^0>j2%)LSOk&N|hzEX@@vTEe<x@8a2vkyKd^m(aIP*87Zl1qp`=YV|qlQ8D3v
zv_^s$LwgJ#Tigy_q~z}q-~5z+o`v+9JsfsM;E_tbqRK5w!Oe4}i!Z(ey|*1M+(;RA
zn@g)5K*J{FGoUy{MH{9$j7Qh4tF?W6VyXXlkGv4TtW)v1pyJfsEscMU4#F(s9nD=u
z+s+OoNxa44NO5SP+ku-G;CJo!ggpNm^5aXs>X9Z;#ofwr3Se=)U$*2{cjcm<#8G1c
zF9tJ%SW9eNyc4233Vsg|f8`5vsqp(sw5mr-o7Sl`z?Papu|quqhX*6KS0klO$efP|
zxwJR={x6`1C!#rTiGq&K&fdUoMmXk>8<EsYs?8t7rxMxol>jvXi(O*YVnO$+vz$9k
z8~MP8kiMH|{43TR%g<R0rlV~y^Kd+ezF%NQ^-zoLM7BRaqA{hhwIp#;WT1Avu78<3
zg=ykd)pFP5w9UJ4x~W$xhPf3sKt!F4DHhGT^XYQVp6g9Bk%Wj}rmx-_u0>`tFp|rL
zGsFiAyS%H$AwYX0RTsGxNVfhjZ3@JC#8ywY>C~=7Gw{p$*Z*VhESutpx^|7bySqCi
z_@KdElHeXJxVyV+kl+r%-Q8V-ySuxd&hxzI{D@O;6(6Qxribe3-m~}G_qr~uE&&q{
zQH^ZoO>H+U0U9MHVKPXx5i-kfICN%YbfO}LaG?^q6yy6C)J6x)l+3o_mi~>e^KanW
zA+8Xs7V0Nhda%s1fmOC%(qt&j&87njhmBD;Me5hSwh<Ormb(cBxr0z%7&CAa;@aTb
zU?kp;1_XSL#hxEPZMt8jHuo))hf;#rW&57_Y0_@rwqJvH8G6A>n{tHsbz~tlx8X7W
zSLk#h8x4hTu3t8>dc!BCLI2qwV5NMc)LBye>0`2)wt`>SEmHy7U;P?Q9E<q}Qkl>0
z5(W$kN^<YS4%-xWxRVn6o(2Zq>=r6&I4__IZ|^mA*zpV3kM>J4Vw};N2K*|^fK@G3
z<3eskr4!ux3m(Oo!yCv;;F-P7ipL~)Pw8np@{#P#Wz`N=$Ni|l9%XdlA1%U7!u;wF
zO4S$a0e1Cabq2O<EOwuOzls{U(H_~i>~G-UyYSAZqJR6XzY7xy<)Puxs-Gre@L;J9
zsg3e%Yrx@tm*xqvxJDS@Q$H1!R>e^Iywx09`sAp<yw~$HQb=pv248O%I#WYO9|;Z#
zDgQq>*}wgu=YKF+V{MZ()&E2g33h4+3FO-3f}-w}SGyuMY*C|UXsqFS+i*h4+1SlA
zo=^Q!x#x*`v4{e2VAMt-C&hq~JTt;W262?bs=uJ~GvJV%wIV6lOI887)>_k<6HW4n
z!7<9ZU2z6?4J_C{-5@n=>WifM1OqX7dVUGlpewrC_~KJCIS^#c8ou5B^Vg)h2r|tS
zHR?fhgH#ztqNE-x^PzUncH@^nW*H&A(EW(f*9cnuevhd#n4{jl9|>F*_-fh~f}|nk
zAg;i^_{!@1Hgk@hh)y)?@kfex6TZu#nm+J@QTq&Ow;cpDnr8=f8$)>9GI8?QE*Y$(
z-`FLHaI?i9#V;mrti><TXUGfb6&s#b!_tiCedyu(fdLJMZt4g4+qTcm;w!)UTC0dN
zmLz0b7W(8o41SkggN2}Tzv@<t<q38pz^37L&EW0<ME4Ess|7W5s`GRS)uVt(Aj^o4
zmc%}>EI4w_r0w4l)s$*A3X0uQA264-(<E@^>hZl0VcPPq6t06*N9)yGw4YE`_05GE
zLC;^k23x!*>ik763r$Z*XpoA4kEx0ukjJ!V%oq)}_4{Fm;L})5B5_ZzKtoSm#|B47
z(or$Wl)_a!mLRRNj{z@iKOdc}?muCzY4Y`??JrR_TMd$5Tb98E>65lgd9UR$zsKCL
zHh!;nM4s|@KKtihG4p6Z`uXbvyFZ^Ta6I9cM&swvDJ7)ZkjY#8$)=MAp+mksPpSxZ
z)j)^yS_b=G_o!I#d==6{fUljHcT{LBubmx;^{G-A7(fLZA+dz>yH_r8wv864DUS)a
zZ|JGs*ADbx=>e$;PX`dMOGFb%U&e$K#Hki3lvfEc{fl~B7uQGvnE?Rh+I0m?W+Hm6
zAFcdJYqPVn37B_vL$g8pe<3^Fr>Cduj7M>n9?1aMWf^qh2zU@`l8wO*OdaH-1QIGL
zt_7`l37uKv;u*GVqf+Ij7NV6}vJHSq`%20=Z>lJt&66b%f<heMs1KZ)ktgV9d!7<W
zQj|T^XE*gYfXqtgv?;fyZT%}wxh=t?;FV0#zzbZ2=n3IND#;rOyMRJi+JZ$WP#m?)
zt8NWAyI)D4!^^PP%nU?-knouSZ<F4cjT(X$pOi#1iuARsJXLrY^23#mR{i#H(ZiV?
z8?=m)SPP$6Q><95_UG_lM8X@0NB}K<T{*N3QGMWpv95?^0m=XVHN<D<o2mD<eZzm)
zQCKIv8#UfE^?CX;EugCYhg4_a8-i=A+3!cAO%C}F;8(z1{NJCS6wiD`SeMQ{+C9b<
zR{x_x{Li0CO8$F_|L=1PaB}{K-uvHICTa#Uc>KTP01tkm|2h4?PyToK67l~%!2h0u
z|Gf_W_g?t#sPO;gy<mWgj&Ap7vo?25T_I!pr~C1bZtjXEr&B~hdKv6)r^~Mf#C1D-
z#Vz*0RgjQC%bxMMDAD<AQM69?L5<z@?h*Ssj<fUX#kKNmwcOwn#(~2)z5@?GiBKt<
zo`Q60BH6jEXI(mQ4@J!h8>8krbT~F0sA^9dxyEm9o{9pZI%qvb6$Odk>~2bl45G)`
zd9|K`ZX7Wu`Y=8-e8qG{)9hEG(Ec@PX%0DE>1k<_OH0ky%}nYWHWAms8pQg#+uPfs
z(b3WINdSJDS4L(b$hSR!lDDM^+mm0MH}%MoL#T7}r5W1oVt2pWC&EW0AuCND0Bb^T
z{2T94Ii;2y!ueOU-L!N=V4>k2Ia6OjnYOlAXT%wgX$YPN@vEnD$8P<^pEr{vchBTz
zrh9MS`bz0I&+-0Nt0899fI;Ig<1T+Eu9u>xEpk<HS2@;el(k9fINIFj{Csz$<<|4t
z?p^)Swr$$RpvPj;Ka@fCS1rs!L0cQqc?@xr&$Qr^C?zXv76B(h5zxf!=jjO=TJJy^
z&ukCY^{_(V<>kfF{swwl?Sz4HohGzdRUjwtxLzX&yjii4J?up_jm<L}FEe_2dfGiX
zkyR9W`z$Ult}G*ia4;NE=k?-jZ%ZdY`jgR(cWKUj6OVxN3LBr*!EH7Ddd|wxcJ!w*
zM20MzspHjc;mJm-C}4%>!j?-JtW14}9DhG_JL?2KHK#c^`n|bf&oONJ8XcEommRU|
zTg3bX&k7oUznvetB*4C<^qci>DG}tf>&-Z|%TjxV&6cghQ_%|Db5GanuCs5K!4g4#
zYrw!sap-}vey*<CDmjh{!(no=>h<@T@s%j;_O;=q-h{~vA)BA<-2(j|6e~`79o&Lq
z)`IUd7Ys7S-H)cG7sB!$D20LxtCADvnH&lXxHtf63}72e>*`wXj(btPCz(bLq(r|!
zLbOE?t*?_1;q=^);U@onvq#=JE>_Mf1{?xJdaQs|kQng&M4a9gB+@x)w|hQbQsBO8
z!H!fi(8)x46l<M#bnN194u>diZS4m<Uh0u7WN0^c9M?%HCu&O7GkUNoBZZoYsdc;}
z>wHZ7IOaS5*{dlr@4di8MkFFdz<;FEaul#pyn*bJP+F9%U|9+#ms_>E!lT6P_>~1w
z={MIId#JVRhnlrfmTK+#mq=1zGUVNqk>-S!HHkYl2s3XZr}lbb(ei#qKX*2BVtieS
zAXHEz9H!o><I-}TLGO5Hu>tI+=S8&?yxl;<d;U!cneI%KCy&NfXl`z<b3S7MN->1Y
zdfv}ca&o__6hbsU4Udlg<6+N)qEqgAXloP6X9?<*d9mD5Yr~}Urrz+d3^Fjt^1r)q
z2qoWFYDx1amuKsYCgYGC!H0!@7fGMUV%GsCtDvu~HMCi<Lsk1{#m1fQyNBCi%}a+)
z%@5^%MwB!SY>-y`w;0uIBHGCP0l9O+Q_+u|IJn4pFlQ?xfqX*TMVpd)qTorPaaMc|
zr2Mf&IvI4Vgh`=?jbFi(86Xj@nW#CB1bqoyj^yqEnm{t74+9^KZm2#zE|%Bjx1B>X
z77Ye1ovxW}066!&@!?$<hAEUIN%ABY9YXq^cz+=zih^<wVmvk!M@bI|>IGS~8Y1t{
z*Gj(&{6$KNGHAa0*V`*3*B0;Xxl8c8wN-j_Kx>`9^Wv<Of#~9t0RY9|IF0btyh#)>
zo+`%>AALC0Vf^leSZVdvp@5mGus2I1Yg38}{%KF(iz?lpf|$SFvb+{}kes@UQXjca
zC;rd~QHO^{jvHR2Q_PKRBHXK&d|vNQLO3#6HQ<bRbWuz(H{j}bKzPdcpAHj8%G})i
zvkd?dC<B<zkdc1sGu9;A-XfzCK>5@c;CcVg9P|&428GTrH+XDnwd_2=x-{<%_VPR}
z=nYX!)SC7?DF+U^yWtBCIRrq;x7&bSf@L7}0ezO(Txc*yYgy_2tOP7C(2B3uQI;K<
zsH;KQc8yT2Fnb4B?E@eg9ktN_mL%qsWe-^g7!aWrV^97p$i9|?t}?}T>!R=Zi59&O
zx?#qMWd4XRS-qn^_U`OefSa+FEn^Y6JAJDA0nNFIVGQzJd%lSR3DnC+>JS_AisYuH
zVm-#DI0!FEB9vFicU)1D?VYKN%8NQq|0#)B^TXOL(P}2<8QP+a$y_MOIGE$}vHp|p
zjV!<Ds$HeHmRS4P=mg6(Qi(;gc9!f3B2!m=!!aJp++KL!5|o>_M5lkt=$n!gnU|k&
zv%Q{XomZL@S?i$_v}Eedx98ODWAGxqXaAX#1Pv}8rWaIt50qA^rqv<?&#v0hfNDY=
zg*d@!4!KD1>r?w+#v{okts>H^GT{tSR})?Jbq|?kk?8|GBekQmuG8A`8(i~#-V+e}
zN>zDVY;}EA1>Rke)S74tUMFHDy?bK04{xJ>5v}D`#;;#rQd7la(utxy;u=)=Xqy%5
zu}M01KxO<gkair*8nKd(7Qv>$M*GASL5BUFD1yniNGDa}4o#@ANm3$c66(CXp*BmS
zLrA!=Jz_jg$9N&;Pv2(QZWcID?Z~pCu37cc#J%9*;IaYtlguJd606>Xfr*TFBKD{^
zT9%!9G-J{Y`7Lsa4R(XZIb!~J5`5i@fAb~&o3lvgw_xv9I%p$5anJF~@LOJN;p?Cm
ztdnZ3XR9#_r|4pUX-x60<&KZfiU;8izJxIgoW>WK|E_R-VYL^MQd!dzH2!|cD<ccC
z+s6s@?7n*Ar{8;wGi=}&q$c?gcnf%PfyGBt-HE>R)tB4#0sx9`Svgp%RlIr!RGme%
zv{RYi5Dzy8esS2QY*BcJn(F_c#BT$qX#s!ry3|oj2*C8z`!U;@)iGwbOs+Md>2Hf+
zm`DFUB^Z6*yinry2pyjgktX*XA5Bb=wb?Rfu={Hl6KeoMgyl@5J=U2ilRh9{O`oow
z)EOU|(~RL^fOK8)<ReoQ=*J`Er^14y0Uz3k9VjohYxLeFBoMiWJ{rg;cOH}&mbHwE
zW}dc6R#uJ-^Cb@AG8YHJJilCbbU}z)Ryvos)I@^9kk;OT!K-gn$%x9^alN;D8F}=3
zeQ}?z-$Y@ai0A8&o84VADr!OKXB_wLKkcT`QagYh(o`JJyTV<YxG*}O0Vp=-1DX*9
zMds%;Xfxat+uYN<7Hu&D`DNR7hdn)89~Nr+jE(wNJ-!ATNdqr>WYFj*vOe3fi7N%g
z@S@Ymuc@1E&`_!gnE}NBj&JvHmtw)R5a24`{o^Y46D?pVxWEgd^MK&47s=u7+R+D>
zLUzuT&{%JUXi?4!))?P+8d}tz!NMfbk@k+JS~GKW)i%P(=-qn>p3l1(=_76!ibXKb
z)q<6U+v%)@?(LbE-~U;^SlnakO0MOe86iiAy}ri@CXMDOA4C)%{Bp@np`)btdv`O%
zFfOD5>O#DkmqxMn+h$>BxMw(%*U06PfOf19P>lU(<jBX+vK=H|l;=-=18%Yuv)Kk=
zyb0^`-E&Mbw?KHWpRacGxTxoYX?Fx;#v7SLM9TL?uG*n_R)`-tDdiS>9%06MV9!2v
zVJ+o+DgpNbJgeC~-i%qZahYh6u-uioi)&hRQM2+)#Yyv~vix*-s<}Kiy}#}FZ6)}H
zy$|f3NV5eP_h+lyS9pX*mQjvv`8S(*`6!L(n75!F7LHWB@n(PD;z`|UIwxkpPbW<)
zpjxUcc&M>%yQdg>jOwEd4*$AY;1(VM5AU37%8R$g`VOSNu8#IFwPQ}3B4Z1wNMN0r
zQsoElEe}>ASiZpTb!-sxH|J2A*m|vyC#$p??CidT9kzdt85E0zop%cb@z;1T8s}3b
z{_Z@0h>SwpeS2#!7LS1Yxe&b)79(l!%mD<7Pv#-(tk<mhoSDK-;1Y4D2Qf#`QBnUz
z)dT@zYoe!D)TL78R)9%wuz=xSd`b$9>~E%;8o1De+S&v<lxPz9eD^yD^S;oEz_X#8
zM79i^FZk#f+mgM(PFoh6g!_q1S<f#QzY=3(?Sp;rK$DY`aD;Y1M@n8|qN8^vH!{(p
z$aVL^f@-4lU<~c-ay|bZ?@+|d>#n}XU6*iH&ucBz0mr9W9)<oGYGZIa?}H3}z$R0O
zF_B1rb5!2wP5*GshDqlZD|vSUs|w^O=r}l<*fNyvEmv^l8P(1=+oZNl@nouIe2tC(
zX*8ft7Noy}UMUy4Uo<jYQ#BbSQ7DQC;cj++8Cj?c+_^q=?xh(@iB>1tdN+R?Gf#{o
z8uaodtUu+BCvZNb_CnmRPTPN#U|(6ddDzYhtA=+2=G=lYL0<HLl<r6o?70Lgv+{iR
z>ILdyLy0Z<GwEmc1E}{q0+pNf-XGC1JR<OzgGML&A0|^*?fkBS?z~YR$B^f>XH9ye
z`J^<u{cj5h=NI88gzU}-GMxmg#X)1dUZRn_v@t_KOJ~o{CBkx}bJ!R@n{{%11vx5F
zdiv4vLWk*79^r;$o#C>}?Shsf7XSBurLoGdB{i^^PqHLFAwiRjZV<Y*HE5$|i|1AB
z$!{m)v`wn-@k9tPbh2z$NRgJJ3!R^WK&~_C8JRRPet8~^sxk15$`_^Q6qQ}R6evrV
zDr7>*W(PCP+g;;WzL)daxOzgb@20yCxrrodut~VdueZ=|H|MHM#p;MLrnpEA$(^^*
zfP*-2Crjc+(B~IlOmh+~d{op1Z_5*Jk=a!Y>FhGT&`eX2kbX{4yBKjNIWG{V`L{ie
zxtGWcjS$F+)bwWU8T#9ibO<64Q_+$q)W4Z+|9xmQv580X?S;F}a-=9egc{z6julS#
z%{pZWqL(N0v`Na9Lzi-bX#MK##TCq3J1`RPl2T798&9SRKiWIr<e#i(_Vn0U;GiLT
zqcNQ@^3+aHY}C$vIvkX&U~z^BzV7+tuItV3MC5>snEgIOBku<11n)!^0u5bP^wHrN
z5KX!XZvzkP`!6i51yuOB5Cn0W_dguWlClbkl@|FZ0y!}>YCYk%=W8Mdk`O-MTVP-g
z^Y+>z$y7S8)<8w$qWD|4Kn5)7bb@EPmO9MTi4IhWzJGVeIoqg;Op<>O1SY{hvDZ^a
zG9gdYW))*}B{L-SDBlA45x)jK>4TDjj>XI)EKEEnw{nQnQ!h*j>GCCoo?)Cib`eSi
z8*cf0Rfp1q-no^pH?SbeMD#YXu}1DTQGR}MWok6=$@INB`+i48Ngu>DmM(+X>60~s
zxnqeyu2fM^=uWS5-M6BBKy-N75Ow>veMcM(-*GTuHEZGECDMXg@5#nl0;N3TopSu;
z6#GkwzR4@TZF9A4NY(S&=eQoNI-{-8=+m+h(!aY@4_8W5R=;l;$3LE#-BfkcMIoHJ
zm&ODBWx-pt?R5<JtK>fD6?~jcxXv`6S;t?tIdL+b!`M`aic48C*B#figjCgQ=U*ce
z{u73o?etLsu#e_l%rW-5pJS^>BU}5@POOm*mTXwpEozB{r<Ynj`ai#?#_*7y88f}U
z-YdT@g*`j09J3ihJhWC|_>kEh*v1Vq`aG!|SDJJBJA3{u-BA9TO!D1`LZ(0d(+SQt
z67^zYk#!EBInKCYpC*8NQW}XQSqm75Jd)1Id8gRzh3LmZGnj_hV6?NNuhZ{_d44*0
zV!EXb2eUy8gb(Tl!l?x~%*PS-U<(9W8QWK*Rb+k~`!5#<|5_M#xn=#ZL}?RID?7bQ
z=_0Cb=92X$@<&-OB#{XMM=rp^;nMY*Y%v91PV0gE3{@6qq72V`<woy8XP(g@hB!Q=
z?fUR=oUYd$&hcV(b>a?VrQFsdQGq0BdpZGk#pnVB(vGI@BSl%`#(UKLX&C+Y691E)
zUdY1)#dBI4o3<Yeny>(e)$IPX{&wus#>hdqziS3Fz-QK!|2<b=QGqWe;9f`N;(|NO
zSlf14wz}%cXHVsqRqI=*sK;=`B)=8HcXOB*GXsRosbT<b9@9u>dU`sAu?~i7wV&OO
zp$tVCi+}9RYm^FED;VU2`9KG=<klqSTp?Yr&}Ur_Atw@#U$;@v;)6T#7;o>Luwu#<
zvdxkA2m5=n?UfyA%Q0=~AWyGI?@~bK1N-Le>`r5)b&2}+@HZ{d)zv5L4d+?ZQX{%L
zd$j3fcy_W;>LdIzK4#EC?@cK4!*sey`r_uoHbuLp2G-c`N#bkKI%t1g{%*zj;D9GR
zlayoT!%hfDj4Eh8p0(B)zh}j77!NxHR{kr*vtY@P^iObsQg|-AE_O!}z#|~T(J>uc
zT*EB9qTI46S@2dY112&nyR*T9(AH5N+RwoWyf;4W68kr%f$U^PjYX&uU^%}~4BVX5
z|CFrI734VL^0PI3^K5ghhuSEwWJmSLeEmhVR2!0NB5A6Y*{7JlSuBwlmp%@8Dm^kP
z@%gm)6XM}c=94eQN`t8gRThV;elRNaDm_l1R@J7xfM}1D8I&eiU~4Fs&f!RMXCW74
zh=VupFcg*0MdGfJ2}2SBexutn>{QnZ%}jAvrJeR__-+w3-jFfj3++ap^-kqBhfCaW
zWZ>}Cgxn{pzp)a?=Bo(_dh**~U@h6wVj^EJhVDC_n!hEc;^?c`Hbgrh_#d=f9G*%k
zSbx8Zh1SNLLm%A;C(#c1k?UZhE#Bqykjoy^DgI{DoCWfFiyCbj<50D48mke@fg?)K
zC^?Ib)7I<At)fu^W#ll|Ls%uz+P@z1DclqJaQ_tAwyz+xaS*6r^%7N(=$NZ7`^u!j
zmWYuPr3K{hQ^xsV_HduW-<hcN-nS3Fc6!lPw{#!-gQ97-BQEVfJ?Wy1F7*a!q8<Wy
z+UNvnJM&F>lgiSIoNgBZ2@-H!klT2|r9BTw3|89t;jRE36{N__LO?F+^35OyRL7tN
zOB=!0MH+EpWROl_;}B6&i6$eO){w)f_m(v4g9vc3BjO`UL+Z3@NjC8-bam%${-hnv
zm<s;rrj}c7lh2>rH@y~^AX#4>jLebEbPX776wF{cG^&}=m+qU~vUJ=;X?@w_Rsyif
zpO(K>+l>QxQPx6uR&NR~U1G8Do!^6R85-}rtEk&`0;7b2-Qb4c2dBpqP9x4Ku3Bfu
zNAh{39tkk(&)NB;F_Niyq=~n<x2k9~ny4fqY8}_eL7WzhZfXQd(X#xu#l1c1U`}}6
z&`}wI&Qrn@2S4!ckj}uYP{%pzq!B57tnTVV!*^LB_FkQC$Tp^=S{~yTiVv_C9PpcE
zIwPuum%q$QAFb#JWYFb0Nk+p$RnNL@ZJs(xWK_HK5yQ-MV)1)_!SA`ED{Py<MqBLd
zc|GdKwaEtT2x5s?!SNxp8L2vEXAs6<Ozi&to?=TgK)9x3K_Zg_&f)Qr<+cZrTWO_1
z83V^x1!k+D$n=VvG6NXorV<o&rPXvem6#$Xpb$uV!-v+^ep$C#C92<3(H2GQp)q+}
zt^iYW^O^k8VnDlrB9x}0`iW}dk@&j|rKIzx<p69rLImnvfkahBK{sqpD^0~?M5gHN
z?fCw9SOh{MB|d-<?v!gO$`=7??rdjAp{91w11W<C8s&+l|E4wH@8@`Xwvq&F{&Nv<
z<YNy0t%4Yy;2L41rv5TQ@I-{OFNq3bOF9|Bla8sU7{%BnPqO`IU=pMpTWfVm_JGIr
z?MGbZsVRG_f}i`dfp-32?^+RDkdO{@{zBk*ERCuA7&$n2H6xkR+3gz4n;w!CqdwH?
zj*~G65xH?L=zX1AIqB4xz`x(}N<?Rrf99}Z#b8UUqg}GozjGL#+>z}&G|(TtfoC*5
zh}?dlL123VxqEPej%zoZgz-!cn!zkhyx$1O4Lj2LCZ6>cJ87i`gSM%3wc2&Gwb
zzj27X0)I;|{s?)e$+uG_c$nj#I|=816cp6xe~SRR2IL3VS5{ttwt;V=8GI;MK6FX<
zl{)kQ)~;M*B0w~pDW*q1IU`QN!c?wsZ4`If57}b(_w1nNa-(%gO3sRVT0}Q2&Hh=4
z?g*wVw_7P<?VW;AB@Hc(@I$5Em(e0`Y8(0QZ|}%wnu0M3Vi!9i9A6WWPu_oz1$;F$
zSag;B?GB2&uVf58S;i)MFw=kumIjBJl6QE|K8x5Nsfo_D)0tdgEeK3`<JximzSUiR
z=R1vshu*Jue0g(fKHYdRyBedneS3yJ>w`4n`eg?fM<%4vCa-hdGyVI=J8`qyi>m&c
z1V&^+8m$%3vDfCuPRoGyQ{dJAM?BhcaTr|nqxs~YegIGTV1A`YI^6*6W7k0xTOd%O
zyMEokTW9Vt?4vH7t!<po0v5o#{u{N~0i&MSvs%uOEMgP+F4txwuV2VhIPD`tA*?!N
zNT?0nEhI4XiA6Dps;sQ;tLDeUj|xWf1CUKddb(@R0iCF@F^Zh`=m}zOn^fwR_Q6ne
zMv#;9N6WWd#~TEsB!5{!0@>1!$JCWIgwp4h;1$@7T7w`ac?QG3U3)=30oGoTz(iS_
zCMYFwV2~)ATF853?@q8WBkORg0eN*oJq4+fcEJ_qQAH^Lr_N%jGcn<p?q@kn6vB}&
za@GQGko^=CAunKbI&A^3^v(;`01ul`^J)~=-cs*#LiE{^NOT2B&8@}k=|)o~QI$KP
zDcu4~%emmOOYa+z#!hoFYudgX?r4{Bf>42zcF29XmMAUCDYY7^-u1Th%NFs9*-B4p
z?jg`{+AT_zD=%5HamY%s;<)`zo&5d%8nQ9S#eA<)&%v8@Ymiq-Sd20{h?tjeDWb=X
zKO}V$yS)uUSlm|$*Qd&iH0@WC0{9)l$W3*k!<~(gJ)theo~}CVyq%W_>F66yl8Q#>
zbnsuaB&&a57_9ZSg0^qsx)(b2iAis-^vqiw`n^I4N1I9YD$&5o>T<Oe1$r)q4A({P
z$R?`6hYxisBpBEi0EA6@+3Ivow)rq;ts{ho<2NQV{x<751=`>FlgCW_s8Ruh+%F5T
zB;-?F-mIBz#q?>^><s{ETEHIDAtIP#pkgBR@Z4E6b8VhATSK2U>mGfw&KYrdcuq{j
z%72aUM|P=XxGrfBT-(AS(g3{lpXq9$S`FWeT+hG5W_U7dYWrfNsNUy8j$LcPA;X!n
z^Ae85fv&5Wc+O_TJZ03&f?bqs*{?Q2{YsxcnNB}Gp}4mge%1NyNXmf!v8IZc=$G^6
z@D}WJVRNVIw!w+B{_YpsUX!_+3n_0FL|LXlLus{@93y%m{^XNqZ4%VZ{5@5{f_k5K
zDp`v77}6a(2v)>xGs;F*x}&Lw5ubF$!TVF@GOfqacJ8O(z?&FnZ5+Egljq4gm%EzT
zJA{RtOuMW)AAy($9HY}_7zmyudij5CZzuEGNvqkqq#)OH?+-6{`HYs>LU}DIpi?lj
z!E#B_&Q7}KGyHebkGDp@`}3&1j#e}693Ed|<U|^vA3nFsPkiWfsr0DzVn6)bP&oEV
zpA0PpMN?Nze>C8sru#1FagT+Jd4&&ZN}$VEk}#vp@)1mk={y>W<MICPMe^lzcxfpP
z;E|i{40zqnYt8;cTdE5C#?S?Wc>}Wc_P)t7BRm%v<YT%dIUc=6YqFwAEhb;I1yrGQ
z5y1SC#ia8ERNV%D1P^yvxT!CsS@WaTv+<iLdj<I$EDBNHS}W=(AbT4KK`a_pnBnz#
zANX`#*@i^CoO&~$z8_}ntZwo>lP!lWz<3JaV^NOo#gfnsDd{HAP9{U{b(*4XUx<o`
zC~2n0a|BbE%@)4(3!;7(KqlnMXEz#<(#<<9RLs&nXI{uj53`&NB5*pFpJL{1<(o}%
z`<TkI5tL|EW7@Q<<85?k)34rXz>&eHFZukC_hI+($mOGu@6OC;|2{{qW+OxrkOXwf
zz6E)KZwFrExlcNPDx%|)eegPCE>^~O9tP_|53~qP6t=2uC?948XY@o%&It*Iywnrb
zkM?`}{k{<kcTT<4q?Jr^x<vTHdKvU=1%OB&hf*+K1%jf_*1C6o@`MaKG)+mRs!|fQ
zDRtM>=I61S-fFluFB<gy0=|3>e!$~?<xVZ3r@&=;dont26~au00tSa25Ihbw&VC!Z
z6`yPtoQ2^Do0VTv9scJe&)#GE^vEaLf#N^MX~16RpKu190q9MonHf8JEJR{ST@j!_
zz`Os2DUt;6Iua|uhqzYNU^Mv#<kgPffCVV(ztK4Mw$7?|A#8$E>y2Gi%l)`QC7tQH
zSkGgU`-})fL>E%Ug71yHxna^<W}fJ{tF$HT*NBGaD%aOj;AGnJi`FoDLV*eG#|-y_
z4uc@3WonBk9sMxWy`^6m#JX?Y;U_7Wf%ekmSr=4YYdR~}&3~Pc3L6C!uENESdrz%m
zQ~gw!Hb*w@AKdmgcU!|=2pcZY#7RyY!NNhtI9`=Lu`nA?p>)Pok@{;{2}hNUSwFxz
z`kSJA#dN!E28}1ZMvT$=4=xLZ{5pINe}*qr=dgi%JtHkwqAJ{WVC{iUOII|YRaGI0
zxl1T!S3@M_GQ^J>Ze65Xy8~oi-B=oLterw;!0`ewuvLJJn6ZkQGTTu^baWxEo~QEs
zjTYjxDRQ&}OR`!{ujo690qi`F086XL@c;?VSf`gh1fo|%pay^SNcH_2ctl_eGyjy6
z&Bs`Rh@yEBfU_+$pIq(6?DMbr1+UG5c;x;0y8A=9b!$g#Q&Oo`&@+to7b#8&Bu#Ov
zSr)e~F_OhhdM^*Xe?vqOtW_pIpgQ_MdVmL!iwP6yC63Zp3|WIp&Rnnl{)<Lu=9e?7
zpel6|IGxeT5*knk(*HEHL;|3<%FDIaLc^rhR0dP;=!F12`mW=dL^=HZX&~?V;WHb(
zP<tezjfyhr^m|d~J)2@;k+hHhYoF7l0nm!-QZgPp{kn!$6?XtsV7FSlOi0qL`s^yi
zy@F=tfYe$KPQo%4&c0An18HNHdK$1(CA0&&&WHh^;=hcWl=em9st#Wsj19BKz5EQI
zdk&tKs@U2NpI?r0PtFfKw_3WL%trG4sX;hyjH6A#Iy6tzaYjna<ZQ1mON_4`Nqx}9
zFygXV78eP7I!zc5qhl^vnA8b(@t;-~OfGqj_-&bwmo_dy^x=Y_!m8IN(8ySSuWA`A
zp$n;Sgj>3lV)%}$Ry}0J#+{~Pw#E8$7+~btU4^%X6%um$gov+&xB@;CL#;u1?i0rs
z{kyA&dH^DKiS#w*=0u>h#n{(%GxKC4Q`BHw^SV|c%zozYsH!gAnEMB>j35rBUA%Kd
z!}u9kM|4#pT8D>WgFSy00iiMeg738BQEiq~%hlS@sOLmXk=ag}UL=Txj{k_iDX)0p
z59UL=mss<UlGjE@j0i;K$>7x|B;=+$k!Dtt6_e-d3ij>f!4^_*ARnQfX__g7Yl|cO
z;j&^y`A=mdOB&sQhY;Dg{`||(a1*x*g>=B4PCMcviXdo~*hO2e*0eH**Wo*^21npY
zEbgR$OU7dxI}t%vJ}@RL{NoF6<luGjf;#`1dLMVxVgXfRdPvQ7PKY3ODbY=H=xmzJ
zIt>DRmDx0n`|UJq)B0{iJS>hh(C&|Gj2umy^%}lExk2{O`v-0`y-Mt?FN~_?<2^>~
z$={URl4++0qdJ;6EfgDVf|k~FD4rFt$AjQtM;O$PK(|u`j#vF_;^h9HH-VRE=~1CF
zb$=S`9BN(A>y%IYS<EujS(N!)D7^HI9nIP}$&MUZSNHrMDp4to3z_#rAsicb#OHjK
z^R-o3*A>P4=HVef43jmX23j(@Z+6YaM;L&?3^~mUyYU$I!A&%VP$8+`>Ps&GbgVsU
zWet&!(7kV8&fWcc$z^1vvMZa2iPu`?pM$NF*F|xKUjIaH9m*ytKK&l74Jlx?wR?!)
zTZud4yRH|T)pHB{{&7`K7ZX*rg<G4<q|w6*j66{80IxmVhimj|WtKG*&@Ukay{ajZ
zI)|VS&mDF*b~9e*oQo?Igao^lN^1cYI;P5)oD#(i7H`Kf3v?P?&#o4(Hr_JNsQZKj
zV-I}XXN9)M_M{i}tSMV~Zoz%BaaSvfNgW)S_IU5qPw-x-uX}(CN>IFE>w0?w#adHw
zL$OUon<t&OlNWge;jM~gN9CVC_i_D_2;3N%NJ5iDoThX2L{!}$qwATE$k+~#_L9tx
zo7j3iv(E1Rf{CYXL{yEhJB;oXTHp7J_9gLz6t4FTS3&sorlVwxMOoH`uM<j8mF-x9
z+KI(j9s`zuc$e@HEEFo7^7c@Iq$QmVo8Klcv2h{IY@ANo9yikK5j-s&=X=U3EbOGK
zs{b%FS}Dng@J~+tL^+D=L*5Rbf2BsJtHFSiI7unm8Nya0OrSz3k3$zpjZ5<GJ@=Yx
zz`=&=M>=2gA%*i}wIIDp;<b}?V|TcRoN@!9*D=q_0_Uo4|I^KYf9E$y8tO$EP7o-)
zr>BR(bMrkrn+*OUy=N;=oB1Srx2Qc&4g;KKSAj*`N64$`>Vk`gPum*}Z#qdS(T!<y
z>|+}_e63me{ELRP*48sv$na$W2CkPd%`Y&)FWLfah$r8M7voABVa08<ndePQR=K7n
z&YEM`mdg@6Cnv_T0lYYR(XXU%_|!f-5=VRwNW@)h?JxuS#9{w4s^xmeXmprHgA$|b
zrJ!Dig2-L5-#1qgb(gI!y8tb@&PrbdzlY<Ql?V>?33UtaJI19BRQjrRe@XM#dj>ER
z?}$XkhXT{vYVCQ23KH>3MjlIg-_0pp**Hz)Kze{|-Sztiz;Z+<`YVf%y-R<6vpWn@
zz6|@IoM5EImHa7S8`%eG{FUg?+o-{sO#MLw9<m@V5%ZxI4Iw+0yNsF|e|vNkyr&1>
zU^z~&Y3l-Zfrd1uAVbzNQD_j>4Lk=*p~ckaGW|AgKAa?xc$qaRsM*6q+9zcag5$Zz
z$fHrVBlQgID8y;mN3TO+5VZ^TH@avkxM!t}F#Zbol><n_cH8-KKEZID!AMO+ptm-k
z_(y(t&B+jpHEcm-)DXp-Rb=H&r@QQHV~%V+6aVfftPWr~8EczERCC%mrK_-jo%gPe
zV1ZT@+48MUAyx7lf{Id-K0J0h{+GiL730C^fZO9zRXQ#%g+xLnG2tADJ@9nA9abh#
zrrSOtAS!HqHGxkgyL>UEQ$#Wzn<>ybTF}ECHC12OWKCp(^VsT%`t0@0p<FDiJX>e3
z^K*ekX8Su)tIOcf1svk!V)G4J7H8QfeRBv4O3L$S$8E71rhZX$AX^igl~t5q&!d!7
z1qZxK8dIyNHapFxhV47jhNzMyAiIPGgm|hdY4v@%f9Bw_ySlHL)g{SU<Dx4&Gv#Yd
z%9&Jj>0Fr3dS;p@1p~ytwP_<p<gT}C;cDp)O%4r7m1sStXJ#stl$695VMD)R6SWo5
z5GB8(T4BiGtG847<OFsTp?YC?hvYe1YWr{4vZmh<j#eY1v8q?`0MNdy^hl=P4@ZpK
zl1NrF_OHQtJ3c~Bq}0Sgcc6zGBBZ3he9bNP-su%;hiO}EB;h1&b0T-@*E^C}?dAoM
z+!!>^1Sq85WLurA+@n8L;Rw6@n|g;Bn=Vk2Z5?WA%;#YzXEJ}?7+e0b%g+AzBIH2f
z-Z;k1F*~#8&QqI2{o#h~!&?M|308_9Lmf$cxkrw|=nZpGEWorxk-DI;0+cjpKXc+!
ziKNTnBK=|O9MnQV3dC!xRY?#-^P@oO+(JgCK?jpn(-shTN{B<Pa)*olNp54p)-4>n
zvi~7Y&%|TURX?yk90t1w4PxI4gH^;}<1XIFDLUmaC=s@QxX*jGY8}2vk?T8OWoq3@
zr~hH35t6{9AtmUYNUH>0R0=l-n_^_MSmnoMy{ZlnEH4(T186hd3KX1w63}tGbb|ft
z+A~X90I>Bx7C7bF%DSnTwuVHO8xBcJZ0MXn+?BEBs!|iZN^IYEjkEHB#d!kp6Iv!S
zTWXRHZ&{C+7mD?ioZPm2L0@!%gW1Ei&#+&Rc1a%>0P+41ebQLvq@n9gA0t}k`%v`c
z&yyo}93)<r9j2|a$c{5j@y(hf%gSkTy19<MI0}=lu|!^(Si#tw1`HcEEB>7YD4W1!
zfwG(2!)^rS6Yli7u?sfi9&}(D082nuiLLDVgeGgdHgVWG-MQ5UpNQ53^I_=+&{TH`
z5OW5gzKgpCtfQUz2E;mdLEt6{7~tn}D*5V%K_q46xM%-{5fdAp`7}2@Bb+fZ&MVDW
z=p}-0Ypo;4FPxdpyck}~jCQf8qh$<F_-v8?B6o_^kB;&4SLFd_)$%{|IFnY7DcmWb
z<y3cG_J_#hA$v4pJiopDl)RebEv&J#XFSJCCuK6ODPHy4&%Y1jMpu16SUTVWuGQgy
z#x&5vpG{RoK?v7p^8@y3ke7~f6Bn@DlgVo^q2-OCae2ODsr?8gWeUe(U8mYWBvFQA
z(m*ARPNvo=R#md(w7?;k&ARl|B2y5L@hANW7Fyfd`swau;rk5AXu3|TVV^QrnKhc8
zMQljbqh_s1@-8^b(Ns=#dLwjC7b5wMgM>sdQ9}<4V-sh@fvu}S2<yPi-H+<7UK@2j
zq`k@#CDE*;UvUgsSNxvjBeH}QaCi>YFF${zS-&oseJl>6CHXcC*w7{92ro5TvV<7i
zYWaaOpN%v)7GN=NFYZ<zVlAeEs0K(9_m*A#ej@h-VjK>Uw58DVm_yRVvI%}Z<CV&c
zs3w#lq1(#&(E!~e*mu+&Au+~FtaDl-GN?;b;*SU{*b<k^PrJMPxf9V)<UZXQmh^3E
zCv=IS3Zf$vnWVFvsv99J2P@gSZwEK}s^5dNnen$Y;VR&;Uj+u}@Vr4cYq)V#ZMQKW
z+k&Me>YJn1U;-zK)DqDKfif3#iJX{0BeuVDf?}n`_+2g7CM#F}Ah<5jNISUIn5~k5
zO`})rxwNt)zCb$$bS)Kxn&R4dH?M_lzpTC%>XfX9_5>Azo3vnHIjJk>2A+?;vVUOg
zkg@Y;`}AC*TY21=LyaH>WcYF)kfs)qTj>}+06qz4v7WTr*1or7n}isFk}{n9`{DH^
z6^Gc=-`RhypsL-Sn6U8e!{&ks?1Z6)+k1Q6^*2q_-!<YtN-}Xl6#Xts{Wy#LYiqdg
zL+9Hai`!MMe(vV7YirC_zw_<hCSA^3k7S>9uPNkb7A|0C+gyy#$5xx|J6!f7hsE1H
zWXS*UC2gq<QDSSiqFJhXpv-s#tgDTW-k~_y^uIw$gq1=m8y&RlsTf9*`ks)ib8$kq
zFvuDG!bu(`3FWn(zy<{~v9EZFZ1hwPx!Y8FD~fiGFV%k=jnG0x$`r($5Hr4A)i(<$
zeLp;JYO)chE|4Nb4z#=jmve~B`D6VOIc$-~{%1s{%!AaMc<}2%rmqJvD20nPm9ax1
z^An4m>i+LT6Pj*cf7&!zRzXwHSR<#uDOQ^v8ZGPr)_`E)=MeN5joM>~m4#EpAxsG&
zkjE*Az5a2zK4jpo+HYa#{><Z>aFuC?=%!KP_+JOC1x$bFm25cY%O9QtdIxJ{LtRGD
z*+|{p=k2y+C}iI!8>~FW8$ueuuLnQb+#OqKqU;8g>~0Z%OKViJfPP@^(;FF*;<&5n
zs{s}fKMAwf*r0&QH7E-ZpP9W)ua}g-f2;QX=@07dxt#b>&zpI5ml`xme91DmB*#fg
z`WEfMP@zXu+ERobJe-uWx1T4C7a#5s@6EYlfDyoT73NL~wKC>K*xPQwNV%T&ErBal
ze;XrEvP%5tNd}n$#u&$Lm<Rd!66`%OR(87I6iOV4Tkh*`ZQlck?Fez|LNlGql&&$u
zrj4B^otr2Bz?5|Hr{)@qH4atMx+p182{kVkR|a29(a^@7XXJj_u5r@ya=h#5#gL1*
zKJo#qOkdaQG#WZ)j$nmUjhKBlEB|DuCh%5^Oev$?9fRn*MHbXA9Y5KvFO`B<SfWFE
zDFzU)i=R0+nEJ-pf4&+L*5X~IvdQ%^e09?2Nlu03Go-JFx*X^p)iS~yxF?NExe0RR
zvK?=jJX#;OxC@&6`vl?I9FU`_ol=yR{XwQ)L94HIONs<rYBknwfhIes*iIXZ8jZZ=
zFt}V`(tH9)n;|NoUCYV5!&VSs=Niu8M4^|9)x+d0D;F4_!M%+oh>C8#d#TDVl#=0r
zK4TJsNT^%FSA>;;TWXg9I^hnqnbj<hx)Nz;92(XWqq~OSg@p!eMB?r=I|k3)CLozE
zB>3isB(P&BeLOD-Nxt(An<rzmG-pWkHN^=nk}VAzC3I!!JwxmE@w|&2^>oc@O>f*~
zl?%OYIl&v882f5;a|;T+y`;+ur*|S+Y^Pl?J4WPDSl`ey7`?^qhGckrTs5vRN0_e1
zQs)qgr}O20@6SP0>jNUwpu5~b)Fc`z84+1kb?eEdl5+R@gk-yWnS*JVG8!!zOf#=1
zXVqJ#z|!&BPDPs;A(DZA5k`1Oj@kSa+rwL1y*~$~tCN^mRRbp!#vH0$G#SKv5(S0m
zEl6L;a{!Ik<pB7S;94A5bG{SFs#JI~%cR*V>rTTEie926{@5njLD=;TLz*l7>nYNZ
znzk^fDdGCk{Rz8!H;$JiBMvQs=mcGaKsWl5Oq7dG;y${Z#%7kaoPrqQ81f=Z6Q$HR
zT=9e{G$brws~z<s1|L$Q$+N?(=cZ^9?QVN+wC6NL11UhX$D*8)JdjBKGcb{`=mj}m
zo=4kFp6gn+;Xd5eBe7bz>mYjCcY2{gTtn2tZ-2oe;o%V*!@j)liU_gd?#^S#mv{Ud
zfa?=$`i*VD9*Uxz{UzbkzItd;9Iy#y1a7WiJ1<I^$6Vc)?%c*?SXlT6^AsXo7Ow>D
z%V{PC0XAUytb?3`Xu;VoJ+zd%YqO5yYy<V=?Ud_Uy?2ip_PgziH)Xp04IloKPo`+}
zS2*(4+4hX$9wHE>j@{qVnSURudHZkW?samfZQA?u@`C-n*W}4vw1E=b&#0r9wL9>1
zicnU#^`%AUeztrai{#N;m-hk@p=mF6tUo8A)ArA2F{68|Ugh25tnSAGym$Mw`8Tob
zCTV2Yr8<Tqk@QPe(`dbhH?{Zk%av0;qmkV|!u;oJHZQqa+7!lU-1e6W7BqH#Z<EoG
zhMHeW*xw<xHG)?XjQued^0+@6joUU{5MpqJs1a!;pGOwl0U5gD6kb%Eq>JiTM{AW-
z8;E{c8?5-l%m{~rjIA4c;RhKMUBiz<;veIV)2#)!lHFm}ONm}*^Yv?!5nR$<!Ofsh
zbTaowo4Bs`6HMb#6t{I8V<qU@!-9uic4x2A$Vg*>-=f+lnyA(n>ZD+a{*)UtuWqCg
zv@VPCNZzZaMiI+ukur9;!$s5&@?eraj~@YvSpj&GuC%tQC^rgVsKl$>Ykmy_c9#sH
zldefwj%$$idnX&fRkL_hS^1_2^R@+{%AL;YsuX;L_U%m+j=Zl`z{7VX*<~9&<ynP&
z($=Yfv*aFE3y<e}B=|ztVJ#BPE<7t09$9_P3Laz?wXjYa194mmE)A69cV-k;=Y8!i
zvV{aS#7h{zMls^FW~jHE5zI31#Y|H(dsnOCmnZb)*K%m2B1as!slG=|wH~uLUot2Z
z?%~BqZ}9m`|9vnK8TPa}T2wt+{_mA@9i=+GSY1MF!rQYm1uw<b9NXy;PQm3jByQi9
zoe%$P+so^FZY~&k98z>p-lsZqG;Zpu8$Q=^$~AOsdRr_p$}hOt>1gOCu1Bp|H{hZw
zV2UAGXxi({k;RhwKWjckpgh@e0!y4l1%6#j+a^-#iAe^}V0Hg@?|eTBt=_+7TMl4f
z)@)HuWSlQesF0d#&%G*Ez%pC8%C9s~<Cfo<kx96kI{$u`D4RY2HPqq|L!1YEb4xMl
zc4+t$x90nLd@RX8$1D7vDQJ4Rl5>|{y?x*Gb`^^z8ZFxP9=*6s(k4j|YLk;<`Z;)r
zYy#4I$xgB0-T$0uvS6RSIQ*BN<=B^{h0603ERxBdd9lp`C|1{l43|vMWf!FVMf(F6
zkHH6HW!p^K=N1=eoBEUnR{9ZcR-ebnN*+B|zY1b)mkE)!Y1pxooE<G4*Om-IB3((d
zpHR+U?XKV~?!Ska3$?=ASuebE>3DmcY;c@qBnv30Zn%GIsMORv3wpQBeek~dh`$h;
zKI+<NJ)68fLyQ?6Ia2+x(NONcG&3d1#5^V0Ld-8wR1_1b$mbek@7u;!22LWwMUc`w
ze}doXYqXDd@S-K~+>rMwzv8Am>z=q3I`CYeCzRTml1U`tSzc6za!Co}&#^NHwhL>Q
z&4Pk_>G2~S5>yF7uhpcS=dDrTs?eL;Nwe7>(Xx4?yJ)*fkUTnT^of61f6)jzX$~I^
zyzf5Ma<60$3=Wo2QabyD!o^J;6n)SrlC`nlURZH$z@GuueWgP<TQX|sL*)@S5o)G~
zWo`Rj-9_x&Je7Gl@^+TsVtl;9+U%LxB%`|F{cYd#HJZ~g$sd(Gkgkm|UXz|>$iWjM
zYQ=xkXFu#z-JZ@>{e*Qh`alo@O0(8S198rbfnjP74BWbWi`@G9^C)^TH>jc9NaV&Z
zIYH3tc_@Y5=*L{KGO|LRWK5Ds08A>b_heC(74v2p_Wu4?{7$@Z%w-x)q1OI6^e--K
ztptA}KxjLKZEbZExIF{D>B<v5ZA2&3e_sQHqinesSo1}MiF^-Bd#~jvjgk_h5KV>}
zC)e_3x7~kNy)_tja+lghydd|PA|Y#W?!cx#1N&I-#6EF8I-YH9UOP`5Z)?c~Y_wWF
zPb|z8iM!_l#+xMvY2J>-4~XYR6AR$MM04Rs_9+<y$38fZ(63UhutZ_mSv^gXk%T7`
zWCWIr2otMS_mY`t9&D<(!0-pGVE77YTr78CkXCBC(dgkMoST1RCPEs!#Bh!wQQ!!B
zAtI~}QvF;b(^aWHf=%MaotSpn7*96MdK1%i-ExSTxta8#VQs&yKV3`PQ7IE9U92^Y
zc^5}OMNQ_iV)M{AK^vN4c6v<A<Kf%+Njz!unu7ad{U#l<cju}6K@b2<F66k6r&X#4
zg31NApGAdNpd`hPy=IBkA^jwU(pXb?e%wp4jy6uNpIy6cGi>31x+^zjt{#N6JF+!=
zfTt3LGnb~jkJS{Z`U)zI^%tcCA~4Iri(NOge{?)JGcRw0)ua^mQErx||Bk7GOdp(e
zqVH^H`z_JWNyc2|depxt9qhxPn#<+=PD@)kOw6*8xAECE&`o(T{f;c&Z60a9(Vfoy
zAb~k>PRVAIO*}SF)u;V)Y>Fv{&8xJnOf2BbiWe6Z#mMjbHG}+w=Ku>#m@m5D3p)ZS
z`>9NeW5IX8NwhNylia~W$RVctH?B%O%<cRvwb^7l-0oSPHYa4MxE*4eT{L<HW?&<y
z7WpQ?)0C~p^n>s~?i(C`g}e5bb$8L{@UJRtiOqaIFBxB~n>fjBvm>*o*mjT?|1dE*
zY4$gp?(k~U|7rj5CGrHON5q-yeD9F)7XXkhoPXq3k4|BQhY!^<{blj-4$2T+!;Hx;
zgu084u*mC;U;Tx<QfEcDgVzFW@JDWR1+99JFZg8VGpreqMZcFeAkWkhbSGJ^|ME(p
zhh2M|u=<zOX|=_v^d8YOco6^@9yNHS5hfBmSP`pOb-g5U`uV-Quw~9xpM5GLe=Q_%
zwIm(-JZU4DvB|EQo?MzhdI*kWg~1jVGQ28ZKkW&JWz`AO0<|y@J?@YHISzT9n-WvC
zuB;-H`aGf2sS5|<EHF#}0uu6X8sQiCX(2fB`~tI7?u#$Bt108eB)UX+^Rt>HNt`0m
zXM=D^x=?>C%8jj<T)UT=ol_H&Q=U^lAio4S{giXL>z9RSbBu4foj$wLAGG}Js$8XK
zd6gU6C^XB|Tc+D072o>s%U5{eOS-;okXT*Z<J`ngCQjU9ev+w=2cHjGpVuAIy?9v`
zbKB?HtamDx4X?)CJp-B)laAfGUXr#B{mYg#jx}u8>5-LAwSIyZ&-#-YS!f=YH4-~;
zw!iM#><q#h>?UR_X7`QW%!28{maFa7L5W!E5^ln_zK(dVkw#W)t|#@DPEY|QB)XyH
z8U>(I#%D07PU6yJGtw@nl)3ttW|k;rhbHK@Ua$(Q)etr$#0O%zm=5dn%H!(5d-$kG
zb}#O^xikHjY(3+%#r*E!dgQShXB~H0CA>f)ZYzB*ne+9|V?H`&`UC+&wIkDuF6*p-
z2h2KiD+xU>zv;jm=m7YI!XgRGd!X-an!Zk1&{dO)y8pQD2$?u35QM(zw$^g?Oxz=7
zX_-IFt#IQK#GI-Boz61gVDtVgH9pHmv2@Kdn<*@eox#1?0uq$P=b|uAE=xxWA#1Z1
zZoYHkZN2r1g{=Wx#}<CYyCBtXUwGBOyQ&273NA#g&7*l<;yWF#@$2-b6}C*~MUmYv
z3hj4wDN0LEPnLtPwqBji<D<yCeChE*eE8&&ZH2-A@cE))KmGr(_m*98ty>c)?$TIr
zcXxLS?!n!ITX1)R2X}W3?oMz?aCd0jq4C__=bZiC|8U3s!eG#g^}t$>%vrN)R`!L$
zEi#qetJMV|L?AmPwAJ|_1(&6t9&IlFq-NLN2OeGsXmBeYb}2t=$Dd#pPsmPnrazHP
zjvb^T+;0jY3c>H|yw~ewn2ah)a+=_~cv7Cv!rFZu(tR^Rc)OsXB3EzJb8qw+ft*=<
zQ{{RH#j#dEGQ~S@KQG=?0;2)3b}1@|JSGxmBAP5;Eko;#O*mGMpquWTBZ5=N&17eX
zf5m)C&L`d=mlS44;e12EnC2$|E7ZCc%bY29Rxl(h66}$*>H0*tK+jJbWt(mDpyAAP
zSVelh<l7`nJ0#!%*9GOYlCp9<c$xDESXx?g2a2-ODn|anF{Qyfdl_`*Bjy3_K7`}`
zgg&x#5u8cq59c!;-+VpcZ-wI4hF_MEm1_P*%x=`T-)!ztOI-aa`#oFy{N^tG;wan&
z6rp+4RM|xDRPRzQCwO<Or%mR}=v=y}(t}JYtnHJ)y-lVn)uO=tIR(g?zv6Uwe^t_q
z*97@J+S`9brP1K|?0ESY>}zdz=__zK-yYbGV@|XKyZyWsI(t0&vULTq-rr>1)se5+
zg(mROyjB2L!5AU&4c|p5%eX$&z>V0#Q4idK=HsluJIGTZJY`T}KFDeW`(23|N`eR<
z+;&*#@7vui*hky$ya{a4?aTC3usoCBr)OO-*fGqcmrO%kE$(SYXchX(1p_yEU5&QH
zY}h%z%WQ@kLm=3izpelQSqP7MN^x`Z<0xY(RyF2R?Q-8V>I{xmIe}1n_2hoB;qEi1
zvx&6>X%ywz@>QZ?7n|>Tp~G&hk)#!=;zhibq+n?o4Oslzo~)x0s5<lhiV^thB_}C~
z%wf$W>!NcV{fYo#LfGxdm6>#c-Aqcw+{>CT;?JV8Zsy6l0Bp~z`7WNI=jB4!tSq>p
z@?u{tq)=azr$aKaUE1j#093mmB1t7uTzv*?^QH+K9aaDq?l-Fx6TEslvR?&HKCD({
zv<@e3F)JLM@QiV?v=o;-5NjQZAD~#F#mpWTffVe|&^B<@mQed<oAWklu6k+nO0Obj
zI72(2XPvY`X4(a!{R&uDJZr{ExN$c5EWzZ9(jPBS;?^raO<loF#uUJ<T&Q?>v|b)B
z7<sq6ikrc*W)ct!_%IdFd;1<n=;wIEl52|(lY!d+S)NlLtjVn$#*UeTWi+Ud6h-Q_
zJ3Bi*2M?pZjc6?=WuPDT%UM?U%v7=E^R@HrR*UA+OlDD^Pq9E{Pvls?2J6C$E}TVp
z9u^Giuf)thxW=^P4uezkSiz7AmW$0!52nTsd-flDuhEs;vWQl6?Ca7Q@O$jQ56vFw
zMAqA3djNpGXsvWC%B>FYH>fidi3II>F2dM0i=qXFkN36BX)Ya~Dk|B3B)@QyJb(|R
z-F0CHRqXu?mOHxH3k31Q7CKN^+>_QX-o@R;cmE`Pfu6b8kqhK_w@CDE5GTdd1+CYg
z5-$+3_P2j#KR5=}e-yK(9drCZMC7<xgQxB^%rYt$V2pZXx3i+?x-^U+WgVh5E;{Q$
zY#tYxhyQk50ts3Xq0K35YHsfw-CL_h3~_hVW7FVa&7<@JM&9r~CJ~kbS2U^~H83_D
zwdW%`Hx5YWXyoDYKV<N~tw}F&`?c4bPjAb-<bR957rI10owi!WdcQNlpS&l*N?(%`
zKp`l7hb=-0_{AcWXzr+~{H}SWU)uPl?M95PyDTo7?Tr6ke#R|=bNbz5-f!9+qS*S{
z23J0dy@p}xsQqm@>kMKfmK$bHfz2Ve#k^?=W18d>G3C2*PVABAe47yGn<*C<@K%W{
z0UJ0dsH-Q9h<5Ep5K__5jBL1_&cKrpO{O^{tv02fSs!>$u?ruv2J)-TGFy#>h@<a9
zJd0-xa6}OS^8`OjV=%eFu?MEHZqwN<+0W0iZ7K7a7Jvy@N=QhG#M6&s=#v<%U}HL}
z=}Dh>F}xMpJ7B#{ny;zS`Q&9RXMC#TruvtkArK+A5({*!^(k`6@jsfI!>V{PMf@ur
zwIAN<U=LdSo@pGQ{UX@hHe8^mJl%3rQ?wMg*B6QDhUQ8Mt0(3g%mwlJ8x#0ds3Q+f
zl)q$gSQehIS$c!S&O1-ano8Y(j|9GmH_3_h?lWYUWO<kq`ZVL??5;PzXfpzSPdj+K
zkTf+lDT(Tr<+}u)EmUeSMz5PAzE1`|Pi*wni2SBu(aPVBs*SQ+2ng8E3+F%=edUll
zog3;T&6@A^SyFikHEfaNosu+}KF>es7rA4x@8rm%r=00A7&OPc$Jb9S_a5zk_dI~r
zR`_Z`*fjMA(Xb#Shkj`AuRoczca#Gh2ECzj$F3UjNP=(6T@&FfVG2G72jTvEuFu#_
z`gT>Ius3L=W3AP!aC{Q82An^$2#@AgU2$<=>F4!MQmS~}kcpVn?-68jn6>hF`J$2t
z>D0r5*R<ms1tsN`o1Y`bHN?Qv_Kc(qp?OVi$LfCBx~2dMQRRNbE(y|X!lUqug8VGx
z3)%SOS~Z-a()A=xfbO}dOCwB#o_Gj2%hYUT$qu~M{{A^A5(8?i<?7F@5Xv;Nm`&+O
z`H9fCn&EPf1sxhXTFh9(?gilGs^g>J6c|u|oPT|qrxNKY79m@n%Ar`-W}e&M56(#3
z`h{m%9$nv8e_(Z)2Il6Pk(fxoP{{~Ba-3B?=5kAu1_@l^>t;vVUS3|zy{{z4sbN*U
zmOKzL1dpgPjm>ZXt9ADEo3FNvX^h<ZzTxwHHhyD=J-RjSP0b|j{%XX_W<Pi$JwU{1
z4di)1vsrH_cHC~&@Wh$ERzN1HbAAmk{5J7`uAsLad~`T){&gbqLp{Y)%oqV6w5BOa
z1~0z8zg$ORuEOJB8#kbG#<_AoJ8<NqpoxxZVBFrilbYJ*<k=Fm2H}80*^;Z9a2IaJ
zB~lwoW$|tP!t~hetE!@WM6^dS%S*PA(@&cw&&x!o{4VW%xOOgArShTQ=y*pM4L8R|
zjg}`+x`MvZN2>tmo{P<HWpgYl_miS!8+-YxZzLW$z0HNfCK<&EG9xX?C64}<Dc;Y*
zyA$iwqSpCLf1l5)KAehKZzM-#ZS#X)gsqp+bZ>{sUv9ml$ioC+VF`;t_VeYn0uK7U
zW8Fk5E}L+X`1g=7HB!&!&I!!b&6f?KGII8=ozcWIAf2hLq`pAo7CU5jsYLVp(7I7Y
zHh=mRgMIQj%VC!z*uL5mPH(=%LUr?nVXE?z2VuWkOOB6dL5s1H@gt*lnNTV_U;Z@|
z5G@M+v50Mv(RzCy>&s=|(obmKWsf#<<;0RY;Rm<NJj0^^tC`Ra>lNU@H<x$MVgB@N
zH=meq1VUOxSnFr((BGa+!;vNdB7bOT4MB+~lSd!cm+bA^O8!9p)$hiGcK4940<z~T
zLn-)uJcQ;{!b6>CjgJ+(Vg6<s0;df!hTq~IDG~BsJugz6@;dZyr-kQB3yo`MslMLX
z>o-Wf#%i)wJ3hdE#*Nr|Byx*BLJYP$QOQ56e^8pot<|0AOvoZ5x7``kXzD8g5z?^B
z*{t!bwJG+=U#@8*O+AF0)JGa{a`!***vwI#Z1?pMD6mPG@_}2Wj2B7Ad^=V&gc68{
zr1W2JRmA5g$zJ6hJkX#AcTY*FGh9~)5D|K3a?0M>Av50?t?Iu6^{J?%7pyjEPpq}9
zYnC*K=K)+2<ApyBnfh@Ruy>o1VILk2c-18yA0y!N)O2k)4`n9(Fs<vjMY`dg-S*th
zrI;^c*1Q>qdp8{n6Bi&tdB_uR|6ZftCYccP&UTLi1>ZW9Sdt&0SMVT#RfIZ{K>Bqe
zzR)6-#>viDFt8>PFPz*y?J1~Gb$71I6F!bGY}aB8ceHmWWdC5pZPbRCS<NoF`*wnt
zp=80m7lLPeW@g@J^p~!S9k}xZ+0BhwXJD6yi$E9(1)#ec@H!T1&us+|as`+WKHr}$
z%SFx>`sdsYcLdQUxaq3#PL;r_p&xQc&o##3{TXWh(Z@cv`)vJv8KE_<TlFZ0HIFRa
z$fq${^X{lNO|$y<w!wD7@45=cVOb({7ri5}$pP3DXzmdNv*CH&J8v^4>yVslr!_VK
z=B>ZOcn|Gc{I0~6m6alN>q#BkyAoaUr1qNJ%ZxayrF`V}zA)<3(^7u@)M2Z?<DJpt
zR^0&yFK?h1Ez*Z2=)rxpOk4wO>+qCF91vu>)TNy1n95Nc9>*Bp=oaq{+zp@)d5R;<
zgkHb`bzfKT$z6%chL;?dJThc}cdh0vM8!Pt=A7kwZ8+!qR!JA%$~_VG3dqan{^exX
ze*OvZdn~&<$CW?LcKZU?Jd3$I=>m>Jxc{K}-u>$0AED6^L!au{_erIrJNB2MFH4U~
zBpcg)_c9eML!$~h-%^(FFQQf?R(MCcNeAG9moYEj!}(vUADFdwNMIY#2jks)bPMVo
zRvV>Xr4`<<Wf1s%Zse4dTDIc!0eHqe38IEG?RWU*bTB8O(6gyoAWf#%y_+tJrO0|b
zNy?RmdonM;aJlCXQPEdp#Duu{_df5(fXwHQz6aAK?1Gxg5gn1MAInr6<TH3lUScP1
zMFbzg6!Tl&hP?qV%DJh>&LNT8-U+=d!k-g((eZxN=d~LKyq+rwvydDJkLEBXlX|V<
zVFipF8x%GD#wH$Hk{F^B1qP>=jN&ypb*zFa1ubSq>ox6+-+VZZ(QzlnmqjZ+le7-S
zhy{4m9h~#9rUegRZr1gTZyS@&e3VtT)$7ocZn$S!IINo(>7a41{fP$<s9uKMb8+V+
zTALd6db6)uiNMQ$bB*TD>!Y_Tv^VTk-As6gev2z?G&Jfu<?^rmii9y+&F-Y_r7LuK
zyKXU+UpMlaM$n>s<$(0!<Py-(?LP$D=rW+l9g!PcfswzZJLCJreFtTYVtDVH48M1C
zzTKcXOB>*i%ngTAXWTuDeF&TcCTvnD1iUR=a0m6qm`Vo*d}x4U0>er6{T}VZK$UHB
z$N}>B+EbNqZ*~+;*3jh3C(#oTOxEb7cbI3OHR*3Y^>*X1&m;0uemN&jM~6$_=DqX5
zW^+APrvxBeb3j&lJ63(Kc6`sZtH0@H_t)NTxA|~}*ad8ARoW>b*PFloRC0#C$L!XD
z_ho+X3G3)K%STYadXP)`c!}U&@`1v_qtXT27+a9e9X{|r8Qv+iBfq!MOj*e}&o|Rg
zx#tGMEg^2n7xFQ<O(EV!0|8V)8L_vQZa@UR!;yP;1Ap}4pG<)C80->w%InhD0TQ*l
z6=%C<^4?z?Z?t}&kR1MWZ|=OttdjZq$3hrgKb<+L&8br?!7d-(w@a=~lsQycbTVaC
zfJlnvzzH_De!=AKDV5L!cftHo#_1%MMd#}D7s*9N;vdZ!?`un&<x;mr|LO(2zq*30
znE}>yp<$uRkhe_qbV;mv?b4t8Uw^CvRW<Xyk0il$M~!t<4TQk=`wJ~!Zq@UF9zx*z
z`K!@_kOC6&VUPY_D#C#LEen5)&npeM2Gm$}of>|M{_Ni{A3R1J|AHL*zG%I@^a<pT
zvSrnE-^ynSc{fH2Lja-Su=n!cAJ?5;&N^uO?O-Wph&^-aM}&IUCFHHF;*qeQ&@dqK
zNFTlXKwG?r>rOx$6JzW1UYAA9z{I<mA!l5?*Rq*j5rJ5G#<L(aZrgPpy;)+ZjD9iE
z=iSKd!yhtR8iiv5<yuB;A0Iafj26EO>h!MQNxZasUuOhZ!qSpeNNgtpf|2;}(md83
z>Z`PoqQ7VMlLWqTfRB`FpHiWx9YH;-DEcwNvwXi>SA4#oF-JWv`z*v&unjQYS1UhC
zn;fXAq$<T=F%Tut*ADZ1w^Cq{mrguY#Mk2qdBrX}&-@UPk(b_79f5X;1Now__7OsF
z*S>o2+RK@o*Nyzo2js42wLn|{-+5Yj@u(<ccd6PRv|v;ud}$N^)emeJd&97&(^Rf`
z`aChoxZnm?h&J?QW==^ZwDD310Aj!I4Zv@wvi4?_0*7#plTA%cZ^5c#`PsZQg!iLt
zC)?ZRguPyY+rhr^u<EPdXZN6b(KMCIK0R0yAz!w;Z5-CWDn-djrDVXMZufWomr#-b
zK=4nF7nS94MD6W5&+p(!(dKe@Ipda;C>kLFe^^%#w5v_c!Y!S@?e#ew8|Un<YtR5$
z?fdv?YVE9(#^T#KQSxj{qwNTY1$;$7M7HlSKY9-nF&x?-Uvi;-7gnjfCvj>9dg+iM
z$tnK^iKXdWdU|2_Q}l**^d7s*_3Tg6?4ZI^2bf?2(MP4Xb$^cIW;bGmT3^}TN<k@>
zX50XaVK*AKdy~}(MPu*R2xQCm?n;u)?d!Lbz_||BHHU<3pkbo>)u4^XslHeG-{-}^
zanD&%RiHnzKRFA_%>APNG`ICTTi7e^%K-1TA0@c^S~E*^^(>ZztcF>SOEX*hD(mYr
z^7HXlwZ&bM^pWXq#@@0}tZiARV^Z&t11A~ZZy_eY4k|kHGWu~uNTZ{!fXY_iKBg|W
zT^4&{t#Pwq+G4k2g}c8_1KN(0k(8?OiZz0BtrG*9-;{<h+pzXBEDUVc-V5I&2dnsz
zv6t4$#bI?Hf);CYpLe0o^`h_XDkZP%QCZwkSBnr65{d9;i9_T!93Q6r1_|)FDlO6t
zL|O%YK8E}{x_PEO7I@X8&cn>Z$_z$zhlXMs=D~%&3AV$Qu)KWZ_eiu!M3l$m0#yy=
zJ8ksOrz0*sZ!aVBw-214i9A%o1wOZB>x&D!kn{;KOgZ`LnH3;vUZtz<Xp}~N4x8b&
zD4x{isR@m{lXA7%EGOds1gEPP^V<zTI?wt?rd9+&yuLw<E(m^S6l6>-+V|LffKO>`
zyoX|1mLq2<sPIa%IX&qXQb{OKm+_HBub=*5WEr&x!<yiM4}I!&GE`sb?9kQ*(g+=Q
zU+00xi)c9qvL5cgemENa#66yDt@dR?fw+Kjywq>cXUV<o==Pl?wY<o|e;G_rVDC5h
zdsBHaJ?RJDe@m0um6;0RD~IvQ^wVln9`4FBi4;e;U+@B?QBiIh*0}k^)@QO3esTNs
z4f;c@pS`0HZ5Z`szi#(BWK&stwzf<$-QV5);6ZFivcF^n^MJBuJ2&oQ1~&|zpC5n+
zuYtl9pQF2gu}1KOT9fdeh7645ANFbeiOmx~zSco+be*3;fSP-<d9%-U1axoTY2A(B
z+gXQpex4nnQ;zv_dPNYleE3|oy%cq*>{IcQ^944E`bmofo?ET9ShcVh6x3;6X?sWz
z3qy`P<?y?4RS1V{?WMbTDcWzet1Z<VJI$1iPf3!&^b78%=^LfdYkaC8gOns)=C`sw
z{X?nL>7n!^BLl$@O%29~Ih*%t>55x?J80*6FUEMX@9%r5Po7RlR05pX<tnG4G<6C_
z5sXx7f=Cuag6KzKG*{gvD*#g>?hg_xxivbbg7fwKq3n!A#CuevmubBx&>u=QjoYqu
z0$g*dGedhEN@QLjZHg{zA`e86Ko`Cz)%|HwTWpjU+WK*r9&Yv5lbh7bw!JIg2Wjn}
zCJuJ5V3<q$EXr<`JJN`a&-hrysrNY)T#OZL^tj;Zv1$+!5|R>dmtS&A`|*Pg1EbJq
zrh$c**Erx6b<1$w)<Js#U9zK0`Ip<D)2^s>^}a%bBLQyEr=HK#4+(bL9W-<*Vm}|^
z%a_?$Xy8|7mUGIh{HGt(v4pz7_1K!(D|a_w#v4P;tA}%<4?T_NS&>A7qupg>*HRt#
z>`dQe)k$4ANf7B8;3vu8+^ul{*E9cR_|dyygPY7<EVuXPFj;Av(y8}z(Ifu1lGU=Q
z8??wwe6>i)S=<*CA?dz=Bai2o6JIL3Q4gFSE@%BsgcO2#oGwS+(1q}$Fz}BTyjL`$
zmD&kzuM2RetxvCiC(;wPX?)xI*Z=gY5N8QN`a^2mh(7Ivx`Lg9Gku3+7#?@taI&mf
zE}5Sc5y+?Is;|!*2I|rAWMV7cpWGMPMkJAW<xtqyx-4@=0Jb5KOd;@|L|pQL7z4~c
zqgB#L9X4^Jyn!kMtleHKrhd+fn8Rr1Z~U9XF5QaeM3ii`UAqWM5tW{Le=#suc~<@E
z5-%7(W$>&OjQ(OBxo#J`7V17QKVBT!wmxR<rDH*<Ztj&Vjk~mqGaUioX|w!%!hsn*
z_L)MYiQ|oMts)RDXf6<It=FYsEcd4*1kP3v%&Fq?0SncRV7<>P1s2E}9emyFjooT6
z-w$mEmais9!a1oYm}Cm9n)$ow*?iF={H_O1@SjrA`y#@nf8mEVx1o6sx4$wd3Jx+t
z8VNr;M#w=SNC%G`PvG&X`^08dNV>C?pgT#9kq${}%|3~1&AzBXb1u{0XNW|mZ^TP}
z&cq%1It{;k_K35@HarsY;pF=^u-P^Ly8&~PMcOz366%TK2w@L%XIV_~{nYje?~)lD
z$k~}FB<819I>4e+0MCrr*D3y@g?`S${kkwdYM`N&i}u=6fOhW76mhVt_vq%5u-SR;
zx!UF~S$mP-Mr`r|dC8a?88>fikC8oEM;hR;)#p}iug9klDYe_Hu5R!(w(JxHgTR<k
zCRz$I5$3sMCp=PWGLz~j;1820jt9((NcQ2JZ_^Tfn)-da(2ctTF8r3C2@Bv4*>!<<
zL0LL(jx~2NZjr{{muK_qZfHGfTS5|jT;rXI`(Ke~vo4F3Q#{I?j@=8+<f(llJJ75?
zDiG&umrrz0H>@d?^4ZVV8;#a$*+G2G;$O^s<+wp!$v>9xfmABiICh=AQD@Np$S&PU
zoezJ=>bb)M?3=v*5TSm#7;iA816$ka_4rayP*{Ln)^3;do$&X4Yfoez`gnDWXg=nu
zwX;T62As*jeL~xLrROsQ0g_<Ca2*NrwO6H5g_wV$btGC8?Sg(}^lAvkFd0mQQMXqv
z+Iol=jKN{#U5u$oLqo=8pL!b8st$K3_8Np_GvEu}HaiufWvDi0&@sCS#G^FC*WtF+
zot5Qi(*5(*P{;d=Hn+nq-|Zz+`|=zPgBQ>h)xgO0XKxGye0HB2JYHRf4K8d1+v}9m
z#Yuxea9ymZKnP-Rd?Hn!%bDZ!0C5G!du@-XyNqmM#k;5%P!3k)#zpJZ^|xLJ4Dv_P
zF8nUsEY&Ed!Wnl)`$@vt?y%LZ@QKBs+$EDO7TrN3dj;BLm=fX&+ycvHM!S*ViS4Oe
zCEsyIoOo6Yx^6xU&2ajzIU$YO?)CF(w?$oLI=36%C29LCTtvi4;v(u1aWzB(rBcsD
zA-I(B_@~?XY5+Z{uHPRsui5L9*)h{)oEM4CnLL;$631|@bK~{6@czcLleb4oL}4gH
z#M(2O&w^-^`)kzTHG>H1DhavoS#=x!T4cq281cyK`hK8m1f`7}AL2BQW7m7r<@#%F
zpuH`;+cs*$QZ~;}qUuH)h5{O=w`JCT;rpq(6X)o_E8Ar<o-2XPM?{$ZGDW`|VD<&|
zsAp9}#nm7tqtIb64AFJj&@<DHUd$T!0uCps5L%(g!?-AG<<HfeLwV7m-ld0Kn%v=7
zPg^#fT|y%UOs@2oALFDaZAonFCZZ#=8HaA1gCdL4h9HUuR#k^4A!7A{A`2cSX9v=2
zPG6V)^jZnRv<Q2l$cQ|8yVX=D1cyZOJpPIaf2ihsY`LxGVr`+rij@#bypaUlYVB8&
zjFNrw&_9b&oEEXmniqx8Xj=X#vgivd%&!0%q^dchdd|69bufxA5w}~Z@i_C<5#B4L
z)cTM@D_inBRBG;s{f%fh|5;bOX0+x^+6a{qden`(@ZG4oOn8dhU6zz>p*yg>xQaBR
z_4<MLqADW0*l@Zt8i`Q^Gi{f^!&9ZgBamZVEXbAlUWZI2M@4nnyT2pcG}tZvvF+t?
zz%Ah}^e(J%8T6*}5P6*N^yw&1Pg69xo}cAXyLMOLw{Ll|zi`h8VRZCFnUf8%?IW`=
z&x@S!8gxWSD|v*W?!Y%fihDljKM?OIMvPkdLS9>Ba65befsKddEU{~G<9?AeFVl-i
zOT3UUzcWnbpSitvct2~K+96EgE#Y*GI9WYk_nso7h3~%UVBSjMimk6b40lKxeLd?O
zMpX%tHEZE!K|RCcK^2BO{q7d_W@~Zcl1GR8l6v8-*<+s;6lqSz>N)s>kqjI`s{|(X
zy&bZ?_^&nF-2RqCx5*SfQD`sM1u1CYZq!e;fD6d!K2FXC8c!~No=Ls~G4oV*68-H=
zHrt41o!pE)F8_Mqqirz?LwjqH%~pYA4o<vUvs_Q!x3PN`0(f~ZOgE8*_>9{xZ;Wgv
zO1!mOO1Y(oL7p3K*OE||nW(>fMw<h;D!hB0XKxQOzlOS!|6{<*;j$-{yZ8_<c$s)O
ziNVynYe^NB81egMn1mG`T~{cB0GTAIZtoK}Y1#V1IX_AF?*$|ys2m&nfP-s|yPV)9
zzb%7Q+-9-N-FGX($2dsV{yGCtHct9`!D;@|z^)KTX3gCh2C^8@%ZzOqo^kyYSj#^8
z_4JJZd2Kp(#oTw{3+r7WRCJp;sP>erhL{X<WOQ_LKS7k`<mBXZ)dkEdLkhVaY${&d
zZM=*a@d(L!=Fu(Zc<vN3tgWB_go``*PXpf>!K)OoS_tQ3=j4~EL0Dp1keLc+2yRS#
z=x2FX&__{U6)ylmIAincxF_iuD@pB*sSJWkXa{=#JWOyryFEr99S^ow-1+KH`OO1F
z^5GGzhDa{v>r4KN=J^9~)pmJTzwhIv+6QDp;7IVim2l{-#SjXII+r1QvEG&r_71mI
zea2zb`du9fn4?fVw_%xz!6kty-+l~>?S1kan1V%x&8u10vQNQf$tsxNTH-kf2F!H^
zVZh-&P8vUd5kR}7%wVo6G`au%#@d?<x_YV}UvAok3Ikk4nyK7{dUhu!0-o2mKWGd)
zzx^rb{`&?LwLF|i<+|@7%6VN`(IIZ-1|zudvHhE~!l0bD`*{_izhQ}=LjQRP<b?hz
zd>R7iq}n*I#pEXEY$5p{FGrjLBw27I<T3!To!{?jbJljfy?Fr}#gK>)zgDJP#=BlG
znLEvxc!2AIWJ9@S&>!@A5~}X(fqCd5pQpT|%^jfxQc<ISyuo`O!6H&xFkdQZh9T-n
z^`CLmLcrsO7UafZ_}J@-SfP`buyYwsk@N|KNMlLF^Vl~4Gp*~A212{48lL<9PXF+|
z@{ci_HHC9Xfcbq1{+~$Kn}Ai-b#s}}eJ`hs#Dj-KFu4E;J*a5mFaVekY6LS3H#Ot_
z3#VK*o#n!PwcES;AlEaX=tgeL6l(dCP{+>n{7NP?B6$gY!HOgTc^F_X+~mbgkr$0k
zn7=HCHjIdbtWjN&I;@!nZ#Dg&I}RfUd#b3a2GJl1hY=ak-Gc`s#P<E?H^Co*zjPz~
zZ|MJx|BK|mKmX@L)(|r=FYkXBM&XJ#ME>u4@NHxMH~xRF-jh~6|Gx_nZYby;|L?*u
zFO2{13|E4quB)rY{9{N3%<#~p2uq~I|M%%NOT2}KP_yxUNe&cCG^gh`czSt~RSgQ%
z<S<4gKZyE8%gvqI$o=1kR2vq3>x!75l!#2?t3a_2ZV6gnUz7V<?e{!9t!|{nAYxP_
z(a%QD%vgV0gLn<XHI{L2s3@sX6fH)DdVjxn(@@niBKM!eJd8+=u<P}froP=N*RZtH
zihs&}q?Hj_d`XZy)_*MhVx(sCBGwV`$PE<iPXOi+J^#eT`p^5CtE#{j{R{EkHk}Cr
z(Pgg8Ay$fp>U({DzGZ_CeT8lTdt<^y77f;mN7{|QfZR8o#4i~Fl+qD!<4;?;2U1Or
zQt8+?cMpI3vqI$Wj0<eGSE5wE#~>tUT&qs^jiZU_VMj&o4g_=5S;Jq57W-Pr$`<so
z$ZX5U$0s6|^_d#>HA{^eg%Pd(J8AxTtfnY`T)kd)nn0uH!Mntmcx2i{yt0>BTzZbe
z?j=;CQdP^+;c{{#WPQ&I=I)!uBPiH`qN1DSl|eh9=BuN{GyB3R*#Vfi7!1q#T!Z)d
z?5@+_ho~5sbizrz+#lo}7(H`D^e#orV{_QsBYi%&EVn62sQ;b9vUWpwSt7luA*#Y3
zG^Xw2e@y&};(u8ls(;b5Qm+i~jBQpM>efk5&vdBS<qPu)Vcsg(E-=PPH>)UUK}*s8
zB3oGOlnR!HY@Q(ShZO#IT}sF@4lXaNAh|Xkysxt+FV~3}8@--A-(K8-<T}{ZG5Y>l
zOO?nX%>P}AHcC%VvfjGi?e&CYmS&!Hmol2&E;yP(hUCGYd%^v?D)oqlf%LqUQ$Hr$
zwa8#BKikI2s7U`B@XL)1AxBg7w)vid{V5~Gsk8`+o{|1>0#6(dPRk4(gV6<Lm3_Nm
zL*u7|ra`{XIsEy1jXyDRuJg>gQCuIMS&8Jjs+CL!ySc7^ywjoiG@m7UxZ4c+#g3nW
z=gKqL>Zr^Vp`hqNoI8)6$E(hfQ|0Q?yqUS+fnja;5$jsfRA?inYx6!XKE8MUKIebu
zcnR$F%|rTih7KZHLQ28UD9NNilf-DYhmlbk2}*M+Dh|Cn40Av2l+;t~%izM@Im;&(
zFj$9N3e3<bS#Q898R|E8@7n0;y7v+hnFg)1#e#Mq0&`s$Ik<GfL-Ny|QiS=RJ4Jex
zGKiL|i<M(~SnE{-^jgJ#1$CAcMfPi2-A1FK{?YAb+n-}M)UZ^IwD^vO{-ehm*0rEQ
zO5Kdo@*C(|t}XH2XoAABg0lFJr!}`eLe;vc9Wg{?q~EKZ^3Hyvooiwk4XjpSNSugk
z#9`>x6cQmNO3sO4S>0+z3#gVV@s5<Epvww(QpbsH(b`-X_?OA84Zk?neSMjCHC)m=
zQYcJM2i$UZqD|qUX6Y)FGy7Wwr*a2<Rdt5vGGf5gieX-Wm3Q+v{DW8**^JV13122x
zdspmv8Vf$6S|cs5%5Ag)*O>cj2G4C|@TP_REXs*>JTy7NS#Z}~L;$#n{@8u^%=ZQX
z#&tZ6t=+O3K5HA79}EzOS&c=xr&Xe2f}bP9zX}#5;9#d`eKc+5fXr}G1*k{9>hI(@
zZe=rbF5CY(bbnx$(Bl*VYml*}5&!e%Y{++9uyRvqaD^X8zuRZm7zxjq5ImyxsiY@d
z_8Q2?#0Wj!R@T%c1{DDKL514X{{8GP?`MG<G;X5ak^<<W$|HXtzy!IT`X~733OvZz
z3#LQr)JFdY_2=80Y>VZ!!@aXBr&^(@ehn%PDp?ZIz>pFFl$S1%EDD9T^Q8yjwb4mR
zVC*t|)AIQmZClUZV(he(CEN3%ahU2+Ldw7Ag~FwVU&3Je^(VN<W4#d=vSVHTp3dHB
z^0i-0xiD}L(J`fEZ@(JxSX;&P<<}zt04Q~Qjz4o2>YD<LTg>Y>{*JS{r$Ihj4a-bI
zTgN5y6pH(7;09z+in44L4C07wI*B@@AQ1qpN#4z}m^PX<Soq57a~Aqyq%z-|iL8uM
zpeg1XYr77&MQ0XoB@0|(TZ!iydZP9H)z_<bFm|rW*R^xql4*s)Z(|fJXF6+dxA`Y6
z?zVEO8dt^q6ryb?(R2NCsH7q$V>PoaFJCThmv2I;1-NwD#D#gzS6R{hAFF2q-1o;v
z<;}jdlOS8pva<YWD`!rc1=fFCwfeg3s=<pO35+{t-b~)79!puCRWmvWeq>^zPlhG*
z9D8j_o6wubwM}cGp%a`LC<eyyI^Z*9cEGxxy^4;-zjrW*DIKrN)n)TZ++}vUCJTWZ
z$lz3iIoXt%YxPL~J^9B>M@N@cBq#={xj2580F=Z>*w9gU=)ty=&~faB#zR<XO^~l}
zaIbQzvO!^^RI<xp#(*1%pBl#ti1W;^q^3kY`gWqEy<kx1j3NM1Bx$ZEVYt*|f!u@#
zfR@Fu2xn_UyVLDH?cQQ~p-m4&yP{rJ#G@H!clcsbQBjlFU4*H$0%t_bo9kMGa<O0(
z4x_!}1%M6E?^F7Pf$(`Vd5=q6wgPnl*SjqaDSamhV%4)i?32S<sS(u-%*(5sNl>59
zV{;rzvcf1@ifH^PExYMJ$Z9bfcACymt?JigU{EYo$IXE`6_3yt>9i~pj0h-Yf;ma$
z%WY}ctet*{TI6BOgL3b*w8}&nr(CSpZ)xx|BFZnN)Kvg2EaN8LVYVn%|7jlWcv2=)
z@n8n*)U@>KuPGR$)PaJg8A#?AMMI?p?NX5@riMTJ`V>i)G}O)6OFs&q%~zj9h~1IC
zd9jK61@AzKGcTv$47EF>g+jksShR6<fTkKa$a*%(jnpqIYOhw#Hfm`PWJ8DA|93o?
zzCl5iC>bPKT&+?1^3)_x#SspRvV92|qWrSI+UFm`lzSWgB_-AS(!r$0-wXBuUsSGW
zttDOF936NunG%3ClM)uowc6&8{`T=8!vUJ-wFDP*C#r0=$MclVsLALQ*M?~)z<ivv
zO+qbjAaJ<4Urxf_tCVDbD54WW4(<}@{skQm?^?8eg)UQhsD0$|++-ru)Bu*l^27(n
zdZPtC0xIq#k0m{U&>|l3ikft_Ra9H#!=a;F(7eH~&!X0THJh2;l$-&k^p|vH6g2z5
zA{)!w_zbH$Iz9sLN^3>4!BO4z5txcykVhU2$*?vo6K0EF#<Wg0ujQaMFbwBDdR_I%
z4iIOcW}#4oEh2tJ|B?#I4!5(IuImBwDF#u8Q(KRml*JpEHalMdHwXm=qYtNK7e$l{
z*_Ja|wYQVeN~gMatKF%3qowN}3U%YqBz)IT-?<zT3U%*z&sEvS>IhFUF)*1%v)|Za
zQ6AQQxQl&Ut?S<Ydeb~*Px%3;J&YU+bq%<AWq4jjce5N%Go0%%tU;3R%&%O&b?~if
zft=xmDIruQC>%OqP^>=4+z;&XimUvKsj-FA=yj(Yw>mG<8P{ql^;1*J>-peQmYe3J
zcN7QEJBMCym?27o5B-s?F5bc1BFe_;po5-M-^v9K_XdhDHs(K}bfQ*h0znx-YNivI
zKll2S`~^Wr)U&MH^EJP;x#IVosR0fSCV+?nSL7z%0)>d-u=ILoJoWdOtgdL_e@e}k
zvk{uMzbRCR;LX&%zU83s@6R^+FH-38g0vq5{?UJ|J9a6}&CQi`kFYhp>|9;3BIB~8
z0P}p6Ckzm@*_O-)C8_#T;Cl&lpTiltgTzEd4I|gg@vxRUFykPE49S>;{AO<E_}ORd
zwF?J_CG-C>*m+!Pm_msG<f9!a+ulu6?(OpnppYgY>Pdm720Yiw{0^#~9j<SH_g+C-
z6-WOo_&CXZCW95Re!Pr7%9p}!IpEb?#rJJCu(r@<=DDaFl*9fx!c%#XqkqzRKhNI#
zAf~d>(Nk;jCnJhWv>g@=D3@PL8m)P&vDWunyS`l9qtyHk9(Yk!&>Q!@wSUV!=Qs=>
zjTdG@GHJ(L!;tLRp@&f+J_j8wgx_vWQEMKCEi1VymIwuGdAX4=ITe-n=8<o+>XtuK
zXSfQxd?2pnjrlG<Dt26il5&3n#Cu;}Z@I3vi!;dkw|%V@y1QVy0Mfz5UpdWOZCb}J
znOd)q0C!HDtj1#(v)u*1>e4atS&HF45Dki4e>~5TGdyR+p|y&+tD+<LMhS+sOGUkg
z2^8k;D20P{P`urFjmXhNz8J0tuE)gR|C>t9o%ipA)0xuh)Yg^8Mqqs+ruXHkO=czO
z5`VNes5MIrRQ>q`iwC`3(x0eGKOssszvZvY!<<!|Ck`><;p*CGVPPTb<Kq*G^{Aq#
zxFBQ*2?+@`vGm^2xJebo$hP%^EW`wVNxa*c_8uaN!e1SWT9_n{28RK&o7cCpW<9ku
zj~bwo$W~*k?UC0l>|Z#NTsfHxZ6tEXdzsh+zvJ+wGLQRx4r?*{7tR<e6xtrj^UKr3
zDY37ny(=}!Y5~Hvk+<gv#p=)X6yAPgSkx3;BYK1>lwjCJ8Q%dr+TQ6x$&N(yv>q<Y
zyl%_WHbL5LHDKs7`N=hVU|?p8RW%YLz&sJB+kR(Yzbco!o2(9_+ob0Gec{Z;cL@?_
zGLgsPGJSAQ9p}WkTj@v;h92S%+Us#JN`_&zLYL0TVs9;y#aAFJZcS3A9gkfuH1Xjt
z5r9LlVY8O`SwYS9r37IyF~gL#r<mB<4}Y;&^pCZnvERygws7_JwYnWisoDjTi2QT^
zZ)7hfY@ef^Y<??+kk^slP5;{Jm=r|DSmCJjt;&<s7?lWB^^@0~{mM^qtr{J2M(sL#
z=fMyCR+FfyLZ}ebFo1qk#rK|##YAet1C)k>K)oWE%uX?{AdEhfA&&=`O2n|J)OAkH
zsGE9f<DkZiwLq~Whn2$oAD{}y69(CZ8cY}(6k)<d(`kutfmwqd-Tm1G51)6%b*Ig2
z<`~|qs#`HvxKU|UvnY+D^NPf<{>i#~8X~MbZh(QU9cP!&WSeD`SXh{jSYa?^9jr_f
z^d8Vei}C5m#8X3ZoOu|<afwFYZRQ4Ff5pWgqPva_WoCx(kSOCvi3E|y_yl80B_r99
zPM+vY!L(X$e!9c&Rw)|x-GMn~`nrj`QDTq3m8)&PM=uTmJcu`NE6M*2D*2IuP<>6V
z9KqXB)f4Q3$WU6-PFP=fc=+u>vdV<}vLP*)*=`;>-?D7P2=>x$^f6g^=H&*Ukr9;q
zlCjyqje<V&xJb?!G{XtD4RJN`T8t*ne6nJ$fklCo05_Nc1s@0Pej(=4QgW|Px|L^y
zL$!jCtm|^Sv)GEbU@%#Ln`p8$jz9R6dE5Dwl1ZZ9d`c^#<)b%%frq$=hyUlDnC>oj
zsHw|Jo7JG}@Z10o8C}21FB<Jt3YjQjS&!txO=if6BJ2y(E4$WUYt+Wi0SMT&5^2w|
z?>i1*UZOe>YDn|>L%zt8bq2va8{Mq>k0f-zA48VEln?8S>PaB{n`e%x1vNxc`vQ5!
zSDWVd+eK%vad70s#l?#?<i^Ftcbp;EA<fYDCd6d>1G^g0D^_;&1b)K6d!1L9>^oAz
zUw`N}xF;aV{*DJHA{JlEv4!JNMFz61q|(3O(2#MMGPt9tRZkDWiEXds8(UcJp-m5t
zHpn>;>e-PWkR7{RrNHFQy)B8xCcVxO(1P;)*@XROTwHICKYaX};kymU2HPjMZC593
zr^DZ`i3c%Z=zXb$d}~&O4>c$469v1IcjPQQIM4jQ$$vxHogZw6P(Y5JuuxI%&4Uig
z5dQ#_*VKm*J^Y)}AJ9SYxNLnWRxT8B)x{mJ1;I;elnkK@NxxBznLbI1@u`04>JKwB
zCCuHNY?vahkPMLfKBd<s-6y$&x5ln(ogXp=Nk&jQA}u$2;HB_H$DC}#$g5pjA0sq{
zdHA$}SF^7<@!el=ycn2AmXI$h{=s2Ykn_7ddFJCVXXJh?@#Exs*@qUgSivBmEg}Uv
zvPC$|exrr4H9TEqF_FCq;wEqh|M0_rY|J&>d<9R6m641WhG!Zq7XBZ`l~^2pJBNuw
zOCTW$s+Q9@)F7N0#EdT=inINd<+vIBte(DMjvANM<dLmxjTTP9Eh2Ay*9gtMpAjlR
zrSL`PCw=5sljB7}&=P;x<cMw2NDJ0&5bhu2&*&*<Dg@-YnC8tJ0GDWa1$Q0>7zYn!
zU2i2`WX;XmuNsv%zHIi2{^5!pF$$A*^}XXn4zXfOJb9!EiN(<Wf=xu+c**T~TI{i>
z;1*~d2T2MC{~+A@PCYFHh_5PnLaVnUKe6M6pc?BYwieo~#I_^{zJUNyM8;<jMbk6q
z(2ZV`ESu;?QllOU8ZNL~T)P9}9NnA|9=g-Q))!j2JrkNJc|+2=(jBpvdF07(p1hWg
zYtWVf)-M#^_c1oE^MSpoc3W@x2Cr$Ubph{;-jD0L3_9HA?5H9F!)RvNJ~)|w-7EAB
zE!o1ufEeV%+}#5I4P(h2gXypU0J2dcx7Zx7z3&`gIxJlIcP!e80`bs;>lrVYn*sx@
zdng1ZfU*6>C=NhkFcivN&j~>^4?^u=87F(e4uM46Ujvhxo<Twy8OA|5Az{uBL_(Lc
zYS*R*^_eqxKO)1;Ln1$Z&-ye>Y&65+{4$|zjIfU-F|)dDtkoa9mm61*h{*aXg_NVL
zO*Vb{N<hYBn#Kat{TW#t!5#~<ByiIVDaw^%34J1(S?w`Pyip&@fHhKMjA~fpUsQAM
zl<lZGxuLqu9C<zA*RNj!jN0Z{xBNK;t8CMr9^yZc;GWPm+`6(?5lvDZ_s^;&zSD|z
z>$aFD*iUTKzmB#brheedF$l#T=M;$-6<#rIHYz})@d8t}a8hyRco+=S%_uaZB$$d+
zj#!p;>)71D0--XqwzBv(tchGad`Cp1>6K+?1C(_ERf`wc-VAf|iI*U~8Q2Klfouj5
z3Ppz9Rtt}*3C5q@q0St7QH_lp!o8$s@c0wlBA>{aq5ls`!cWCHRM+}){6w<{BsIYw
zWQLGJnksS!7bPb0|Grsw+|MZx=CJfWnR!31S}!7q7<}fpK=%4>KRK~D3xMO;E7tUL
z!6Cy}V4JGeSl8Ly(jZGbz{UpWwOR#M=g2>3sd?02%G>762h1%`tCR?8+WA#T@6o+R
z3uYiVO-=ROJ%?6*!vX36o(K<(4HhXw$S>lB^xGbz&k|jwnRhsj6A5lWBztIa1WxB&
z&X4s*R&CKX?CYcdD9-MtIB(bN>c_EEgvkA0A4_*6^-%s@9L*FaZ)vI6lOKlb_r8|l
zvf%PH*w&CEU*9T2KAis&ES<4DTdHjY(@WrJ&-*;DjAZH};lJVlbhYf)i$_F-^aWbu
z)CNWu*+>_d$vF&sN}IXYm+Sp8>a8zd90W84V&BmmiVx3Cd#G3_zDzdIbIMGbqN3Xw
zsXFPu1&x`*ig}=x@n0nZn4vk=G+QvFNWCPvBmF{LSvEWM6XIqsiUvy)_(=ucJCRk}
z#EFwilq$G|(D$|`E_|yHHK!4Ue0&~Gss9mr-Cu6%CxP2U?(_%$!}E}PeY~K5O8gZ3
zFDPG&YTkUQ*Jdyw&!~A0hH9w%RfH5KCgn?N+GT%{38to}3U~-~n|zJ#>kBN_a6_>k
z1eC&NdrahQ2;mqGhCo}+C0=5FjYlU(AV9!`f|g|B1Bc`fm<`tRx7bA1lf6W4HN<=S
zXO!SP*VXSTzLN;NL2HH=9DO2p?gk@iNil_L5x)x{@aNd2BZXGoKQr*gsTyd}{4phW
z9bXa0#3IpaL7Oi$7jSEy--R>74lx`eM25oZ&F629jCLl#8pnqE6*LxlMZErblK#C*
z>!QoYy3Br~$@o`W8bpX`zKB1SkdT4A{1mtC8ukzl`3nlMz?VL(xs89%n3B|8;1z53
zr<tV9v$X8lrA9T}cSGf#XT%e>uO%gq_!Kt&k@7x$rkMF#t7}1Dg30AFyiX%b&osS)
zN5|L=)elc^$~O0kz6pN|KFHv4eQp<bmH*4aK(l}=iwsb)s@bwf@sSbxN<~UK&P<oK
zz2xYC;e@r|<mA{}v(hQ^3lODKj(SRi8+%UqU2RwIbN+<<cQn+r-CXs9JkHUvo%r!2
z4OJ>{>0f#Y;B<+YVUcJcf_(0Lk~zicQ{A7>%=I;iEEjtRX&MQsA2t_MRQ<-`RZ)pz
zZY1+lw?F21I1Iuo_U1UA{XQ6otCrOBgvoe$wf`;=<z?jM)j`hSH)nG3{##kGv3z=Z
zk^A`Q;Q$NnOW9aernC_e(0=SD|7A*lJ2UUtOJzL-_jpNU+<r1pZofsdqa{dQ4BGv>
z_@>F>^zQL+EPkeVzAT;Bc`fCAoU?y8Fg}_+i**3T+NN83IozhL##veThjr?aQU2()
zS|-@9G#Q6e6Tt^++iz3qgAOaM=P<=~p1*A&t>$KzYox$GzfSxrezAKe6$0`psTi<{
zUkcjg+$-3X?=AEN=2Q>Os0NfA9P>=m6g)jYk9jIAO{7>7e)$^GKF`aoM#2yB^1koJ
zntCRnO~?-n*>F4hDwmM3<neh`B(DpM6Rd+(DLB|2)Ip<bvG|-8mm3`n=P<Ca{7X?j
zuPcz*h&Iaqw^JkCaDLtgixtPL6Sn9wvw16bpjk=HL$wuJ`F8bAZ!rEAh#{h2Afj-8
zuB0@%`U4RLF?D@E)z$<)YX~0%vyg&g;7i)(Ggz+;22hlZKYG=-a(i{w*|^4xL!jP;
zKJot~w)z-MWcDgGVf89|YYoUo<G=kPG>D=*5d4v48*@`k5y>zAlb*7SWb~0va{4Yo
zXz6l|R>faf1B_291ofQ=mC;q)6~tQLZxr^Fj7nDm!Y)W&&9eNuymIa5@Vit$iJoMq
z(U}ftnb;{9VB}6CkD^EfzybHoj<uatgVK=q;_l>|p&x$<93*JDY~i}o(KDw9zO!kI
zWHlKeDXyEYc{MA54!rQ!=%odyhAYZ3Cvr4&sOnV=$*yJ%ioD5Nu<^1oQd6nXMc;mI
zfaCDny%^ThDYpvF2=_l=Zx)z;kl}Oh|JDGT)h84bN|gKr$MM)CijwBIRGTh-?VkZz
z1(%sp!LA8a+tsG611~I;@##@dQsyYHW4*P67VmF1@u=}=JG+=WK-wXJmUQ#3Qh9J-
zjCc=9ZETd+ZS&1n)a?})mi#%*ZsqT#T7V?hv}CW@FoyL9eqDuN!|=oB&E#%c#shdT
zUVHVI%-8i+`1X9@glL0Yu3TYsSL<Sh5sWpsdD|nvM~QQFyG2LSK8Xarqdn}nv<OFJ
zQcFdQgA*r#FUa%N<||(^ZPIIfoBlyw!Snjlz``MbIjM+(OO-P$x+qEA0e&hC_IzLB
z=YXO`)3Rf2mJP~bzr7KHx<g|cb+)IC5G790RHy2E-B4GCzfz_fa*SoX%^fDLSq72<
zjk{7bCy+_{rG!S_UaCS&kb$v&!G~q7d!EQQ+dhOhm9(UO3DOK?a@1)$6fsVjAVg7E
zSa=KOSx=nR_pt;A2glbU*pU3++?}Ybpi7TdH+Lo+T7kb5QtsD@sMtq?aT)n?*}uGO
z{1z`!j0J~o<q-muOO)gJWraTs@EYjnEo*N!Xbbm(NJz?(2xBBu6r=0Bs%e;G?aB4o
z^y@Hbc{_{T=K9)HE&($U88i-Y7z`gH_)}L-S2xufB|>=dQw|;!(kVWX>ZTpSK$V&`
zQ&Ttvj5v$S6iN7to#1iX>FBScR?h^~c(tR9Y2+<lJS3}7j}SVmi!+T2p>`;5NH0i!
zVCZ4gd}wQJZJZv|0>(f_edj5|Kb(DVx-<R`+GcICo?Y5C4CA7$Jv$})@7y8^sjts}
zf`^R(SBajU;lC)fAPPgg!oWn@3J`>oKr}!V!3L?C&_bvUVsh4)eg;TgFJl$Li}@XA
zCKX7=k)>d#q)nW+=zpV2x8~P52GEV(oMEzm`RZaH^JSUo3&2%15T?qbPV^f-ljkFX
zgpo%tz#_-ZL&Q-Dd=sulko%{vK}bTHuS>*3pJ7>(cc%9ZtR5CPUfXyI;Z4T0z?3$N
zjdqtZVt>nr)5XSbC+Z{T<jhe2hEii{kYUJCXqw}C22MKz!C~Zz0M;42jUjU9r?H8P
z%tkYPt7>e@szt+Ob7@96OZyAUk!xnJn{O9unst32KHn1((!0Bbp09_{A6UakC9((~
z1^!orzX2c!3<N`-mL?J4(x(&i%0@a3?86v|h%y<`!lGs@#t>zS-N!^lB{t0nD3JJF
zi=QmjW@F>x+STOoyB&c=Zeh(Zv8(?f3V`oO0-=)UBFuR(_`~0QOv}r;U^p!NSe0@(
zEwk6BN$BJ7TU?)H1Ms+@x%o>3Iw?zn@MFfv;cBZxcK73&-R-=JsHQh1#vJC8^ykl8
zz<-FQ|9N-As8QbYtjuc4r^}i}lA+WK?5V=t{zr~0EPvlj>kd9)yR6Kp!3-0jEP~x9
zlb-K@lTEWCjD27$i;<JU-;#QmKL4@>qTo$3mKrwa4r;(fp2k6rVppUXi=$I!0e^3C
zxy+|!45mykQoM=qUA?Gv=)wfZOgX!te}kbgFN8AoQ2+~}Ur0lJlh2>z{R&wBAYgto
z&9SlU|Nc^v5Sxh$n}Hu>w<a&H1TQkYLVrtI{(n}(KX37d!cSd8gOND@zjzI}=1iZO
znyL=&#Qx+#*W25h6$MgPQ!5-N{`mNaS2>W4{P(%nCQcw(7jMY;eGN<g3{eQS8Ie#|
zS05f79R-WlbXviR%GJKWzR3so;I|+4i87M6<dNu6Lg3&#H7qR57KpTCkeZS*R;HAF
z?<6^!+mtA^`_uCO1=^q^v`l0(z`@N4=(Z7)o|$1`HjFE5ZcZJa+csqbEV|hP0zIP|
z&=cZujl9G~{=T_^O_QCO9*7R`F){a^Xw-FdbgFSNYyzVHANJnztLims!=*cxAl*oJ
zh=6oSNl14}gLHRyx0Hl*cZYPNG)Sj(2uh!acklha<9s^*!1?T8xZ<~-XU==x^SUD=
zcWOsj5-%r(^zQ;Wd3-@Jov8h(MWWGWxp3_e_Ey=z;0qig;^7|XmyTgy-fLmV*!+9#
zQ8GmN6BFXR@^#-(ZaE1|cX*Jj+o{*i#?m*AQdesw^AX{PB~><|?ss_oxQX6NloyjF
zDketF&)?3#CB6;Kb_DG5Ztj=k+&j_St5dH*{=66*ypk_@<`<Ih1JV7sAJ;cbavYJI
zCcN|5Nd?XbZwG}QDFA&d=CS`zAsx2`{{NPFswxpV`e5ibAZwR_gdC2RSiR449-NBl
zeL*DhH7PYE?K&zx*G!kvJ0^!JxuK8cT+QI@RnOt>yuNQTxZxEa@7A71B+0y~4(F?E
z&!#fo*epq@sHn{9BS{z7FJylJZm*O`mR70F%}vYbCS^Q`V+yl24Vl-Of41X&7ME2@
zW!)XTl9(Uu)b2Q#t7P&nA+|an(wJqt(+GHKLH@S`<1qN7s)pY?y5OvqC#jInhEN_G
zTZJaH=g@NBU$qm(N19T=(h;@c<%Gc*0F6Nb4KoQExm-Q`zisB<^E@!u>=8tRvZS`U
zZ=_{q6YoyD^e4v&%rbNJc+qi;q586aO&6(WT@I4@>U7>;jMCtI?7Z0}TC6b+y@q!4
zygFJ;B-LYv`|ohpBABzQKsm>RoO2?t5gBij9IJeOf(Z~a>6T8Urg)_~GHUSN(G9r|
z7Hz9WSuB|Jw;bBMf0|#e!C%>)B75PZU$bSYQN+ATljs37-2GtdY*29RtW%WCoMFlr
za%y%kQ*tZ7!2z(w4^GFJ?Y>+>|NCu87&NPixTJt3K#7Y@lcLc`UtZnEKVw;M;leo`
zxQKD6WMraWM6=Fu{CM4+mcwBZq2s=j{7n29RaMvh+Us7O`I;C>5aJ&5dIQZHx~?_%
zr&uV4>M7~KiG|PK`hPCY0<Mj%S2#N+V3>oo*lr$U&0%r6>l~9@L52`7c-zhP<><&d
z8qv7u=g*%sB&1Hwen?VPiG{E*Ya!@FABPjD37?Qfz9{f)1ieGYq(Tq;?^n$n&HjdQ
zWTc?Iyq=*)K7=;b<MMU4`9Qgzi=N9E8GgfGJuMwN>R@=)lKN&T_`<jLN3lT{OQals
zW4uPjIfR~s5Ah{-`HY<145aeCRBt>KV-B1hU$`RfFfo^8k7S1Qr0te7aSxWaM88r^
z^AdpHVdgwmH##`{S@<``rD0^>8wS7q&3c6)tygICR^~xu79PLWkz;UI*o_i+UYXtB
zmJhdNHin{I_dX6G2EYX&y_eM^V<k-bKG9U^Fk{WAzWhpPW}4S9eI;fA@ow(tfP8jJ
z8?Ugt>11}rjzkLrCMnbK@URxy53j!kg@tq8!T$_dL@|gueIBu1%+GzcFQXY)2;@mj
zO)jAfkxV;g3shG$Z$4lT=G9b+%cv+@ayg<|k(5u`FL>>!BZOna(NvynBIHXCASbv&
za3~HY#Ton$Ffekw*s6Ztd|EtRWzyN>5?ilR^nfAT{+N@Bj!6`_z)XN`yVl{Lsn%vk
z^ysMd69Rq>#fx~XA2fs+C3KDkYoGqWgwtz_zYhSl(cyeT0RQmhYEcgK1u7bH;`Pei
z=UYPt`%%Z<wWNo?j3;qH7+DP-kzG$ZxIUNujUAk9m5~>h9|sSpQ}(w?nbD9=qBsT`
zPsvFTV-p%lS~Ipn4jy?(`?5Q<=Y|V|>!?QSt!n6t8f{RaVZDCue|H!paV`D>-xu9>
z(khF~q@j39KmVbtx|%J^d6<Nkmsd$q@yqcBGDOG7*!YT7+#d)rCHR5O9IyyzasIG~
zYU5|YJI~krSdNfB3M#4<35%%$C&tC)B{d_XY(PN3cA9a_cYgc#NP-%jma`1$?8YI0
zo;0^a!=v~9ebfn}Wf|iV%FercLa1}t!2pO+pbfZLoJt+M5Hw;h<g)oSFN61HMp?Vc
z@8&Yu(GR(8mZ&`coLn_`K=)OEB&Z>QKR@&Da>s4k&)-G2P2(#NXLc17P`MwJ6j4!7
zh@^slcD%UR%b9PhiVqR{Pxr-z%cfu-{&i7gCKy-Tmn6ooeGhYQ^6>N5#bZ{gl*NZ!
zNqf)rERJ6jW+v<OiMh^0Rkng=Kc+cOYPC(ye)%R%4Y3)seMTRDyO4tKp4?&8bbodl
z`f)fmrvym_+pU<&<%k8e<c`P~k#U)0x)&4Cd$NN)`6fsDpOUKg?Tp#9+<P3~jjiR~
z8>K((v|U`5DahCxX3O><uCf}lbm|T0+h_Q!HTWdA+mCB1Igu~VG~8Z8)L{Ywb^>Tu
zQ=*`Ioe9J<6v#T?3H2IC@FuohaUI{p!x4#f-4@q=E}YB_sj2E3I=QmisW_x5>)tJ8
zUe|m{NWeUp?VF6ov*U#v8S#2K{b!~;dJJk&{*OGRXF}V1_Z|C1Hs99aE|blL*IyT4
z<Du%rC<J2}ZPAh7(~f^F+jd>i*>1leBdZF4D#ROvRA<<BYQo1_RduKV0ztlikiCx$
zChm;cYP<WzH&oXBBZbO7<ZaJIlhk3}z0CYIcI6qq)AoFK45ByhM@=Icq1?sx{faqk
zmR_ft0Q{jTTRu(@<x#9=h0pCT+SKLz%%V((2vY2wP&TJg=B9l`+DycwSf>@|`QehH
z?9%0V#BJpBJ@D10P72<Y>+N0Gg$uZ3ta_ey5E>8~MEtosX;WCnAEXS;Y0Y{0&m|w%
z8yQ;#hkz(+5SF%Gg-jV6ONOO3zjpS6!eC*%pt$ea;hU=K=)RCHz2OH#B`rl7_{Cr)
zEuGvvaZQV_;xvuP%C;WjZ!ri<0>WjZ;zKxkBK&}(V^5!XP~X06m{Vuuy<Ur_Bem@l
zG(BU_?`6uzvoAcSSm|!Zuy5N(8=B-zR5dG9Ty@L97O|@Hlpgd_4BcMO_Lk?=(C^DX
zydMRWAdd(Hfs4^15pUrn>*J?wJ++oOUy>4;PNNvF;qD|`w&Cukq`e6mOtA=DhZ@G8
z(qH<6Z?LqjXxR7WI)jo6j|Vph#Z!&8z+l>!0<uuji6GE8=mTLPKLR10>YutET+q)V
zO(1@8<c-2`q6F`v@N+3q_|2?e6w@rsPjGW_srE-1#|o&WlJM9G!^ePjWrfV$xcNn2
z+K2piEjW5dwM!+HtwsV%qb#9}ie@vtEpl1GMBDyvT=Y&e<9w~4O(d2pw9Wbn#g<+6
zr+_$!y^>A)<M;o&m9T*JdU&8Jkc&E6ZdG-u)T9UNb_ss#Y;{aO@Z~AN$r}GvxpOL4
z*_j{(%HA3PK<K1(J!FDw)W!0RsExZ7>BokX7s)y(0Qf+D*XOzr!50GwC3PXmsf4a7
z>B|&3up*}5bUi9WTkIK&Y$L_Sqyk_TnASsGDOxP&M$((bS?loV`*@vyxH(_L96r^5
zki53mVEzzZ@`HwDB1w~~#7e>O=mCIW?#1{`)`ye6`vy4?yG8j-jxNzYFV>96M2wwV
z;SUtP_t|X*lq-tw$4iHKVAFPZ#A5BhEJ<A!&A6^ws*y>PjtFloS&qg=vsDV@5OZIl
zaw9awTI)6{;`%()SOnhGT5PnJ*4uBKn&vFl+VsOHzv+`NU0vlt3{+^7dY9=Lj`CKc
z!@C~~Fd#+?ZS>;-D1d&{=SHcKS)Xo^&s8eTqHbykNt4?(q|tTU(3oxCaKBf>_I*H{
zVg^OD^Yi+F1#}Pew}DBaC#GlinfGC&KmiJZz>iAZHm9K&W}ogv<Kv|U(w*qe5JN+3
zT6q-_*fl<vBRT>C0>b@2LQglbU<VITOV2G!IIic6DNGQbrG8e2+iI?ycUxl!Q7o4&
zn70oRt;2GyvjYS6?x3t{CJ-@Okd86LaKp`97VYu?bgE-neCh)Bnec?$;S`!%gR$!I
z@N4Y9QM2Dx9z?jDE;W>SU_le#fkJO)u{5q~rR$8~N~75>mr1W(L!grF2yLhT4SprU
zBvlgf;c>nBcghDqQ;8bQO4~;H4gDM0&1-S+??xz@ib|P+V5M*e;D?t}*w)jJG`nB!
zB3yAR`TnoFCS?FWij9!VuKEU5?C{9*i5U<%G!aQFa`=>2+a4yb2nU=8bSH<ql_yhb
zu@ZB-VqbnV8y&>Fms68_(u2p_hTVYaji(X%TMO7mg#B^%51JQq&U-_Z&Wxq#ytN_r
zS8=lX_ewB<Vhx$V62}nd<@RSMp~ExdFCs!ZN*J$cP5tqd+=S<Ebm^h}k5K~gW8*~I
z<5rXLKgY6W;mG=45#FU{e7whL+W(__I6rjg%a=cAsDmeXQwhvIrK7mMPr+Un>uXna
zHay0BXqeOA*4IW*kWr9RyY4R?r4RXpM=uYFCP$whPgzp!oya*TXyUSR&<~Dm8@u+&
zPBjO)7j1PL7cw&<sXgb_7+H^u8+#R5-RLD~sD{5Nv&NNYX@C6QM5aoD=}zJMUOguF
z^%Y6y7TPW9=GZeG`F1`zH|9RW5+85s@c=4MHkC!!i6@aH1L0hYT;45#TO1Kd0zU#y
z1VI$~y7%|-5H=;k2pZdOq*XzD${-DX2nJga%JQO(WuO-r{X6i2Yq{a^x>a4@TNNDH
z0yeZqvB%Y?1=5MXy$lFqo*wT`M`c+%RGh8-Y&Ix<1OI>;`j*F0&7exRy{bAfUT@vA
zrt%Q8z=xk_in&$W<6<sJG;>)$EuZ-8aT<-j^cd%w9|DbyP6RbiqoYr@U2}*%^bGte
z|MNC~Hf?>w%E=Wotp$E`;^Du-K{VpKV!}N6VJ{(bd2Wnto<DU>AIHndfZbnQf*=e7
zde>3p#OLN=^hwH{0FE|3ZU{eI_V6Rx$w5t^WdOwo33vF5(Zi>>p8|X;P@?Zik$90*
z+Re5s(;vRe(vZ2GhV>OX#mS4kYa^qa3lbg@C19HwlZU2J4JR+m?ks$qILyLc>eqZ-
z?Qi~)kc8F-D?!{CI$dsVES-|Kt?8E!l0mdfzy7xWmnp>SlY|_7Pnndb`>(Qw^b8C#
z9m<4>>%RilJK3hPB+`1N$FFOa8|h<W+t+U{28e9Ug^k?qgvo^ixY66a*Qo<;Meze9
zG1{dL-o9xJ#Tx^$hq;?bSTQE3)R3Ze`oPH{yxu;DEE3G*27-wlX-aUAiA&bh>RC)l
zOLUf2hF`#ikz1w!yb#u{h2!=2e3Jid2w|)Phvenp#nS_R!q$51V$$ee-s-=yMH^mf
z_|!6C9;`Zmn|Ovl_C?u~VBM+CahzkZ)w=U9Ji^-)(!NRjQmfs2@KzhsIW(5d(0JYn
zIL2rgPrh7Un;F%t_t#L=wqdhIweh$3rR#6Ve65%nLUYt*)Q@sCV?NTUj2D{U1%m`L
zn#!6QO~hTgzJV4K>|xZg63fPzCU8GWvVeJLHj!D<n_n6vgb4Z<b}H0Oj6Tx7G=EnV
z!Ngnt>w#*vK8f19P=RH_|5Il-`@0Lr7dv!|o+=4g-?6)BXaes&E{=-#;OqSc3_Q-|
zZm;DA(-MqSj@{0NpU6l0X>Ix(+*Qgoh!Ikvp_OmB`1n7@h?8fEyks02=Rja;*AQ<d
z<rBgB(IzEr=->D%WI*$i&}<2P*ut97{h3$`!2G?Yvl9bjz!fSG&k~05qLki<xP%-5
zb8~sdnwzL^!5;}08ZjV_mh=^bOsz^!w`?8CAK3^Z%=8sNIS<X0gzQ$?l2C18CV9BS
zF1MK<E~WgK<^JdS4&yeqHVI?tcW+@(Yle|6BbX%(W4Ckhi&OSD)MP<(KoU0Tm#FpK
zGkw6|<FA*C0*98ievI%sfhvHoy<y3uPRr}aD_2@_K4J0ox`pMvDjal5)6%h&JV!<P
z@mJ<6wFIGG>zj26nAn7Y!hybf3mf6Sp0k`KJ6f0)SW4LT_mHQ-mCgl(-a)o|q-AnL
z2Dcw<<k!O0b^d>F`m=Wf|9hEuuyu_E#nr=`PYc3F^kZfH<Y1fJ-J)dscyy*}$`vN6
zCe&gyR<WWeVv($qQX7{wrQ1OEb(l>~p7NOf{Sl(WuSosjoX68wynsToljc~JwC`d;
z$kxVEg>3}w4{sEE{NjfSIjbA2h@V|5Ysyyh#NazS!yjx@Arm&^R$tuzlckU>AeMu(
zy`+DxP~*PDv0eTe;<|<S5kjX6HQ#aE9M@rfC8r%EEHqt$mV1GVL;vdmL23>AiM&Y1
zH%I6ChdRLOGuGGqBxq~=@N+@@Gj1j;+72Z;Z12wcxH0UD==0dmnE(B5;!JaE^K`bB
zh<}k^zt|-Ae%wdDSW+z-F2Y=60w4(4#wr~K#M;_|ymlqYZxmh%Ztu(Aw_ghqcntKb
z@srV#(-?RhBa(S37Gc?%gY3n-oz}z3Y2p6Yg&7z~=Hu+xsoX}WCV={KOJHL$s7X2g
zzvcj`4w3z6;P5vd;>9Hv{rcCa^ytlpTXY}Vunpz3x~}hfbjcOnoKj|s99N@1+tqfx
zA5-p6)A_cO%j6SNS+eMI1ikXhbA*;>rU_3%Mq<;vTKei0sv<UwiMj<me#d>!mmDqr
zkWZLd{WO&;c9<D|V2*GcP%?UmvW?~sGrT&0F{eZ1+`v0G-w@s8e`_S;h89?Nn4O+p
z(z`EpDg0lrriR!T7Z?!sS*Bu4t#9gU+BHE8K_><?@ll&!w?X))R7+;IWjxZM_}T=@
zx;m>G#Jj(UaHU9_jjg6xz6OeA7ML?+d8uIT;Bm5jw8+xsdsBJ8mlFALb<ehpPEC^q
zpbj*_L+u`B1{=~yj&E;_9e~nQ@MTH^`qVD4F+40x@s!r}ILHwj2j}auAyGkjIn$ul
z{FhY-mnB#d_>P-~Z-ZDxy4U!#-XhUMrs2AClllKf*B?H7@UdfX3Wb7+&!1hdPEKt7
zt+vs>U>~YzfoAr;jx=XZI`nn!-@8WJq)nJHr)}$yzfzjgHCvnkJfSk=o43tFFO?fZ
zCHiGD{!*n5mZO*5j0r7F-XkNu#g{FRW<&IQf%=`6oJTz3^73Pk`l+<x3A8|?CesEx
za!hnh3$B!&`wq^{7TFVBs*Zzn;B-HpJ`(`i)EzB1Rn1xSW8xLd+S}W!o*7di5wo3X
zqc;AXYOqpo60570v$V8K=0-WFGwekeTnaR3InE7eT?z<cB)=#qDmrwzvcWupt<C|S
z1FqQoIgF9?goM253fQrc2AkzK5AQ!C{`YVLpT@e#{Q|%q_=MWvB?I8W5FL)c7Al>L
zQj=i9LQO_eDhXjI;^>cC^kks2SkbsD$I79a$x4g(B)>pC<F!2{trPHhz*C+y2UU~N
zg;0HE+DaMN8=|JHtVk{zOnbx3%wB>|*1%-0jKt|N<?*4U&MJAH{p;HUc!d{8Dy8h{
zy-CKOZwU?F+M(H6SO-i^E1fGUs=IY}_afy`*g5S__0%*vL!Cp5;)k@gVC5ZM^@d`q
z8^SvaXMieMO=x-DQC?nNRW}~^rhnVYku&u7^+^E=HKEfd@-K_Rucf7=%xfHV^ry13
z_*~~2#h1Uteyb~@?i<R-`QOig!+`i$tL^=W%!ulYd14$Lth}H?hY8rSQ9-LsWQ5M;
zEiNxksxGD@aR1JQ^^!1mNA#wg16M};y*7d1jl((S?`aDC7EjJ+j?rd*Zzsm4anznb
zhMKJXBlH6)<R!VlK&EBNU5RkuJ^4M@4U@9Alq7JOqaHA*kCIjp@l5&rQ0MxIE5|S>
zvf!E&fR>_@u$0yIu4mpzKrHZX@`-DX`G$s2D^vc?$hT&72+S>-^m8Otfh8@f*BnZ9
z49qO?W6XytRt^Z8&}i$0Qq1rNP~R=L2BL2X^jIOPkXWSN88G!J{hETsbmRc;UwHw!
zGZGVjI#@$^(aAUpvEO`4j^FbnY903`#FlWKmw<>nhqHWrx~Biu<*3-5Qm6ok{l~CH
zIX<l(*E3BML!Lmm|4i#A9PD5`gg>&RwKRiM%-u)t_5oCIc?5l&UNqx@`X4`Td&f-v
z+u5r@h4Ai}n3UqlI<<-WuTOWH`G(6)D87KvqX+PNDwePLaKRz6KG%YS*|=#}rFX^Q
z2{R2vK5K0z`AiSL9C6;hKWQC9$foTveDUL{z>$iIQmJXN*`N{hilg|Zh|xMo1RLz<
z9P3G!eUn1nH8YY+g{}!{*CbS~lkzz3Fj4HEe;E6{Z{LLmwv^^>*r9p-?WU(rYFXd=
zF^<fN06M{4AtZrcu<xXFEK60A!a*mX8tS-Xdcw9QOAXfKIBsL(lx^L_6I%Zyd~5e&
z4y?+FJ^BZeI5#p%zxMfyT!3hAy)Jnt92}x_5k>x2-PF|s@1U+@xfW#)^d$UC0joh=
zWfv=r1a$r~Be@KP)=P_Dd>ilU9)!UK4LHr@W%9dEac|-i;#O5-I@T=--rOjiU!7xj
z3C9Dd_2^Noo&eS4E{qYBcxe19NwsjWI(lCx%briMAeIp$aIG;;CRTWRby!3R4;Qh$
zy<G}t4-*i+f&oFSevqckDxie!oczos3s$N3YOU$`g5bh9p>DkIj++i_wwP$hbDpKb
z%W`D4y)xI+)mswMw?km845Y_Q9iN*W!fC{vZ>_!$hx)Z<li8Lg-EU{GcZlCur;)<;
zc5-lVIBCzI4tCpBUrz8TOz>a6XC@OCgxLDrpG$(VlW*2HMAItK8*FUst27>8pi54@
z>4TI!!;_mnt^8(%1M2vZ#})Goi92OC_qEufFK8FVmTEf5^=F<k9VfnD#-5US|Bhe2
zl*Ygm+@*433IAt*R`fHxTcne~E*$S=_8t`zgA$viRF+YuHSd#fJTOAAlKr&jWzv?l
zV>4LeJn@i6$o6Q|WT|x+n7mlE!Q)d?=SH9q1RJ-*`g(s=@)DJ1Q1|xKvr}^PeMh8c
z*xgf-WsC+%pQ{xa1F7O9oYbLJW-IC%W$vq0`Z30Y2fjx9SKk46O?g%f@msWS+|U7m
zGGJd-r3An(wsz9c*-qN~#?_sA>w%r2XbL@xnB!OOch{r-9c@%>uXT(P62|SlP5OTm
z0QpeTgUq#7&E))#{tzwZqwZzN_*Ae44n84G;;H9{)>zV4*F$xm`~BG(JXhYiA0Ke4
z*j+kv$bUyTL%N;jJe9R;#)*V`nbEp;viyn~-W>~{n85~XNm?<D-Qk1pbkLpaNebpk
zy9z)NS6x{luUo+6KLVrbR4x|;5wPu800GKB0y`Q$4{_-U2m~pd6Fg%5$coBzJuUzc
z6&HaLCIU;2U=RjkON3Z!YHAqz_;0ooFIHUFRBg7<Mjk_j?nVBm*~6&H1GuVFLTm)*
zI+0F@-Gsm&od-1b0bI&6@Jw~Qy+;#CKc-M}A$Yo(^c^wF@m6CNBC<yGN4(kx_g%{1
zC*?4+p*Ap-WT5QHfERrk#B>B3k9eL8K^;<EomkI|CgD^2)Utz%lR8ysuCV75GYK&T
z*|P-T)hHY|vrvde6L7veY1tzkM>OdGc?1n4M6X83*T9@x`wnISgm3^l=nDRe@EOd;
z*Z&O7D<EMoWo8d_IsA6fM;aX?_{R7chSFTK*pI4mMJQg^n7>oIA@SXcUM~2bPJDK+
z&XfJfg;sFy{(duXYN~*i?5jWM^1dw`d-zR4&sY%`7iJ)Ruq1)S=c<y6m=_<*dsIum
zHVRBn<Tt#UEWxDfhh*f8J`Xwfx4VuE#eWe$ze?MZ?30ka%U+?*6^o!xtSbM^E8FRI
zgfP5RBRxtO)aq{j)2JCgfX4TBsDWcks7m%EdnHBKa3tJl5&2DJRTXb%qKFM_U^5-7
zx{Iv_Gj%X^D=QKTW=c?$iuA9=-Xt#FVA7+LEp7S?T;?Rq(@vd678~B&B3|KTVL10+
zUsJcAzr|BX7y;W!O{Hlma*E-B8{aJT8-tDx=5|S)onL4p{sg1z6?LZLgv>;61ZZ4V
z^OadAn1;yTzDEMcl)l$_@G=KE>kQ_y;XdIA)EQoH%eoCV2?=wX>#5rJeQ;y^*y`y;
zUUp+}Gsr1XE*c5M6r__)VJac~5&U3P*m|(0v;H}N4Zeyo37~^gahG6nq%TD#|HU6@
zv7dP$M@J+-v;SLlBZBI~LnOW)vK@o@0^`U28Qb4&=<OOx9+-0_-*Z22pDgC~#Op$D
zF#$o~EgES>Z)Q|5P1`(6sxFQGo@Sk;$MqK^vNhMQOgHD8zj&`-{^C9-&T>bsabbgZ
z-`(YU>tsECME|xfjZ31<QxQYi?ibI;C=48{<=i$DndF@vLMyXhNvt2gmY9jir_>+M
zh$VRbo`0xywFc+&-Y+u+;rAqHgAo>~4B=Ux#q*F6>%g~oJ3B)m?$ZVJ1%Dy02zUD4
z>q+b0fuuxk=~M3aM#>yc3j}b~uir}>b{@U932Uv&xhq2vXJ-E0e0g*OEIMWFBA-Rs
zgzvgx;h{hCIUf{|at`TFRvAD1{xw|&f7s6vMjnWWskVWzre3b`0SB*$29nH{ge;0K
zys|ryrEO!5Z+CqJD9NUkZ6|M;C;P&&P5Q!KyiLUIj-&6}PY1KkfmD&j153;qPI`Lj
zOVLFN2fzD!uW?jL%n%ksbNmqf#dDW+Irxp?>Fak7Q!GesQP+Jpj;nY%>J?gFNnbl7
zGhDkvHJVL!{8=q?e;sTF?ArM6z=*qg4lnzC)2c^VBkYLW|20ze4iI(3Jj`N2o0cn*
zJSn+qfgp6hT&ad|bT+6_5|Lp4oMFt>HxtcRo`UY{mYTm;1}*6<(MXs<&W{X^S2bqL
zVaW$n#ZO3L6Yi=h>ih4tctRqicfOkPF|s-9Iy;6&2=16HHF9;Nd3kwb3~bm#FCo_i
zoy|v(d>kpwSX$1wG&zWUQ*8TyiC5W|4MrW8afUi_s%l6S5N(}sObC`Gi=_5ajSbX@
zM#j2d`*E%jlpZhF=!wixO;Iu8#GQe_64H!hQa5`w>(Z32-nHvJeC3E1-U8lL@YGI3
z>(b}|3oH+@5#|!C4H`6>JD8FbUHo*MoGp`+o>pu-lKe-;val!L2fe#hfZKYe4zO1p
zf+_gx4M&@9x=T!y%B9Z2DP^K+%?H>(6>@((d1|<qm+J7?OjSi6B5S{i4e+uW=K*5&
z<yWPC9EHPOj4<&bPV%9VLEhY~2BS~HS$025J=DLHyST?QsEnCQJ!HV*--Aq<ebF=j
zz5e5-wMtR~wz&)#{6A;!{E?J6Td@45<QG=uox{;JPTRe`mk93ih#~Td`cz);(hnJB
z$XSn+we{$T;Ic!KW)beb04j=$e0CToB=cP~;nx7N3El|-Q&dM<@>#sF4-@`%FJAou
zF1oqNf}8(q(_x~ZQD4o;tLm|!_wllhS2ZZ*XR7W<Sb7V;77l<w=im2XD<mOf(-hOb
z)6&vPBzTmBanW8VVSjC+7mAAX;TcF{@co*1@T?ZtFyO?I<RP(}TU)AA&uTW0R~h{O
z{!bK=&}>6Cr`S|*_Y#>ze7u;El&gq+#BP7c1>?OZ*fS?v6kfj!3rgETn+7DX>#DBD
zQayo}9$FZ{M)psYIuqWOCIBY43xwC=T3iqh2pisXR2z(_Fo={v!21*xDrp6?8C;eo
zu(-I$5}M%7XATP&Iq>rAuMRfSo3Q0+$almW!(u`#K{1tpIH%W*j#mE63jciVB}5A$
zf=D_?v`C3twh3H!Ne!RtKasC@C&Ko)>QawwZ_A0sGMjeS_2E)LE)kls-o_wUi(u|8
zsgiXcnW0~uh^!+0jVd{0pT-wd-nY>FH=ls!pVtlHRw<QKu(mWCUhR;*wpnJd+PkM*
z<ZCjoHvVzzFM<L4Q}+Tl7jePaV!5#*f`7TmHWi)d&Hl^dv$yAkt#hHRx5CE4s}te%
zOvD2c|GeIFBsXz`2p1wDV{u~!Cs8fNiRol9KffZ(U?hjv|MmXQ^ZLHXb@Km=a^Pe9
z|7kV<V;cPbV(|>U5HT_NDvhIBLZRWVQ9^GwQB`t56^fb7_>bps=zD;Ir6mJ1*^h(s
zt4ll~U*88b<BsxrII`buv<75CLPEe|1tB9Nqi1BCp23gH=~7OYk(H&Vs#o&z_NHcI
zQ=C+gkO)pnN-`Jv*W3xQEl>>?z+p=n*B3I5YjR;28Hw7^FdQM=*AT_P0~_HS&j|(B
zliX5Y*=OopPvidBc4=T7Ex+pY@rkMU3A-i8!=v$BfBjl%YkD@3Dk}(3qvA3to)>j4
z1Wc?wUUL)hIuz{YWMwfw7mn5|t*R2_KkTMDA)Dk=uO+lM@5KT4Kg)HM9!$PGDzn`-
zku^I!oRc;FJsBxh#mn#qR5d?TZb~ct>EyTwD5K`5S;aTsiD{@nMdIK2N{yAVBof1W
z$`Gz_%7n8MeF>p(Hi&xqGR0<*2r*X56z}T~k7X5ynskyuLIjhnR4^8m_VQ}~cM0@6
zofw&@xTt6V%{K5;Y9phdlwt))^;Sz{Q;3BOkv*Ov3P?uVC~r5}t_h|KcsBM849M(F
zWbJ@MyyD{bzh?x*How0=8<<AIr6=NsG25vCzZ{$QI)D+702a#4z2D)t%ZO6u?aXA5
zrpi8D1!vmL+F%h9<c|(-z`x`DEyS2>>ozAeBKKqoDydBS0fM*Iw6uU1^zd8x>crxR
zjnplQ+?OwY!2g<<$L<UN<KL}|1>@M^g&JcLD?&{Epy%PcW}RttcG*U6=qM2Ghy;>i
z|258Ny<%p$PD)LdxyyzvSnE>;U9yUdY}EHCsP+oBOay*6qotzJb!j{)ah<57^jf8T
zixpvEp7TCzYA{;vGTxv2@l}>czzcB`8>?L`?(;X;9rJ-tZCf#X@*N)J6Xcch%Fw6T
zKwtx4ZnAoXZd9uu1=3i!xP@!~`W5>!%(;4-N1}Ai7Ig;pt@`biDNcm9ExzWDg<+qq
zgKQ`jXlP-6znWO#k?8Y-o=ym@p6`Qm3-Gn>gb;m-dv>_E94+*?38420lAYD_>iAdM
ze=1bSsoX=p`2!qftN-x1f*@4|s$qbng+YEz?&kosh$}v$tM0NQdN_hGY~ZI$XmQ-L
z)M|ID0VZ5-4|5GisN5~PF!~CT%s(JoYS^~xp>PkGR1=%#ecaU4RDP|%KW84rs%%}P
zK-l*{sKE*3(M)<)=H!cngXM*iN<(F~r8u$>9Zmm4OhS94TOBneD(c`HL}g&c#LI>Y
z-HkM!<@b@$EHSQtkNMz8#K&b8XOYxUIuY}Rj+v#rly+(Qjd=~TX>;v-GA@q0j@tGI
zJvAB=d3WuS?s^;hz{4o06*YI%(jE2r8yCb8Fy5s%9*8QzG5|>qRqUQ@1w|AyQQrf-
zD9FjVs1)|@Gefa9_}b%upQj8fh>GbC9G?Q!UFH_&MRgJJplcs<fO=$yoXx(wYkU>o
z&)IQ;^4@bBThjZAmWqy5)r*3km3|9>Z~AN2SEj$p;*J0J__PO`W_gf=@ENU&_t6lF
zjdL83M>r93yc^JS$Y&Ro0A~T?ljki4Z(PG5{uCEK+ZckTHn$YmFTMQUU>{x5EVd*b
zi5X>UjE1{?lnR`pICKE3C7t?WP!@!PpI>55aTq+@-b{bI8<Ak(+4n(6;-X~vV3aUj
z8YQzGAGazY-??FkjD(hiWz6LJG<jz-zl5`?{R#nXBo!p10p>>Gz|{r~52X@dd^P;M
zDLnpWyd<-RJ(*A(L}jye)ioK{(DgvML){M|nHitM91dY>8*LMS6hPx{?kM!9#u}oJ
zya&e_=S<A61p>0da*Xd1@?aKQOiQfOgnCB?>#Jz5Lg-X29%b3wT?p9qD6tc+h&MHc
zY-fL;QoBxK5V$Tfb8H;&y?+qDKK;p3rR)3|WLh|B>xCgcv-*BbZS+9rZUGX0szf9%
zmwBr*gp~BeWq_G!bbMSDx>s3LRP=Rj7P;W|-rnJ3+KLs379%u662-b$1p%OAY3yQ+
zwriE6KO30Ei-0F|U;oH;y&HO*sCSW#h@g-(6iE$Oz4TZB&viO|nB0q8U!9z6hUluP
zYhG2o?^}-`lqkxG|AVz+XMKIWr5+buZ@O7FJxE}kJAEFz&Fgu}#VXfzF1P?;@Cu_o
z`T@XUoerGGMuNT9e4o7CZ+237(fR&4<@&75)oKt|sft!^F$p++iPAIL-%_vO>!@bG
z7&r?Q*+mJ1f&}>7{dTq*s3hzzqZX{F)A_sYOU2J3B6D%M9c?v{1pF2w>(ftkuUK2(
z0X%*52eUUW3!!%6^z=-L$<v+5RZsRAAh4x$U0F+oZZOmLD-h>==vgZs%Y;S@m8Thp
z2ZW~dO*@|wy#CyKZ{k7rH9Xw>x&s%-TB9@Rf)TGrlTt|&dlDb%=sP@eExs2hX=$)@
zbzYS--QRliZk_BEY^`q;l3qVj65%n&J>&AhSGK7n?EeS5FoO)_e-#WChhq_w(sY_~
zE>FB8wetvQzFHovN_YI7;vurQna*g}DN31dG7~&M<d-;^I8}_qoVaT-f@T&8LMAgV
z9YPDP3F)w_ItGnv5Mg?g)%EBTX%*Dip~ae???@2-BDgSr0DZd++{SdkJ8@6p9~e*8
zht_wj+4Lz(G7oyhb=7?j48nfX;t&_UPtW62%yJl^ci0HVE9EhcRH74tiv>pD5#Z`#
z*7tpAkTpbDHt3Qhbr*jUMG{FTZ`W}g=X#roez!P}cM-gQ3B(zz``m}SKW$@p-w5~%
zrhMoa8{2>&X9Av=P-|OMZh(W$l6Q?gh1fi6B+QsgBe*(>6QA|CG;AZe7v^U>pbjYv
za?#M!Mac&5fJM2Ivwj0VE%y+t#;Da)P<RLacgYZqTP@=EGO+O|2K4q*J(a%1_nI>J
zeSQ`Hr8@Ail$PuqwYyw0CR!IN_Osu*0Wum2+p?sT+xq}w$JJzqgYj4y9m`tw-SuE4
z3JG@;B_(r{moE=kLAn6=j1Q)Z?+}T#)T|HDI{B)E(#9S|{2Njl=+6hs6?+7+uwKv)
zK7lNuwfxkVcuBIF_Ftt7b?+ZO{$2Eq=RENd3C_N#m9g?h7UzkNdW8LWRuEY+X8DpZ
z`-Ak#?Z{1RCk3p6fGmty@B0u+oPl@qE=<LWFA&~%I4OCiyN!1^TsUKv4DI~Po-k-u
zw{t6a-+ByKYTtLTtfPF9_&=NZw7kSiIc$;nwWvJNdFn2GEU=K&TjNO5?HAhZPx1CI
z8Z0U0Sf}?Lp~+Y;D!d-2OF)rq^)D-_Gou;_6a{}pqf;UxqB1>Zh%|6esx%Lej+PO2
z!u!K4-yhrhPSX!lbT`A8_|altNVpG16BSY7v<H!AcE3XW_VRkr6-lNj+oSXoEc8Hf
zGmN5I()4sIrGA@@V)p0dExwV^H%{^s-_T<;9XlK?_glnCD9*mfodb8MCQxU8&H=dP
z|J-mfum8(6!wnGq297kJ#abs2`_icKhE<O|xCN1lhF2H8JK>wC=IvH0$aT?KZ1f8M
zq@0|h-5ccKV#^(}noEj=pAt+V^Ps@54I?Hxdx&tZACi4#hKz#Be#4yVp3_y7O?xTm
zD+^K~7@j>)ZaRF%9}$KEs@AwohF#F0IT5Vv!d~v-WYn_!*s|5ERjhTfPHmovu{0O0
z*5Eou1nGikcqY0mM7}xHp4?Rxvu$6ci_4Sr2D50+3w4x!G5QQlWncA_aqR3!0^ja<
z!q1Mb68$g0rXZ~&<GI-Islw~ePj8N=NLLlKNYHuTA_PkeUC`^7BBt168W+}2y{U27
zU70NLQ&EFC?;Ff)y5YDCO#Qff_mqM&-5GzR%Lft+OW{I;%bHXiVpj=K&8PEtLkV{Z
z+v5^ry6#;F+zu8GUIb0Ic&=C`?u`Of?kQ=f%Zs@s-II!@BwxHyE@@v}`JFNiaU0#K
zAJhFBqLNPBUabv93_#u#&NYT&Jpg)YG{yum9$wKVY6vDp4<ii?&AVlkQG2xar^n$)
z(1y7RIf6fca*jj7mTA@P2hHGe|930)q6d*|=-CX?I}s<;?m(_1A`XUIQxv*9h2Ggv
zb7Iy0lIVR9;(X)57%UtdYVD>^$$wTo6+#&))<9mN)gt>xw86nRhBIL0CsXp-p3W6R
z<h@=nzH&P&6(O*?Odw=83UWG_kshu0{PXEb=bQb}g^|z<yh!E74J;<feg92FOw8Od
zn-jW?WtFSEMx)(^=+%JzCs>8*<rlsE<l+<+U><k5tq4LfpKp;c?y2(LW?@Z>nt_2D
z|7y0gq|5D}q5AA!UkoocDC@P}Qw=Y*p2zX<bPJIW?KphtZT`_Vl#}yN1vV?B-7Q*@
zUT#ZEeJmJz+;_g&xhkS!WmE<!z6g|2f?)*b7P=p&EbxZgSyxjY_}HR`vEU%w2<M-y
z&j?6qQoAlH;Naj(*{*gQ%tmkBr*$x9MRmUP3Iw5cd+xSN>Zg+C50r(0$J_SVAC`83
zW3HX$JSp^btxsAtn>|O+DSGVbLie1@Chn)B-~G;FEHxE_5=yP(_A${!@^Kli)gm;E
zR&4v(DaZM#t7cx5#Vz%kFN?l_bH=vWDd%!Jd?CsA>k{vfk;Oj`+Mwh<=yccgl=Ime
zooE;>y?`|zOwtS7j#cO?g2DbBa9ox$9O|melAbYFtP#vlvGmVGqtfikOD#$Lv*VWX
z9vcf<T>ypepB2}#VyJlN9Cj<*sQZwBM<Y~Jo`L7K<*E<*EUo2gPFwZ$nG-%<h?TCx
zF!g~+pabBf8w=iSV?Ur7qWDt0Gw}EWFb@Z=N0A$vNIb_nWaZCt?m&KzzL%sX+w9+G
zadtF?dHVPRB5@rM;@`HH0_9$dJB+(nY&_R|0e5=-)$WTCWD!)cR`Qvdmy>ar&A$pW
zwt<2(a|&5JY2L!X3AeUK8vP#9$c^6l%8&;vp`rLEeIO-XOa20DI5bfuxSj^-ems48
zROs-C8%)*rDOo#j>D+wrZ+BJy=D?Po3oDGvpTcl9!S~CKcP_pdBM@{$QD%Q)3};O_
zA`R>Y%}Z)n7cga9G$1Ikyp1!yK$_hW3F-lv8H(tN_gkZ}Z~FoR2N>ZZJ`euEM6oZ2
z7o>?@7i+9hc0>}GWqZSLG?QhOQW6F%i*DRDx{qzMq^2ZG1|Wd}3-Xfb(e%N8Oc$i4
zG*yZ%YxLAH2a-@KX*gvfI1W&{tsSixEggK7ETEgTl%07&Dh6HHnUOk?tiS4}H8NUm
zsQbd+{)hK(>FRARAv2E{0SVDpuj+NEgZrN?<mv76zucQH#f3CD4<*wd*3?Nn-4W1S
z(1L5>QNz&E(~%BVcwUj-E$6{0DM!FS<V+_r>2tiV(^JSq5k_LWAg+)i#NJlB_e^zx
zsPhPOVrS30inCVmt71dggWt{=+<vwLPc@+|A`BA0_j>#^@xoPgpo+0ns;*$$Q!dHT
zRVCIO_e(bkifhP-|CY#ak?iZ5e;c=Lslr8I@_{?|z4)^?9QEgk*@T+<Q!u422JGsF
zi5(TJwrXe|ZxGbIc9O3!TRlpn{f56aJxli#o_^hgq<{k(F^cd{lo)f^WXUdL-DM1c
z8Euq1W0-mza4>>|UT6A`B177oceKMlo8NhTNl&kJ!nZ4j3zk%AsK7vtj7pAreD#|R
z66aE`$pF%WN?C59l@Z-AD*n!bF#&~+Gqf=YB^1odeDpYS(*@-iZd|^}!l6+F_a4t3
zuvDdLSX3AWZEp%%^|zk5bKpEJs^l=>J3_=HP$!i;!`EtY_=-@~DB$!w9u<w_jwE&Q
zTsFc4TGrBkHBCEo6|Sd>ltzOTuo?ppWHIjNA8xkN2Kji~C-WORC}_*yyhOYxplbSY
z9ijN&Li`AX$NVJ|Z1#HB6%LlP11$q<H#xvvn(Yk16V~yYs`s81C04`4&dQ|O_<nAR
z7wWfth8ywzJ$ueLOx;T3PM0h)937rfOe}ABOFC$6cAZoFwxQ_|jph~>j1;Wkclz0?
znYA1D_db8I{f6CteFS|LR8lrc$=Y|&!=%R*sttYDjPDWBZuco6Az`K$F{gvTjh|u{
zOIRpW9TSfIe|I_m=Y1KVo~~ApploZ08Xn21EcLis8+3mQ;gVC-(qY;$v77F3C5z5c
zj{3E`@>)T~C?Y~wBer`8#GkxbTX7<i^zm%6p4!@o1B3TxD^6*9;d@O=Y%pGwO1qk6
zT5U7wVDDsaONtat<A1S7Z#76W7|`0&;&T0w7Yoe|2^0U*Ze@;9OC4m}B5Qq7dAU&|
zbVKKFWRT_<By4H=K30X?ls8h~?7*Hn?8U#*SA5u?2!R39khr)e+gNSbB~N?(wTN0J
z<DY6gHvS4HO_(T-ZbNBu<exRg`hL0+x$Ti>$G*K#iNN@S5)FE0%rjd%@dD$0;*bbK
z6t^(@N)W2Z2wHhqPglMaH?rba?Xi{<m9sC&_Z@6N$rM_$K;dJ+*Ei;`p41d6u)g2M
z`d_t#+pd0Ouekd<1GhxTzbAJN(HO7S3h`L8E&k{gSTvl2R4`;VR`o?`JyzE{v-A6m
zL_`uZ>IUYTeGN{*3!xP*_Omk*%U^X|Ux8u8v&HjVrw-25hI)W~fIm<8S8?bgI`gOr
z4e{BcW41?PAGnshC&nL6KFHZ3g;RBLnJIBcVH4B9G5vapX3oOC{I@yHXociVNrZ`z
z2#(nFOm|XvnIb#73HHNi%wdoxT*#Ej(Z)l-Kd*li-K}Fmj4)I{84n2sJ@HF5($$1L
zBEKkGR)-B|FJlp$9&ZdhTcK<u>Q}6QfZ>7pNp%=8D}py4Q;bup!xrEM1C{_#ijmC`
z=R#bR_;bFqmW=`ouNYeI8z~4sWVEo-+tNz5ZAF`rj>zb(Ap%QVW?K589a{dYe;pqw
zUz{TZBy;3eE$N3=&CrZybzb`@0O}z|hXZ1i3EW4uNm65OvHjukpYBs(Z+snHY*P~`
zk%S?h;*^`^7C&_TEB%^<z#v-_l^+V>K8A$P$-@SU9k%X0$tUQ%24rIM>7XFECL2`6
z-GzmP()GzO61a_CUi^B_h0e-cy@jR*Po-S1#)ZJ^o^q20_@RHL^0-&wYLwSJXiUa@
z-y(Yl*Scm2CMvtnULn1Z6xAdOV)t^b)TP+@TIjR;)Ex%_I^$+zC&YjKCUNF#tWW4l
zW@^JAmM8HhGz|OR@njM#Qs(i#_i@`*qTd&T)Emk)Cp!(Ia0S2^c4aDu4MRUW>QUK&
z(jx}^205Q>@~8#(B^W6y7_nN#)37k42OaDrl58OoK9`wO^CqykHJmLFiog|WXF88_
z71We;cxfVMoSm6@<j}#VV-va6jGaXYyF|&({sE*DUp4=Yu{ezl*|7Te^Ue>%#q%^!
zq6|T`7N81)cXp4hr=m-xQC55EhPvYMC*O!JHm`L_Cthfi4JIf!$yZ1qM{9Gi_W`UL
zYKr`_`okZ~naW{?Aa0l?<r^$7!v)kfvTZI2BVKqF?~QruwE&{j`#ZEO`f{2bYU+eJ
z*J2QBd)Sx_wPCW{X!H4u064XB4;+=D*x>?{!s*HUx}i^V=cw?%FwxV~|EM!lNbL!&
zDF<&7XpDF(xO=Ak5jde}Ti^D^#>Ps&%%3G$&SN}0D*7Y1=T*epdM|ww%s!5WhejZk
zge2=At42fbh!Vte`_VN6Fb~Z!*}sF29|`sC3&v<;uy-!9(uTP{m}tmJkJytRFmrxv
zQf4g<^QsC<m{Tauef#;KWPoL@Tp+kL8CPAb0hXBg>&tf}ua5-NxEMY6)HLswSs8w+
zq#b}5Tb2hJ`MG93h^zHEDyO9g9O1*D74>Trd+(?XJsp-ZQ?H^rH`Q>&UDiS+eclIc
zdS-fc=ps3<cC;L?77rQ2Mpixr;#ZV{ALefmZ$Zzf<8Rz^XnMZce@k-L<4MiJQaRj*
zkMs(kp%Gw@1V{^+$b@{_JtEf^Ak3MGMJBc~c4Gi2#(Q^E=RAbfM~FnL5yO#-0oHp>
zXYFXvj6VZqy79y84aYhszYkkGYEI1>aB<EsA{pU<AL=7Skwzi==a*m0_Bv=q-+YFu
zKpn?n)Q7~(7^BQN%kAsuz5Z*qLYo`MRS}OE8~MLgb8rxG4*UXsEoPXg!OeTvA0`X<
z;WF=&E|lUMvs%lzB|nPR_ECK;DUxcN=%5<D4~9~SOEkIWn_&wKMGs1S#HN+~f>4M9
zK1P*i*Ek9uh?3EYqXoHuX%|i}c=EvC%7pq4$~d*X1dt>kkAncQvdw;AhjH%ejbJzM
zB=h|YvQ_&eH8~w4to0kDVlk^qf_}67a6BpjtZGJZO`4oz=138kE^_Py_|1-%mo2%@
zy<7y*-V?>})lZ;i`(ZpI{Wc2K84TC^@CP0fQj_U{7-;Ga*8L7BXdfR0e+ha7M2JF*
zP?nkAQ%$cu(4uHmpxd3)y4oaepFF%@`~m-2!)T8b$PCQ9ro1D#ke8Ihh3E>MkA}Z^
z+?v@R$72e;1Um(vQ?GF!Phkb{uFtAJ3-kCHVt#-TMGFcCRE09(>6fI$5CpJasT#LG
zT@>q4-(6P(e02=+!oR!U7Nn)&ui!2(9D{@G4jrFOxXaAMWEgCw@4t~^JFxp{KltnH
zRk=P;h{iH`>y{deSwpNvoWAL6qlvVDPW~e(aTvsNkIvd}E_0rkXtUk&Qs7~bF77~g
z=$vz~H!9iT{nq(_3W(elrB*-)f2}Zsmxl${0QNI=-o+4?e2Zr;G|*T`qE{}RUBjVc
zogkV>Cwbipiu|QQ$ie#8AP(|eLG!v>1l4!C`T6-fU`<fP5aZ4Vv>qI;Cp!OESA?ih
zT!RL!tVuF;V=&S;j(yeJi~0?<>RWVPHIJxt^B>`#J)-$OaBx?oR><y#SBXX0?b<~O
z3_<i5ZWc9vIQPLJha;)IfT2_k_3tehG%w4P@*Yr#OZY~qDR4-Oql>{fASI|{gn2Qv
zQl~87`WA!K;^K_+FrZ+1D*ne^v-FRv1!Ru1B-HJ+%SVReR;R=f2ibkl{C9ZW{AGR%
zS8lqzmHDbmAY)E?FL#Sc;R}ks^dV>W%of@&5P<gV5;W;+NtHDk#8`@1R%1#CYi^`r
z)sO8X4MPK?U4~-eG(U}iUkiKV8S@|C3G^Ym1^qslm6gCpLWX=8h$f<?&GD+ubowFf
z0M=MX&Ng~Y2BHYGNS8eeTKck|_Zw2tLzti4NMOI428e&{9&67Fexv>iJPRj4m9;0x
z)SOh;OmAFcA0*EIhH-JS+R+vT>`j-MJob6J;S|Q8FG~g@vMPGka))P=q!DIJ<d#^J
z$THLn(dGmg`l!MM#07PUg`mX)!if1Q*QhoTo!Hak!&UniYeWp<xTZB9?a-20TC#s?
zDkO{XdX&QfRzQ6XQy^`XcPZDS-l4`fGD6w2otu?n!)5k`p9fhP(!zFM;6e<f@UPkV
zxG0c@+U+;q->jc7e(VjnQPNWRIPx*(P`x8lOHC!O@!K2ZIp}-XhlWlOnwwk1VyKT>
z#!*Y%5Pe-d$jfIxrIpRsn;>t@M!dev&%q(p*ULq=BdH3jG(Z(XsvY#%s9{f7t;(Tt
zJ*Dx_6`9(vCz%`1cty728GN)@zf>Q%xBzcHm{o>1>U-Lz<x;L?F=9Z%^_oHH$L)3U
zwLq(^om*Ve%aoD&rYHK#F1O{RNreDWIo`l=5gj)d-asE1+a!IT8wq&yFl<;vj2Rxb
zzpF17L<jO?1xtH#;#m8T!w|R<8RKD>0c8*iE~(Xz&e--;n$3J$$otmmxwBFg4}c2V
zkOeC@voLX(o4=n&afAg6SrCNDoSo6sQ6k&+Tz5mE`V!_b;lH3#%FXt<wOGMjmCK$R
zM{wW7{~v-X`Uc+;hI2`}C3cLCf4ia(8oF4zon8uyZs}%Ho5#OC#-aEyFzmH-9_@q|
zX-|nedowUa_N6z^3$~P~p6vwigI~4HSE<A%e(tWLrJ|H+CE*hRrjhDEYfXN2mr89@
zw6;R7-SzMwf&vf)m)tyqv1u!q0jtG`PPaC}HTrQhVXdzyJk}d&Anf?HbtZ;;ph%m=
zck9JE!>^=x>@n5t({bz!x7%jN$+MikQ;qK|D5RWUAC`MFI;{Z_pJb_%X05hXDO^q-
z=q3vBAZc4#MJa+kftejqx#U!*?LCMzRht{j&J(-*i7YVjh1g~j>+2-4S(=wZ!cY(u
zGxw6~;AiUE^|Gh5d!M<-Dfhqj4Ez+f#E8?a^QLEFUOkXfaFB_HJ{UBA2g|L6dNdH$
z&nRdDqCO9|SJR7C<aCQW0bl%}y7j%Ul7X{z-_@_yQ0k>P7JyhIy$h4rDwSWrBz=uP
z0k{64-k>MIaygF}DKqMJll2;q9Mu2E;^q&#%6*pLuNbf;Rvdu;8;Ip)lISV(2)DRc
z{Ko4ILERI@+$oDYPhhV35sVWP0NibpaKz6|=<Hj`Xx$*fjiL4o&T(Lx=!A(FDNpJU
zHt7eKj5+v=J|@Gbzk@<fG9L9;y_iBw(}i+YKOTkuE$*4+%CJ14&;-zuYfU}h)|>l0
z`dgm0!3IcZvYVpUURPM%4fxd>A<&~GJE*GZcR`(7(Y(Tf&SfO&Dcj)M;&(#XVPtJE
zRk}DoS5_ds6zK|Fx~@PDkbl+Hy$+eKgjOBkTY@=Hg7hM~HxudYM93sfZWBSUN;hBB
zssyfW2Y342zecLRsXVrRbB0-O)H(x?cKhtmt!&;zSs{h#0S>b&BY)aE&j@BkVR8E%
z>Hrh2*wEv6D7eVhc4tMrZOk}*Isb3T3zyH3$LH1X9kE9#x=>xczBB*0%@PBcDVGJr
zRdyFWI=qW-u1V-7Jucmett~L_c~L`TB{bLP0`SM78Ui4wzT_m7M*xVCwuyh>|2vT^
zwD?UG<P4ldto_u(%2ZOrz6_WOG}@x6X};(!^dLCi9=Su>gb%{Q2`7vvkK>8dn6Oc+
zoA|4zN^_*#TU}wZ_Srz7F_c2@gI+181BD2bXypV<T+xEN=v2*RP;tRc`cA9d_~pR{
zdo@V_$E!CO{1WTVb(XWGTA@r_aLhTwleqGhH0nG34%V^T(way2D>j%SeY~caF)cvY
zBq{yqxh@>!?-$VZH>RE6MwaMqVZwcp67K-+K$}~gr>Ue&CC4E@>TLJsQ2+ng|5eWC
z1stTDj|!|_vG6IZ|8vc2_2)Mq(-LBqf0b0qwf?&1(~?XM0e7xV-%kS1!dvkoc}?7L
zsYhHDB?;G7g3iF%>nLzT;+}0pT>Qetrp%6qr>66_6*|gT&T9P5x2MV=TH@SKV5zR{
zZ})Ra+~j*#*rADx1#}x&Pax;}_RIOTPHyov`=(5_m}bsku{$Svr(@iX7cQE*dM3wS
zJ^UQ^i7ny6;muEhv3Ygn9>Y5?TE*ioNG{&v$+GaqW2u<(j^E51`8O|B+$WGG7rnpk
zZ=ik?v`fB6xXbn|N1JA%*T;9t>K;3l!@QjjeAw{U+UzvHzMT$7iqH9$U5y9jWn^@K
zhpGLNZQ^hHbYb)H-y4tG`8f8U2aZ%|UVp2Y4sEkP$^|+(O<?nJt`~PK6oMj*8wC9u
zA227&AGq+{Y~mTQ>IYksfYb2apMf)}@?Yh2EZ$rEXLvr>33?0Gk=dE0jI;L6y~Sj8
zU0rsa<gq=Hw@-X`TX>w!uoJXh;qu=1H#fU8n<VeJbWtx)hWGhAXn#KE#ZBi9aYlZz
ziXS)Re9jrCKz+wyX|+Jq11Pgzl;@^XkCX}*Y?(pP7e{a4X~eL;`iQ)33lwR?_b#zf
z-$64aFp<Z>Piq1t_|b)Y;CxSLAs@7C0EQxU?(j41_+uZ!_g!jEh^!|A5O})!xvX<a
GXaWH25F-x&

literal 0
HcmV?d00001

diff --git a/benchmarks/Suite.md b/docs/source/performance/perf-benchmarking.md
similarity index 52%
rename from benchmarks/Suite.md
rename to docs/source/performance/perf-benchmarking.md
index 84179ba33..b2dcd5bc4 100644
--- a/benchmarks/Suite.md
+++ b/docs/source/performance/perf-benchmarking.md
@@ -1,26 +1,36 @@
+(perf-benchmarking)=
+
 # TensorRT-LLM Benchmarking
 
-> [!WARNING] Work in Progress
-> This benchmarking suite is a current work in progress and is prone to large changes.
+```{important}
+This benchmarking suite is a work in progress.
+Expect breaking API changes.
+```
 
-TensorRT-LLM provides a packaged benchmarking utility that is accessible via the `trtllm-bench` CLI tool.
+TensorRT-LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility.
 
 #### Supported Networks for Benchmarking
 
-- [`tiiuae/falcon-180B`](https://huggingface.co/tiiuae/falcon-180B)
-- [`meta-llama/Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf)
-- [`meta-llama/Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf)
-- [`meta-llama/Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
-- [`meta-llama/Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
-- [`EleutherAI/gpt-j-6b`](https://huggingface.co/EleutherAI/gpt-j-6b)
-- [`mistralai/Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-- [`mistralai/Mixtral-8x7B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+- [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
+- [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)
+- [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)
+- [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
+- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
+- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
+- [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B)
+- [meta-llama/Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B)
+- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
+- [meta-llama/Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
+- [meta-llama/Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
+- [mistralai/Mixtral-8x7B-v0.1-Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1-Instruct)
 
 
 #### Support Quantization Modes
 
-TensorRT-LLM supports a number of quanization modes. For more information about quantization, see the
-[documentation](https://nvidia.github.io/TensorRT-LLM/precision.html).
+TensorRT-LLM supports a number of quantization modes:
 
 - None (no quantization applied)
 - W8A16
@@ -31,7 +41,8 @@ TensorRT-LLM supports a number of quanization modes. For more information about
 - FP8
 - INT8
 
-> [!NOTE] Please see the supported quantization methods for each network [here](https://nvidia.github.io/TensorRT-LLM/precision.html#support-matrix)
+For more information about quantization, refer to [](../reference/precision.md) and
+the [support matrix](../reference/precision.md#support-matrix) of the supported quantization methods for each network.
 
 
 ## Inflight Benchmarking with a Dataset
@@ -41,9 +52,10 @@ This section covers how to benchmark TensorRT-LLM using inflight batching.
 
 ### Quickstart
 
-For this quick start guide, we will focus on running a short max throughput benchmark on
-`meta-llama/Llama-2-7b-hf` on a syntehtic dataset with a uniform distribution of prompts with ISL:OSL
-of 128:128. In order to run the benchmark from start to finish simply run the following commands:
+This quick start focuses on running a short max throughput benchmark on
+`meta-llama/Llama-2-7b-hf` on a synthetic dataset with a uniform distribution of prompts with ISL:OSL
+of 128:128.
+To run the benchmark from start to finish, run the following commands:
 
 ```shell
 python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-2-7b-hf token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt
@@ -51,7 +63,8 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --dataset /tmp/synthetic_128
 trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
 ```
 
-And that's it! Once the benchmark completes, a summary will be printed with summary metrics.
+And that's it!
+After the benchmark completes, `trtllm-bench` prints a summary with summary metrics.
 
 ```shell
 ===========================================================
@@ -101,35 +114,38 @@ The workflow for `trtllm-bench` is composed of the following steps:
 The inflight benchmark utilizes a fixed JSON schema so that it is simple and
 straightforward to specify requests. The schema is defined as follows:
 
-| Key | Required | Type | Description |
-| :- | :-: | :-: | :- |
-| `task_id`| Y | String | Unique identifier for the request. |
-| `prompt` | N* | String | Input text for a generation request. |
-| `logits` | N* | List[Integer] | List of logits that make up the request prompt. |
-| `output_tokens` | Y | Integer | Number of generated tokens for this request. |
+| Key             | Required |     Type      | Description                                     |
+| :-------------- | :------: | :-----------: | :---------------------------------------------- |
+| `task_id`       |    Y     |    String     | Unique identifier for the request.              |
+| `prompt`        |    N*    |    String     | Input text for a generation request.            |
+| `logits`        |    N*    | List[Integer] | List of logits that make up the request prompt. |
+| `output_tokens` |    Y     |    Integer    | Number of generated tokens for this request.    |
 
-> [!NOTE] Prompt and logits are mutually exclusive*
-> While having both `prompt` and `logits` is not required, at least one is required.
-> If `logits` are specified, the `prompt` entry is ignored for request generation.
+Prompt and logits are mutually exclusive, but one of `prompt` or `logits` is required.
+If you specify `logits`, the `prompt` entry is ignored for request generation.
 
-Examples of valid entries for the inflight benchmark are:
+Refer to the following examples of valid entries for the inflight benchmark:
 
 - Entries with a human-readable prompt and no logits.
-```json
-{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
-{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
-```
+
+  ```json
+  {"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
+  {"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
+  ```
 
 - Entries which contain logits.
-```json
-{"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
-{"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
-```
 
-> [!INFO] A whole entry is on a line!
-> To make the passing of data simpler, a complete JSON entry is on each line so that the benchmarker
-> can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
-> JSON entry is on every line.
+  ```json
+  {"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
+  {"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
+  ```
+
+```{tip}
+Specify each entry on one line.
+To simplify passing the data, a complete JSON entry is on each line so that the benchmarker
+can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
+JSON entry is on every line.
+```
 
 #### Using `prepare_dataset` to Create Synthetic Datasets
 
@@ -162,12 +178,12 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --max_seq_len 256 --quantiza
 
 > [!NOTE] `trtllm-bench build` reproduces benchmark engines for performance study. These engine
 configurations are not guaranteed to be optimal for all cases and should be viewed as reproducers
-for the benchmark data we provide on our [Performance Overview](../docs/source/performance/perf-overview.md).
+for the benchmark data we provide on our [Performance Overview](./perf-overview.md).
 
 Looking a little closer, the `build` sub-command
 will perform a lookup and build an engine using those reference settings. The
 look up table directly corresponds to the performance table found in our
-[Performance Overview](../docs/source/performance/perf-overview.md#throughput-measurements). The
+[Performance Overview](./perf-overview.md#throughput-measurements). The
 output of the `build` sub-command looks similar to the snippet below (for `meta-llama/Llama-2-7b-hf`):
 
 ```shell
@@ -236,16 +252,17 @@ upper bound throughput number.
 
 #### How the Benchmarker Works
 
-The benchmarker will read in a data file or standard input (stdin) as a stream where a single line contains
-a complete JSON request entry. The process that the benchmarker is as follows:
+The benchmarker reads a data file where a single line contains
+a complete JSON request entry as specified in [](#preparing-a-dataset).
+The process that the benchmarker is as follows:
 
 1. Iterate over all input requests. If `logits` is specified, construct the request using the specified
 list of logits. Otherwise, tokenize the `prompt` with as specified by `--model $HF_MODEL_NAME`.
-3. Submit the dataset to the TensorRT-LLM `Executor` API at as fast of a rate as possible (offline mode).
-4. Wait for all requests to return, compute statistics, then report out results.
+1. Submit the dataset to the TensorRT-LLM `Executor` API as fast as possible (offline mode).
+1. Wait for all requests to return, compute statistics, and then report results.
 
-To run the benchmarker, run the following with the [engine](#building-a-benchmark-engine) and
-[dataset](#preparing-a-dataset) generated above:
+To run the benchmarker, run the following commands with the [engine](#building-a-benchmark-engine) and
+[dataset](#preparing-a-dataset) generated from previous steps:
 
 ```shell
 trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
@@ -316,16 +333,160 @@ Total Latency (seconds):        20.331645167
 [TensorRT-LLM][INFO] Refreshed the MPI local session
 ```
 
+## Low Latency Benchmark
+
+The low latency benchmark follows a similar workflow to the [throughput benchmark](#running-a-max-throughput-benchmark)
+but requires building the engine separately from `trtllm-bench`. Low latency benchmarks has the following modes:
+
+- A single-request low-latency engine
+- A Medusa-enabled speculative-decoding engine
+
+### Low Latency TensorRT-LLM Engine for Llama-3 70B
+
+To build a low-latency engine for the latency benchmark, run the following quantize and build commands.
+The `$checkpoint_dir` is the path to the [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) Hugging Face checkpoint in your cache or downloaded to a specific location with the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
+To prepare a dataset, follow the same process as specified in [](#preparing-a-dataset).
+
+#### Benchmarking a non-Medusa Low Latency Engine
+
+To quantize the checkpoint:
+
+```shell
+cd tensorrt_llm/examples/llama
+python ../quantization/quantize.py \
+    --model_dir $checkpoint_dir \
+    --dtype bfloat16 \
+    --qformat fp8 \
+    --kv_cache_dtype fp8 \
+    --output_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
+    --calib_size 512 \
+    --tp_size $tp_size
+```
+
+then build,
+
+```shell
+trtllm-build \
+    --checkpoint_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
+    --use_fused_mlp enable \
+    --gpt_attention_plugin bfloat16 \
+    --output_dir /tmp/meta-llama/Meta-Llama-3-70B/engine \
+    --max_batch_size 1 \
+    --max_seq_len $(($isl+$osl)) \
+    --reduce_fusion enable \
+    --gemm_plugin fp8 \
+    --workers $tp_size \
+    --use_fp8_context_fmha enable \
+    --max_num_tokens $isl \
+    --use_paged_context_fmha disable \
+    --multiple_profiles enable
+```
+
+After the engine is built, run the low-latency benchmark:
+
+```shell
+env TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG=1 \
+  TRTLLM_MMHA_KERNEL_BLOCK_SIZE=256 \
+  TRTLLM_MMHA_BLOCKS_PER_SEQUENCE=32 \
+  FORCE_MULTI_BLOCK_MODE=ON \
+  TRTLLM_ENABLE_FDL=1 \
+  trtllm-bench --model meta-llama/Meta-Llama-3-70B \
+  latency \
+  --dataset $DATASET_PATH \
+  --engine_dir /tmp/meta-llama/Meta-Llama-3-70B/engine
+```
+
+#### Building a Medusa Low-Latency Engine
+
+To build a Medusa-enabled engine requires checkpoints that contain Medusa heads.
+NVIDIA provides TensorRT-LLM checkpoints on the [NVIDIA](https://huggingface.co/nvidia) page on Hugging Face.
+The checkpoints are pre-quantized and can be directly built after downloading them with the
+[huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
+After you download the checkpoints, run the following command. Make sure to
+specify the `$tp_size` supported by your Medusa checkpoint and the path to its stored location `$checkpoint_dir`.
+
+Using Llama-3.1 70B as an example, for a tensor parallel 8 and bfloat16 dtype:
+
+```shell
+tp_size=8
+trtllm-build --checkpoint_dir $checkpoint_dir \
+    --speculative_decoding_mode medusa \
+    --max_batch_size 1 \
+    --gpt_attention_plugin bfloat16 \
+    --output_dir /tmp/meta-llama/Meta-Llama-3.1-70B/medusa/engine \
+    --use_fused_mlp enable \
+    --paged_kv_cache enable \
+    --use_paged_context_fmha disable \
+    --multiple_profiles enable \
+    --reduce_fusion enable \
+    --use_fp8_context_fmha enable \
+    --workers $tp_size \
+    --low_latency_gemm_plugin fp8
+```
+
+After the engine is built, you need to define the Medusa choices.
+The choices are specified with a YAML file like the following example (`medusa.yaml`):
+
+```yaml
+- [0]
+- [0, 0]
+- [1]
+- [0, 1]
+- [2]
+- [0, 0, 0]
+- [1, 0]
+- [0, 2]
+- [3]
+- [0, 3]
+- [4]
+- [0, 4]
+- [2, 0]
+- [0, 5]
+- [0, 0, 1]
+```
+
+To run the Medusa-enabled engine, run the following command:
+
+```shell
+env TRTLLM_ENABLE_PDL=1 \
+  UB_ONESHOT=1 \
+  UB_TP_SIZE=$tp_size \
+  TRTLLM_ENABLE_PDL=1 \
+  TRTLLM_PDL_OVERLAP_RATIO=0.15 \
+  TRTLLM_PREFETCH_RATIO=-1 \
+  trtllm-bench --model meta-llama/Meta-Llama-3-70B \
+  latency \
+  --dataset $DATASET_PATH \
+  --engine_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \
+  --medusa_choices medusa.yml
+```
+
 ## Summary
 
-In summary, the general process for reproducing a benchmark point is as follows:
+The following table summarizes the commands needed for running benchmarks:
 
-- Prepare a dataset: `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH`
-- Build engine: `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH`
-- Benchmark engine: trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR`
+| Scenario | Phase | Command |
+| - | - | - |
+| Dataset | Preparation | `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH` |
+| Throughput | Build | `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH` |
+| Throughput | Benchmark | `trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
+| Latency | Build | See [section about building low latency engines](#low-latency-tensorrt-llm-engine-for-llama-3-70b) |
+| Non-Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
+| Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR --medusa_choices $MEDUSA_CHOICES` |
 
 where,
-- `$HF_MODEL` is the Huggingface name of a model.
-- `$NUM_REQUESTS` is the number of requests to generate.
-- `$DATASET_PATH` is the path where the dataset was written when preparing the dataset.
-- `$ENGINE_DIR` the engine directory as printed by `trtllm-bench build`.
+
+`$HF_MODEL`
+: The Hugging Face name of a model.
+
+`$NUM_REQUESTS`
+: The number of requests to generate.
+
+`$DATASET_PATH`
+: The path where the dataset was written when preparing the dataset.
+
+`$ENGINE_DIR`
+: The engine directory as printed by `trtllm-bench build`.
+
+`$MEDUSA_CHOICES`
+: A YAML config representing the Medusa tree for the benchmark.
diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index b4c0646a8..1dcf759fc 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -34,191 +34,145 @@ and shows the throughput client-server scenario under maximum load.
 
 The performance numbers below were collected using the steps described in this document.
 
-**All data in the table below was generated using version 0.13.0 and presents token throughput in tokens/second.**
-
-|                 |                          |               |                     |                 |                    |                    |                    |          |
-| --------------- | ------------------------ | ------------- | ------------------- | --------------- | ------------------ | ------------------ | ------------------ | -------- |
-|                 |                          | **GPU**       | **H200 141GB HBM3** | **GH200 120GB** | **H100 80GB HBM3** | **H100 80GB HBM3** | **A100-SXM4-80GB** | **L40S** |
-|                 |                          | **Precision** | **FP8**             | **FP8**         | **FP8**            | **FP16**           | **FP16**           | **FP8**  |
-| **Model**       | **Input/Output Lengths** | **TP**        |                     |                 |                    |                    |                    |          |
-| GPTJ 6B         | 128/128                  | 1             | 24,533.54           | 22,368.50       | 24,318.61          | 12,936.63          | 5,964.19           | 7,688.44 |
-|                 | 128/2048                 | 1             | 8,375.67            | 6,588.73        | 7,829.91           | 3,931.61           | 2,215.88           | 1,842.82 |
-|                 | 128/4096                 | 1             | 5,048.59            | 3,662.81        | 3,955.28           | 2,041.06           | 1,118.12           | 980.23   |
-|                 | 2048/128                 | 1             | 2,770.27            | 2,520.37        | 2,698.08           | 1,479.48           | 650.09             | 746.54   |
-|                 | 5000/500                 | 1             | 1,791.39            | 1,449.23        | 1,623.17           | 818.80             | 436.85             | 413.33   |
-|                 | 500/2000                 | 1             | 6,770.60            | 5,565.62        | 6,149.65           | 3,030.03           | 1,673.05           | 1,538.45 |
-|                 | 1000/1000                | 1             | 6,465.73            | 5,580.37        | 6,078.80           | 2,797.48           | 1,673.45           | 1,531.57 |
-|                 | 2048/2048                | 1             | 3,637.42            | 2,998.01        | 3,060.80           | 1,285.08           | 845.83             | 753.55   |
-| LLaMA v3.1 8B   | 128/128                  | 1             | 28,125.59           | 26,045.60       | 27,147.22          | 15,647.83          | 6,687.04           | 8,548.90 |
-|                 | 128/2048                 | 1             | 22,989.20           | 16,497.79       | 19,221.02          | 8,882.95           | 4,918.53           | 4,988.61 |
-|                 | 128/4096                 | 1             | 16,077.62           | 9,637.91        | 11,856.11          | 5,462.96           | 3,054.46           | 2,768.91 |
-|                 | 2048/128                 | 1             | 3,625.83            | 3,357.60        | 3,497.30           | 1,859.37           | 796.17             | 1,000.90 |
-|                 | 5000/500                 | 1             | 3,823.76            | 3,217.40        | 3,276.69           | 1,687.74           | 788.66             | 872.14   |
-|                 | 500/2000                 | 1             | 19,382.37           | 15,128.77       | 13,996.05          | 6,834.76           | 3,929.83           | 3,911.14 |
-|                 | 1000/1000                | 1             | 16,435.21           | 12,355.41       | 13,411.43          | 7,160.92           | 3,592.16           | 3,648.21 |
-|                 | 2048/2048                | 1             | 11,072.97           | 7,850.75        | 8,851.23           | 4,152.21           | 2,269.78           | 2,055.78 |
-|                 | 20000/2000               | 1             | 1,634.98            | 1,200.89        | 1,278.04           | 595.89             | 316.43             | 263.75   |
-| LLaMA v3 8B     | 128/128                  | 1             | 27,940.47           | 26,117.13       | 27,156.81          | 15,489.11          | 6,656.98           | 8,734.57 |
-|                 | 128/2048                 | 1             | 23,228.98           | 16,417.04       | 19,209.17          | 8,901.43           | 4,967.37           | 5,004.93 |
-|                 | 128/4096                 | 1             | 15,980.94           | 9,351.95        | 11,889.67          | 5,455.91           | 3,053.27           | 2,768.15 |
-|                 | 2048/128                 | 1             | 3,631.45            | 3,339.90        | 3,476.37           | 1,918.56           | 796.28             | 1,050.68 |
-|                 | 5000/500                 | 1             | 3,836.98            | 3,186.22        | 3,279.24           | 1,668.42           | 792.95             | 860.31   |
-|                 | 500/2000                 | 1             | 19,725.45           | 15,241.74       | 14,218.30          | 6,816.62           | 3,899.64           | 3,990.73 |
-|                 | 1000/1000                | 1             | 16,201.60           | 12,049.81       | 13,371.60          | 7,041.47           | 3,617.10           | 3,679.10 |
-|                 | 2048/2048                | 1             | 11,097.69           | 7,255.55        | 8,852.87           | 4,251.45           | 2,269.68           | 2,048.94 |
-| LLaMA v2 7B     | 128/128                  | 1             | 19,549.13           | 17,823.45       | 19,298.99          | 11,436.31          | 5,238.68           | 6,396.62 |
-|                 | 128/2048                 | 1             | 7,675.14            | 5,438.53        | 6,607.33           | 2,985.61           | 1,807.39           | 1,566.03 |
-|                 | 128/4096                 | 1             | 4,397.83            | 3,310.09        | 3,628.46           | 1,575.35           | 957.24             | 821.83   |
-|                 | 2048/128                 | 1             | 2,392.31            | 2,064.18        | 2,304.02           | 1,157.55           | 560.35             | 619.83   |
-|                 | 5000/500                 | 1             | 1,570.37            | 1,250.11        | 1,419.09           | 624.75             | 366.39             | 347.03   |
-|                 | 500/2000                 | 1             | 6,044.15            | 4,717.51        | 5,188.69           | 2,382.75           | 1,408.58           | 1,231.78 |
-|                 | 1000/1000                | 1             | 5,896.10            | 4,825.24        | 5,208.97           | 2,462.65           | 1,431.92           | 1,277.79 |
-|                 | 2048/2048                | 1             | 3,193.42            | 2,693.21        | 2,792.53           | 1,263.11           | 734.38             | 641.47   |
-| Mistral 7B      | 128/128                  | 1             | 30,152.19           | 27,738.08       | 29,672.75          | 16,711.12          | 6,863.59           | 9,676.88 |
-|                 | 128/2048                 | 1             | 24,742.09           | 17,528.14       | 20,318.60          | 9,774.11           | 5,321.44           | 5,437.25 |
-|                 | 128/4096                 | 1             | 16,905.49           | 10,671.38       | 12,715.46          | 5,740.41           | 3,257.23           | 2,941.08 |
-|                 | 2048/128                 | 1             | 3,676.37            | 3,369.77        | 3,502.83           | 1,893.42           | 796.00             | 996.65   |
-|                 | 5000/500                 | 1             | 3,890.07            | 3,401.45        | 3,358.65           | 1,740.69           | 807.07             | 904.45   |
-|                 | 500/2000                 | 1             | 20,788.70           | 15,035.59       | 15,962.94          | 7,494.80           | 4,168.89           | 4,088.52 |
-|                 | 1000/1000                | 1             | 17,620.46           | 13,362.84       | 14,213.48          | 7,281.07           | 3,794.31           | 3,972.63 |
-|                 | 2048/2048                | 1             | 11,747.88           | 8,599.03        | 9,200.19           | 4,349.39           | 2,320.50           | 2,170.16 |
-|                 | 20000/2000               | 1             | 1,693.41            | 1,271.85        | 1,299.05           | 609.91             | 324.52             | 276.19   |
-| LLaMA v3.1 405B | 128/128                  | 8             | 3,734.50            |                 |                    |                    |                    |          |
-|                 | 128/2048                 | 8             | 3,039.70            |                 |                    |                    |                    |          |
-|                 | 128/4096                 | 8             | 3,144.97            |                 |                    |                    |                    |          |
-|                 | 2048/128                 | 8             | 454.17              |                 |                    |                    |                    |          |
-|                 | 5000/500                 | 8             | 459.91              |                 |                    |                    |                    |          |
-|                 | 500/2000                 | 8             | 2,967.98            |                 |                    |                    |                    |          |
-|                 | 1000/1000                | 8             | 2,259.32            |                 |                    |                    |                    |          |
-|                 | 2048/2048                | 8             | 2,067.15            |                 |                    |                    |                    |          |
-|                 | 20000/2000               | 8             | 447.67              |                 |                    |                    |                    |          |
-| LLaMA v3.1 70B  | 128/128                  | 1             | 3,923.61            | 2,998.99        | 2,168.72           |                    |                    |          |
-|                 |                          | 2             | 5,358.16            | 1,839.02        | 5,215.12           | 3,156.10           | 1,340.20           |          |
-|                 |                          | 4             | 8,969.59            | 8,655.98        | 8,677.59           | 5,845.53           | 2,426.46           | 1,434.63 |
-|                 |                          | 8             | 16,449.68           |                 | 15,711.60          | 10,643.75          | 4,491.42           | 1,365.36 |
-|                 | 128/2048                 | 1             | 3,503.59            | 1,343.53        | 344.22             |                    |                    |          |
-|                 |                          | 2             | 7,068.42            | 1,146.08        | 5,654.43           | 801.82             | 498.44             |          |
-|                 |                          | 4             | 12,890.95           | 10,358.10       | 9,377.87           | 4,791.11           | 2,460.91           | 1,748.87 |
-|                 |                          | 8             | 19,947.02           |                 | 15,168.97          | 6,892.18           | 4,148.33           | 1,890.62 |
-|                 | 128/4096                 | 1             | 2,314.83            |                 |                    |                    |                    |          |
-|                 |                          | 2             | 6,227.19            | 896.56          | 3,302.41           | 413.22             | 268.86             |          |
-|                 |                          | 4             | 10,059.64           | 6,628.22        | 6,501.69           | 3,056.98           | 1,660.93           | 1,180.87 |
-|                 |                          | 8             | 14,393.28           |                 | 9,699.99           | 4,238.15           | 2,705.77           | 1,417.60 |
-|                 | 2048/128                 | 1             | 459.73              | 372.44          | 211.51             |                    |                    |          |
-|                 |                          | 2             | 689.30              | 280.61          | 690.05             | 323.66             | 143.39             |          |
-|                 |                          | 4             | 1,047.96            | 1,015.14        | 1,016.24           | 672.37             | 278.87             | 167.87   |
-|                 |                          | 8             | 2,061.19            |                 | 1,964.49           | 1,273.97           | 539.57             | 163.91   |
-|                 | 5000/500                 | 1             | 534.79              | 283.19          | 112.21             |                    |                    |          |
-|                 |                          | 2             | 943.78              | 337.04          | 897.36             | 224.31             | 115.63             |          |
-|                 |                          | 4             | 1,437.45            | 1,383.61        | 1,329.82           | 851.12             | 361.39             | 235.90   |
-|                 |                          | 8             | 2,795.95            |                 | 2,472.69           | 1,438.10           | 679.27             | 224.33   |
-|                 | 500/2000                 | 1             | 2,758.24            | 1,083.48        |                    |                    |                    |          |
-|                 |                          | 2             | 6,063.53            | 851.46          | 4,347.69           | 652.34             | 423.06             |          |
-|                 |                          | 4             | 10,061.89           | 9,090.78        | 8,378.16           | 3,441.34           | 2,072.88           | 1,436.41 |
-|                 |                          | 8             | 16,139.49           |                 | 10,790.85          | 5,792.17           | 3,115.20           | 1,512.78 |
-|                 | 1000/1000                | 1             | 2,539.65            | 728.79          |                    |                    |                    |          |
-|                 |                          | 2             | 4,572.03            | 1,223.92        | 3,880.41           | 737.40             | 451.82             |          |
-|                 |                          | 4             | 7,612.56            | 6,705.02        | 6,553.00           | 3,655.64           | 1,731.86           | 1,113.18 |
-|                 |                          | 8             | 12,660.86           |                 | 11,121.10          | 5,599.45           | 3,013.95           | 1,120.73 |
-|                 | 2048/2048                | 1             | 1,753.58            | 611.08          | 161.60             |                    |                    |          |
-|                 |                          | 2             | 3,407.26            | 626.26          | 2,432.55           |                    | 108.91             |          |
-|                 |                          | 4             | 6,565.77            | 4,864.55        | 4,948.83           | 2,396.06           | 1,220.93           | 855.44   |
-|                 |                          | 8             | 9,948.56            |                 | 8,527.52           | 3,819.60           | 2,103.68           | 924.89   |
-|                 | 20000/2000               | 1             | 262.82              | 88.89           |                    |                    |                    |          |
-|                 |                          | 2             | 598.19              | 177.04          | 414.17             |                    |                    |          |
-|                 |                          | 4             | 1,047.27            | 958.88          | 856.31             | 375.85             | 187.42             | 140.73   |
-|                 |                          | 8             | 1,793.52            |                 | 1,359.27           | 650.78             | 344.41             | 122.04   |
-| LLaMA v3 70B    | 128/128                  | 1             | 3,924.02            | 3,161.73        | 2,177.84           |                    |                    |          |
-|                 |                          | 2             | 5,388.22            | 1,551.84        | 5,205.80           | 3,186.61           | 1,321.55           |          |
-|                 |                          | 4             | 8,958.95            | 8,618.55        | 8,678.68           | 5,857.16           | 2,424.68           | 1,432.46 |
-|                 |                          | 8             | 16,375.41           |                 | 15,703.26          | 10,627.36          | 4,490.19           | 1,333.09 |
-|                 | 128/2048                 | 1             | 3,519.24            | 1,346.37        | 353.68             |                    |                    |          |
-|                 |                          | 2             | 7,071.54            | 862.54          | 5,878.06           | 802.98             | 512.11             |          |
-|                 |                          | 4             | 12,876.38           | 10,015.23       | 8,929.23           | 4,768.27           | 2,458.73           | 1,737.31 |
-|                 |                          | 8             | 20,013.92           |                 | 15,171.91          | 6,875.97           | 3,906.35           | 1,892.41 |
-|                 | 128/4096                 | 1             | 2,310.85            |                 |                    |                    |                    |          |
-|                 |                          | 2             | 6,199.95            | 602.98          | 3,311.05           | 413.29             | 269.02             |          |
-|                 |                          | 4             | 9,633.49            | 7,370.19        | 6,489.95           | 3,053.89           | 1,677.51           | 1,199.71 |
-|                 |                          | 8             | 14,552.09           |                 | 9,632.02           | 4,259.39           | 2,697.61           | 1,358.34 |
-|                 | 2048/128                 | 1             | 458.75              | 371.70          | 210.27             |                    |                    |          |
-|                 |                          | 2             | 694.00              | 277.85          | 692.74             | 321.71             | 144.61             |          |
-|                 |                          | 4             | 1,048.84            | 1,016.03        | 1,022.77           | 690.10             | 279.06             | 168.52   |
-|                 |                          | 8             | 2,072.33            |                 | 1,976.76           | 1,273.41           | 542.93             | 158.63   |
-|                 | 5000/500                 | 1             | 533.37              | 303.33          | 112.68             |                    |                    |          |
-|                 |                          | 2             | 936.82              | 379.62          | 899.29             | 224.65             | 115.00             |          |
-|                 |                          | 4             | 1,442.76            | 1,384.62        | 1,326.95           | 853.73             | 361.06             | 235.19   |
-|                 |                          | 8             | 2,797.36            |                 | 2,483.56           | 1,437.15           | 678.70             | 225.15   |
-|                 | 500/2000                 | 1             | 2,763.89            | 1,074.62        | 293.47             |                    |                    |          |
-|                 |                          | 2             | 6,054.46            | 1,109.13        | 4,356.55           | 683.11             | 423.82             |          |
-|                 |                          | 4             | 10,103.08           | 7,325.93        | 8,370.32           | 3,436.29           | 2,064.47           | 1,412.78 |
-|                 |                          | 8             | 16,857.45           |                 | 10,760.65          | 5,665.02           | 3,159.89           | 1,517.76 |
-|                 | 1000/1000                | 1             | 2,540.45            | 1,164.45        |                    |                    |                    |          |
-|                 |                          | 2             | 4,590.38            | 1,040.64        | 3,879.25           | 768.53             | 453.73             |          |
-|                 |                          | 4             | 7,606.92            | 6,655.61        | 6,547.23           | 3,655.19           | 1,732.86           | 1,117.53 |
-|                 |                          | 8             | 12,660.32           |                 | 11,155.47          | 5,617.24           | 2,894.58           | 1,126.50 |
-|                 | 2048/2048                | 1             | 1,746.77            | 610.87          | 162.10             |                    |                    |          |
-|                 |                          | 2             | 3,405.72            | 738.51          | 2,548.70           |                    | 108.66             |          |
-|                 |                          | 4             | 6,571.34            | 4,880.28        | 5,060.39           | 2,391.55           | 1,222.11           | 854.65   |
-|                 |                          | 8             | 9,923.96            |                 | 8,480.48           | 3,826.38           | 2,181.07           | 927.54   |
-| LLaMA v2 70B    | 128/128                  | 1             | 3,969.25            | 3,502.35        | 3,413.82           |                    |                    |          |
-|                 |                          | 2             | 6,394.64            | 3,252.69        | 6,432.82           | 3,170.28           | 1,336.48           |          |
-|                 |                          | 4             | 11,031.42           | 11,126.95       | 10,865.42          | 6,420.88           | 2,766.00           | 1,487.71 |
-|                 |                          | 8             | 17,060.04           |                 | 16,384.83          | 11,146.15          | 4,742.74           | 1,404.99 |
-|                 | 128/2048                 | 1             | 3,742.99            | 1,660.81        |                    |                    |                    |          |
-|                 |                          | 2             | 6,453.25            | 1,335.80        | 5,775.34           | 757.21             | 476.46             |          |
-|                 |                          | 4             | 13,869.67           | 11,098.69       | 9,536.82           | 5,274.27           | 2,686.16           | 1,880.22 |
-|                 |                          | 8             | 19,220.48           |                 | 17,715.01          | 8,904.94           | 5,520.41           | 2,186.68 |
-|                 | 128/4096                 | 1             | 2,459.63            |                 | 446.60             |                    |                    |          |
-|                 |                          | 2             | 4,831.03            | 684.68          | 3,354.60           | 385.98             | 235.22             |          |
-|                 |                          | 4             | 8,988.84            | 8,397.13        | 7,619.62           | 3,228.36           | 1,941.07           | 1,318.51 |
-|                 |                          | 8             | 15,115.41           |                 | 12,506.95          | 5,996.81           | 3,539.36           | 1,782.93 |
-|                 | 2048/128                 | 1             | 458.88              | 400.31          | 328.90             |                    |                    |          |
-|                 |                          | 2             | 745.71              | 457.57          | 742.17             | 308.02             | 138.81             |          |
-|                 |                          | 4             | 1,297.10            | 1,330.90        | 1,270.78           | 755.30             | 321.72             | 171.67   |
-|                 |                          | 8             | 2,060.53            |                 | 2,009.57           | 1,348.71           | 561.71             | 160.37   |
-|                 | 5000/500                 | 1             | 548.46              | 364.00          | 224.17             |                    |                    |          |
-|                 |                          | 2             | 1,020.86            | 335.07          | 885.67             | 212.20             | 112.43             |          |
-|                 |                          | 4             | 1,759.69            | 1,683.26        | 1,590.94           | 837.57             | 386.78             | 231.54   |
-|                 |                          | 8             | 2,839.69            |                 | 2,546.12           | 1,570.91           | 709.66             | 238.59   |
-|                 | 500/2000                 | 1             | 3,019.28            | 1,364.66        | 716.54             |                    |                    |          |
-|                 |                          | 2             | 6,402.94            | 1,292.24        | 4,462.98           | 629.21             | 387.61             |          |
-|                 |                          | 4             | 12,429.18           | 8,951.07        | 8,753.09           | 4,012.41           | 2,158.17           | 1,517.53 |
-|                 |                          | 8             | 16,789.12           |                 | 15,260.29          | 7,384.79           | 4,104.80           | 1,739.28 |
-|                 | 1000/1000                | 1             | 2,706.04            | 1,449.83        |                    |                    |                    |          |
-|                 |                          | 2             | 4,693.24            | 960.39          | 3,958.45           | 736.68             | 425.70             |          |
-|                 |                          | 4             | 8,557.11            | 7,278.64        | 6,817.41           | 3,866.05           | 1,876.40           | 1,188.91 |
-|                 |                          | 8             | 13,483.04           |                 | 11,511.74          | 6,543.96           | 3,285.82           | 1,241.42 |
-|                 | 2048/2048                | 1             | 1,911.20            | 798.50          | 412.37             |                    |                    |          |
-|                 |                          | 2             | 3,408.82            | 767.24          | 2,551.21           | 388.82             | 226.60             |          |
-|                 |                          | 4             | 6,702.46            | 5,354.80        | 5,212.02           | 2,512.22           | 1,316.92           | 891.95   |
-|                 |                          | 8             | 10,348.65           |                 | 8,016.14           | 4,414.75           | 2,492.09           | 1,083.26 |
-| Mixtral 8x7B    | 128/128                  | 2             | 25,135.25           | 8,512.51        | 24,572.90          | 15,395.59          | 5,927.88           |          |
-|                 |                          | 4             | 42,394.61           | 40,148.01       | 40,309.25          | 27,747.43          | 11,205.51          | 6,784.44 |
-|                 |                          | 8             | 54,648.80           |                 | 51,683.16          | 40,116.51          | 18,496.66          | 6,437.72 |
-|                 | 128/2048                 | 2             | 29,412.17           | 3,271.02        | 20,938.80          | 7,391.51           | 4,278.79           |          |
-|                 |                          | 4             | 52,603.13           | 43,071.34       | 40,580.94          | 21,332.15          | 10,946.58          | 7,475.05 |
-|                 |                          | 8             | 70,427.00           |                 | 64,161.64          | 41,101.18          | 21,235.99          | 9,955.21 |
-|                 | 128/4096                 | 2             | 21,312.11           | 2,254.56        |                    | 3,896.02           | 2,388.14           |          |
-|                 |                          | 4             | 39,353.01           | 30,065.77       |                    |                    | 7,108.03           | 5,232.44 |
-|                 |                          | 8             | 32,992.62           |                 | 47,860.65          | 27,261.67          | 15,943.70          | 8,081.21 |
-|                 | 2048/128                 | 2             | 2,946.01            | 921.87          | 2,894.09           | 1,790.49           | 684.71             |          |
-|                 |                          | 4             | 5,237.58            | 5,056.60        | 4,988.14           | 3,354.89           | 1,338.54           | 803.50   |
-|                 |                          | 8             | 7,053.32            |                 | 6,559.63           | 5,072.46           | 2,244.39           | 753.39   |
-|                 | 5000/500                 | 2             | 3,848.10            | 997.06          | 3,630.24           | 1,656.04           | 739.84             |          |
-|                 |                          | 4             | 6,877.65            | 6,466.39        | 6,237.22           | 3,607.46           | 1,619.49           | 1,048.60 |
-|                 |                          | 8             | 9,531.26            |                 | 8,709.34           | 6,237.96           | 2,927.13           | 1,109.25 |
-|                 | 500/2000                 | 2             | 23,539.24           | 2,773.86        | 16,886.30          | 5,773.33           | 3,325.73           |          |
-|                 |                          | 4             | 40,035.05           | 33,478.35       | 32,047.73          | 16,897.03          | 8,908.09           | 6,153.32 |
-|                 |                          | 8             | 60,572.77           |                 | 41,597.80          | 31,392.32          | 16,954.54          | 7,980.34 |
-|                 | 1000/1000                | 2             | 18,644.51           | 4,540.15        | 14,154.95          | 5,826.43           | 3,289.27           |          |
-|                 |                          | 4             | 32,709.62           | 29,046.16       | 25,291.30          | 14,307.91          | 7,461.63           | 4,697.19 |
-|                 |                          | 8             | 44,072.88           |                 | 40,628.46          | 27,633.48          | 13,741.62          | 5,706.17 |
-|                 | 2048/2048                | 2             | 14,017.70           | 2,870.77        | 10,448.79          | 3,535.21           | 1,954.32           |          |
-|                 |                          | 4             | 25,550.44           | 21,488.32       | 19,977.11          | 9,620.99           | 5,191.30           | 3,593.18 |
-|                 |                          | 8             | 24,999.94           |                 | 31,678.85          | 19,372.52          | 10,572.07          | 4,860.61 |
-|                 | 20000/2000               | 2             | 2,195.84            | 367.81          | 1,583.86           | 626.60             | 320.41             |          |
-|                 |                          | 4             | 4,086.41            | 3,301.28        | 2,982.42           | 1,586.09           | 807.67             | 579.49   |
-|                 |                          | 8             | 5,797.57            |                 | 5,163.91           | 3,106.98           | 1,653.55           | 821.64   |
+**All data in the table below was generated using version 0.14.0 and presents token throughput in tokens/second.**
+
+|                 |                          |               |                     |                    |                    |                    |                    |           |
+| --------------- | ------------------------ | ------------- | ------------------- | ------------------ | ------------------ | ------------------ | ------------------ | --------- |
+|                 |                          | **GPU**       | **H200 141GB HBM3** | **H100 80GB HBM3** | **H100 80GB HBM3** | **A100-SXM4-80GB** | **A100-PCIE-80GB** | **L40S**  |
+|                 |                          | **Precision** | **FP8**             | **FP8**            | **FP16**           | **FP16**           | **FP16**           | **FP8**   |
+| **Model**       | **Input/Output Lengths** | **TP Size**   |                     |                    |                    |                    |                    |           |
+| LLaMA v3 70B    | 1000/1000                | 1             | 2594.2199           | 464.5243           |                    |                    |                    |           |
+|                 |                          | 2             | 4574.1197           | 4092.3267          | 776.9965           | 468.5805           | 259.1155           |           |
+|                 |                          | 4             | 7612.2487           | 6925.0844          | 3730.2064          | 1765.9123          | 987.1971           | 1159.357  |
+|                 |                          | 8             | 13075.5194          | 10733.0804         | 5963.0914          | 3054.8915          | 960.3737           | 1173.3517 |
+|                 | 128/128                  | 1             | 3904.1639           | 2551.6384          |                    |                    |                    |           |
+|                 |                          | 2             | 5343.8677           | 5191.7428          | 3183.9714          | 1334.903           | 806.1477           |           |
+|                 |                          | 4             | 8829.1049           | 8540.5362          | 5837.9598          | 2421.4383          | 1275.5474          | 1427.9115 |
+|                 |                          | 8             | 16359.1322          | 15498.2004         | 10597.6556         | 4474.1621          | 1223.1747          | 1377.473  |
+|                 | 128/2048                 | 1             | 3613.7474           | 418.3639           |                    |                    |                    |           |
+|                 |                          | 2             | 7112.2959           | 5852.0185          | 817.52             | 511.6257           |                    |           |
+|                 |                          | 4             | 12772.8148          | 8998.3742          | 5072.0345          | 2484.2018          | 1471.9105          | 1771.4437 |
+|                 |                          | 8             | 19722.5974          | 15099.0633         | 7554.2141          | 4463.6602          | 1589.1759          | 1953.7918 |
+|                 | 128/4096                 | 1             | 2409.6881           |                    |                    |                    |                    |           |
+|                 |                          | 2             | 5687.3482           | 3513.0941          | 413.3767           | 273.5871           |                    |           |
+|                 |                          | 4             | 8937.3115           | 6718.5895          | 3093.7358          | 1688.0132          | 1231.8104          | 1279.2496 |
+|                 |                          | 8             | 13976.1386          | 9279.1013          | 5001.2743          | 2948.5374          | 1350.794           | 1494.0776 |
+|                 | 2048/128                 | 1             | 457.5772            | 241.7561           |                    |                    |                    |           |
+|                 |                          | 2             | 699.5582            | 690.9961           | 328.0399           | 145.088            | 91.1746            |           |
+|                 |                          | 4             | 1035.6523           | 1008.8318          | 670.6725           | 278.5717           | 150.2619           | 168.7886  |
+|                 |                          | 8             | 2055.7245           | 1996.2653          | 1288.7599          | 546.9599           | 140.0144           | 160.2741  |
+|                 | 2048/2048                | 1             | 1802.1116           | 204.0931           |                    |                    |                    |           |
+|                 |                          | 2             | 3487.2497           | 2444.6903          | 165.6522           | 126.1101           |                    |           |
+|                 |                          | 4             | 6126.7196           | 4850.8285          | 2386.6556          | 1230.1833          | 822.2269           | 876.6085  |
+|                 |                          | 8             | 9784.0193           | 7432.6659          | 3991.2123          | 2144.3042          | 883.4809           | 994.94    |
+|                 | 500/2000                 | 1             | 2822.7846           | 389.8823           |                    |                    |                    |           |
+|                 |                          | 2             | 6175.7623           | 4601.857           | 687.5386           | 430.6093           |                    |           |
+|                 |                          | 4             | 10783.8925          | 9018.9053          | 3698.3674          | 2113.3936          | 1248.8319          | 1468.7827 |
+|                 |                          | 8             | 17631.9756          | 11375.9582         | 6321.3679          | 3673.5693          | 1321.8541          | 1636.4588 |
+|                 | 5000/500                 | 1             | 532.2603            | 123.8543           |                    |                    |                    |           |
+|                 |                          | 2             | 931.8255            | 897.4263           | 227.9005           | 117.5698           | 75.35              |           |
+|                 |                          | 4             | 1399.7865           | 1316.2865          | 831.2804           | 362.3465           | 209.8052           | 234.7343  |
+|                 |                          | 8             | 2725.1283           | 2469.5585          | 1446.3508          | 662.5725           | 202.0719           | 231.9027  |
+| LLaMA v3.1 405B | 1000/1000                | 8             | 3391.0372           |                    |                    |                    |                    |           |
+|                 | 128/128                  | 8             | 3766.2785           |                    |                    |                    |                    |           |
+|                 | 128/2048                 | 8             | 5952.1416           |                    |                    |                    |                    |           |
+|                 | 128/4096                 | 8             | 3944.117            |                    |                    |                    |                    |           |
+|                 | 20000/2000               | 8             | 481.5732            |                    |                    |                    |                    |           |
+|                 | 2048/128                 | 8             | 444.5735            |                    |                    |                    |                    |           |
+|                 | 2048/2048                | 8             | 2604.8557           |                    |                    |                    |                    |           |
+|                 | 500/2000                 | 8             | 4805.86             |                    |                    |                    |                    |           |
+|                 | 5000/500                 | 8             | 655.9754            |                    |                    |                    |                    |           |
+| LLaMA v3.1 70B  | 1000/1000                | 1             | 2585.0953           | 410.286            |                    |                    |                    |           |
+|                 |                          | 2             | 4600.9616           | 4116.4444          | 785.4931           | 468.6383           | 257.972            |           |
+|                 |                          | 4             | 7607.5304           | 6932.8808          | 3774.676           | 1762.6831          | 989.4082           | 1161.4814 |
+|                 |                          | 8             | 13081.434           | 10730.156          | 5978.4573          | 3190.0211          | 959.8463           | 1188.1193 |
+|                 | 128/128                  | 1             | 3897.2623           | 2459.6003          |                    |                    |                    |           |
+|                 |                          | 2             | 5357.0227           | 5194.8171          | 3207.2866          | 1346.9692          | 806.7215           |           |
+|                 |                          | 4             | 8826.9618           | 8542.3012          | 5846.8413          | 2420.8665          | 1272.6755          | 1438.0446 |
+|                 |                          | 8             | 16382.9807          | 15533.1169         | 10649.4968         | 4572.3445          | 1212.0566          | 1381.7051 |
+|                 | 128/2048                 | 1             | 3612.2603           | 445.7773           |                    |                    |                    |           |
+|                 |                          | 2             | 7054.7235           | 5869.3998          | 822.1912           | 483.1299           |                    |           |
+|                 |                          | 4             | 12763.4114          | 9017.4377          | 4982.6225          | 2492.4036          | 1435.236           | 1763.522  |
+|                 |                          | 8             | 19266.0398          | 15190.1652         | 7605.5295          | 4254.2871          | 1609.2473          | 1944.1251 |
+|                 | 128/4096                 | 1             | 2415.1981           |                    |                    |                    |                    |           |
+|                 |                          | 2             | 5671.9561           | 3518.782           | 419.0178           | 272.9137           |                    |           |
+|                 |                          | 4             | 8939.8227           | 6431.2702          | 3083.8794          | 1685.9677          | 1212.5416          | 1280.3778 |
+|                 |                          | 8             | 13974.2854          | 9168.709           | 4981.9765          | 3067.5452          | 1310.091           | 1499.2441 |
+|                 | 20000/2000               | 1             | 240.7202            |                    |                    |                    |                    |           |
+|                 |                          | 2             | 614.318             | 397.6801           |                    |                    |                    |           |
+|                 |                          | 4             | 1030.9528           | 851.8542           | 369.4269           | 179.5181           | 126.7676           | 140.5565  |
+|                 |                          | 8             | 1898.9762           | 1354.5333          |                    | 362.9368           | 156.5767           | 141.1584  |
+|                 | 2048/128                 | 1             | 458.1948            | 244.1842           |                    |                    |                    |           |
+|                 |                          | 2             | 692.3911            | 697.3907           | 322.7016           | 144.7921           | 95.0306            |           |
+|                 |                          | 4             | 1034.5773           | 1001.0771          | 688.0344           | 278.4018           | 150.6795           | 169.0386  |
+|                 |                          | 8             | 2070.8157           | 1966.6072          | 1316.3086          | 550.4751           | 142.6166           | 163.6749  |
+|                 | 2048/2048                | 1             | 1797.6743           | 209.1707           |                    |                    |                    |           |
+|                 |                          | 2             | 3518.0774           | 2445.0093          | 166.792            | 126.1127           |                    |           |
+|                 |                          | 4             | 6112.9026           | 4838.5272          | 2393.1359          | 1231.0359          | 823.4777           | 876.2254  |
+|                 |                          | 8             | 9716.1934           | 7434.8117          | 4023.6978          | 2171.5323          | 858.6602           | 1001.3649 |
+|                 | 500/2000                 | 1             | 2826.6665           |                    |                    |                    |                    |           |
+|                 |                          | 2             | 6106.5855           | 4605.9226          | 700.5415           | 430.6129           |                    |           |
+|                 |                          | 4             | 10816.8283          | 9205.3766          | 3781.082           | 2096.2441          | 1176.418           | 1470.0826 |
+|                 |                          | 8             | 17693.705           | 13109.4437         | 6205.2658          | 3486.7891          | 1306.35            | 1639.2778 |
+|                 | 5000/500                 | 1             | 533.6128            | 125.4236           |                    |                    |                    |           |
+|                 |                          | 2             | 936.7014            | 886.6758           | 228.874            | 116.9529           | 76.1601            |           |
+|                 |                          | 4             | 1386.4827           | 1313.893           | 849.1091           | 362.9361           | 209.2045           | 236.117   |
+|                 |                          | 8             | 2711.5057           | 2444.9643          | 1420.5163          | 670.3742           | 203.8008           | 230.3084  |
+| LLaMA v3.1 8B   | 1000/1000                | 1             | 16414.6988          | 14108.0361         | 7054.5156          | 3634.3886          | 3165.3542          | 3726.7552 |
+|                 | 128/128                  | 1             | 27778.8885          | 26933.1886         | 15571.6549         | 6701.7958          | 5338.0166          | 8639.7933 |
+|                 | 128/2048                 | 1             | 22948.5383          | 18995.2523         | 9150.7477          | 4963.4443          | 4250.6391          | 5101.6652 |
+|                 | 128/4096                 | 1             | 15583.3035          | 11815.449          | 5368.9227          | 3011.3335          | 2568.5398          | 2774.5363 |
+|                 | 20000/2000               | 1             | 1649.5453           | 1301.4754          | 562.8735           | 316.533            | 291.4776           | 270.5404  |
+|                 | 2048/128                 | 1             | 3619.4309           | 3460.3545          | 1904.3259          | 795.389            | 611.8446           | 986.9134  |
+|                 | 2048/2048                | 1             | 11032.9729          | 8777.6623          | 4159.6857          | 2264.9513          | 2011.1215          | 2018.303  |
+|                 | 500/2000                 | 1             | 19510.4015          | 14993.328          | 7498.3331          | 3945.1912          | 3374.7133          | 4065.3921 |
+|                 | 5000/500                 | 1             | 3787.6721           | 3258.2001          | 1708.0353          | 790.6631           | 703.56             | 855.9822  |
+| Mistral 7B      | 1000/1000                | 1             | 17739.1436          | 14986.7562         | 7697.1418          | 3804.5585          | 3333.4754          | 3981.4799 |
+|                 | 128/128                  | 1             | 30094.9137          | 29341.284          | 16238.937          | 6914.2184          | 5491.7418          | 9127.5052 |
+|                 | 128/2048                 | 1             | 24671.5477          | 20941.6631         | 9708.1161          | 5303.4318          | 4402.3044          | 5357.3405 |
+|                 | 128/4096                 | 1             | 16454.0833          | 12780.3724         | 5800.4957          | 3235.0678          | 2825.7896          | 2879.9833 |
+|                 | 20000/2000               | 1             | 1676.0415           | 1317.9654          | 569.7589           | 324.5936           | 281.4751           | 286.353   |
+|                 | 2048/128                 | 1             | 3649.1462           | 3492.3042          | 1929.3126          | 800.9286           | 617.0932           | 1019.75   |
+|                 | 2048/2048                | 1             | 11403.6968          | 8974.7383          | 4367.8733          | 2331.8112          | 1988.3496          | 2184.3861 |
+|                 | 500/2000                 | 1             | 20819.4592          | 15992.3357         | 7947.4257          | 4189.395           | 3603.4489          | 4286.3867 |
+|                 | 5000/500                 | 1             | 3840.0108           | 3340.7385          | 1707.2611          | 807.4561           | 722.8385           | 881.7336  |
+| Mixtral 8x22B   | 1000/1000                | 8             | 18557.43            | 16918.03           | 9759.888           | 4753.6273          |                    | 2128.4403 |
+|                 | 128/128                  | 8             | 25179.4765          | 23729.5293         | 16421.3182         | 6948.5923          |                    | 2488.6297 |
+|                 | 128/2048                 | 8             | 27492.4926          | 24556.7807         | 12303.4168         | 7246.7172          |                    | 3540.0067 |
+|                 | 128/4096                 | 8             | 19718.8648          | 17755.0018         | 7474.3817          | 4696.6123          |                    | 2568.3114 |
+|                 | 20000/2000               | 8             | 2897.182            | 2189.606           | 1118.8294          | 594.8509           |                    | 309.0799  |
+|                 | 2048/128                 | 8             | 3093.8418           | 2917.1362          | 1994.0127          | 825.3934           |                    | 294.7706  |
+|                 | 2048/2048                | 8             | 13795.9827          | 12487.6502         | 5857.8831          | 3377.8371          |                    | 1694.6176 |
+|                 | 500/2000                 | 8             | 24637.473           | 19997.3914         | 10637.6598         | 6007.619           |                    | 2976.9633 |
+|                 | 5000/500                 | 8             | 3889.2745           | 3578.4843          | 2211.2377          | 1028.3843          |                    | 420.2156  |
+| Mixtral 8x7B    | 1000/1000                | 2             | 18712.2046          | 15931.8663         | 6052.876           | 3276.6186          | 1907.8817          |           |
+|                 |                          | 4             | 32834.0923          | 28015.1981         | 15509.1538         | 7357.1613          | 4737.0179          | 5060.8399 |
+|                 |                          | 8             | 44410.7533          | 40573.0499         | 27684.9381         | 13948.1533         | 4970.9287          | 5725.9638 |
+|                 | 128/128                  | 2             | 24970.5594          | 24321.9927         | 15334.2103         | 5915.3897          | 3810.1846          |           |
+|                 |                          | 4             | 42500.5855          | 40182.7271         | 27718.9857         | 11328.7486         | 6026.9206          | 6769.9441 |
+|                 |                          | 8             | 54304.0436          | 51030.9048         | 40119.3268         | 17918.1146         | 5573.7682          | 6422.4308 |
+|                 | 128/2048                 | 2             | 29314.1475          | 20945.7816         | 7409.9253          | 4284.3035          | 2248.1815          |           |
+|                 |                          | 4             | 52680.8353          | 40668.5928         | 21293.1761         | 10929.0182         | 7353.7405          | 7506.7612 |
+|                 |                          | 8             | 70409.1968          | 64529.9982         | 40839.3077         | 21058.2144         | 8866.251           | 9907.6896 |
+|                 | 128/4096                 | 2             | 21520.4385          | 12070.6724         | 3928.6678          | 2302.964           | 1171.966           |           |
+|                 |                          | 4             | 32550.5267          | 29120.2002         | 11678.0071         | 6538.1511          | 5176.9632          | 4958.7004 |
+|                 |                          | 8             | 40373.4857          | 36357.7861         | 21628.821          | 13565.7778         | 7209.2336          | 8271.7938 |
+|                 | 20000/2000               | 2             | 2204.1378           | 1659.5907          | 622.2717           | 321.9839           | 185.6671           |           |
+|                 |                          | 4             | 4047.7473           | 3290.9457          | 1602.0208          | 778.7285           | 572.4282           | 587.1759  |
+|                 |                          | 8             | 6561.6849           | 5328.5261          | 3113.2047          | 1645.8114          | 750.5372           | 828.8471  |
+|                 | 2048/128                 | 2             | 2958.0873           | 2883.5166          | 1796.5451          | 687.7251           | 465.1585           |           |
+|                 |                          | 4             | 5229.8744           | 4972.6818          | 3354.994           | 1351.7191          | 728.4943           | 812.0143  |
+|                 |                          | 8             | 7030.9766           | 6532.721           | 5025.3047          | 2248.6418          | 677.9886           | 771.3656  |
+|                 | 2048/2048                | 2             | 13842.834           | 9334.0732          | 3503.0218          | 1997.1923          | 1060.8946          |           |
+|                 |                          | 4             | 22389.4914          | 20185.8212         | 9143.2741          | 4963.8758          | 3520.3659          | 3453.8076 |
+|                 |                          | 8             | 28975.322           | 26176.9163         | 19291.8278         | 10552.9732         | 4590.187           | 4929.7228 |
+|                 | 500/2000                 | 2             | 23459.0411          | 18185.6392         | 6023.3308          | 3438.6964          | 1817.11            |           |
+|                 |                          | 4             | 39971.0236          | 31693.8787         | 17087.037          | 8930.3495          | 6117.5624          | 6434.9178 |
+|                 |                          | 8             | 60721.462           | 48842.8084         | 31358.2791         | 17034.706          | 7118.0767          | 8130.8026 |
+|                 | 5000/500                 | 2             | 3742.5293           | 3563.8228          | 1648.9041          | 733.1921           | 448.6716           |           |
+|                 |                          | 4             | 6602.3877           | 6020.6267          | 3543.6819          | 1603.8223          | 948.0567           | 1047.3212 |
+|                 |                          | 8             | 8862.8164           | 8214.9445          | 5968.7734          | 2813.1531          | 969.817            | 1098.3081 |
+
 *TP stands for Tensor Parallelism*
 
 ## Reproducing Benchmarked Results
@@ -226,7 +180,7 @@ The performance numbers below were collected using the steps described in this d
 > [!NOTE] The only models supported in this workflow are those listed in the table above.
 
 The following tables are references for commands that are used as part of the benchmarking process. For a more detailed
-description of this benchmarking workflow, see the [Benchmarking Suite README](../../../benchmarks/Suite.md).
+description of this benchmarking workflow, see the [benchmarking suite documentation](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html).
 
 ### Commands
 
@@ -277,10 +231,7 @@ remain in the system longer and therefore require less requests to achieve stead
 | 128          | 4096          | 4224       | 1500               |
 | 2048         | 128           | 2176       | 3000               |
 | 2048         | 2048          | 4096       | 1500               |
-| 5000         | 500           | 5500       | 1500               |
-| 1000         | 1000          | 2000       | 3000               |
-| 500          | 2000          | 2500       | 3000               |
-| 20000        | 2000          | 22000      | 1000               |
+
 
 ## Engine Building
 
diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md
index ea62b081e..1db1b7ca2 100644
--- a/docs/source/quick-start-guide.md
+++ b/docs/source/quick-start-guide.md
@@ -16,6 +16,18 @@ This is the starting point to try out TensorRT-LLM. Specifically, this Quick Sta
   git clone https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct
   ```
 
+## LLM API
+The LLM API is a Python API designed to facilitate setup and inference with TensorRT-LLM directly within Python. It enables model optimization by simply specifying a HuggingFace repository name or a model checkpoint. The LLM API streamlines the process by managing checkpoint conversion, engine building, engine loading, and model inference, all through a single Python object.
+
+Here is a simple example to show how to use the LLM API with TinyLlama.
+
+```{literalinclude} ../../examples/llm-api/quickstart_example.py
+    :language: python
+    :linenos:
+```
+
+To learn more about the LLM API, check out the [](llm-api/index) and [](llm-api-examples/index).
+
 (quick-start-guide-compile)=
 ## Compile the Model into a TensorRT Engine
 
@@ -68,19 +80,6 @@ cd tensorrtllm_backend
 
 2. Refer to [End to end workflow to run llama 7b](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/llama.md) in the TensorRT-LLM backend repository to deploy the model with Triton Inference Server.
 
-## LLM API
-The LLM API is a Python API to setup & infer with TensorRT-LLM directly in python.It allows for optimizing models by specifying a HuggingFace repo name or a model checkpoint. The LLM API handles checkpoint conversion, engine building, engine loading, and model inference, all from one python object.
-
-Note that these APIs are in incubation, they may change and  supports the [following models](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/index.html#supported-model), which will increase in coming release. We appreciate your patience and understanding as we improve this API.
-
-Here is a simple example to show how to use the LLM API with TinyLlama.
-
-```{literalinclude} ../../examples/llm-api/quickstart_example.py
-    :language: python
-    :linenos:
-```
-
-To learn more about the LLM API, check out the [](llm-api-examples/index) and [](llm-api/index).
 
 ## Next Steps
 
diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md
index bb7071aab..e77270eaa 100644
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@@ -43,7 +43,7 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA
 - [Qwen/Qwen1.5/Qwen2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/qwen)
 - [Qwen-VL](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/qwenvl)
 - [RecurrentGemma](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/recurrentgemma)
-- [Replit Code](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/mpt)
+- [Replit Code](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/mpt)[^ReplitCode]
 - [RoBERTa](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/bert)
 - [SantaCoder](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt)
 - [Skywork](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/skywork)
@@ -117,6 +117,7 @@ The following table shows the supported software for TensorRT-LLM.
     - Turing (SM75) - FP32, FP16, INT8[^smooth], INT4
     - Volta (SM70) - FP32, FP16, INT8[^smooth], INT4[^smlt75]
 ```
+[^ReplitCode]：Replit Code is not supported with the transformers 4.45+.
 
 [^smooth]: INT8 SmoothQuant is not supported on SM70 and SM75.
 
diff --git a/docs/source/reference/troubleshooting.md b/docs/source/reference/troubleshooting.md
index 9fcffae71..43e053cb8 100644
--- a/docs/source/reference/troubleshooting.md
+++ b/docs/source/reference/troubleshooting.md
@@ -12,20 +12,6 @@ Then, we print the values at runtime.
 
 Many build errors can be resolved by simply deleting the build tree. Try running the build script with `--clean` or running `rm -r cpp/build`.
 
-## cuDNN Linking Errors
-
-If you encounter errors such as "Entry Point Not Found" (see for example [#1062](https://github.com/NVIDIA/TensorRT-LLM/issues/1062)) the issue might be a mismatch in the `cuDNN` libraries shipped from `torch` and `tensorrt`. To rectify this, please try the following steps
-
-```
-python -m pip uninstall -y tensorrt_llm
-python -m pip install --upgrade pip
-python -m pip install nvidia-cudnn-cu11==8.9.4.25 --no-cache-dir
-python -m pip install --pre --extra-index-url https://pypi.nvidia.com/ tensorrt==9.2.0.post12.dev5 --no-cache-dir
-python -m pip uninstall -y nvidia-cudnn-cu11
-python -m pip install tensorrt_llm  --extra-index-url https://pypi.nvidia.com/ --extra-index-url https://download.pytorch.org/whl/cu121
-```
-
-
 ## Debug on Unit Tests
 
 1. Register the intermediate tensors as the network outputs with `register_network_output` API.
diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md
index 9dd351dd2..a319d0209 100644
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@@ -5,6 +5,52 @@
 All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
 
 
+## TensorRT-LLM Release 0.14.0
+
+### Key Features and Enhancements
+  - Enhanced the `LLM` class in the [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/index.html).
+    - Added support for calibration with offline dataset.
+    - Added support for Mamba2.
+    - Added support for `finish_reason` and `stop_reason`.
+  - Added FP8 support for CodeLlama.
+  - Added `__repr__` methods for class `Module`, thanks to the contribution from @1ytic in #2191.
+  - Added BFloat16 support for fused gated MLP.
+  - Updated ReDrafter beam search logic to match Apple ReDrafter v1.1.
+  - Improved `customAllReduce` performance.
+  - Draft model now can copy logits directly over MPI to the target model's process in `orchestrator` mode. This fast logits copy reduces the delay between draft token generation and the beginning of target model inference.
+  - NVIDIA Volta GPU support is deprecated and will be removed in a future release.
+
+### API Changes
+  - [BREAKING CHANGE] The default `max_batch_size` of the `trtllm-build` command is set to `2048`.
+  - [BREAKING CHANGE] Remove `builder_opt` from the `BuildConfig` class and the `trtllm-build` command.
+  - Add logits post-processor support to the `ModelRunnerCpp` class.
+  - Added `isParticipant` method to the C++ `Executor` API to check if the current process is a participant in the executor instance.
+
+### Model Updates
+  - Added support for NemotronNas, see `examples/nemotron_nas/README.md`.
+  - Added support for Deepseek-v1, see `examples/deepseek_v1/README.md`.
+  - Added support for Phi-3.5 models, see `examples/phi/README.md`.
+
+### Fixed Issues
+  - Fixed a typo in `tensorrt_llm/models/model_weights_loader.py`, thanks to the contribution from @wangkuiyi in #2152.
+  - Fixed duplicated import module in `tensorrt_llm/runtime/generation.py`, thanks to the contribution from @lkm2835 in #2182.
+  - Enabled `share_embedding` for the models that have no `lm_head` in legacy  checkpoint conversion path, thanks to the contribution from @lkm2835 in #2232.
+  - Fixed `kv_cache_type` issue in the Python benchmark, thanks to the contribution from @qingquansong in #2219.
+  - Fixed an issue with SmoothQuant calibration with custom datasets. Thanks to the contribution by @Bhuvanesh09 in #2243.
+  - Fixed an issue surrounding `trtllm-build --fast-build` with fake or random weights. Thanks to @ZJLi2013 for flagging it in #2135.
+  - Fixed missing `use_fused_mlp` when constructing `BuildConfig` from dict, thanks for the fix from @ethnzhng in #2081.
+  - Fixed lookahead batch layout for `numNewTokensCumSum`. (#2263)
+
+### Infrastructure Changes
+  - The dependent ModelOpt version is updated to v0.17.
+
+### Documentation
+  - @Sherlock113 added a [tech blog](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml) to the latest news in #2169, thanks for the contribution.
+
+### Known Issues
+  - Replit Code is not supported with the transformers 4.45+
+
+
 ## TensorRT-LLM Release 0.13.0
 
 ### Key Features and Enhancements
@@ -63,10 +109,6 @@ All published functionality in the Release Notes has been fully tested and verif
   - The dependent PyTorch version is updated to 2.4.0.
   - The dependent ModelOpt version is updated to v0.15.
 
-### Known Issues
-
-- On Windows, installation of TensorRT-LLM may succeed, but you might hit `OSError: exception: access violation reading 0x0000000000000000` when importing the library in Python. See [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) for workarounds.
-
 
 ## TensorRT-LLM Release 0.12.0
 
@@ -440,11 +482,11 @@ All published functionality in the Release Notes has been fully tested and verif
 
 ### Key Features and Enhancements
 
-- Chunked context support (see docs/source/gpt_attention.md#chunked-context)
+- Chunked context support (see docs/source/advanced/gpt-attention.md#chunked-context)
 - LoRA support for C++ runtime (see docs/source/lora.md)
 - Medusa decoding support (see examples/medusa/README.md)
   - The support is limited to Python runtime for Ampere or newer GPUs with fp16 and bf16 accuracy, and the `temperature` parameter of sampling configuration should be 0
-- StreamingLLM support for LLaMA (see docs/source/gpt_attention.md#streamingllm)
+- StreamingLLM support for LLaMA (see docs/source/advanced/gpt-attention.md#streamingllm)
 - Support for batch manager to return logits from context and/or generation phases
   - Include support in the Triton backend
 - Support AWQ and GPTQ for QWEN
diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py
index c9e5d70fb..96b5f0d2f 100755
--- a/examples/apps/fastapi_server.py
+++ b/examples/apps/fastapi_server.py
@@ -2,6 +2,9 @@
 import asyncio
 import json
 import logging
+import signal
+from contextlib import asynccontextmanager
+from http import HTTPStatus
 from typing import AsyncGenerator, Optional
 
 import click
@@ -9,6 +12,7 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
+from tensorrt_llm.executor import CppExecutorError, RequestError
 from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
@@ -16,11 +20,16 @@
 
 class LlmServer:
 
-    def __init__(self, llm: LLM, kv_cache_config: KvCacheConfig):
+    def __init__(self, llm: LLM):
         self.llm = llm
-        self.kv_cache_config = kv_cache_config
 
-        self.app = FastAPI()
+        @asynccontextmanager
+        async def lifespan(app: FastAPI):
+            # terminate rank0 worker
+            yield
+            self.llm._shutdown()
+
+        self.app = FastAPI(lifespan=lifespan)
         self.register_routes()
 
     def register_routes(self):
@@ -50,20 +59,27 @@ async def generate(self, request: Request) -> Response:
 
         sampling_params = SamplingParams(**request_dict)
 
-        promise = self.llm.generate_async(prompt,
-                                          streaming=streaming,
-                                          sampling_params=sampling_params)
+        try:
+            promise = self.llm.generate_async(prompt,
+                                              streaming=streaming,
+                                              sampling_params=sampling_params)
 
-        async def stream_results() -> AsyncGenerator[bytes, None]:
-            async for output in promise:
-                yield output.outputs[0].text_diff.encode("utf-8")
+            async def stream_results() -> AsyncGenerator[bytes, None]:
+                async for output in promise:
+                    yield output.outputs[0].text_diff.encode("utf-8")
 
-        if streaming:
-            return StreamingResponse(stream_results())
+            if streaming:
+                return StreamingResponse(stream_results())
 
-        # Non-streaming case
-        await promise.aresult()
-        return JSONResponse({"text": promise.outputs[0].text})
+            # Non-streaming case
+            await promise.aresult()
+            return JSONResponse({"text": promise.outputs[0].text})
+        except RequestError as e:
+            return JSONResponse(content=str(e),
+                                status_code=HTTPStatus.BAD_REQUEST)
+        except CppExecutorError:
+            # If internal executor error is raised, shutdown the server
+            signal.raise_signal(signal.SIGINT)
 
     async def __call__(self, host, port):
         config = uvicorn.Config(self.app,
@@ -82,28 +98,32 @@ async def __call__(self, host, port):
 @click.option("--max_beam_width", type=int, default=1)
 @click.option("--tp_size", type=int, default=1)
 @click.option("--pp_size", type=int, default=1)
+@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
 def entrypoint(model_dir: str,
                tokenizer: Optional[str] = None,
                host: Optional[str] = None,
                port: int = 8000,
                max_beam_width: int = 1,
                tp_size: int = 1,
-               pp_size: int = 1):
+               pp_size: int = 1,
+               kv_cache_free_gpu_memory_fraction: float = 0.8):
     host = host or "0.0.0.0"
     port = port or 8000
     logging.info(f"Starting server at {host}:{port}")
 
     build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width)
 
+    kv_cache_config = KvCacheConfig(
+        free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
+
     llm = LLM(model_dir,
               tokenizer,
               tensor_parallel_size=tp_size,
               pipeline_parallel_size=pp_size,
-              build_config=build_config)
-
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
+              build_config=build_config,
+              kv_cache_config=kv_cache_config)
 
-    server = LlmServer(llm=llm, kv_cache_config=kv_cache_config)
+    server = LlmServer(llm=llm)
 
     asyncio.run(server(host, port))
 
diff --git a/examples/apps/openai_server.py b/examples/apps/openai_server.py
index 5c502aa13..5323be8d4 100644
--- a/examples/apps/openai_server.py
+++ b/examples/apps/openai_server.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 import asyncio
 import logging
+import signal
+from contextlib import asynccontextmanager
 from http import HTTPStatus
 from pathlib import Path
 from typing import (AsyncGenerator, AsyncIterator, List, Optional, Tuple,
@@ -15,6 +17,7 @@
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
 # yapf: disable
+from tensorrt_llm.executor import CppExecutorError
 from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig
 from tensorrt_llm.hlapi.llm import RequestOutput
 from tensorrt_llm.hlapi.openai_protocol import (
@@ -66,10 +69,8 @@ class OpenaiServer:
     def __init__(self,
                  llm: LLM,
                  model: str,
-                 kv_cache_config: KvCacheConfig,
                  hf_tokenizer: PreTrainedTokenizer = None):
         self.llm = llm
-        self.kv_cache_config = kv_cache_config
         self.tokenizer = hf_tokenizer
 
         model_dir = Path(model)
@@ -78,7 +79,13 @@ def __init__(self,
         else:
             self.model = model
 
-        self.app = FastAPI()
+        @asynccontextmanager
+        async def lifespan(app: FastAPI):
+            # terminate rank0 worker
+            yield
+            self.llm._shutdown()
+
+        self.app = FastAPI(lifespan=lifespan)
 
         @self.app.exception_handler(RequestValidationError)
         async def validation_exception_handler(_, exc):
@@ -326,7 +333,9 @@ async def create_chat_response(promise: RequestOutput) -> JSONResponse:
             else:
                 response = await create_chat_response(promise)
                 return JSONResponse(content=response.model_dump())
-
+        except CppExecutorError:
+            # If internal executor error is raised, shutdown the server
+            signal.raise_signal(signal.SIGINT)
         except Exception as e:
             return self.create_error_response(str(e))
 
@@ -432,6 +441,9 @@ async def create_completion_response(generator: AsyncIterator[Tuple[int, Request
             else:
                 response = await create_completion_response(generator, num_choices)
                 return JSONResponse(content=response.model_dump())
+        except CppExecutorError:
+            # If internal executor error is raised, shutdown the server
+            signal.raise_signal(signal.SIGINT)
         except Exception as e:
             return self.create_error_response(str(e))
 
@@ -452,31 +464,34 @@ async def __call__(self, host, port):
 @click.option("--max_beam_width", type=int, default=1)
 @click.option("--tp_size", type=int, default=1)
 @click.option("--pp_size", type=int, default=1)
+@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
 def entrypoint(model_dir: str,
                tokenizer: Optional[str] = None,
                host: Optional[str] = None,
                port: int = 8000,
                max_beam_width: int = 1,
                tp_size: int = 1,
-               pp_size: int = 1):
+               pp_size: int = 1,
+               kv_cache_free_gpu_memory_fraction: float = 0.8):
     host = host or "0.0.0.0"
     port = port or 8000
     logging.info(f"Starting server at {host}:{port}")
 
     build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width)
 
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
+
     llm = LLM(model_dir,
               tokenizer,
               tensor_parallel_size=tp_size,
               pipeline_parallel_size=pp_size,
-              build_config=build_config)
+              build_config=build_config,
+              kv_cache_config=kv_cache_config)
 
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
     hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer or model_dir)
 
     server = OpenaiServer(llm=llm,
                           model=model_dir,
-                          kv_cache_config=kv_cache_config,
                           hf_tokenizer=hf_tokenizer)
 
     asyncio.run(server(host, port))
diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt
index c6f02eff0..ea47f59f5 100644
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@@ -1,8 +1,8 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 cpm-kernels~=1.0.11
 transformers_stream_generator~=0.0.4
diff --git a/examples/bindings/executor/README.md b/examples/bindings/executor/README.md
index e3fdd5d3d..51a605c83 100644
--- a/examples/bindings/executor/README.md
+++ b/examples/bindings/executor/README.md
@@ -21,7 +21,7 @@ python3 example_basic.py --model_path=../llama/tmp/7B/trt_engines/fp16/1-gpu/
 
 ### Debug example
 
-This example shows how you can define which engine IO tensors should be dumped to numpy files.
+This example shows how you can define which engine IO tensors should be kept or dumped to numpy files.
 Run `example_debug.py`, passing in the directory where the TensorRT engine was generated. For example:
 
 ```
diff --git a/examples/bindings/executor/example_advanced.py b/examples/bindings/executor/example_advanced.py
index c6b7c31ea..6cd1303ed 100644
--- a/examples/bindings/executor/example_advanced.py
+++ b/examples/bindings/executor/example_advanced.py
@@ -124,7 +124,7 @@ def write_output_tokens(output_tokens_csv_file: str, request_ids: list[int],
         default=False,
         action="store_true",
         help=
-        "Exclude input token when writing output toekns. Only has effect for streaming=False since in streaming mode, input tokens are never included in output."
+        "Exclude input token when writing output tokens. Only has effect for streaming=False since in streaming mode, input tokens are never included in output."
     )
     parser.add_argument("--max_tokens",
                         type=int,
diff --git a/examples/bindings/executor/example_debug.py b/examples/bindings/executor/example_debug.py
index 9fe817e27..f7c0669b1 100644
--- a/examples/bindings/executor/example_debug.py
+++ b/examples/bindings/executor/example_debug.py
@@ -16,14 +16,17 @@
                         type=str,
                         required=True,
                         help="Directory containing model engine")
+    parser.add_argument("--dump_tensors",
+                        action="store_true",
+                        help="Dump debug tensors to files")
     args = parser.parse_args()
 
-    # debug_config = trtllm.DebugConfig(dump_input_tensors=True,
-    #                                   dump_output_tensors=True,
-    #                                   debug_tensor_names=["test"])
+    max_tokens = 2
 
-    # Select which tensors should be dumped
-    debug_config = trtllm.DebugConfig(debug_tensor_names=["host_request_types"])
+    # Select which tensors should be kept or dumped
+    debug_config = trtllm.DebugConfig(
+        debug_tensor_names=["sequence_length"],
+        debug_tensors_max_iterations=0 if args.dump_tensors else max_tokens)
 
     # Create the executor.
     executor = trtllm.Executor(
@@ -32,7 +35,8 @@
 
     if executor.can_enqueue_requests():
         # Create the request.
-        request = trtllm.Request(input_token_ids=[1, 2, 3, 4], max_tokens=2)
+        request = trtllm.Request(input_token_ids=[1, 2, 3, 4],
+                                 max_tokens=max_tokens)
 
         # Enqueue the request.
         request_id = executor.enqueue_request(request)
@@ -44,9 +48,20 @@
         # Print tokens.
         print(output_tokens)
 
-    print("debug tensors:")
-    debug_dir = pl.Path("/tmp/tllm_debug/PP_1/TP_1")
-    for iter_dir in [x for x in debug_dir.iterdir() if x.is_dir()]:
-        print(iter_dir.name)
-        for file in [x for x in iter_dir.iterdir() if x.is_file()]:
-            print(file.name, np.load(file))
+    if args.dump_tensors:
+        print("debug tensors from files:")
+        debug_dir = pl.Path("/tmp/tllm_debug/PP_1/TP_1")
+        if debug_dir.is_dir():
+            for iter_dir in [x for x in debug_dir.iterdir() if x.is_dir()]:
+                print(iter_dir.name)
+                for file in [x for x in iter_dir.iterdir() if x.is_file()]:
+                    print(file.name, np.load(file))
+        else:
+            print("debug dir not found")
+    else:
+        print("debug tensors from queue:")
+        debug_tensors = executor.get_latest_debug_tensors()
+        for debug_iter in debug_tensors:
+            print(f"iteration {debug_iter.iter}")
+            for [name, tensor] in debug_iter.debug_tensors.items():
+                print(name, tensor)
diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt
index b795b5ca4..d4adbc087 100644
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt
index 231831897..aebac704a 100644
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@@ -1,8 +1,10 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
 rouge_score~=0.1.2
 sentencepiece
 tiktoken
+# https://github.com/THUDM/ChatGLM3/issues/1324
+transformers<=4.43.0
diff --git a/examples/dbrx/convert_checkpoint.py b/examples/dbrx/convert_checkpoint.py
index 41e19b309..2009bbac7 100644
--- a/examples/dbrx/convert_checkpoint.py
+++ b/examples/dbrx/convert_checkpoint.py
@@ -545,6 +545,7 @@ def execute(workers, func, hf_model):
         kv_cache_quant_algo = QuantAlgo.INT8
 
     hf_config = None
+
     if args.model_dir is not None:
         hf_config = AutoConfig.from_pretrained(args.model_dir,
                                                trust_remote_code=True)
@@ -563,8 +564,10 @@ def execute(workers, func, hf_model):
         args.clip_qkv = hf_config.attn_config.clip_qkv
         args.hidden_act = 'swiglu'
         args.rotary_base = hf_config.attn_config.rope_theta
-    args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k,
-                                args.moe_renorm_mode).validate()
+    args.moe_config = MoeConfig(
+        num_experts=args.moe_num_experts,
+        top_k=args.moe_top_k,
+        normalization_mode=args.moe_renorm_mode).validate()
     config = {
         'architecture': hf_config.architectures[0],
         'dtype': args.dtype,
diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt
index f4b32b625..c0790e6de 100644
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/deepseek_v1/README.md b/examples/deepseek_v1/README.md
new file mode 100644
index 000000000..6bca80df1
--- /dev/null
+++ b/examples/deepseek_v1/README.md
@@ -0,0 +1,77 @@
+# Deepseek-v1
+
+This document shows how to build and run [deepseek-v1](https://arxiv.org/pdf/2401.06066) model in TensorRT-LLM.
+
+- [Deepseek-v1](#deepseek-v1)
+    - [Prerequisite](#prerequistie)
+    - [Hardware](#hardware)
+    - [Overview](#overview)
+    - [Support Matrix](#support-matrix)
+    - [Usage](#usage)
+        - [Build TensorRT engine(s)](#build-tensorrt-engines)
+
+## Prerequisite
+
+First, please download Deepseek-v1 weights from HF https://huggingface.co/deepseek-ai/deepseek-moe-16b-base.
+
+```bash
+git lfs install
+git clone https://huggingface.co/deepseek-ai/deepseek-moe-16b-base
+```
+
+## Hardware
+
+The Deepseek-v1 model requires 1x80G GPU memory.
+
+## Overview
+
+The TensorRT-LLM Deepseek-v1 implementation can be found in [tensorrt_llm/models/deepseek_v1/model.py](../../tensorrt_llm/models/deepseek_v1/model.py). The TensorRT-LLM Deepseek-v1 example code is located in [`example/deepseek_v1`](./). There is one main file:
+
+* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the Deepseek-v1 model into tensorrt-llm checkpoint format.
+
+In addition, there are three shared files in the parent folder [`examples`](../) can be used for inference and evaluation:
+
+* [`../run.py`](../run.py) to run the model inference output by given an input text.
+* [`../summarize.py`](../summarize.py) to summarize the article from [cnn_dailmail](https://huggingface.co/datasets/cnn_dailymail) dataset, it can running the summarize from HF model and TensorRT-LLM model.
+* [`../mmlu.py`](../mmlu.py) to running score script from https://github.com/declare-lab/instruct-eval to compare HF model and TensorRT-LLM model on the MMLU dataset.
+
+## Support Matrix
+
+- [x] FP16
+- [x] TENSOR PARALLEL
+- [ ] FP8
+
+## Usage
+
+The TensorRT-LLM Deepseek-v1 example code locates at [examples/deepseek_v1](./). It takes PyTorch weights as input, and builds corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
+
+### Build TensorRT engine(s)
+
+Below is the step-by-step to run Deepseek-v1 with TensorRT-LLM.
+
+First the checkpoint will be converted to the TensorRT-LLM checkpoint format by apply [`convert_checkpoint.py`](./convert_checkpoint.py). After that, the TensorRT engine(s) can be build with TensorRT-LLM checkpoint.
+
+```bash
+# Build the bfloat16 engine from Deepseek-v1 HF weights.
+python convert_checkpoint.py --model_dir ./deepseek_moe_16b/ \
+                            --output_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \
+                            --dtype bfloat16 \
+                            --tp_size 1
+trtllm-build --checkpoint_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \
+            --output_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \
+            --gpt_attention_plugin bfloat16 \
+            --gemm_plugin bfloat16 \
+            --moe_plugin bfloat16 \
+```
+
+Then, test the engine with [run.py](../run.py) script:
+
+```bash
+python ../run.py --engine_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \
+                --tokenizer_dir ./deepseek_moe_16b/ \
+                --max_output_len 32 \
+                --top_p 0 \
+                --input_text "The president of the United States is person who"
+```
+## Credits
+This Deepseek-v1 model example exists thanks to @akhoroshev(https://github.com/akhoroshev) community contribution!
diff --git a/tensorrt_llm/models/deci/__init__.py b/examples/deepseek_v1/__init__.py
similarity index 100%
rename from tensorrt_llm/models/deci/__init__.py
rename to examples/deepseek_v1/__init__.py
diff --git a/examples/deepseek_v1/convert_checkpoint.py b/examples/deepseek_v1/convert_checkpoint.py
new file mode 100644
index 000000000..b76ea1b18
--- /dev/null
+++ b/examples/deepseek_v1/convert_checkpoint.py
@@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import tensorrt_llm
+from tensorrt_llm._utils import release_gc
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import DeepseekForCausalLM
+from tensorrt_llm.models.deepseek_v1.convert import load_hf_deepseek
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_dir', type=str, default=None, required=True)
+    parser.add_argument('--tp_size',
+                        type=int,
+                        default=1,
+                        help='N-way tensor parallelism size')
+    parser.add_argument('--pp_size',
+                        type=int,
+                        default=1,
+                        help='N-way pipeline parallelism size')
+    parser.add_argument(
+        '--moe_tp_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way tensor parallelism size for MoE, default is tp_size, which will do tp-only for MoE'
+    )
+    parser.add_argument(
+        '--moe_ep_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way expert parallelism size for MoE, default is 1, which will do tp-only for MoE'
+    )
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float16',
+                        choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0)'
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim=0')
+    parser.add_argument(
+        '--use_embedding_sharing',
+        action="store_true",
+        default=False,
+        help=
+        'Try to reduce the engine size by sharing the embedding lookup table between two layers'
+        'Note: the flag might not take effect when the criteria are not met')
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default='trtllm_checkpoint',
+                        required=True,
+                        help='The path to save the TensorRT-LLM checkpoint')
+    parser.add_argument(
+        '--workers',
+        type=int,
+        default=1,
+        help='The number of workers for converting checkpoint in parallel')
+    parser.add_argument(
+        '--moe_num_experts',
+        type=int,
+        default=0,
+        help='Specify the number of experts to use for MOE layers')
+    parser.add_argument(
+        '--moe_top_k',
+        type=int,
+        default=0,
+        help=
+        'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set'
+    )
+    parser.add_argument(
+        '--moe_renorm_mode',
+        type=int,
+        default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+        help=
+        'Controls renormalization after gate logits. Check layers/moe.py for accepted values'
+    )
+    parser.add_argument(
+        '--save_config_only',
+        action="store_true",
+        default=False,
+        help=
+        'Only save the model config w/o read and converting weights, be careful, this is for debug only'
+    )
+    parser.add_argument(
+        '--disable_weight_only_quant_plugin',
+        default=False,
+        action="store_true",
+        help=
+        'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
+        'You must also use --use_weight_only for that argument to have an impact'
+    )
+    # Add quantization related feature later
+    args = parser.parse_args()
+
+    return args
+
+
+def args_to_build_options(args):
+    return {
+        'use_parallel_embedding': args.use_parallel_embedding,
+        'embedding_sharding_dim': args.embedding_sharding_dim,
+        'share_embedding_table': args.use_embedding_sharing,
+        'disable_weight_only_quant_plugin':
+        args.disable_weight_only_quant_plugin
+    }
+
+
+def execute(workers, func, args):
+    if workers == 1:
+        for rank, f in enumerate(func):
+            f(args, rank)
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as p:
+            futures = [p.submit(f, args, rank) for rank, f in enumerate(func)]
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    traceback.print_exc()
+                    exceptions.append(e)
+            assert len(
+                exceptions
+            ) == 0, "Checkpoint conversion failed, please check error log."
+
+
+def convert_and_save_hf(args):
+    model_dir = args.model_dir
+    world_size = args.tp_size * args.pp_size
+    # Need to convert the cli args to the kay-value pairs and override them in the generate config dict.
+    # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now,
+    # before the refactor is done.
+    override_fields = {}
+    override_fields.update(args_to_build_options(args))
+
+    hf_model = load_hf_deepseek(model_dir)
+
+    def convert_and_save_rank(args, rank):
+        mapping = Mapping(world_size=world_size,
+                          rank=rank,
+                          tp_size=args.tp_size,
+                          pp_size=args.pp_size,
+                          moe_tp_size=args.moe_tp_size,
+                          moe_ep_size=args.moe_ep_size)
+        deepseekv1 = DeepseekForCausalLM.from_hugging_face(
+            hf_model, args.model_dir, args.dtype, mapping, **override_fields)
+        deepseekv1.save_checkpoint(args.output_dir, save_config=(rank == 0))
+        del deepseekv1
+
+    execute(args.workers, [convert_and_save_rank] * world_size, args)
+    release_gc()
+
+
+def main():
+    print(tensorrt_llm.__version__)
+    args = parse_arguments()
+
+    args.tp_size * args.pp_size
+    if (args.moe_tp_size == -1 and args.moe_ep_size == -1):
+        # moe default to tp-only
+        args.moe_tp_size = args.tp_size
+        args.moe_ep_size = 1
+    elif (args.moe_tp_size == -1):
+        args.moe_tp_size = args.tp_size // args.moe_ep_size
+    elif (args.moe_ep_size == -1):
+        args.moe_ep_size = args.tp_size // args.moe_tp_size
+    assert (args.moe_tp_size * args.moe_ep_size == args.tp_size
+            ), "moe_tp_size * moe_ep_size must equal to tp_size"
+
+    tik = time.time()
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    assert args.model_dir is not None
+    convert_and_save_hf(args)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Total time of converting checkpoints: {t}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/deepseek_v1/requirements.txt b/examples/deepseek_v1/requirements.txt
new file mode 100644
index 000000000..0ad9cdb1e
--- /dev/null
+++ b/examples/deepseek_v1/requirements.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://pypi.nvidia.com
+tensorrt_llm==0.11.0
+datasets~=2.14.6
+evaluate~=0.4.1
+rouge_score~=0.1.2
diff --git a/examples/draft_target_model/README.md b/examples/draft_target_model/README.md
new file mode 100644
index 000000000..debf1cf3b
--- /dev/null
+++ b/examples/draft_target_model/README.md
@@ -0,0 +1,86 @@
+# Draft-Target-Model Speculative Decoding
+
+This document shows how to build and run a model using Draft-Target-Model speculative decoding (also known as `Speculative-Sampling`, [`Paper`](https://arxiv.org/abs/2302.01318)) in TensorRT-LLM on single GPU, or single node multiple GPU.
+
+## Overview
+
+The Draft-Target-Model involves the use of two distinct models trained independently but sharing the same vocabulary: a smaller Draft model and a larger Target model. For example, GPT 125M / 6.7B models can serve as the Draft / Target model.
+
+There are two styles of using Draft-Target-Model in TensorRT-LLM now. The first one is using TensorRT-LLM-BLS in Triton, which more information and detailed steps can be found in [speculative decoding documentation](../../docs/source/speculative_decoding.md). The second one is using it directly in TensorRT-LLM, which steps can be found in this document and the code can be found in [examples/run.py](../run.py).
+
+Draft-Target-Model has 4 additional hyperparameters that you need to specify to control the process of generation:
+- `draft_len`: the number of tokens the draft model generated in one iteration, which the range is from 4 to 10 in common usage. Empirically, the larger the value is, the higher acceptance ratio but higher overhead is expected at the same time, so the right balance based on the models and application scenarios needs to be found.
+- `draft_model_device_list`: the index list of device(s) to run the draft model. The length of it must be the same as the TP size of the draft model engine. For instances, `draft_model_device_list=[1]` means using tp_size=1 and GPU 1 for draft model, `draft_model_device_list=[4,5,6,7]` means using tp=4 and GPU from 4 to 7 for draft model.
+- `target_model_device_list`: the index list of device(s) to run the target model. The length of it must be the same as the TP size of the target model engine. For instances, `draft_model_device_list=[0]` means using tp_size=1 and GPU 0 for target model, `draft_model_device_list=[2,3]` means using tp=2 and GPU from 2 to 3 for target model.
+- `use_logits`: there are two methods to accept tokens proposed by draft model. When `use_logits=True`, the draft tokens are accepted based on the ratio of the logits from draft and target model (modified rejection sampling method in the original paper); When `use_logits=False`, the draft tokens are accepted based on per-token comparison with target predictions regardless of the logits.
+
+## Support Matrix
+  * GPU Compute Capability >= 8.0 (Ampere or newer)
+  * FP16 / BF16 / FP8 (both draft and target model)
+  * Paged KV Cache
+  * Tensor Parallel
+
+## Usage
+
+### Build draft and target engines
+
++ We use a open-source `llama-v2-7B/13B` models as both draft and target model in this example.
++ `--use_paged_context_fmha=enable` must be specified since we need KVcache reuse for draft / target model.
++ `--gather_generation_logits` is optional. In original paper, we accept the tokens by comparing logits of draft and target models, so this parameter is needed. But for simplification, we can accept the tokens by comparing the output token directly, in this occasion, we can skip this parameter.
++ `--speculative_decoding_mode=draft_tokens_external` and `--max_draft_len` must be specified for target model.
+
+```bash
+cd examples/llama
+
+python3 convert_checkpoint.py \
+    --model_dir=<Path To Llama-v2-7B repo> \
+    --output_dir=./ckpt-draft \
+    --dtype=float16
+
+python3 convert_checkpoint.py \
+    --model_dir=<Path To Llama-v2-13B repo> \
+    --output_dir=./ckpt-target \
+    --dtype=float16
+
+trtllm-build \
+    --checkpoint_dir ./ckpt-draft \
+    --output_dir=./draft-engine \
+    --gemm_plugin=float16 \
+    --use_paged_context_fmha=enable \
+    --gather_generation_logits \
+    --max_batch_size=4 \
+    --max_input_len=3200 \
+    --max_seq_len=4800
+
+trtllm-build \
+    --checkpoint_dir=./ckpt-target \
+    --output_dir=./target-engine \
+    --gemm_plugin=float16 \
+    --use_paged_context_fmha=enable \
+    --gather_generation_logits \
+    --speculative_decoding_mode=draft_tokens_external \
+    --max_draft_len=10 \
+    --max_batch_size=4 \
+    --max_input_len=3200 \
+    --max_seq_len=4800
+```
+
+### Run decoding
+
++ `--draft_engine_dir` and `--engine_dir` must be specified for the draft and target engines.
++ `--draft_target_model_config` is corresponding configuration of Draft-Target-Model, we can see its definition in [util.py](../util.py).
+  + As an example, `[4,[0],[1],False]` means `draft_len=4`, device of draft model is `GPU0`, device of target model is `GPU1`, and use tokens rather than logits to accept.
++ Only CPP session (using executor as low-level API) is supported, while Python session (`--use_py_session`) is not supported.
+
+```bash
+cd examples/llama
+
+python3 ../run.py \
+    --tokenizer_dir gpt2-medium \
+    --draft_engine_dir ./draft-engine \
+    --engine_dir ./target-engine \
+    --draft_target_model_config="[4,[0],[1],True]" \
+    --kv_cache_free_gpu_memory_fraction=0.4 \
+    --max_output_len=256 \
+    --input_text="How does Draft-Sampling work?"
+```
diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt
new file mode 100644
index 000000000..179f92d66
--- /dev/null
+++ b/examples/draft_target_model/requirements.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pypi.nvidia.com
+tensorrt_llm==0.14.0
+datasets~=2.14.5
+rouge_score~=0.1.2
+sentencepiece>=0.1.99
+evaluate~=0.4.1
diff --git a/examples/enc_dec/README.md b/examples/enc_dec/README.md
index dfe68ed90..509de1255 100644
--- a/examples/enc_dec/README.md
+++ b/examples/enc_dec/README.md
@@ -219,6 +219,8 @@ For good usability, Python binding of the C++ runtime is provided. You can use t
 python3 ../run.py --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} --tokenizer_dir tmp/hf_models/${MODEL_NAME} --max_output_len 64 --num_beams=1 --input_text "translate English to German: The house is wonderful."
 ```
 
+You can specify `--kv_cache_free_gpu_memory_fraction` to control the percentage of free GPU memory to be used by KV cache (by default 0.9), and `--cross_kv_cache_fraction` to control the percentage of KV cache to be used by cross attention (by default 0.5, and rest of the KV cache will be used by self attention).
+
 For pure C++ runtime, there is no example given yet. Please check the [`Executor`](../../cpp/include/tensorrt_llm/executor/executor.h) API to implement your own end-to-end workflow. It is highly recommended to leverage more encapsulated solutions such as the above C++ Python binding or [Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend).
 
 #### Run with Triton Backend
diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt
index 83ba1d6a5..cf320cc9d 100644
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@@ -1,8 +1,8 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 tqdm
diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt
index a296aa5b6..d1e107a9b 100644
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@@ -3,12 +3,12 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
 safetensors~=0.4.1
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 h5py~=3.10.0
 rouge_score
 nltk
diff --git a/examples/gpt/README.md b/examples/gpt/README.md
index 2c9699096..7b0b65507 100644
--- a/examples/gpt/README.md
+++ b/examples/gpt/README.md
@@ -425,8 +425,7 @@ Then, use `trtllm-build` to build engine(s).
 
 ```bash
 trtllm-build --checkpoint_dir starcoder2/trt_ckpt/int8-sq/ \
-             --output_dir starcoder2/trt_engine/int8-sq/ \
-             --builder_opt 4
+             --output_dir starcoder2/trt_engine/int8-sq/
 ```
 
 
diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt
index 30e41f940..2714c5b5c 100644
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
-SentencePiece~=0.1.99
+SentencePiece>=0.1.99
diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt
index d7bf43bed..80c4473aa 100644
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt
index b97a55fdb..4700fd4eb 100644
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt
index 3c4c9a922..66889c9a3 100644
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@@ -1,6 +1,6 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt
index 1a932da58..9b782d0f9 100644
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets==2.14.5
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 evaluate~=0.4.1
diff --git a/examples/jais/requirements.txt b/examples/jais/requirements.txt
index 30e41f940..2714c5b5c 100644
--- a/examples/jais/requirements.txt
+++ b/examples/jais/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
-SentencePiece~=0.1.99
+SentencePiece>=0.1.99
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 7c3ca77b7..b21c7f370 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -67,7 +67,7 @@ The TensorRT-LLM LLaMA example code locates at [examples/llama](./). It takes HF
 Please install required packages first to make sure the example uses matched `tensorrt_llm` version:
 
 ```bash
-pip install -r requirements.txt
+pip install --upgrade -r requirements.txt
 ```
 
 Need to prepare the HF LLaMA checkpoint by following the guides here https://huggingface.co/docs/transformers/main/en/model_doc/llama.
@@ -717,16 +717,19 @@ To run the GPTQ LLaMa example, the following steps are required:
 
 1. Weight quantization:
 
-    Quantized weights for GPTQ are generated using [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa.git) as follow:
+    Quantized weights for GPTQ are generated using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) as follow:
 
     ```bash
-    git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git
-    cd GPTQ-for-LLaMa
-    pip install -r requirements.txt
+    git clone https://github.com/AutoGPTQ/AutoGPTQ
+    cd AutoGPTQ
+    pip install .
+
+    # Download the quant_autogptq script
+    wget https://gist.githubusercontent.com/TheBloke/b47c50a70dd4fe653f64a12928286682/raw/ebcee019d90a178ee2e6a8107fdd7602c8f1192a/quant_autogptq.py
 
     # Quantize weights into INT4 and save as safetensors
     # Quantized weight with parameter "--act-order" is not supported in TRT-LLM
-    python llama.py ./tmp/llama/7B/ c4 --wbits 4 --true-sequential --groupsize 128 --save_safetensors ./llama-7b-4bit-gs128.safetensors
+    python quant_autogptq.py ./tmp/llama/7B ./llama-7b-4bit-gs128.safetensors wikitext --bits 4 --group_size 128 --desc_act 0 --damp 0.1 --dtype float16 --seqlen 4096 --num_samples 3 --use_fast
     ```
 
     Let us build the TRT-LLM engine with the saved `./llama-7b-4bit-gs128.safetensors`.
@@ -907,6 +910,24 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_codellama \
             --output_dir ./tmp/codellama/trt_engines/fp16/1-gpu/ \
             --gemm_plugin auto
 ```
+The example below uses the NVIDIA ModelOpt (AlgorithMic Model Optimization) toolkit for the model quantization process.
+First make sure Modelopt toolkit is installed (see [examples/quantization/README.md](/examples/quantization/README.md#preparation))
+
+```bash
+# Quantize HF CodeLlama 7B into FP8 and export trtllm checkpoint
+python ../quantization/quantize.py --model_dir /tmp/CodeLlama-7b-Instruct-hf \
+                                   --dtype float16 \
+                                   --qformat fp8 \
+                                   --kv_cache_dtype fp8 \
+                                   --output_dir ./tllm_checkpoint_1gpu_fp8 \
+                                   --calib_size 512
+
+# Build trtllm engines from the trtllm checkpoint
+trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp8 \
+             --output_dir ./engine_outputs \
+             --gemm_plugin auto
+```
+
 Use the following command to build `CodeLlama-34b-Instruct` for 4 GPUs (TP=4):
 ```bash
 python convert_checkpoint.py --model_dir /tmp/CodeLlama-34b-Instruct-hf  \
diff --git a/examples/llama/convert_checkpoint.py b/examples/llama/convert_checkpoint.py
index 17034568f..6443f8513 100644
--- a/examples/llama/convert_checkpoint.py
+++ b/examples/llama/convert_checkpoint.py
@@ -91,6 +91,19 @@ def parse_arguments():
         help=
         "The huggingface dataset name or the local directory of the dataset for calibration."
     )
+    parser.add_argument(
+        "--calib_size",
+        type=int,
+        default=512,
+        help=
+        "Number of samples for calibration. Set to -1 to use the whole dataset.",
+    )
+    parser.add_argument(
+        "--calib_max_seq_length",
+        type=int,
+        default=512,
+        help="Max Sequence length for calibration",
+    )
     parser.add_argument(
         "--smoothquant",
         "-sq",
@@ -408,6 +421,8 @@ def convert_and_save_hf(args):
             quant_config=quant_config,
             device='cpu' if args.load_model_on_cpu else 'cuda',
             calib_dataset=args.calib_dataset,
+            calib_batches=args.calib_size,
+            calib_max_seq_length=args.calib_max_seq_length,
             **override_fields)
     else:
         # When not loading by shard, preload one complete model and then slice per rank weights from this
diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt
index 30dd11d63..8cb0b9096 100644
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
+transformers>=4.43.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
diff --git a/examples/llm-api/README.md b/examples/llm-api/README.md
index 1dbba4f28..7e180d3af 100644
--- a/examples/llm-api/README.md
+++ b/examples/llm-api/README.md
@@ -1,329 +1,3 @@
-# High-level API
-We are working on a Python high-level API(HLAPI) for LLM workflow, which is still in incubation and may change later.
-Here we show you a preview of how it works and how to use it.
+# LLM API Examples
 
-Note that the APIs are not stable and we appreciate your patience and understanding as we improve this API.
-
-## HLAPI Supported Model
-* LLaMA (including variants Mistral, Mixtral, InternLM)
-* GPT (including variants Starcoder-1/2, Santacoder)
-* Gemma-1/2
-* Phi-1/2/3
-* ChatGLM (including variants glm-10b, chatglm, chatglm2, chatglm3, glm4)
-* QWen-1/1.5/2
-* Falcon
-* Baichuan-1/2
-* GPT-J
-
-## Quick start
-
-Please install the required packages first:
-
-```bash
-pip install -r requirements.txt
-```
-
-Here is a simple example to show how to use the HLAPI:
-
-Firstly, import the `LLM` and `SamplingParams` from the `tensorrt_llm` package, and create an LLM object with a HuggingFace (HF) model directly. Here we use the TinyLlama model as an example, `LLM` will download the model from the HuggingFace model hub automatically. You can also specify local models, either in HF format, TensorRT-LLM engine format or TensorRT-LLM checkpoint format.
-
-```python
-from tensorrt_llm import LLM, SamplingParams
-
-llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-```
-
-Secondly, generate text with the `generate` method of the `LLM` object directly with a batch of prompts, the `sampling_params` is optional, and you can customize the sampling strategy with it.
-
-```python
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-outputs = llm.generate(prompts, sampling_params)
-
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
-
-Please refer to the [LLM quickstart](./quickstart_example.py) for the complete example.
-
-## Examples
-
-You can refer to the scripts in the current directory for all of the examples.
-
-## Model preparation
-The `LLM` class supports four kinds of model inputs:
-
-1. **HuggingFace model name**: triggers a download from the HuggingFace model hub, e.g. `TinyLlama/TinyLlama-1.1B-Chat-v1.0` in the quickstart.
-1. **Local HuggingFace models**: uses a locally stored HuggingFace model.
-2. **Local TensorRT-LLM engine**: built by `trtllm-build` tool or saved by the HLAPI
-3. **Local TensorRT-LLM checkpoints**: converted by `convert_checkpoint.py` script in the examples
-
-All kinds of the model inputs can be seamlessly integrated with the HLAPI, and the `LLM(model=<any-model-path>)` construcotr can accommodate models in any of the above formats.
-
-Let's delve into the preparation of the three kinds of local model formats.
-
-### Option 1: From HuggingFace models
-
-Given its popularity, the TRT-LLM HLAPI chooses to support HuggingFace format as one of the start points, to use the HLAPI on LLaMA models, you need to run the following conversion script provided in [transformers/llama](https://huggingface.co/docs/transformers/main/model_doc/llama) or [transformers/llama2](https://huggingface.co/docs/transformers/main/model_doc/llama2) to convert the Meta checkpoint to HuggingFace format.
-
-For instance, when targeting the LLaMA2 7B model, the official way to retrieve the model is to visit the [LLaMA2 model page](https://huggingface.co/docs/transformers/main/en/model_doc/llama2), normally you need to submit a request for the model file.
-
-To convert the checkpoint files, a script from transformers is required, thus please also clone the transformers repo with the following code:
-
-```sh
-git clone https://github.com/huggingface/transformers.git
-```
-
-Finally, the command to convert the checkpoint files to HuggingFace format is as follows:
-
-``` sh
-python <transformers-dir>/src/transformers/models/llama/convert_llama_weights_to_hf.py \
-    --input_dir Llama-2-7b --model_size 7B --output_dir llama-hf-7b
-```
-
-That should produce a HuggingFace format model in `./llama-hf-7b`, which could be used by the HLAPI.
-
-### Option 2: From TensorRT-LLM engine
-There are two ways to build the TensorRT-LLM engine:
-
-1. You can build the TensorRT-LLM engine from the HuggingFace model directly with the `trtllm-build` tool, and save the engine to disk for later use.  Please consult the LLaMA's [README](../llama/README.md).
-2. Use the HLAPI to save one:
-
-```python
-llm = LLM(<model-path>)
-
-# Save engine to local disk
-llm.save(<engine-dir>)
-```
-
-### Option 3: From TensorRT-LLM checkpoint
-In each model example, there is a `convert_checkpoint.py` to convert third-party models to TensorRT-LLM checkpoint for further usage.
-The HLAPI could seamlessly accept the checkpoint, and build the engine in the backend.
-For step-by-step guidance on checkpoint conversion, please refer to the LLaMA's [README](../llama/README.md).
-
-
-## Basic usage
-To use the API, import the `LLM` from the `tensorrt_llm` package and create an LLM object with a HuggingFace model directly.
-For example:
-
-``` python
-from tensorrt_llm import LLM
-
-llm = LLM(model=<llama_model_path>)
-```
-
-It will trigger TRT-LLM engine building in the backend, and create a HuggingFace tokenizer by default to support an end-to-end generation.
-
-To generate text, use the `generate` method of the `LLM` object directly with a batch of prompts, for example:
-
-``` python
-prompts = ["To tell a story"]
-for output in llm.generate(prompts):
-    print(output)
-```
-
-The output might be something like:
-
-``` python
-RequestOutput(request_id=2, prompt='To tell a story', prompt_token_ids=[1, 1763, 2649, 263, 5828], outputs=[CompletionOutput(index=0, text=', you need to have a beginning, a middle, and an end.\nThe beginning is the introduction of the characters and the setting.\nThe middle is', token_ids=[29892, 366, 817, 304, 505, 263, 6763, 29892, 263, 7256, 29892, 322, 385, 1095, 29889, 13, 1576, 6763, 338, 278, 18707, 310, 278, 4890, 322, 278, 4444, 29889, 13, 1576, 7256, 338], cumulative_logprob=None, logprobs=[])], finished=True)
-```
-
-You can also dump the runtime engine to disk, and load from the engine file directly in the next run to save the engine building time from the HuggingFace model.
-
-``` python
-# dump the llm
-llm.save(<engine-path>)
-
-# next time
-llm = LLM(model=<engine-path>)
-```
-
-In other words, the `model_dir` could accept either a HugggingFace model, a built TensorRT-LLM engine, or a TensorRT-LLM checkpoint, and the `LLM()` will do the rest work silently for end-to-end execution.
-
-## Quantization
-
-By simply setting several flags in the `LLM`, TensorRT-LLM can quantize the HuggingFace model automatically. For example, to perform an Int4 AWQ quantization, the following code will trigger the model quantization.
-
-
-``` python
-from tensorrt_llm.hlapi import QuantConfig, QuantAlgo
-
-quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ)
-
-llm = LLM(<model-dir>, quant_config=quant_config)
-```
-
-## Parallelism
-
-### Tensor Parallelism
-It is easy to enable Tensor Parallelism in the HLAPI. For example, setting `parallel_config.tp_size=2` to perform a 2-way parallelism:
-
-```python
-from tensorrt_llm.hlapi import LLM
-
-llm = LLM(<llama_model_path>,
-          tensor_parallel_size=2)
-```
-
-### Pipeline Parallelism
-Similar to Tensor Parallelism, you can enable Pipeline Parallelism in the HLAPI with following code:
-
-```python
-llm = LLM(<llama_model_path>,
-          pipeline_parallel_size=4)
-```
-
-### Automatic Parallelism (in preview)
-
-By simply enabling `auto_parallel` in the `LLM` class, TensorRT-LLM can parallelize the model automatically. For example, setting `world_size` to perform a 2-way parallelism:
-
-``` python
-from tensorrt_llm import LLM
-
-llm = LLM(<llama_model_path>, auto_parallel=True, world_size=2)
-```
-
-## Generation
-### `asyncio`-based generation
-With the high-level API, you can also perform asynchronous generation with the `generate_async` method. For example:
-
-```python
-llm = LLM(model=<llama_model_path>)
-
-async for output in llm.generate_async(<prompt>, streaming=True):
-    print(output)
-```
-
-When the `streaming` flag is set to `True`, the `generate_async` method will return a generator that yields the token results as soon as they are available. Otherwise, it will return a generator that yields the final results only.
-
-### Future-style generation
-The result of the `generate_async` method is a Future-like object, it doesn't block the thread unless the `.result()` is called.
-
-```python
-# This will not block the main thread
-generation = llm.generate_async(<prompt>)
-# Do something else here
-# call .result() to explicitly block the main thread and wait for the result when needed
-output = generation.result()
-```
-
-The `.result()` method works like the [result](https://docs.python.org/zh-cn/3/library/asyncio-future.html#asyncio.Future.result) method in the Python Future, you can specify a timeout to wait for the result.
-
-```python
-output = generation.result(timeout=10)
-```
-
-There is an async version, where the `.aresult()` is used.
-
-```python
-generation = llm.generate_async(<prompt>)
-output = await generation.aresult()
-```
-
-### Customizing sampling with `SamplingParams`
-With SamplingParams, you can customize the sampling strategy, such as beam search, temperature, and so on.
-
-To enable beam search with a beam size of 4, set the `sampling_params` as follows:
-
-```python
-from tensorrt_llm.hlapi import LLM, SamplingParams, BuildConfig
-
-build_config = BuildConfig()
-build_config.max_beam_width = 4
-
-llm = LLM(<llama_model_path>, build_config=build_config)
-# Let the LLM object generate text with the default sampling strategy, or
-# you can create a SamplingParams object as well with several fields set manually
-sampling_params = SamplingParams(beam_width=4) # current limitation: beam_width should be equal to max_beam_width
-
-for output in llm.generate(<prompt>, sampling_params=sampling_params):
-    print(output)
-```
-
-`SamplingParams` manages and dispatches fields to C++ classes including:
-* [SamplingConfig](https://nvidia.github.io/TensorRT-LLM/_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime14SamplingConfigE)
-* [OutputConfig](https://nvidia.github.io/TensorRT-LLM/_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor12OutputConfigE)
-
-Please refer to these classes for more details.
-
-## LLM pipeline configuration
-
-### Build configuration
-Apart from the arguments mentioned above, you can also customize the build configuration with the `build_config` class and other arguments borrowed from the lower-level APIs. For example:
-
-```python
-llm = LLM(<model-path>,
-          build_config=BuildConfig(
-            max_num_tokens=4096,
-            max_batch_size=128,
-            max_beam_width=4))
-```
-
-### Runtime customization
-Similar to `build_config`, you can also customize the runtime configuration with the `runtime_config`, `peft_cache_config` or other arguments borrowed from the lower-level APIs. For example:
-
-
-```python
-from tensorrt_llm.hlapi import LLM, KvCacheConfig
-
-llm = LLM(<llama_model_path>,
-          kv_cache_config=KvCacheConfig(
-            free_gpu_memory_fraction=0.8))
-```
-
-### Tokenizer customization
-
-By default, the high-level API uses transformers’ `AutoTokenizer`. You can override it with your own tokenizer by passing it when creating the LLM object. For example:
-
-```python
-llm = LLM(<llama_model_path>, tokenizer=<my_faster_one>)
-```
-
-The LLM() workflow should use your tokenizer instead.
-
-It is also possible to input token IDs directly without Tokenizers with the following code, note that the result will be also IDs without text since the tokenizer is not used.
-
-``` python
-llm = LLM(<llama_model_path>)
-
-for output in llm.generate([32, 12]):
-    ...
-```
-
-### Disabling tokenizer
-For performance considerations, you can disable the tokenizer by passing `skip_tokenizer_init=True` when creating `LLM`. In this case, `LLM.generate` and `LLM.generate_async` will expect prompt token ids as input. For example:
-
-```python
-llm = LLM(<llama_model_path>)
-for output in llm.generate([[32, 12]]):
-    print(output)
-```
-
-You will get something like:
-```python
-RequestOutput(request_id=1, prompt=None, prompt_token_ids=[1, 15043, 29892, 590, 1024, 338], outputs=[CompletionOutput(index=0, text='', token_ids=[518, 10858, 4408, 29962, 322, 306, 626, 263, 518, 10858, 20627, 29962, 472, 518, 10858, 6938, 1822, 306, 626, 5007, 304, 4653, 590, 4066, 297, 278, 518, 11947, 18527, 29962, 2602, 472], cumulative_logprob=None, logprobs=[])], finished=True)
-```
-
-Note that the `text` field in `CompletionOutput` is empty since the tokenizer is deactivated.
-
-### Build caching
-Although the HLAPI runs the engine building in the background, you can also cache the built engine to disk and load it in the next run to save the engine building time.
-
-To enable the build cache, there are two ways to do it:
-
-1. Use the environment variable: `export TLLM_HLAPI_BUILD_CACHE=1` to enable the build cache globally, and optionally export `TLLM_HLAPI_BUILD_CACHE_ROOT` to specify the cache root directory.
-2. Pass the `enable_build_cache` to the `LLM` constructor
-
-The build cache will reuse the built engine if all the building settings are the same, or it will rebuild the engine.
-
-NOTE: The build cache monitors the model path and build settings, if you change the weights while keeping the same model path, the build cache will not detect the change and reuse the old engine.
+Please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/) and [examples](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/) for detailed information and usage guidelines regarding the LLM API.
diff --git a/examples/llm-api/llm_generate.py b/examples/llm-api/llm_inference.py
similarity index 100%
rename from examples/llm-api/llm_generate.py
rename to examples/llm-api/llm_inference.py
diff --git a/examples/llm-api/llm_generate_async.py b/examples/llm-api/llm_inference_async.py
similarity index 100%
rename from examples/llm-api/llm_generate_async.py
rename to examples/llm-api/llm_inference_async.py
diff --git a/examples/llm-api/llm_generate_async_streaming.py b/examples/llm-api/llm_inference_async_streaming.py
similarity index 100%
rename from examples/llm-api/llm_generate_async_streaming.py
rename to examples/llm-api/llm_inference_async_streaming.py
diff --git a/examples/llm-api/llm_inference_customize.py b/examples/llm-api/llm_inference_customize.py
new file mode 100644
index 000000000..405a3aa40
--- /dev/null
+++ b/examples/llm-api/llm_inference_customize.py
@@ -0,0 +1,47 @@
+### Generate text
+import tempfile
+
+from tensorrt_llm.hlapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
+
+# The end user can customize the build configuration with the build_config class and other arguments borrowed from the lower-level APIs
+build_config = BuildConfig()
+build_config.max_batch_size = 128
+build_config.max_num_tokens = 2048
+
+build_config.max_beam_width = 4
+
+# Model could accept HF model name or a path to local HF model.
+
+llm = LLM(
+    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    build_config=build_config,
+    kv_cache_config=KvCacheConfig(
+        free_gpu_memory_fraction=0.8
+    ),  # Similar to `build_config`, you can also customize the runtime configuration with the `kv_cache_config`, `runtime_config`, `peft_cache_config` or \
+    # other arguments borrowed from the lower-level APIs.
+)
+
+# You can save the engine to disk and load it back later, the LLM class can accept either a HF model or a TRT-LLM engine.
+llm.save(tempfile.mkdtemp())
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# With SamplingParams, you can customize the sampling strategy, such as beam search, temperature, and so on.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, beam_width=4)
+
+for output in llm.generate(prompts, sampling_params):
+    print(
+        f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+    )
+
+# Got output like
+# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+# Prompt: 'The president of the United States is', Generated text: 'likely to nominate a new Supreme Court justice to fill the seat vacated by the death of Antonin Scalia. The Senate should vote to confirm the'
+# Prompt: 'The capital of France is', Generated text: 'Paris.'
+# Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'
diff --git a/examples/llm-api/llm_generate_distributed.py b/examples/llm-api/llm_inference_distributed.py
similarity index 91%
rename from examples/llm-api/llm_generate_distributed.py
rename to examples/llm-api/llm_inference_distributed.py
index 2b0dd5c5b..6590f3916 100644
--- a/examples/llm-api/llm_generate_distributed.py
+++ b/examples/llm-api/llm_inference_distributed.py
@@ -6,8 +6,10 @@ def main():
     # model could accept HF model name or a path to local HF model.
     llm = LLM(
         model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        # Distributed settings
-        tensor_parallel_size=2,
+        # Enable 2-way tensor parallelism
+        tensor_parallel_size=2
+        # Enable 2-way pipeline parallelism if needed
+        # pipeline_parallel_size=2
     )
 
     # Sample prompts.
diff --git a/examples/llm-api/llm_logits_processor.py b/examples/llm-api/llm_logits_processor.py
new file mode 100644
index 000000000..33e46657a
--- /dev/null
+++ b/examples/llm-api/llm_logits_processor.py
@@ -0,0 +1,51 @@
+### Control generated text using logits post processor
+import typing as tp
+
+import torch
+
+from tensorrt_llm import LLM, SamplingParams
+
+
+# Define the logits post-processor callback. This simple callback will output
+# a specific token at each step irrespective of prompt.
+# Refer to ../bindings/executor/example_logits_processor.py for a more
+# sophisticated callback that generates JSON structured output.
+def logits_post_processor(req_id: int, logits: torch.Tensor,
+                          ids: tp.List[tp.List[int]], stream_ptr: int,
+                          client_id: tp.Optional[int]):
+    target_token_id = 42
+    with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)):
+        logits[:] = float("-inf")
+        logits[..., target_token_id] = 0
+
+
+# Several callbacks can be specified when initializing LLM
+llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+          logits_post_processor_map={"my_logits_pp": logits_post_processor})
+
+# Sample prompts
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+]
+
+# Generate text
+for prompt_id, prompt in enumerate(prompts):
+    # We will use logits post processor callback only for odd-numbered prompts
+    if prompt_id % 2 == 0:
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    else:
+        # Each prompt can use one callback from the choices that were provided to LLM
+        sampling_params = SamplingParams(
+            temperature=0.8,
+            top_p=0.95,
+            logits_post_processor_name='my_logits_pp')
+
+    for output in llm.generate([prompt], sampling_params):
+        print(
+            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+        )
+
+# Got output like
+# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
+# Prompt: 'The president of the United States is', Generated text: "''''''''''''''''''''''''''''''''"
diff --git a/examples/llm-api/requirements.txt b/examples/llm-api/requirements.txt
deleted file mode 100644
index 90de003d1..000000000
--- a/examples/llm-api/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
---extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
diff --git a/examples/mamba/README.md b/examples/mamba/README.md
index b784e123c..c454606c1 100644
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
@@ -81,37 +81,31 @@ mamba-codestral-7B-v0.1 as an example.
 ```bash
 # mamba-2.8b
 python convert_checkpoint.py --model_dir ./mamba_model/mamba-2.8b/ \
-                             --ckpt_type hf \
                              --dtype bfloat16 \
                              --output_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/
 
 # mamba-130m
 python convert_checkpoint.py --model_dir ./mamba_model/mamba-130m/ \
-                             --ckpt_type hf \
                              --dtype float16 \
                              --output_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/
 
 # mamba2-2.7b
 python convert_checkpoint.py --model_dir ./mamba_model/mamba2-2.7b/ \
-                             --ckpt_type state_spaces \
                              --dtype float16 \
                              --output_dir ./mamba_model/mamba2-2.7b/trt_ckpt/fp16/1-gpu/
 
 # mamba2-130m
 python convert_checkpoint.py --model_dir ./mamba_model/mamba2-130m/ \
-                             --ckpt_type state_spaces \
                              --dtype float16 \
                              --output_dir ./mamba_model/mamba2-130m/trt_ckpt/fp16/1-gpu/
 
 # mamba-codestral-7B-v0.1
 python convert_checkpoint.py --model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
-                             --ckpt_type mistral_inference \
                              --dtype float16 \
                              --output_dir ./mamba_model/mamba-codestral-7B-v0.1/trt_ckpt/fp16/1-gpu/
 
 # mamba-codestral-7B-v0.1 with 2-way tensor parallelism.
 python convert_checkpoint.py --model_dir ./mamba_model/mamba-codestral-7B-v0.1/ \
-                             --ckpt_type mistral_inference \
                              --dtype float16 \
                              --world_size 2 \
                              --output_dir ./mamba_model/mamba-codestral-7B-v0.1/trt_ckpt/fp16/2-gpu/
diff --git a/examples/mamba/convert_checkpoint.py b/examples/mamba/convert_checkpoint.py
index d0de466e2..0e48c97fb 100644
--- a/examples/mamba/convert_checkpoint.py
+++ b/examples/mamba/convert_checkpoint.py
@@ -1,336 +1,95 @@
 import argparse
-import copy
 import json
 import os
-import re
 import time
-from dataclasses import dataclass, field
-from enum import Enum
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-from typing import List, Union
-
-import safetensors.torch
-import torch
-from transformers import AutoConfig, AutoModelForCausalLM
 
 import tensorrt_llm
 from tensorrt_llm import logger
-from tensorrt_llm.models.convert_utils import (iterate_shard_files,
-                                               load_state_dict)
-
-
-class CheckpointType(str, Enum):
-    mistral_inference = "mistral_inference"
-    state_spaces = "state_spaces"
-    hf = "hf"
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import MambaForCausalLM
+from tensorrt_llm.models.modeling_utils import QuantConfig
+from tensorrt_llm.quantization import QuantAlgo
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--ckpt_type",
-                        type=CheckpointType,
-                        choices=list(CheckpointType),
-                        default=CheckpointType.hf,
-                        help='Checkpoint type')
+
     parser.add_argument('--model_dir', type=Path, default=None)
     parser.add_argument("--world_size",
                         type=int,
                         default=1,
                         help="world size, only support tensor parallelism now")
+    parser.add_argument('--tp_size',
+                        type=int,
+                        default=1,
+                        help='N-way tensor parallelism size')
+    parser.add_argument('--pp_size',
+                        type=int,
+                        default=1,
+                        help='N-way pipeline parallelism size')
     parser.add_argument('--dtype',
                         type=str,
                         default='float16',
                         choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument(
+        '--use_weight_only',
+        default=False,
+        action="store_true",
+        help='Quantize weights for the various GEMMs to INT4/INT8.'
+        'See --weight_only_precision to set the precision')
+    parser.add_argument(
+        '--weight_only_precision',
+        const='int8',
+        type=str,
+        nargs='?',
+        default='int8',
+        choices=['int8', 'int4'],
+        help=
+        'Define the precision for the weights when using weight-only quantization.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
     parser.add_argument(
         '--output_dir',
         type=Path,
         default='mamba_tllm_checkpoint',
         help='The path to save the mamba TensorRT-LLM checkpoint')
     parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument(
+        '--workers',
+        type=int,
+        default=1,
+        help='The number of workers for converting checkpoint in parallel')
     args = parser.parse_args()
     return args
 
 
-def get_weight(config, prefix, dtype):
-    return config[prefix + '.weight'].to(dtype).detach()
-
-
-def get_bias(config, prefix, dtype):
-    if (prefix + '.bias') in config:
-        return config[prefix + '.bias'].to(dtype).detach()
-    return None
-
-
-def get_weight_and_bias(config, prefix, dtype_w, dtype_b):
-    return get_weight(config, prefix,
-                      dtype_w), get_bias(config, prefix, dtype_b)
-
-
-def get_tllm_linear_weight(weight, prefix, bias=None):
-    results = {}
-    results[prefix + 'weight'] = weight.contiguous()
-    if bias is not None:
-        results[prefix + 'bias'] = bias
-    return results
-
-
-def split(v, tp_size, idx, dim=0):
-    assert v.shape[dim] % tp_size == 0
-    split_size = v.shape[dim] // tp_size
-    if tp_size == 1:
-        return v
-    return torch.split(v, split_size, dim=dim)[idx]
-
-
-def convert_hf_mamba(hf_mamba,
-                     rank=0,
-                     dtype='float32',
-                     mamba_version: str = 'Mamba1'):
-    weights = {}
-    tik = time.time()
-
-    model_params = dict(hf_mamba.named_parameters())
-    dtype = getattr(torch, dtype)
-
-    # Parameter names in mamba block
-    for l in range(hf_mamba.config.num_hidden_layers):
-        # ssm layer
-        prefix = f'backbone.layers.{l}.mixer.'
-        tllm_prex = f'backbone.layers.{l}.ssm.'
-        for layer in ['conv1d', 'x_proj', 'dt_proj', 'out_proj']:
-            dtype_b = torch.float32 if layer == 'dt_proj' else dtype
-            weight, bias = get_weight_and_bias(model_params, prefix + layer,
-                                               dtype, dtype_b)
-            if layer == 'conv1d':
-                weight = weight.unsqueeze(3)
-            tllm_weight_name = tllm_prex + layer + '.weight'
-            tllm_bias_name = tllm_prex + ('dt_bias' if layer == 'dt_proj' else
-                                          layer + '.bias')
-            weights[tllm_weight_name] = weight
-            if bias is not None:
-                weights[tllm_bias_name] = bias
-        # in_proj
-        weight, bias = get_weight_and_bias(model_params, prefix + 'in_proj',
-                                           dtype, dtype)
-        in_proj_weights = torch.split(weight, weight.size(0) // 2, dim=0)
-        tllm_weight_name = tllm_prex + 'in_proj.weight'
-        weights[tllm_weight_name.replace('proj', 'proj_x')] = in_proj_weights[0]
-        weights[tllm_weight_name.replace('proj', 'proj_z')] = in_proj_weights[1]
-        if bias is not None:
-            in_proj_biases = torch.split(bias, bias.size(0) // 2, dim=0)
-            tllm_bias_name = tllm_prex + 'in_proj.bias'
-            weights[tllm_bias_name.replace('proj',
-                                           'proj_x')] = in_proj_biases[0]
-            weights[tllm_bias_name.replace('proj',
-                                           'proj_x')] = in_proj_biases[1]
-
-        # A and D
-        Aparam = model_params[prefix + 'A_log'].float().detach()
-        Aparam = Aparam.permute(1, 0).contiguous()
-        weights[tllm_prex + 'A'] = -torch.exp(Aparam)
-        weights[tllm_prex + 'D'] = model_params[prefix + 'D'].float().detach()
-        # norm
-        prefix = f'backbone.layers.{l}.norm'
-        tllm_prex = f'backbone.layers.{l}.input_layernorm.'
-        weight, bias = get_weight_and_bias(model_params, prefix, dtype, dtype)
-        weights[tllm_prex + 'weight'] = weight
-        if bias is not None:
-            weights[tllm_prex + 'bias'] = bias
-
-    # others
-    for layer in ['backbone.embeddings', 'backbone.norm_f']:
-        weight, bias = get_weight_and_bias(model_params, layer, dtype, dtype)
-        layer = layer.replace('embeddings', 'vocab_embedding')
-        layer = layer.replace('norm_f', 'ln_f')
-        weights[layer + '.weight'] = weight
-        if bias is not None:
-            weights[layer + '.bias'] = bias
-    weights['lm_head.weight'], _ = get_weight_and_bias(model_params,
-                                                       'backbone.embeddings',
-                                                       dtype, dtype)
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    print(f'Weights loaded. Total time: {t}')
-    return weights
-
-
-def rename_hf_to_tllm(name: str):
-    """ Rename a HF parameter name by the corresponding TRT-LLM style name. """
-    # remove model
-    if 'model.' in name:
-        name = name.replace('model.', '')
-
-    # change layer name
-    if 'embeddings.' in name:
-        name = name.replace('embeddings', 'vocab_embedding')
-    elif 'embedding.' in name:
-        name = name.replace('embedding', 'vocab_embedding')
-    norm_pattern = r'\d\.norm\.'
-    if 'mixer.' in name:
-        name = name.replace('mixer.', 'ssm.')
-    elif re.search(norm_pattern, name):
-        name = name.replace('norm.', 'input_layernorm.')
-    elif 'norm_f.' in name:
-        name = name.replace('norm_f.', 'ln_f.')
-
-    # Parameter names in ssm layers
-    if 'A_log' in name:
-        name = name.replace('A_log', 'A')
-    elif 'dt_proj.bias' in name:
-        name = name.replace('dt_proj.bias', 'dt_bias')
-    return name
-
-
-def convert_from_hf_checkpoint(mamba_config: dict,
-                               model_dir: Union[str, Path],
-                               rank=0,
-                               dtype: Union[str, torch.dtype] = torch.float32,
-                               mamba_version: str = 'Mamba1'):
-    logger.info('Loading weights from HF Mamba...')
-    tik = time.time()
-
-    tp_rank = rank
-    tp_size = mamba_config['mapping']['tp_size']
-    d_inner = mamba_config['rnn_hidden_size']
-    d_state = mamba_config['state_size']
-    weights = {}
-    if isinstance(dtype, str):
-        dtype = tensorrt_llm.str_dtype_to_torch(dtype)
-
-    for model_file in iterate_shard_files(model_dir, 0):
-        logger.debug(f'Loading file {str(model_file)}...')
-        model_params = load_state_dict(model_file, dtype=dtype)
-        for name, param in model_params.items():
-            logger.debug(f'Converting weight {name}...')
-            tllm_name = rename_hf_to_tllm(name)
-            param = param.detach().cpu()
-            if 'A_log' in name:
-                param = -torch.exp(param.float())
-                if mamba_version == 'Mamba1':
-                    param = param.permute(1, 0).contiguous()
-            elif 'D' in name:
-                param = param.float()
-            elif 'dt_proj.bias' in name:
-                param = param.float()
-            elif 'dt_bias' in name:
-                param = param.float()
-            elif 'conv1d.weight' in name:
-                param = param.unsqueeze(3)
-
-            # split in_proj in Mamba1
-            if 'in_proj' in name and mamba_version == 'Mamba1':
-                in_proj_params = torch.split(param, param.size(0) // 2, dim=0)
-                weights[tllm_name.replace('proj', 'proj_x')] = in_proj_params[0]
-                weights[tllm_name.replace('proj', 'proj_z')] = in_proj_params[1]
-            elif 'in_proj' in name and mamba_version == 'Mamba2':
-                nheads = d_inner // mamba_config['rnn_head_size']
-                ngroups = mamba_config['ngroups']
-                in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split(
-                    param, [
-                        d_inner, d_inner, ngroups * d_state, ngroups * d_state,
-                        nheads
-                    ],
-                    dim=0)
-                in_proj_z = split(in_proj_z, tp_size, tp_rank, dim=0)
-                in_proj_x = split(in_proj_x, tp_size, tp_rank, dim=0)
-                in_proj_b = split(in_proj_b, tp_size, tp_rank, dim=0)
-                in_proj_c = split(in_proj_c, tp_size, tp_rank, dim=0)
-                in_proj_dt = split(in_proj_dt, tp_size, tp_rank, dim=0)
-                in_proj = torch.concat(
-                    [in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt])
-                weights[tllm_name] = in_proj.contiguous()
-            elif 'conv1d' in name and mamba_version == 'Mamba2':
-                ngroups = mamba_config['ngroups']
-                conv_x, conv_b, conv_c = torch.split(
-                    param, [d_inner, ngroups * d_state, ngroups * d_state],
-                    dim=0)
-                conv_x = split(conv_x, tp_size, tp_rank, dim=0)
-                conv_b = split(conv_b, tp_size, tp_rank, dim=0)
-                conv_c = split(conv_c, tp_size, tp_rank, dim=0)
-                conv = torch.concat([conv_x, conv_b, conv_c])
-                weights[tllm_name] = conv.contiguous()
-            elif any(keyword in name for keyword in (
-                    'mixer.norm.weight',
-                    'A_log',
-                    'D',
-                    'dt_proj.bias',
-                    'dt_bias',
-            )) and mamba_version == 'Mamba2':
-                weights[tllm_name] = split(param, tp_size, tp_rank, dim=0)
-            elif 'out_proj' in name and mamba_version == 'Mamba2':
-                weights[tllm_name] = split(param, tp_size, tp_rank,
-                                           dim=1).contiguous()
-            else:
-                weights[tllm_name] = param
-        del model_params
-
-    # lm_head
-    emb = weights['backbone.vocab_embedding.weight']
-    if 'lm_head.weight' not in weights or weights['lm_head.weight'].data_ptr(
-    ) == emb.data_ptr():
-        weights['lm_head.weight'] = copy.deepcopy(emb)
-    if mamba_version == 'Mamba2':
-        weights['lm_head.weight'] = split(weights['lm_head.weight'],
-                                          tp_size,
-                                          tp_rank,
-                                          dim=0)
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
-    return weights
-
-
-def do_convert_from_ckpt(args):
-    return args.model_dir.exists()
-
-
-def convert(worker_rank, args, convert_args):
-    convert_from_ckpt = do_convert_from_ckpt(args)
-    for rank in range(worker_rank, args.world_size):
-        if convert_from_ckpt:
-            weights = convert_from_hf_checkpoint(rank=rank, **convert_args)
-        else:
-            weights = convert_hf_mamba(rank=rank, **convert_args)
-        safetensors.torch.save_file(weights,
-                                    args.output_dir / f'rank{rank}.safetensors')
-
-
-@dataclass
-class MambaConfig:
-
-    architectures: List[str] = field(
-        default_factory=lambda: ['MambaForCausalLM'])
-    d_intermediate: int = 0
-    vocab_size: int = 50277
-    attn_layer_idx: list = field(default_factory=list)
-    attn_cfg: dict = field(default_factory=dict)
-    rms_norm: bool = True
-    residual_in_fp32: bool = True
-    pad_vocab_size_multiple: int = 8
-    hidden_size: int = 2560
-    num_hidden_layers: int = 64
-    intermediate_size: int = 0
-    state_size: int = 128
-    conv_kernel: int = 4
-    use_bias: bool = False
-    head_dim: int = 64
-    n_groups: int = 1
-    chunk_size: int = 256
-    ssm_rmsnorm: bool = True
-
-    def update(self, data_dict):
-        self.__dict__.update(data_dict)
-
-
-def load_config_hf(model_name, ckpt_type):
+def load_config_hf(model_name, ckpt_type, dtype, mapping, quant_config,
+                   output_dir):
     if ckpt_type == CheckpointType.hf:  # transformer compatible models
-        hf_config = AutoConfig.from_pretrained(model_name,
-                                               trust_remote_code=True)
-        mamba_version = 'Mamba2' if hf_config.model_type == 'mamba2' else 'Mamba1'
+        override_fields = {}
+        mamba = MambaForCausalLM.from_hugging_face(
+            model_name,
+            dtype,
+            mapping=mapping,
+            quant_config=quant_config,
+            **override_fields,
+        )
+        mamba.save_checkpoint(output_dir, save_config=True)
+
     elif ckpt_type == CheckpointType.state_spaces:  # state-spaces/mamba models
         config = json.load(open(os.path.join(model_name, 'config.json')))
+        override_fields = {}
+        mamba = MambaForCausalLM.from_hugging_face(
+            model_name,
+            dtype,
+            mapping=mapping,
+            quant_config=quant_config,
+            **override_fields,
+        )
+        mamba.save_checkpoint(output_dir, save_config=True)
+
         ssm_cfg = config.pop('ssm_cfg')
         cfg_to_mamba_cfg = {
             'd_model': 'hidden_size',
@@ -355,6 +114,7 @@ def load_config_hf(model_name, ckpt_type):
         for k in ssm_cfg_to_mamba_cfg:
             if k in ssm_cfg and ssm_cfg_to_mamba_cfg[k] is not None:
                 config[ssm_cfg_to_mamba_cfg[k]] = ssm_cfg[k]
+
         hf_config = MambaConfig(**config)
         if 'expand' in ssm_cfg:
             expand = ssm_cfg['expand']
@@ -362,6 +122,7 @@ def load_config_hf(model_name, ckpt_type):
         else:
             hf_config.intermediate_size = 2 * hf_config.hidden_size
         mamba_version = ssm_cfg.pop("layer", "Mamba1")
+
     elif ckpt_type == CheckpointType.mistral_inference:  # mistral inference format
         config = json.load(open(os.path.join(model_name, 'params.json')))
         cfg_to_mamba_cfg = {
@@ -384,90 +145,71 @@ def load_config_hf(model_name, ckpt_type):
         else:
             hf_config.intermediate_size = 2 * hf_config.hidden_size
         mamba_version = 'Mamba2'
+
     return hf_config, mamba_version
 
 
+def execute(workers, func, args):
+    if workers == 1:
+        for rank, f in enumerate(func):
+            f(args, rank)
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as p:
+            futures = [p.submit(f, args, rank) for rank, f in enumerate(func)]
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    traceback.print_exc()
+                    exceptions.append(e)
+            assert len(
+                exceptions
+            ) == 0, "Checkpoint conversion failed, please check error log."
+
+
+def args_to_quant_config(args: argparse.Namespace) -> QuantConfig:
+    '''return config dict with quantization info based on the command line args
+    '''
+    quant_config = QuantConfig()
+    if args.use_weight_only:
+        if args.weight_only_precision == 'int8':
+            quant_config.quant_algo = QuantAlgo.W8A16
+        elif args.weight_only_precision == 'int4':
+            quant_config.quant_algo = QuantAlgo.W4A16
+
+    return quant_config
+
+
 def main():
     print(tensorrt_llm.__version__)
 
     args = parse_arguments()
     logger.set_level(args.log_level)
     tik = time.time()
+    assert args.pp_size == 1, "Pipeline parallelism is not supported."
+    world_size = args.tp_size * args.pp_size
 
     args.output_dir.mkdir(exist_ok=True, parents=True)
 
-    hf_config, mamba_version = load_config_hf(args.model_dir, args.ckpt_type)
-
-    vocab_size = hf_config.vocab_size
-    pad_vocab_size_multiple = getattr(hf_config, "pad_vocab_size_multiple", 1)
-    if vocab_size % pad_vocab_size_multiple != 0:
-        vocab_size += pad_vocab_size_multiple - (vocab_size %
-                                                 pad_vocab_size_multiple)
-
-    config = {
-        'architecture': 'MambaForCausalLM',
-        'dtype': args.dtype,
-        'logits_dtype': 'float32',
-        'hidden_size': hf_config.hidden_size,
-        'num_hidden_layers': hf_config.num_hidden_layers,
-        'layer_types': ['recurrent'],
-        'vocab_size': vocab_size,
-        'rms_norm': hf_config.rms_norm,
-        'residual_in_fp32': hf_config.residual_in_fp32,
-        'pad_vocab_size_multiple': pad_vocab_size_multiple,
-        'hidden_act': 'silu',
-        'num_attention_heads': args.world_size,
-        'rnn_hidden_size': hf_config.intermediate_size,
-        'rnn_conv_dim_size': hf_config.intermediate_size,
-        'state_size': hf_config.state_size,
-        'conv_kernel': hf_config.conv_kernel,
-        'use_bias': hf_config.use_bias,
-        'mamba_version': mamba_version,
-        'mapping': {
-            'world_size': args.world_size,
-            'tp_size': args.world_size,
-            'pp_size': 1
-        },
-    }
-    if mamba_version == 'Mamba2':
-        conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size
-        ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm)
-        mamba2_cfg = {
-            'rnn_head_size': hf_config.head_dim,
-            'rnn_conv_dim_size': conv_dim,
-            'ngroups': hf_config.n_groups,
-            'chunk_size': hf_config.chunk_size,
-            'ssm_rmsnorm': ssm_rmsnorm,
-        }
-        config.update(mamba2_cfg)
-
-    with (args.output_dir / 'config.json').open('w') as f:
-        json.dump(config, f, indent=4)
-
-    convert_from_ckpt = do_convert_from_ckpt(args)
-    # TODO: Add convert_hf_mamba support for Mamba2 when transformers can support Mamba2 models
-    assert convert_from_ckpt or mamba_version == 'Mamba2', "Mamba2 can only support convert from checkpoints."
-    assert args.world_size == 1 or mamba_version == 'Mamba2', "Mamba1 can not support tensor parallelism."
-    if not convert_from_ckpt:
-        logger.info(f'Convert by using model')
-        hf_mamba = AutoModelForCausalLM.from_pretrained(args.model_dir,
-                                                        device_map="auto",
-                                                        torch_dtype="auto",
-                                                        trust_remote_code=True)
-    else:
-        logger.info(f'Convert by using checkpoint')
-        hf_mamba = None
+    quant_config = args_to_quant_config(args)
 
-    convert_args = dict(dtype=args.dtype, )
+    def convert_and_save_rank(args, rank):
+        mapping = Mapping(world_size=world_size,
+                          rank=rank,
+                          tp_size=args.tp_size,
+                          pp_size=args.pp_size)
 
-    if convert_from_ckpt:
-        convert_args['model_dir'] = args.model_dir
-    else:
-        convert_args['hf_mamba'] = hf_mamba
-    convert_args['mamba_version'] = mamba_version
-    convert_args['mamba_config'] = config
+        mamba = MambaForCausalLM.from_hugging_face(
+            args.model_dir,
+            args.dtype,
+            mapping=mapping,
+            quant_config=quant_config,
+        )
+        mamba.save_checkpoint(args.output_dir, save_config=(rank == 0))
+        del mamba
 
-    convert(0, args, convert_args)
+    execute(args.workers, [convert_and_save_rank] * world_size, args)
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt
index 844ae8978..f265ca761 100644
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 transformers>=4.39.0
 datasets~=2.14.5
 evaluate
diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt
index db7c7cd52..179f92d66 100644
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 evaluate~=0.4.1
diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md
index 20f8aa644..2aef5bd51 100644
--- a/examples/mixtral/README.md
+++ b/examples/mixtral/README.md
@@ -94,7 +94,7 @@ In TP+EP mode, both strategies are used simultaneously. This means each GPU hand
 
 You can enable Expert Parallel or hybrid parallel by setting `--moe_tp_size` and `--moe_ep_size` when calling `convert_coneckpoint.py`. If only `--moe_tp_size` is provided, TRT-LLM will use Tensor Parallel for the MoE model; if only `--moe_ep_size` is provided, TRT-LLM will use Expert Parallel; if both are provided, the hybrid parallel will be used.
 
-Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE paralleism across all GPUs must match the total number of parallelism in other parts of the model.
+Be sure that the product of `moe_tp_size` and `moe_ep_size` should equal to `tp_size`, since the total number of MoE parallelism across all GPUs must match the total number of parallelism in other parts of the model.
 
 ```bash
 # Build Mixtral8x7B with Expert Parallelism
diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt
index e7d9d8c71..3e391c645 100644
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 transformers==4.38.2
 accelerate==0.25.0
diff --git a/examples/model_api/README.md b/examples/model_api/README.md
index 065a81847..258801153 100644
--- a/examples/model_api/README.md
+++ b/examples/model_api/README.md
@@ -42,7 +42,7 @@ python ./llama_quantize.py --hf_model_dir <hf llama path> --cache_dir ./llama.aw
 
 ## AutoModelForCausalLM
 
-The API `tensorrt_llm.AutoModelForCausalLM` can read from a Hugging Face model directory, find the correct TRT-LLM model class and dispatch the `from_hugging_face` mothod to the correct TRT-LLM class.
+The API `tensorrt_llm.AutoModelForCausalLM` can read from a Hugging Face model directory, find the correct TRT-LLM model class and dispatch the `from_hugging_face` method to the correct TRT-LLM class.
 
 The following code snippets demonstrated the usage of the `AutoModelForCausalLM` class.
 
diff --git a/examples/model_api/llama.py b/examples/model_api/llama.py
index c699ee192..5ca11b98a 100644
--- a/examples/model_api/llama.py
+++ b/examples/model_api/llama.py
@@ -43,8 +43,6 @@ def main():
     build_config = BuildConfig(max_input_len=256,
                                max_seq_len=276,
                                max_batch_size=1)
-    # just for fast build, not best for production
-    build_config.builder_opt = 0
     build_config.plugin_config.gemm_plugin = 'auto'
 
     if args.clean_build or not args.engine_dir.exists():
diff --git a/examples/model_api/llama_multi_gpu.py b/examples/model_api/llama_multi_gpu.py
index 388ad3a7f..fe35cbe9a 100644
--- a/examples/model_api/llama_multi_gpu.py
+++ b/examples/model_api/llama_multi_gpu.py
@@ -28,7 +28,6 @@ def build_and_run_llama(hf_model_dir, engine_dir, tp_size, rank):
     build_config = BuildConfig(max_input_len=256,
                                max_seq_len=512,
                                max_batch_size=8)
-    build_config.builder_opt = 0  # fast build for demo, pls avoid using this in production, since inference might be slower
     build_config.plugin_config.gemm_plugin = 'auto'  # for fast build, tune inference perf based on your needs
     mapping = Mapping(world_size=tp_size, rank=rank, tp_size=tp_size)
     llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir, mapping=mapping)
diff --git a/examples/mpt/README.md b/examples/mpt/README.md
index 093fd123c..4b54e7f18 100644
--- a/examples/mpt/README.md
+++ b/examples/mpt/README.md
@@ -19,10 +19,6 @@ This document explains how to build the [MPT](https://huggingface.co/mosaicml/mp
       - [1. Convert weights from HF Transformers to TRTLLM format](#1-convert-weights-from-hf-transformers-to-trtllm-format)
       - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines)
       - [3. Run TRT engine to check if the build was correct](#3-run-trt-engine-to-check-if-the-build-was-correct)
-    - [Replit Code V-1.5 3B](#replit-code-v-15-3b)
-      - [1. Convert weights from HF Transformers to TRTLLM format](#1-convert-weights-from-hf-transformers-to-trtllm-format-1)
-      - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines-1)
-      - [3. Run TRT engine to check if the build was correct](#3-run-trt-engine-to-check-if-the-build-was-correct-1)
 
 ## Overview
 
@@ -178,55 +174,3 @@ mpirun -n 4 --allow-run-as-root \
                      --engine_dir ./trt_engines/mpt-30b/fp16/4-gpu/ \
                      --tokenizer_dir mosaicml/mpt-30b
 ```
-
-### Replit Code V-1.5 3B
-Same commands can be changed to convert [Replit Code V-1.5 3B](https://huggingface.co/replit/replit-code-v1_5-3b) to TRT LLM format. Below is an example to build Replit Code V-1.5 3B fp16 2-way tensor parallelized TRT engine.
-
-#### 1. Convert weights from HF Transformers to TRTLLM format
-
-The [`convert_checkpoint.py`](./convert_checkpoint.py) script allows you to convert weights from HF Transformers format to TRTLLM format.
-
-```bash
-python convert_checkpoint.py --model_dir ./replit-code-v1_5-3b --output_dir ./ckpts/replit-code-v1_5-3b/bf16_tp2/ --tp_size 2 --dtype bfloat16
-```
-
-#### 2. Build TensorRT engine(s)
-
-Examples of build invocations:
-
-```bash
-# Build 2-GPU Replit Code V-1.5 3B bfloat16 engines
-trtllm-build --checkpoint_dir ./ckpts/replit-code-v1_5-3b/bf16_tp2 \
-             --max_batch_size 32 \
-             --max_input_len 1024 \
-             --max_seq_len 1536 \
-             --gpt_attention_plugin bfloat16 \
-             --gemm_plugin bfloat16 \
-             --workers 2 \
-             --output_dir ./trt_engines/replit-code-v1_5-3b/bf16_tp2
-```
-
-#### 3. Run TRT engine to check if the build was correct
-
-```bash
-# Run 2-GPU Replit Code V-1.5 3B TRT engine on a sample input prompt
-mpirun -n 2 --allow-run-as-root \
-    python ../run.py --max_output_len 64 \
-                     --input_text "def fibonacci" \
-                     --engine_dir ./trt_engines/replit-code-v1_5-3b/bf16_tp2 \
-                     --tokenizer_dir ./replit-code-v1_5-3b/
-```
-
-Here is the output of above command.
-```bash
-Input: "def fibonacci"
-Output: "(n):
-    if n == 0:
-        return 0
-    elif n == 1:
-        return 1
-    else:
-        return fibonacci(n-1) + fibonacci(n-2)
-
-print(fibonacci(10))"
-```
diff --git a/examples/mpt/convert_checkpoint.py b/examples/mpt/convert_checkpoint.py
index e58f76bde..4708c4146 100644
--- a/examples/mpt/convert_checkpoint.py
+++ b/examples/mpt/convert_checkpoint.py
@@ -14,8 +14,7 @@
 import torch
 import torch.nn as nn
 from tqdm import tqdm
-from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
-                          MptForCausalLM)
+from transformers import AutoTokenizer, MptConfig, MptForCausalLM
 from transformers.pytorch_utils import Conv1D
 
 import tensorrt_llm
@@ -675,7 +674,7 @@ def convert_hf_mpt_legacy(hf_model,
 
 
 def convert_hf_mpt(hf_model: MptForCausalLM,
-                   hf_config: AutoConfig,
+                   hf_config: MptConfig,
                    mapping: Mapping,
                    dtype: str = 'float32',
                    use_parallel_embedding: bool = False,
@@ -691,8 +690,8 @@ def convert_hf_mpt(hf_model: MptForCausalLM,
     dtype = getattr(torch, dtype)
     num_hidden_layers = hf_config.n_layers
     num_head = hf_config.n_heads
-    num_kv_heads = hf_config.attn_config['kv_n_heads'] if 'kv_n_heads' in hf_config.attn_config \
-        else hf_config.n_heads
+    num_kv_heads = getattr(hf_config.attn_config, 'kv_n_heads',
+                           hf_config.n_heads)
     hidden_size = hf_config.d_model
     vocab_size = hf_config.vocab_size
 
@@ -816,10 +815,10 @@ def convert_hf_mpt(hf_model: MptForCausalLM,
     else:
         kv_cache_quant_algo = None
 
-    hf_config = AutoConfig.from_pretrained(args.model_dir,
-                                           trust_remote_code=True)
-    num_kv_heads = hf_config.attn_config['kv_n_heads'] if 'kv_n_heads' in hf_config.attn_config \
-        else hf_config.n_heads
+    hf_config = MptConfig.from_pretrained(args.model_dir,
+                                          trust_remote_code=True)
+    num_kv_heads = getattr(hf_config.attn_config, 'kv_n_heads',
+                           hf_config.n_heads)
     config = {
         'architecture': hf_config.architectures[0],
         'dtype': args.dtype,
@@ -845,18 +844,17 @@ def convert_hf_mpt(hf_model: MptForCausalLM,
             'pp_size': args.pp_size,
         },
         'bias': (not hf_config.no_bias),
-        'clip_qkv': hf_config.attn_config['clip_qkv'],
-        'alibi_bias_max': hf_config.attn_config['alibi_bias_max']
+        'clip_qkv': hf_config.attn_config.clip_qkv,
+        'alibi_bias_max': hf_config.attn_config.alibi_bias_max
     }
 
     with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
         json.dump(config, f, indent=4)
 
-    hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
-                                                    trust_remote_code=True,
-                                                    device_map="auto",
-                                                    torch_dtype=getattr(
-                                                        torch, args.dtype))
+    hf_model = MptForCausalLM.from_pretrained(args.model_dir,
+                                              device_map="auto",
+                                              torch_dtype=getattr(
+                                                  torch, args.dtype))
 
     act_range = {}
     mpt_qkv_para = {}
diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt
index d7bf43bed..80c4473aa 100644
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
index a2e85a42e..79b28d294 100644
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -621,7 +621,7 @@ Currently, CogVLM only support bfloat16 precision.
 1. Download Huggingface weights
 
     ```bash
-    export MODEL_NAME="Phi-3-vision-128k-instruct"
+    export MODEL_NAME="Phi-3-vision-128k-instruct" # or Phi-3.5-vision-instruct
     git clone https://huggingface.co/microsoft/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
     ```
 
diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt
index a054119f4..88ffbef37 100644
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 nemo-toolkit[all]==2.0.0rc1
 megatron-core==0.8.0
 datasets~=2.14.5
diff --git a/examples/nemotron_nas/README.md b/examples/nemotron_nas/README.md
new file mode 100644
index 000000000..d1e3a898c
--- /dev/null
+++ b/examples/nemotron_nas/README.md
@@ -0,0 +1,102 @@
+# Nemotron-NAS
+
+This document shows how to convert and build a model generated by Nemotron-NAS, such as Llama-3_1-Nemotron-51B-Instruct, in TensorRT-LLM.
+
+- [NemotronNas](#nemotron-nas)
+  - [Overview](#overview)
+  - [Support Matrix](#support-matrix---verify-with-omer--nave)
+  - [Custom Layers](#custom-layers)
+  - [Usage](#usage)
+    - [Build TensorRT engine(s)](#build-tensorrt-engines)
+  - [Runtime](#runtime)
+
+## Overview
+
+The TensorRT-LLM Nemotron-NAS implementation can be found in [tensorrt_llm/models/nemotron_nas/model.py](../../tensorrt_llm/models/nemotron_nas/model.py). The TensorRT-LLM Nemotron-NAS example code is located in [`examples/nemotron_nas`](./). There is one main file:
+
+* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the model into tensorrt-llm checkpoint format.
+
+## Support Matrix
+
+  * FP16
+  * BF16
+  * Tensor parallelism
+  * Pipeline parallelism
+
+## Custom Layers
+
+Nemotron-NAS offers the ability to replace both `attention` and `FFN` layers with either `Linear` or `NoOp` layers.
+The `attention` layers can be replaced with `LinearAttention` (which eventually calls `tensorrt_llm/layers/Linear`).
+Additionally, `attention` layers can also be replaced with `NoOpAttention` (which essentially returns 0, thus implementing a no-op operation).
+The `LinearAttention` and `NoOpAttention` layers require no KV-cache.
+Likewise, `FFN` layers can be replaced with either `LinearFFN` or `NoOpFFN`.
+
+Different attention layers of the model can have a different number of key-value attention heads and different MLP layers can have different hidden sizes.
+
+## About Pipeline Parallelism
+
+Due the non-uniform architecture of the model, the different pipeline parallelism ranks might run different types of layers, resulting in a possibly unbalanced load between GPUs during inference.
+
+## Usage
+
+The TensorRT-LLM example code is located at [examples/nemotron_nas](./).
+The `convert_checkpoint.py` script accepts Hugging Face weights as input, and builds the corresponding TensorRT engines.
+The number of TensorRT engines depends on the number of GPUs used to run inference.
+
+### Build TensorRT Engines
+
+To build a TensorRT engine, you first need to obtain a Nemotron-NAS checkpoint in Hugging Face format. For example, [Llama-3_1-Nemotron-51B-Instruct](https://huggingface.co/nvidia/Llama-3_1-Nemotron-51B-Instruct).
+
+The `trtllm-build` command builds TensorRT engines from a TensorRT-LLM checkpoint.
+If no checkpoint directory is specified, TensorRT-LLM builds the engines with dummy weights.
+
+The `trtllm-build` command has a variety of options.
+In particular, the plugin-related options have two categories:
+
+* Plugin options that require a data type, such as `gpt_attention_plugin`, you can:
+    * explicitly specify `float16`, `bfloat16`, or `float32`, so that the plugins are enabled with the specified precision
+    * implicitly specify `auto`, so that the plugins are enabled with the precision that is automatically inferred from the model dtype, the dtype specified in weight conversion
+
+```bash
+# Optional: prepare environment variables
+export MODEL_DIR=...
+export TRT_CHECKPOINT_DIR=...
+export TRT_ENGINE_DIR=...
+export TP_SIZE=...
+export PP_SIZE=...
+
+# create a local copy of the model checkpoint
+git clone https://huggingface.co/nvidia/Llama-3_1-Nemotron-51B-Instruct $MODEL_DIR
+
+# Convert the model to TRT BF16 checkpoint
+# Note, currently must provide --trust_remote_code flag
+python convert_checkpoint.py --model_dir $MODEL_DIR \
+                             --dtype bfloat16 \
+                             --output_dir $TRT_CHECKPOINT_DIR \
+                             --tp_size=$TP_SIZE --pp_size=$PP_SIZE \
+                             --trust_remote_code
+
+# Build the model engine using a single GPU and FP16
+trtllm-build --checkpoint_dir $TRT_CHECKPOINT_DIR \
+             --output_dir $TRT_ENGINE_DIR \
+             --gemm_plugin auto \
+             --kv_cache_type paged
+```
+
+The conversion script supports additional models with variable GQA, such as [DeciLM-7B](https://huggingface.co/Deci/DeciLM-7B).
+
+## Runtime
+
+After you build the engine, you can use the engine with any TensorRT-LLM entrypoint or API.
+For example, you can run inference with [examples/run.py](../run.py):
+
+```bash
+export MODEL_DIR=...
+export TRT_ENGINE_DIR=...
+
+python run.py --engine_dir $TRT_ENGINE_DIR --tokenizer_dir $MODEL_DIR --max_output_len 1024 ...
+
+# for multi-GPU inference (engine must be built with either tp_size>1, pp_size>1, or both)
+export NUM_GPUS=...
+mpirun -n $NUM_GPUS --allow-run-as-root python run.py ...
+```
diff --git a/examples/nemotron_nas/calibration_utils.py b/examples/nemotron_nas/calibration_utils.py
new file mode 100644
index 000000000..42b4382fa
--- /dev/null
+++ b/examples/nemotron_nas/calibration_utils.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DATASET = "Magpie-Align/Magpie-Pro-MT-300K-v0.1"
+
+
+def create_trtllm_magpie_calibration_dataset(output_dir: str,
+                                             calib_size: int = 512) -> None:
+    from datasets import load_dataset
+
+    dataset = load_dataset(DATASET, split="train")
+
+    def transform(conversation):
+        value = '\n'.join(turn['value']
+                          for turn in conversation['conversations'])
+        return {"text": value}
+
+    dataset = dataset.select(range(calib_size)).map(
+        transform, remove_columns=dataset.column_names)
+    # https://github.com/huggingface/datasets/issues/6703#issuecomment-1974766332
+    dataset.to_parquet(output_dir + "/data.parquet")
+
+
+if __name__ == "__main__":
+    import sys
+    output_dir = sys.argv[1]
+    create_trtllm_magpie_calibration_dataset(output_dir)
diff --git a/examples/nemotron_nas/convert_checkpoint.py b/examples/nemotron_nas/convert_checkpoint.py
new file mode 100644
index 000000000..f2fcc04e8
--- /dev/null
+++ b/examples/nemotron_nas/convert_checkpoint.py
@@ -0,0 +1,162 @@
+import argparse
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+from tensorrt_llm._utils import release_gc
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import DeciLMForCausalLM
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_dir', type=str, required=True)
+
+    parser.add_argument('--tp_size',
+                        type=int,
+                        default=1,
+                        help='N-way tensor parallelism size')
+    parser.add_argument('--pp_size',
+                        type=int,
+                        default=1,
+                        help='N-way pipeline parallelism size')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='bfloat16',
+                        choices=['float32', 'bfloat16', 'float16'])
+
+    parser.add_argument('--load_by_shard',
+                        action='store_true',
+                        help='Load a pretrained model shard-by-shard.')
+
+    parser.add_argument("--load_model_on_cpu", action="store_true")
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
+    )
+    parser.add_argument(
+        '--use_embedding_sharing',
+        action="store_true",
+        default=False,
+        help=
+        'Try to reduce the engine size by sharing the embedding lookup table between two layers.'
+        'Note: the flag might not take effect when the criteria are not met.')
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default='tllm_checkpoint',
+                        help='The path to save the TensorRT-LLM checkpoint')
+    parser.add_argument(
+        '--workers',
+        type=int,
+        default=1,
+        help='The number of workers for converting checkpoint in parallel')
+
+    parser.add_argument(
+        '--save_config_only',
+        action="store_true",
+        default=False,
+        help=
+        'Only save the model config w/o read and converting weights, be careful, this is for debug only'
+    )
+
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Pass trust_remote_code=True to HF loading functions as needed")
+
+    args = parser.parse_args()
+    return args
+
+
+def args_to_build_options(args):
+    return {
+        'use_parallel_embedding': args.use_parallel_embedding,
+        'embedding_sharding_dim': args.embedding_sharding_dim,
+        'share_embedding_table': args.use_embedding_sharing,
+    }
+
+
+def convert_and_save_hf(args):
+    model_dir = args.model_dir
+    load_by_shard = args.load_by_shard
+    world_size = args.tp_size * args.pp_size
+    # Need to convert the cli args to the kay-value pairs and override them in the generate config dict.
+    # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now,
+    # before the refactor is done.
+    override_fields = {}
+    override_fields.update(args_to_build_options(args))
+
+    def convert_and_save_rank(args, rank):
+        mapping = Mapping(world_size=world_size,
+                          rank=rank,
+                          tp_size=args.tp_size,
+                          pp_size=args.pp_size)
+        model = DeciLMForCausalLM.from_hugging_face(
+            model_dir,
+            args.dtype,
+            mapping=mapping,
+            load_by_shard=load_by_shard,
+            load_model_on_cpu=args.load_model_on_cpu,
+            trust_remote_code=args.trust_remote_code,
+            **override_fields,
+        )
+        model.save_checkpoint(args.output_dir, save_config=(rank == 0))
+        del model
+
+    execute(args.workers, [convert_and_save_rank] * world_size, args)
+    release_gc()
+
+
+def execute(workers, func, args):
+    if workers == 1:
+        for rank, f in enumerate(func):
+            f(args, rank)
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as p:
+            futures = [p.submit(f, args, rank) for rank, f in enumerate(func)]
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    traceback.print_exc()
+                    exceptions.append(e)
+            assert len(
+                exceptions
+            ) == 0, "Checkpoint conversion failed, please check error log."
+
+
+def main():
+    args = parse_arguments()
+    tik = time.time()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # TODO(oargov): all deci checkpoints require trust_remote_code=True at the moment, remove this when this changes
+    # NOTE: we opt not to make this the default since users should be made aware of this in-case they don't want to trust remote code
+    assert args.trust_remote_code, "Nemotron NAS checkpoint require --trust_remote_code"
+
+    convert_and_save_hf(args)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Total time of converting checkpoints: {t}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt
index d7bf43bed..80c4473aa 100644
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/phi/README.md b/examples/phi/README.md
index c6c25ebb5..802ee282a 100644
--- a/examples/phi/README.md
+++ b/examples/phi/README.md
@@ -1,18 +1,16 @@
 # Phi
 
-This document explains how to build the [phi-2](https://huggingface.co/microsoft/phi-2), [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct),
-[Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct), [Phi-3-small-8k-instruct](https://huggingface.co/microsoft/Phi-3-small-8k-instruct), [Phi-3-small-128k-instruct](https://huggingface.co/microsoft/Phi-3-small-128k-instruct), [Phi-3-medium-4k-instruct](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/) and [Phi-3-medium-128k-instruct](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/)
-models using TensorRT-LLM and run on a single GPU.
-
-- [Phi](#phi)
-  - [Overview](#overview)
-  - [Support Matrix](#support-matrix)
-  - [Usage](#usage)
-    - [1. Convert weights from HF Transformers to TensorRT-LLM format](#1-convert-weights-from-hf-transformers-to-tensorrt-llm-format)
-    - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines)
-    - [3. Summarization using the Phi model](#3-summarization-using-the-phi-model)
-    - [4. Quantization](#4-quantization)
-    - [5. Run Phi-3 with LoRA](#5-run-phi-3-with-lora)
+This document explains how to build Phi-2, Phi-3 and Phi-3.5 family of models using TensorRT-LLM and run on a single or multiple GPUs.
+For multimodal models (Phi-3-vision-128k-instruct and Phi-3.5-vision-instruct), see `../multimodal/README.md`.
+
+- [Overview](#overview)
+- [Support Matrix](#support-matrix)
+- [Usage](#usage)
+  - [1. Convert weights from HF Transformers to TensorRT-LLM format](#1-convert-weights-from-hf-transformers-to-tensorrt-llm-format)
+  - [2. Build TensorRT engine(s)](#2-build-tensorrt-engines)
+  - [3. Summarization using the Phi model](#3-summarization-using-the-phi-model)
+  - [4. Quantization](#4-quantization)
+  - [5. Run Phi-3 with LoRA](#5-run-phi-3-with-lora)
 
 ## Overview
 
@@ -29,13 +27,15 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
 
 |    Model Name    | FP16  | BF16  | FP8   | INT8  | TP   |
 | :--------------: | :---: | :---: | :---: | :---: | :---: |
-|    phi-2    |   Y   |   Y    |   |    | Y |
+|    Phi-2    |   Y   |   Y    |   |    | Y |
 | Phi-3-mini-4k-instruct    |   Y   |   Y   | Y   | Y  |
 | Phi-3-mini-128k-instruct  |   Y   |   Y   | Y   | Y  |
 | Phi-3-small-8k-instruct   |   Y   |   Y   | Y   | Y  | Y |
 | Phi-3-small-128k-instruct |   Y   |   Y   | Y   | Y  | Y |
 | Phi-3-medium-8k-instruct  |   Y   |   Y   | Y   | Y  |
 | Phi-3-medium-128k-instruct |  Y   |   Y   | Y   | Y  |
+| Phi-3.5-mini-instruct     |   Y   |   Y   | Y   | Y  |
+| Phi-3.5-MoE-instruct      |   Y   |   Y   | Y   | Y  | Y |
 
 * Model Name: the name of the model, the same as the name on HuggingFace
 * TP: Tensor Parallel
@@ -57,6 +57,11 @@ python ./convert_checkpoint.py \
                     --dtype float16
 ```
 
+If a model supports tensor-parallelism, number of tensor parallel ranks to split the model into can be specified as `--tp_size` argument to `convert_checkpoint.py`.
+
+For Phi-3.5-MoE-instruct model, expert parallelism can be enabled using `--moe_tp_size` and `--moe_ep_size` arguments.
+The section on Parallelism Modes in `../mixtral/README.md` discusses tensor and expert parallelism for Mixture of Experts models in detail.
+
 ### 2. Build TensorRT engine(s)
 
 TensorRT-LLM builds TensorRT engine(s) using a HF checkpoint. If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights.
diff --git a/examples/phi/convert_checkpoint.py b/examples/phi/convert_checkpoint.py
index cddb110b0..249dae2f2 100644
--- a/examples/phi/convert_checkpoint.py
+++ b/examples/phi/convert_checkpoint.py
@@ -59,6 +59,20 @@ def parse_arguments():
         'Define the precision for the weights when using weight-only quantization.'
         'You must also use --use_weight_only for that argument to have an impact.'
     )
+    parser.add_argument(
+        '--moe_tp_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE'
+    )
+    parser.add_argument(
+        '--moe_ep_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE'
+    )
     parser.add_argument('--output_dir',
                         type=str,
                         default='tllm_checkpoint',
@@ -110,6 +124,18 @@ def args_to_quant_config(args: argparse.Namespace) -> QuantConfig:
     args = parse_arguments()
     assert args.pp_size == 1, "Pipeline parallelism is not supported."
 
+    world_size = args.tp_size * args.pp_size
+    if (args.moe_tp_size == -1 and args.moe_ep_size == -1):
+        # moe default to tp-only
+        args.moe_tp_size = args.tp_size
+        args.moe_ep_size = 1
+    elif (args.moe_tp_size == -1):
+        args.moe_tp_size = args.tp_size // args.moe_ep_size
+    elif (args.moe_ep_size == -1):
+        args.moe_ep_size = args.tp_size // args.moe_tp_size
+    assert (args.moe_tp_size * args.moe_ep_size == args.tp_size
+            ), "moe_tp_size * moe_ep_size must equal to tp_size"
+
     tik = time.time()
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
@@ -119,39 +145,35 @@ def args_to_quant_config(args: argparse.Namespace) -> QuantConfig:
     model_type = model_config.architectures[0]
     supported_models = [
         'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM',
-        'Phi3SmallForCausalLM'
+        'Phi3SmallForCausalLM', 'PhiMoEForCausalLM'
     ]
 
     if model_type not in supported_models:
         assert False, "Invalid model type"
 
-    phi_model = Phi3ForCausalLM if model_type.find(
-        'Phi3') != -1 else PhiForCausalLM
-
-    hf_model = None
+    is_phi3 = 'Phi3' in model_type or 'MoE' in model_type
+    phi_model = Phi3ForCausalLM if is_phi3 else PhiForCausalLM
 
-    override_fields = {}
-    # override_fields.update(args_to_build_options(args))
     quant_config = args_to_quant_config(args)
 
     def convert_and_save_rank(args, rank):
-        mapping = Mapping(world_size=args.tp_size * args.pp_size,
+        mapping = Mapping(world_size=world_size,
                           rank=rank,
                           tp_size=args.tp_size,
-                          pp_size=args.pp_size)
+                          pp_size=args.pp_size,
+                          moe_tp_size=args.moe_tp_size,
+                          moe_ep_size=args.moe_ep_size)
 
         phi = phi_model.from_hugging_face(
-            args.model_dir if hf_model is None else hf_model,
+            args.model_dir,
             args.dtype,
             mapping=mapping,
             quant_config=quant_config,
-            **override_fields,
         )
         phi.save_checkpoint(args.output_dir, save_config=(rank == 0))
         del phi
 
-    execute(args.workers, [convert_and_save_rank] * args.tp_size * args.pp_size,
-            args)
+    execute(args.workers, [convert_and_save_rank] * world_size, args)
 
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt
index c985fe088..c44bb977e 100644
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/quantization/quantize.py b/examples/quantization/quantize.py
index f2fe6f3c9..c1380efa3 100644
--- a/examples/quantization/quantize.py
+++ b/examples/quantization/quantize.py
@@ -55,8 +55,13 @@
         help="Quantization format.",
         default="full_prec",
         choices=[
-            "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo",
-            "full_prec"
+            "fp8",
+            "int8_sq",
+            "int4_awq",
+            "w4a8_awq",
+            "int8_wo",
+            "int4_wo",
+            "full_prec",
         ],
     )
     parser.add_argument(
@@ -101,15 +106,43 @@
                         action='store_true',
                         help="whether to quantize the weights of medusa heads")
 
+    # auto quantization
+    parser.add_argument(
+        '--autoq_format',
+        default=None,
+        type=str,
+        help=
+        "Specific quantization algorithms will be searched in auto quantization."
+        "The algorithm must in ['fp8', 'int4_awq', 'w4a8_awq', 'int8_sq']."
+        "You can use ',' to separate more than one quantization algorithms(e.g. --autoq_format fp8,int4_awq,w4a8_awq)."
+        "Notice: fp8 and int8_sq can't be used at the same time.")
+    parser.add_argument(
+        '--weight_compression',
+        type=float,
+        default=None,
+        help="Percent of compression size when using mix precision quantization."
+        "The range is [0.0, 1.0], if you only indicate the autoq_format, it will be default to the lowest possible value."
+    )
+
     args = parser.parse_args()
 
+    # weight_compression check
+    if args.autoq_format:
+        lower_bound = 0.25 if '4' in args.autoq_format else 0.5
+        if args.weight_compression is None or args.weight_compression < lower_bound:
+            print(
+                f"invalid weight_compression value, will be set to {lower_bound}"
+            )
+            args.weight_compression = lower_bound
+
     if args.model_dir is not None:
         quantize_and_export(
             model_dir=args.model_dir,
             device=args.device,
             calib_dataset=args.calib_dataset,
             dtype=args.dtype,
-            qformat=args.qformat,
+            qformat=args.qformat
+            if args.weight_compression is None else args.autoq_format,
             kv_cache_dtype=args.kv_cache_dtype,
             calib_size=args.calib_size,
             batch_size=args.batch_size,
@@ -125,7 +158,8 @@
             max_draft_len=args.max_draft_len,
             medusa_hidden_act=args.medusa_hidden_act,
             medusa_model_dir=args.medusa_model_dir,
-            quant_medusa_head=args.quant_medusa_head)
+            quant_medusa_head=args.quant_medusa_head,
+            weight_compression=args.weight_compression)
     elif args.nemo_ckpt_path is not None:
         quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path,
                                  decoder_type=args.decoder_type,
diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt
index 7fd6da926..2f84ad8b5 100644
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets>=2.14.4
 nemo-toolkit[all]<=1.20.0,>=1.18.0
 rouge_score~=0.1.2
diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt
index 38672c8e0..32ceb5db1 100644
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@@ -1,11 +1,11 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
 transformers>=4.40.1
 transformers-stream-generator
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 tiktoken
 einops
 
diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt
index 3db61b20c..23f381886 100644
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
 transformers-stream-generator
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 tiktoken
 einops
 auto-gptq
diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt
index ee554d62b..80c16b1dc 100644
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 git+https://github.com/google-deepmind/recurrentgemma.git
 flax>=0.8.2
 jax~=0.4.23
diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt
index db7c7cd52..179f92d66 100644
--- a/examples/redrafter/requirements.txt
+++ b/examples/redrafter/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.14.5
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
 evaluate~=0.4.1
diff --git a/examples/run.py b/examples/run.py
index 0baa00b35..bcbe7ecce 100644
--- a/examples/run.py
+++ b/examples/run.py
@@ -41,6 +41,11 @@ def parse_arguments(args=None):
     parser = argparse.ArgumentParser()
     parser.add_argument('--max_input_length', type=int, default=923)
     parser.add_argument('--max_output_len', type=int, required=True)
+    parser.add_argument(
+        '--draft_engine_dir',
+        type=str,
+        default=None,
+        help='Path to engine of draft model in Draft-Target-Model mode.')
     parser.add_argument(
         '--input_text',
         type=str,
@@ -55,6 +60,19 @@ def parse_arguments(args=None):
     parser.add_argument('--multimodal_input_file',
                         type=str,
                         help='Path to multimodal input file.')
+    parser.add_argument(
+        '--input_token_extra_ids',
+        type=int,
+        nargs='+',
+        help=
+        'Input token extra ids for using p-tuning and KV Cache reuse together (only available with cpp session).',
+        default=None)
+    parser.add_argument(
+        '--input_token_extra_ids_file',
+        type=str,
+        help=
+        'CSV or Numpy file containing input token extra ids file. Alternative to text input (only available with cpp session).',
+        default=None)
     parser.add_argument('--output_csv',
                         type=str,
                         help='CSV file where the tokenized output is stored.',
@@ -155,9 +173,41 @@ def parse_input(tokenizer,
     batch_input_ids = [
         torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
     ]
+
+    logger.debug(f"Input token ids (batch_size = {len(batch_input_ids)}):")
+    for i, input_ids in enumerate(batch_input_ids):
+        logger.debug(f"Request {i}: {input_ids.tolist()}")
+
     return batch_input_ids
 
 
+def parse_input_token_extra_ids(prompt_table_path, kv_cache_enable_block_reuse,
+                                input_token_extra_ids,
+                                input_token_extra_ids_file, max_input_length):
+    batch_extra_ids = None
+    if prompt_table_path and kv_cache_enable_block_reuse:
+        assert input_token_extra_ids or input_token_extra_ids_file, \
+            "Input token extra ids must be provided when p-tuning and KV Cache reuse are both enabled"
+        batch_extra_ids = []
+        if input_token_extra_ids_file:
+            if input_token_extra_ids_file.endswith('.csv'):
+                with open(input_token_extra_ids_file, 'r') as csv_file:
+                    csv_reader = csv.reader(csv_file, delimiter=',')
+                    for line in csv_reader:
+                        extra_ids = [int(num) for num in line]
+                        batch_extra_ids.append(extra_ids[-max_input_length:])
+            elif input_token_extra_ids_file.endswith('.npy'):
+                inputs = np.load(input_token_extra_ids_file)
+                for extra_ids in inputs:
+                    batch_extra_ids.append(extra_ids[-max_input_length:])
+            else:
+                print('Input file format not supported.')
+                raise SystemExit
+        else:
+            batch_extra_ids.append(input_token_extra_ids)
+    return batch_extra_ids
+
+
 def print_output(tokenizer,
                  output_ids: torch.Tensor,
                  input_lengths: List[int],
@@ -193,6 +243,7 @@ def print_output(tokenizer,
                              if num_return_sequences > 1 else
                              f'Text {batch_idx} Beam {beam}')
                 print(f'Output [{index_str}]: \"{output_text}\"')
+                logger.debug(str(outputs))
 
     output_ids = output_ids.reshape((-1, output_ids.size(2)))
 
@@ -248,10 +299,230 @@ def print_output(tokenizer,
         np.save(log_probs_file, log_probs_outputs)
 
 
+def run_draft_target_model(batch_input_ids, args, runtime_rank, end_id, pad_id,
+                           stop_words_list, bad_words_list, vocab_size):
+    draft_len, draft_device_list, target_device_list, use_logits = ast.literal_eval(
+        args.draft_target_model_config)
+    logger.info(f"draft_len: {draft_len}")
+    logger.info(f"Device(s) for draft model: {draft_device_list}")
+    logger.info(f"Device(s) for target model: {target_device_list}")
+    logger.info(f"Use logits to accept tokens: {use_logits}")
+    # Variables keeping constant during decoding
+    input_batch_size = len(batch_input_ids)  # Note as `BS`
+    beam_width = args.num_beams  # Note as `BW`
+    is_compute_acceptance_ratio = logger.level == 'verbose'  # Only enable in verbose mode
+    input_lengths = [len(p) for p in batch_input_ids]
+    max_seq_lengths = [i + args.max_output_len for i in input_lengths]
+    # Variables changing during decoding
+    n_iteration = 0
+    prefix = batch_input_ids  # Input for draft model
+    batch_slot = list(range(input_batch_size))  # Index of requests
+    if is_compute_acceptance_ratio:
+        n_draft_token = [0 for _ in range(input_batch_size)]
+        n_accept_token = [0 for _ in range(input_batch_size)]
+
+    # Repack the output like the output of function `generate`
+    outputs = {}
+    outputs["output_ids"] = torch.full(
+        [input_batch_size, beam_width,
+         max(max_seq_lengths)],
+        end_id,
+        dtype=torch.int32)
+    for bs in range(input_batch_size):
+        outputs["output_ids"][bs, :, :input_lengths[bs]] = batch_input_ids[bs]
+    outputs["sequence_lengths"] = torch.full([input_batch_size, beam_width],
+                                             0,
+                                             dtype=torch.int32)
+    outputs["context_logits"] = None
+    outputs["generation_logits"] = torch.full(
+        [input_batch_size, beam_width,
+         max(max_seq_lengths), vocab_size],
+        0,
+        dtype=torch.float16)
+    outputs['cum_log_probs'] = None
+    outputs['log_probs'] = None
+
+    # Model runners
+    common_kwargs = dict(
+        lora_dir=args.lora_dir,
+        rank=runtime_rank,
+        debug_mode=args.debug_mode,
+        lora_ckpt_source=args.lora_ckpt_source,
+        gpu_weights_percent=args.gpu_weights_percent,
+        max_output_len=args.max_output_len,
+        is_enc_dec=False,
+        max_batch_size=input_batch_size,
+        max_input_len=max(input_lengths) + args.max_output_len,
+        max_beam_width=beam_width,
+        max_attention_window_size=args.max_attention_window_size,
+        sink_token_length=args.sink_token_length,
+        max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
+        kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
+        kv_cache_free_gpu_memory_fraction=args.
+        kv_cache_free_gpu_memory_fraction,
+        enable_chunked_context=args.enable_chunked_context,
+        multi_block_mode=args.multi_block_mode,
+        cuda_graph_mode=args.cuda_graph_mode,
+        enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc,
+    )
+    draft_runner_kwargs = common_kwargs.copy()
+    draft_runner_kwargs.update(
+        engine_dir=args.draft_engine_dir,
+        device_ids=draft_device_list,
+    )
+    draft_runner = ModelRunnerCpp.from_dir(**draft_runner_kwargs)
+    target_runner_kwargs = common_kwargs.copy()
+    target_runner_kwargs.update(
+        engine_dir=args.engine_dir,
+        device_ids=target_device_list,
+    )
+    target_runner = ModelRunnerCpp.from_dir(**target_runner_kwargs)
+
+    common_gen_kwargs = dict(
+        max_attention_window_size=args.max_attention_window_size,
+        sink_token_length=args.sink_token_length,
+        end_id=end_id,
+        pad_id=pad_id,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        num_beams=beam_width,
+        num_return_sequences=args.num_return_sequences,
+        length_penalty=args.length_penalty,
+        early_stopping=args.early_stopping,
+        repetition_penalty=args.repetition_penalty,
+        presence_penalty=args.presence_penalty,
+        frequency_penalty=args.frequency_penalty,
+        stop_words_list=stop_words_list,
+        bad_words_list=bad_words_list,
+        random_seed=args.random_seed,
+        streaming=False,
+        output_sequence_lengths=True,
+        return_dict=True,
+    )
+
+    while True:
+        n_iteration += 1
+        batch_size = len(prefix)
+        prefix_len = [len(prefix[i]) for i in range(batch_size)]
+        # Run draft model
+        draft_generation_kwargs = common_gen_kwargs.copy()
+        draft_generation_kwargs.update(
+            batch_input_ids=prefix,
+            max_new_tokens=draft_len,
+            streaming=False,
+            output_sequence_lengths=True,
+            return_dict=True,
+        )
+        draft = draft_runner.generate(**draft_generation_kwargs)
+        torch.cuda.synchronize()
+
+        # draft["output_ids"].shape -> [BS, BW, maxSL]
+        # draft["sequence_lengths"].shape -> [BS, BW]
+        # draft["generation_logits"].shape -> [BS, BW, draft_len, vocab_size]
+        # `d_*` means variables from draft model
+        # Value of `d_seq_len` includes input part, but `draft_len` doesn't
+        d_seq_len = draft["sequence_lengths"][:, 0].tolist()
+        d_len = [d_seq_len[bs] - prefix_len[bs] for bs in range(batch_size)]
+        d_ids = [None] * batch_size
+        if use_logits:
+            assert "generation_logits" in draft.keys(
+            ), "`--gather_generation_logits` must be specified when building TRT engine."
+            d_logits = [None] * batch_size
+        else:
+            d_logits = None
+
+        for bs in range(batch_size):
+            l = prefix_len[bs]
+            r = d_seq_len[bs]
+            d_ids[bs] = draft["output_ids"][bs, 0, l:r].tolist()
+            if use_logits:
+                d_logits[bs] = draft["generation_logits"][bs, 0, :, :]
+
+        # Run target model
+        target_generation_kwargs = common_gen_kwargs.copy()
+        target_generation_kwargs.update(
+            batch_input_ids=prefix,
+            max_new_tokens=draft_len + 1,
+            draft_tokens_list=d_ids,
+            draft_logits_list=d_logits,
+        )
+        target = target_runner.generate(**target_generation_kwargs)
+        torch.cuda.synchronize()
+
+        # `t_*` means variables from target model
+        # Value of `t_seq_len` and `t_seq_ids` includes input part, but `t_len` or `t_ids` doesn't
+        t_seq_len = target["sequence_lengths"][:, 0].tolist()
+        # t_len = [t_seq_len[bs] - prefix_len[bs] for bs in range(batch_size)]
+        t_seq_ids = [None] * batch_size
+        t_ids = [None] * batch_size
+
+        # Update output and tokens for next iteration
+        for bs in range(batch_size):
+            index = batch_slot[bs]  # Get original index in the input batch
+            l = prefix_len[bs]
+            r = min(t_seq_len[bs], max_seq_lengths[index])
+            t_ids[bs] = target["output_ids"][bs, 0, l:r].tolist()
+            t_seq_ids[bs] = target["output_ids"][bs, 0, :r]
+            outputs["output_ids"][index, 0, l:r] = torch.IntTensor(t_ids[bs])
+            outputs["sequence_lengths"][index, 0] = r
+            if use_logits:
+                outputs["generation_logits"][index, 0, (l - input_lengths[bs]):(r - input_lengths[bs])] = \
+                    target["generation_logits"][bs][0,:(r-l)].detach().cpu()
+            if is_compute_acceptance_ratio:
+                n_draft_token[index] += len(d_ids[bs])
+                n_accept_token[index] += sum(d_ids[bs][i] == t_ids[bs][i] \
+                    for i in range(min(d_len[bs], t_seq_len[bs] - prefix_len[bs], max_seq_lengths[index] - prefix_len[bs])))
+
+        # yield output if using streaming
+        if args.streaming and not n_iteration % args.streaming_interval:
+            yield outputs
+
+        # Evaluate stop criteria and prepare inputs for next iteration
+        prefix_next = []
+        batch_slot_next = []
+        for bs in range(batch_size):
+            # Stop due to output length
+            if len(t_seq_ids[bs]) >= max_seq_lengths[batch_slot[bs]]:
+                continue  # No need to update for the stopped requests
+            # Stop due to the same output. Normally target should return 1 more token.
+            # if (d_ids is not None and np.array_equal(d_ids[bs], t_ids[bs])):
+            #     continue
+            # Stop due to no change (hit early stopping)
+            if np.array_equal(t_seq_ids[bs], prefix[bs]):
+                continue
+            # Stop due to end words
+            if end_id in t_seq_ids[bs]:
+                continue
+            # TODO: Check bad words and stop words criteria
+            prefix_next.append(t_seq_ids[bs])
+            batch_slot_next.append(bs)
+        prefix = prefix_next
+        batch_slot = batch_slot_next
+        if len(prefix) == 0:  # Leave while loop if no request remained
+            break
+
+    if is_compute_acceptance_ratio:
+        logger.debug(f"Count of iteration(s): {n_iteration}")
+        logger.debug(f"Acceptance ratio:")
+        for i, (a, d) in enumerate(zip(n_accept_token, n_draft_token)):
+            logger.debug(f"Request {i}: {a / d * 100 :6.2f}%")
+
+    # Return runner in No-Streaming mode
+    if args.streaming:
+        yield outputs
+    else:
+        yield outputs, target_runner
+
+
 def main(args):
     runtime_rank = tensorrt_llm.mpi_rank()
     logger.set_level(args.log_level)
 
+    if args.draft_target_model_config is not None:
+        assert args.draft_engine_dir is not None, "Path to draft engine (--draft_engine_dir) must be specified."
+        assert args.engine_dir is not None, "Path to target engine (--engine_dir) must be specified."
+
     # different handling if encoder-decoder models
     is_enc_dec = {'encoder', 'decoder'}.issubset({
         name
@@ -321,6 +592,11 @@ def main(args):
             batch_input_ids, model_name, args.engine_dir,
             args.multimodal_input_file)
 
+    input_token_extra_ids = parse_input_token_extra_ids(
+        args.prompt_table_path, args.kv_cache_enable_block_reuse,
+        args.input_token_extra_ids, args.input_token_extra_ids_file,
+        args.max_input_length)
+
     input_lengths = [x.size(0) for x in decoder_input_ids
                      ] if is_enc_dec else [x.size(0) for x in batch_input_ids]
 
@@ -358,86 +634,109 @@ def main(args):
             "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)."
         )
         args.return_all_generated_tokens = True
-    runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
-    runner_kwargs = dict(
-        engine_dir=args.engine_dir,
-        lora_dir=args.lora_dir,
-        rank=runtime_rank,
-        debug_mode=args.debug_mode,
-        lora_ckpt_source=args.lora_ckpt_source,
-        gpu_weights_percent=args.gpu_weights_percent,
-        max_output_len=args.max_output_len,
-    )
-    if not args.use_py_session:
-        runner_kwargs.update(is_enc_dec=is_enc_dec)
-    if args.medusa_choices is not None:
-        args.medusa_choices = ast.literal_eval(args.medusa_choices)
-        assert args.temperature == 1.0, "Medusa should use temperature == 1.0"
-        assert args.num_beams == 1, "Medusa should use num_beams == 1"
-        runner_kwargs.update(medusa_choices=args.medusa_choices)
-    if args.lookahead_config is not None:
-        args.lookahead_config = ast.literal_eval(args.lookahead_config)
-        assert len(
-            args.lookahead_config
-        ) == 3, "Lookahead needs [max_window_size, max_ngram_size, max_verification_set_size]"
-        runner_kwargs.update(lookahead_config=args.lookahead_config)
-    if not args.use_py_session:
+
+    logger.info(f"Using {'Python' if args.use_py_session else 'C++'} session")
+
+    if args.draft_target_model_config is None:  # Normal run except Draft-Target-Model
+        runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
+        runner_kwargs = dict(
+            engine_dir=args.engine_dir,
+            lora_dir=args.lora_dir,
+            rank=runtime_rank,
+            debug_mode=args.debug_mode,
+            lora_ckpt_source=args.lora_ckpt_source,
+            gpu_weights_percent=args.gpu_weights_percent,
+            max_output_len=args.max_output_len,
+        )
+        if args.medusa_choices is not None:
+            args.medusa_choices = ast.literal_eval(args.medusa_choices)
+            assert args.temperature == 1.0, "Medusa should use temperature == 1.0"
+            assert args.num_beams == 1, "Medusa should use num_beams == 1"
+            runner_kwargs.update(medusa_choices=args.medusa_choices)
+        if args.lookahead_config is not None:
+            args.lookahead_config = ast.literal_eval(args.lookahead_config)
+            assert len(
+                args.lookahead_config
+            ) == 3, "Lookahead needs [max_window_size, max_ngram_size, max_verification_set_size]"
+            runner_kwargs.update(lookahead_config=args.lookahead_config)
+        if not args.use_py_session:
+            runner_kwargs.update(
+                is_enc_dec=is_enc_dec,
+                max_batch_size=len(batch_input_ids),
+                max_input_len=max(
+                    encoder_input_lengths if is_enc_dec else input_lengths),
+                max_beam_width=args.num_beams,
+                max_attention_window_size=args.max_attention_window_size,
+                sink_token_length=args.sink_token_length,
+                max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
+                kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
+                kv_cache_free_gpu_memory_fraction=args.
+                kv_cache_free_gpu_memory_fraction,
+                cross_kv_cache_fraction=args.cross_kv_cache_fraction
+                if is_enc_dec else None,
+                enable_chunked_context=args.enable_chunked_context,
+                multi_block_mode=args.multi_block_mode,
+                cuda_graph_mode=args.cuda_graph_mode)
         runner_kwargs.update(
-            max_batch_size=len(batch_input_ids),
-            max_input_len=max(
-                encoder_input_lengths if is_enc_dec else input_lengths),
-            max_beam_width=args.num_beams,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
-            kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
-            kv_cache_free_gpu_memory_fraction=args.
-            kv_cache_free_gpu_memory_fraction,
-            enable_chunked_context=args.enable_chunked_context,
-            multi_block_mode=args.multi_block_mode)
-    runner_kwargs.update(
-        enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
-    runner = runner_cls.from_dir(**runner_kwargs)
-
-    with torch.no_grad():
-        outputs = runner.generate(
-            batch_input_ids=decoder_input_ids
-            if is_enc_dec else batch_input_ids,
-            encoder_input_ids=encoder_input_ids if is_enc_dec else None,
-            encoder_input_features=encoder_input_features
-            if is_enc_dec else None,
-            encoder_output_lengths=encoder_output_lengths
-            if is_enc_dec else None,
-            max_new_tokens=args.max_output_len,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            end_id=end_id,
-            pad_id=pad_id,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            num_beams=args.num_beams,
-            num_return_sequences=args.num_return_sequences,
-            length_penalty=args.length_penalty,
-            early_stopping=args.early_stopping,
-            repetition_penalty=args.repetition_penalty,
-            presence_penalty=args.presence_penalty,
-            frequency_penalty=args.frequency_penalty,
-            stop_words_list=stop_words_list,
-            bad_words_list=bad_words_list,
-            output_cum_log_probs=(args.output_cum_log_probs_npy != None),
-            output_log_probs=(args.output_log_probs_npy != None),
-            random_seed=args.random_seed,
-            lora_uids=args.lora_task_uids,
-            prompt_table=args.prompt_table_path,
-            prompt_tasks=args.prompt_tasks,
-            streaming=args.streaming,
-            output_sequence_lengths=True,
-            no_repeat_ngram_size=args.no_repeat_ngram_size,
-            return_dict=True,
-            medusa_choices=args.medusa_choices,
-            return_all_generated_tokens=args.return_all_generated_tokens)
-        torch.cuda.synchronize()
+            enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
+        runner = runner_cls.from_dir(**runner_kwargs)
+
+        with torch.no_grad():
+            outputs = runner.generate(
+                batch_input_ids=decoder_input_ids
+                if is_enc_dec else batch_input_ids,
+                encoder_input_ids=encoder_input_ids if is_enc_dec else None,
+                encoder_input_features=encoder_input_features
+                if is_enc_dec else None,
+                encoder_output_lengths=encoder_output_lengths
+                if is_enc_dec else None,
+                max_new_tokens=args.max_output_len,
+                max_attention_window_size=args.max_attention_window_size,
+                sink_token_length=args.sink_token_length,
+                end_id=end_id,
+                pad_id=pad_id,
+                temperature=args.temperature,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                num_return_sequences=args.num_return_sequences,
+                length_penalty=args.length_penalty,
+                early_stopping=args.early_stopping,
+                repetition_penalty=args.repetition_penalty,
+                presence_penalty=args.presence_penalty,
+                frequency_penalty=args.frequency_penalty,
+                stop_words_list=stop_words_list,
+                bad_words_list=bad_words_list,
+                output_cum_log_probs=(args.output_cum_log_probs_npy != None),
+                output_log_probs=(args.output_log_probs_npy != None),
+                random_seed=args.random_seed,
+                lora_uids=args.lora_task_uids,
+                prompt_table=args.prompt_table_path,
+                prompt_tasks=args.prompt_tasks,
+                streaming=args.streaming,
+                output_sequence_lengths=True,
+                no_repeat_ngram_size=args.no_repeat_ngram_size,
+                return_dict=True,
+                medusa_choices=args.medusa_choices,
+                return_all_generated_tokens=args.return_all_generated_tokens,
+                input_token_extra_ids=input_token_extra_ids)
+            torch.cuda.synchronize()
+
+    else:  # For Draft-Target-Model
+        if not args.kv_cache_enable_block_reuse:
+            logger.warning(
+                "`--kv_cache_enable_block_reuse` must be specified in Draft-Target-Model."
+            )
+        assert not args.use_py_session, "Only CPP session is supported in Draft-Target-Model."
+        assert not is_enc_dec, "Only decoder model is supported in Draft-Target-Model."
+        assert args.num_beams == 1, "Beam width > 1 is not supported in Draft-Target-Model."
+
+        outputs = run_draft_target_model(batch_input_ids, args, runtime_rank,
+                                         end_id, pad_id, stop_words_list,
+                                         bad_words_list, tokenizer.vocab_size)
+
+        if not args.streaming:  # Unpack runner from the return value in No-Streaming mode
+            outputs, runner = list(outputs)[0]
 
     if args.streaming:
         for curr_outputs in throttle_generator(outputs,
@@ -525,8 +824,9 @@ def main(args):
                     streaming=args.streaming,
                     output_sequence_lengths=True,
                     return_dict=True,
-                    return_all_generated_tokens=args.return_all_generated_tokens
-                )
+                    return_all_generated_tokens=args.
+                    return_all_generated_tokens,
+                    input_token_extra_ids=input_token_extra_ids)
                 torch.cuda.synchronize()
 
         tensorrt_llm.profiler.start("tmp")
@@ -559,8 +859,9 @@ def main(args):
                     streaming=args.streaming,
                     output_sequence_lengths=True,
                     return_dict=True,
-                    return_all_generated_tokens=args.return_all_generated_tokens
-                )
+                    return_all_generated_tokens=args.
+                    return_all_generated_tokens,
+                    input_token_extra_ids=input_token_extra_ids)
                 torch.cuda.synchronize()
         tensorrt_llm.profiler.stop("tmp")
 
diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt
index 8ea5798fc..ceee140b0 100644
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt
index 30dd11d63..5f7d98fe2 100644
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
-sentencepiece~=0.1.99
+sentencepiece>=0.1.99
diff --git a/examples/summarize.py b/examples/summarize.py
index e54eb0a52..faa3412d2 100644
--- a/examples/summarize.py
+++ b/examples/summarize.py
@@ -487,7 +487,8 @@ def eval_hf(datapoint,
                 kv_cache_free_gpu_memory_fraction=args.
                 kv_cache_free_gpu_memory_fraction,
                 enable_chunked_context=args.enable_chunked_context,
-                multi_block_mode=args.multi_block_mode)
+                multi_block_mode=args.multi_block_mode,
+                cuda_graph_mode=args.cuda_graph_mode)
         runner_kwargs.update(
             enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc)
         runner = runner_cls.from_dir(**runner_kwargs)
@@ -739,6 +740,7 @@ def eval_hf(datapoint,
 
 
 if __name__ == '__main__':
+    # see `add_common_args` for extended list of arguments
     parser = argparse.ArgumentParser()
     parser.add_argument('--test_hf', action='store_true')
     parser.add_argument('--test_trt_llm', action='store_true')
diff --git a/examples/utils.py b/examples/utils.py
index 03eea2d3a..6d43fe41f 100644
--- a/examples/utils.py
+++ b/examples/utils.py
@@ -289,7 +289,14 @@ def add_common_args(parser):
     parser.add_argument('--enable_context_fmha_fp32_acc',
                         action='store_true',
                         help="Enable FMHA runner FP32 accumulation.")
-    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument('--cuda_graph_mode',
+                        action='store_true',
+                        help="Enable cuda graphs in the inference.")
+    parser.add_argument(
+        '--log_level',
+        type=str,
+        choices=['verbose', 'info', 'warning', 'error', 'internal_error'],
+        default='info')
     parser.add_argument(
         '--no_prompt_template',
         dest='use_prompt_template',
@@ -340,19 +347,26 @@ def add_common_args(parser):
         help="Number of (default) virtual tokens to prepend to each sentence."
         " For example, '--num_prepend_vtokens=10' will prepend the tokens"
         " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.")
+    parser.add_argument(
+        '--draft_target_model_config',
+        type=str,
+        default=None,
+        help=
+        "Configuration of Draft-Target-Model decoding, see `examples/draft_target_model/README.md` for more information."
+        "   E.g.: [4, [0], [1], False] for [draft_len, draft_model_device_list, target_model_device_list, use_logits]."
+    )
     parser.add_argument(
         '--medusa_choices',
         type=str,
         default=None,
-        help="Medusa choice to use, if not none, will use Medusa decoding."
+        help="Configuration of Medusa decoding."
         "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
     )
     parser.add_argument(
         '--lookahead_config',
         type=str,
         default=None,
-        help=
-        "executor and request lookahead config to use, if not none, will use lookahead decoding."
+        help="Configuration of executor and request lookahead decoding."
         "   E.g.: [5, 6, 7] for [max_window_size, max_ngram_size, max_verification_set_size]."
     )
     # model arguments
@@ -402,6 +416,13 @@ def add_common_args(parser):
         type=float,
         help='Specify the free gpu memory fraction.',
     )
+    parser.add_argument(
+        '--cross_kv_cache_fraction',
+        default=0.5,
+        type=float,
+        help=
+        'Specify the kv cache fraction reserved for cross attention. Only applicable for encoder-decoder models. By default 0.5 for self and 0.5 for cross.',
+    )
     parser.add_argument(
         '--enable_chunked_context',
         action='store_true',
diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt
index dd123dc96..9f28078c7 100644
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.13.0
+tensorrt_llm==0.14.0
 tiktoken
 datasets
 kaldialign
diff --git a/requirements-dev.txt b/requirements-dev.txt
index c4529370a..3bf5a427d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -11,6 +11,7 @@ pytest-cov
 pytest-forked
 pytest-xdist
 pytest-timeout
+pytest-split
 rouge_score
 cloudpickle
 typing-extensions==4.8.0
@@ -18,3 +19,4 @@ bandit==1.7.7
 jsonlines==4.0.0
 jieba==0.42.1
 rouge==1.0.1
+pytest-rerunfailures
diff --git a/requirements-windows.txt b/requirements-windows.txt
index e3fc56308..dd2dbf8c8 100644
--- a/requirements-windows.txt
+++ b/requirements-windows.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://pypi.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu124
-accelerate==0.25.0
+accelerate>=0.25.0
 build
 colored
 cuda-python==12.5.0
-diffusers==0.27.0
+diffusers>=0.27.0
 numpy<2
 onnx>=1.12.0
 polygraphy==0.49.9
@@ -15,13 +15,14 @@ pandas
 h5py==3.10.0
 pywin32
 StrEnum
-sentencepiece>=0.1.99
+sentencepiece>=0.2.0
 tensorrt~=10.4.0
 tokenizers>=0.14
 # Default torch is CPU-only on Windows, so need to specify a torch version with GPU support
-torch==2.4.0+cu124
-nvidia-modelopt~=0.15.0
-transformers>=4.38.2
+torch==2.4.1+cu124
+torchvision==0.19.1+cu124
+nvidia-modelopt[torch]~=0.17.0
+transformers>=4.38.2,<=4.45.1
 wheel
 optimum
 evaluate
diff --git a/requirements.txt b/requirements.txt
index 698662167..cfaca19bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,11 +18,12 @@ h5py==3.10.0
 StrEnum
 sentencepiece>=0.1.99
 tensorrt~=10.4.0
-# https://github.com/pytorch/pytorch/blob/v2.4.0/version.txt uses 2.4.0a0.
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 uses 2.4.0a0.
 torch>=2.4.0a0,<=2.4.0
-nvidia-modelopt~=0.15.0
-transformers>=4.38.2,<=4.42.4
+torchvision
+nvidia-modelopt[torch]~=0.17.0
+transformers>=4.38.2,<=4.45.1
+pydantic>=2.9.1
 pillow==10.3.0
 wheel
 optimum
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 0f7e4c750..441c81463 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -76,6 +76,7 @@ def main(*,
          trt_root: str = None,
          nccl_root: str = None,
          clean: bool = False,
+         clean_wheel: bool = False,
          configure_cmake: bool = False,
          use_ccache: bool = False,
          fast_build: bool = False,
@@ -86,11 +87,23 @@ def main(*,
          benchmarks: bool = False,
          micro_benchmarks: bool = False,
          nvtx: bool = False):
+
+    if clean:
+        clean_wheel = True
+
     project_dir = get_project_dir()
     os.chdir(project_dir)
     build_run = partial(run, shell=True, check=True)
 
-    if not (project_dir / "3rdparty/cutlass/.git").exists():
+    # Get all submodules and check their folder exists. If not,
+    # invoke git submodule update
+    with open(project_dir / ".gitmodules", "r") as submodules_f:
+        submodules = [
+            l.split("=")[1].strip() for l in submodules_f.readlines()
+            if "path = " in l
+        ]
+    if any(not (project_dir / submodule / ".git").exists()
+           for submodule in submodules):
         build_run('git submodule update --init --recursive')
     on_windows = platform.system() == "Windows"
     requirements_filename = "requirements-dev-windows.txt" if on_windows else "requirements-dev.txt"
@@ -303,14 +316,23 @@ def get_pybind_lib():
                     print(f"Failed to build pybind11 stubgen: {ex}",
                           file=sys.stderr)
 
-    if dist_dir is None:
-        dist_dir = project_dir / "build"
-    else:
-        dist_dir = Path(dist_dir)
-
-    if not dist_dir.exists():
-        dist_dir.mkdir(parents=True)
     if not skip_building_wheel:
+        if dist_dir is None:
+            dist_dir = project_dir / "build"
+        else:
+            dist_dir = Path(dist_dir)
+
+        if not dist_dir.exists():
+            dist_dir.mkdir(parents=True)
+
+        if clean_wheel:
+            # For incremental build, the python build module adds
+            # the new files but does not remove the deleted files.
+            #
+            # This breaks the Windows CI/CD pipeline when building
+            # and validating python changes in the whl.
+            clear_folder(dist_dir)
+
         build_run(
             f'\"{sys.executable}\" -m build {project_dir} --skip-dependency-check --no-isolation --wheel --outdir "{dist_dir}"'
         )
@@ -327,6 +349,9 @@ def add_arguments(parser: ArgumentParser):
     parser.add_argument("--cuda_architectures", "-a")
     parser.add_argument("--install", "-i", action="store_true")
     parser.add_argument("--clean", "-c", action="store_true")
+    parser.add_argument("--clean_wheel",
+                        action="store_true",
+                        help="Clear dist_dir folder creating wheel")
     parser.add_argument("--configure_cmake",
                         action="store_true",
                         help="Always configure cmake before building")
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 3e5029249..3f64b1bc3 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -518,3 +518,31 @@ def supports_inflight_batching(engine_dir):
     json_config = GptJsonConfig.parse_file(config_path)
     model_config = json_config.model_config
     return model_config.supports_inflight_batching
+
+
+class QuantModeWrapper:
+
+    def __init__(self, objs):
+        self.objs = objs
+
+    def __getattr__(self, name):
+
+        def method_wrapper(*args, **kwargs):
+            result = False
+            for obj in self.objs:
+                attr = getattr(obj, name)
+                if callable(attr):
+                    result = result | attr(*args, **kwargs)
+            return result
+
+        return method_wrapper
+
+    def __repr__(self):
+        return f"QuantModeWrapper: ({self.objs})"
+
+    def __str__(self):
+        obj_strs = [str(obj) for obj in self.objs]
+        return f"[{', '.join(obj_strs)}]"
+
+    def __getitem__(self, index):
+        return self.objs[index]
diff --git a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
index 2d7df1e75..35282ce28 100644
--- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
+++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
@@ -26,6 +26,7 @@ class IdxEntry(Enum):
     KV_CACHE_BLOCK_OFFSETS = auto()
     HOST_KV_CACHE_BLOCK_OFFSETS = auto()
     HOST_KV_CACHE_POOL_POINTERS = auto()
+    HOST_KV_CACHE_POOL_MAPPING = auto()
     PAST_KEY_VALUE = auto()
     KV_CACHE_QUANTIZATION_SCALE = auto()
     KV_CACHE_DEQUANTIZATION_SCALE = auto()
@@ -101,6 +102,8 @@ def is_entry_used(self, entry: IdxEntry) -> bool:
             return self.use_cache and self.paged_kv_cache
         elif entry == IdxEntry.HOST_KV_CACHE_POOL_POINTERS:
             return self.use_cache and self.paged_kv_cache
+        elif entry == IdxEntry.HOST_KV_CACHE_POOL_MAPPING:
+            return self.use_cache and self.paged_kv_cache
         elif entry == IdxEntry.PAST_KEY_VALUE:
             return self.use_cache and not self.paged_kv_cache
         elif entry == IdxEntry.KV_CACHE_QUANTIZATION_SCALE:
diff --git a/tensorrt_llm/bench/run/__init__.py b/tensorrt_llm/bench/benchmark/__init__.py
similarity index 100%
rename from tensorrt_llm/bench/run/__init__.py
rename to tensorrt_llm/bench/benchmark/__init__.py
diff --git a/tensorrt_llm/bench/run/dataclasses.py b/tensorrt_llm/bench/benchmark/dataclasses.py
similarity index 62%
rename from tensorrt_llm/bench/run/dataclasses.py
rename to tensorrt_llm/bench/benchmark/dataclasses.py
index e507f0bdb..9a020a740 100644
--- a/tensorrt_llm/bench/run/dataclasses.py
+++ b/tensorrt_llm/bench/benchmark/dataclasses.py
@@ -2,13 +2,19 @@
 
 from importlib.util import find_spec
 from pathlib import Path
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 from pydantic import (BaseModel, Field, PositiveFloat, computed_field,
-                      model_validator)
+                      field_validator, model_validator)
 
 import tensorrt_llm.bindings.executor as trtllm
 from tensorrt_llm.bench.enums import IFBSchedulingPolicy
+from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
+
+SPECULATIVE_MAP = {
+    SpeculativeDecodingMode.NONE: lambda *args: None,
+    SpeculativeDecodingMode.MEDUSA: trtllm.DecodingMode.Medusa,
+}
 
 
 class RuntimeConfig(BaseModel):
@@ -17,6 +23,8 @@ class RuntimeConfig(BaseModel):
     sw_version: str
     settings_config: ExecutorSettingsConfig
     world_config: ExecutorWorldConfig
+    decoding_config: DecodingConfig
+    performance_options: PerformanceOptions
 
     def get_config(self) -> trtllm.ExecutorConfig:
         return trtllm.ExecutorConfig(
@@ -29,8 +37,60 @@ def get_config(self) -> trtllm.ExecutorConfig:
             max_batch_size=self.settings_config.max_batch_size,
             max_num_tokens=self.settings_config.max_num_tokens,
             enable_chunked_context=self.settings_config.chunking,
+            extended_runtime_perf_knob_config=self.performance_options.
+            get_perf_config(),
+            decoding_config=self.decoding_config.get_decoding_config(),
         )
 
+    @model_validator(mode="after")
+    def validate_full_config(self) -> RuntimeConfig:
+        # TODO: Check engine to make sure it can support Medusa.
+        return self
+
+
+class PerformanceOptions(BaseModel):
+    cuda_graphs: bool = False
+    multi_block_mode: bool = False
+    cuda_graph_cache_size: int = 1000
+
+    def get_perf_config(self) -> trtllm.ExtendedRuntimePerfKnobConfig:
+        config = trtllm.ExtendedRuntimePerfKnobConfig()
+        config.cuda_graph_mode = self.cuda_graphs
+        config.multi_block_mode = self.multi_block_mode
+        config.cuda_graph_cache_size = self.cuda_graph_cache_size
+
+        return config
+
+
+class DecodingConfig(BaseModel):
+    medusa_choices: Optional[List[List[int]]] = None
+    decoding_mode: SpeculativeDecodingMode = SpeculativeDecodingMode.NONE
+
+    @field_validator("decoding_mode")
+    @classmethod
+    def decoding_mode_validator(
+        cls, value: Union[str, int,
+                          SpeculativeDecodingMode]) -> SpeculativeDecodingMode:
+        return SpeculativeDecodingMode(value)
+
+    @model_validator(mode="after")
+    def validate_speculative_decoding(self) -> DecodingConfig:
+        if self.medusa_choices and self.decoding_mode != SpeculativeDecodingMode.MEDUSA:
+            raise RuntimeError(
+                "Attempting to use set Medusa choices with a non-Medusa engine."
+                " Verify that you are using a Medusa engine.")
+
+        return self
+
+    def get_decoding_config(self) -> trtllm.DecodingConfig:
+        """Create a populated TRT-LLM DecodingConfig."""
+        kwargs = {"decoding_mode": SPECULATIVE_MAP[self.decoding_mode]()}
+
+        if self.medusa_choices is not None:
+            kwargs["medusa_choices"] = self.medusa_choices
+
+        return trtllm.DecodingConfig(**kwargs)
+
 
 class ExecutorWorldConfig(BaseModel):
     pp_size: int = 1
@@ -101,9 +161,10 @@ class RequestRecord(BaseModel):
     start_timestamp: int = -1
     first_token_timestamp: int = -1
     end_timestamp: int = -1
+    decode_iteration: int = 0
 
     def register_event(self, is_error: bool, is_final: bool, timestamp: int,
-                       tokens: List[int]) -> None:
+                       decoding_iter: int, tokens: List[int]) -> None:
         if is_final:
             self.end_timestamp = timestamp
         elif self.first_token_timestamp == -1:
@@ -113,6 +174,7 @@ def register_event(self, is_error: bool, is_final: bool, timestamp: int,
             self.error_tokens += 1
 
         self.tokens += tokens
+        self.decode_iteration = decoding_iter
 
     @computed_field
     def num_output_tokens(self) -> int:
@@ -124,16 +186,18 @@ def num_generated_tokens(self) -> int:
 
     @computed_field
     def generation_time(self) -> int:
-        return self.end_timestamp - self.time_to_first_token
+        return self.end_to_end_latency - self.time_to_first_token
 
     @computed_field
     def time_to_first_token(self) -> int:
-        return self.first_token_timestamp - self.start_timestamp
+        return (self.first_token_timestamp -
+                self.start_timestamp if self.first_token_timestamp > 0 else 0.0)
 
     @computed_field
     def intertoken_latency(self) -> float:
-        return (self.end_timestamp -
-                self.first_token_timestamp) / self.num_generated_tokens
+        return ((self.end_timestamp - self.first_token_timestamp) /
+                self.num_generated_tokens
+                if self.num_generated_tokens > 0 else 0.0)
 
     @computed_field
     def end_to_end_latency(self) -> int:
@@ -145,7 +209,7 @@ def total_token_throughput(self) -> float:
 
     @computed_field
     def output_token_throughput(self) -> float:
-        return self.num_output_tokens / self.generation_time
+        return (self.num_generated_tokens / self.generation_time)
 
 
 class PercentileStats(BaseModel):
@@ -171,21 +235,41 @@ def from_iterable(cls, values: List[Any]) -> PercentileStats:
 
 
 class BenchmarkStatistics(BaseModel):
+    # Time-related Properties
     total_latency_ns: float
+
+    # Token-related Properties
     total_output_tokens: int
     total_input_tokens: int
+
+    # General Information
     num_requests: int
     issue_rate_ns: float
 
-    request_percentiles: Optional[PercentileStats] = None
+    # Speculative Information
+    acceptance_rate: float
+
+    # Percentile-related Statistics
+    request_latency_percentiles: Optional[PercentileStats] = None
     token_percentiles: Optional[PercentileStats] = None
     itl_percentiles: Optional[PercentileStats] = None
     ttft_percentiles: Optional[PercentileStats] = None
+    generation_tp_percentiles: Optional[PercentileStats] = None
+    generation_latency_percentiles: Optional[PercentileStats] = None
+    acceptance_percentiles: Optional[PercentileStats] = None
+
+    @computed_field
+    def generation_tokens(self) -> int:
+        return int(self.total_output_tokens - self.num_requests)
 
     @computed_field
     def token_throughput_ns(self) -> float:
         return float(self.total_output_tokens) / self.total_latency_ns
 
+    @computed_field
+    def generation_token_throughput_ns(self) -> float:
+        return self.generation_tp_percentiles.average
+
     @computed_field
     def request_throughput_ns(self) -> float:
         return float(self.num_requests) / self.total_latency_ns
diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
new file mode 100644
index 000000000..8b0ea5612
--- /dev/null
+++ b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -0,0 +1,336 @@
+from __future__ import annotations
+
+import json
+import os
+from copy import deepcopy
+from pathlib import Path
+from time import monotonic_ns, sleep
+from typing import List
+
+import click
+import yaml
+from click_option_group import optgroup
+
+from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
+
+os.environ["TLLM_LOG_LEVEL"] = "WARNING"
+
+import tensorrt_llm.bindings.executor as trtllm
+from tensorrt_llm.bench.benchmark.dataclasses import (BenchmarkStatistics,
+                                                      RuntimeConfig)
+from tensorrt_llm.bench.benchmark.utils import (StatsKeeper,
+                                                get_executor_requests,
+                                                get_settings_from_engine)
+from tensorrt_llm.bench.dataclasses import BenchmarkEnvironment
+from tensorrt_llm.bench.enums import IFBSchedulingPolicy
+from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
+                                           initialize_tokenizer)
+from tensorrt_llm.logger import logger
+
+
+@click.command(name="latency")
+@optgroup.group("Engine run configuration",
+                help="Runtime settings for executing a TensorRT-LLM engine.")
+@optgroup.option(
+    "--engine_dir",
+    type=click.Path(exists=True,
+                    readable=True,
+                    path_type=Path,
+                    resolve_path=True),
+    required=True,
+    help="Path to a serialized TRT-LLM engine.",
+)
+@optgroup.option(
+    "--kv_cache_free_gpu_mem_fraction",
+    type=float,
+    default=.90,
+    help="The percentage of memory to use for KV Cache after model load.",
+)
+@optgroup.group(
+    "Engine Input Configuration",
+    help="Input configuration for driving the engine.",
+)
+@optgroup.option(
+    "--dataset",
+    type=click.Path(exists=True,
+                    readable=True,
+                    path_type=Path,
+                    resolve_path=True),
+    default=None,
+    help="Pass in a dataset file for parsing instead of stdin.",
+)
+@optgroup.option(
+    "--num_requests",
+    type=int,
+    default=0,
+    help="Number of requests to cap benchmark run at. Minimum between value and"
+    "length of dataset.",
+)
+@optgroup.group("Speculative Decode Options",
+                help="Runtime settings for executing a TensorRT-LLM engine.")
+@optgroup.option(
+    "--medusa_choices",
+    type=click.Path(exists=True,
+                    readable=True,
+                    path_type=Path,
+                    resolve_path=True),
+    default=None,
+    required=False,
+    help="Path to a YAML file that defines the Medusa tree.",
+)
+@click.pass_obj
+def latency_command(
+    bench_env: BenchmarkEnvironment,
+    **params,
+) -> None:
+    """Run a throughput test on a TRT-LLM engine."""
+
+    logger.set_level("info")
+    logger.info("Preparing to run latency benchmark...")
+    # Parameters from CLI
+    # Model, experiment, and engine params
+    dataset_path: Path = params.pop("dataset")
+    num_requests: int = params.pop("num_requests")
+    model: str = bench_env.model
+    engine_dir: Path = params.pop("engine_dir")
+    # Engine configuration parsing
+    exec_settings, build_cfg = get_settings_from_engine(engine_dir)
+    exec_settings["model"] = model
+    engine_bs = exec_settings["settings_config"]["max_batch_size"]
+    engine_tokens = exec_settings["settings_config"]["max_num_tokens"]
+    engine_max_seq_len = build_cfg["max_seq_len"]
+
+    # Runtime Options
+    kv_cache_percent = params.pop("kv_cache_free_gpu_mem_fraction")
+    medusa_choices = params.pop("medusa_choices")
+
+    # Update configuration with runtime options
+    exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent
+    exec_settings["settings_config"]["max_batch_size"] = engine_bs
+    exec_settings["settings_config"]["max_num_tokens"] = engine_tokens
+    exec_settings["settings_config"]["beam_width"] = 1
+    exec_settings["settings_config"]["chunking"] = False
+    exec_settings["settings_config"][
+        "scheduler_policy"] = IFBSchedulingPolicy.NO_EVICT
+
+    # Performance options
+    exec_settings["performance_options"]["cuda_graphs"] = True
+    exec_settings["performance_options"]["multi_block_mode"] = True
+
+    # Decoding Options
+    if medusa_choices is not None:
+        with open(medusa_choices, "r") as medusa_yml:
+            exec_settings["decoding_config"]["medusa_choices"] = \
+                yaml.load(medusa_yml, Loader=yaml.SafeLoader)
+
+    # Construct the runtime configuration dataclass.
+    runtime_config = RuntimeConfig(**exec_settings)
+
+    # Initialize the HF tokenizer for the specified model.
+    ignore_eos = True if runtime_config.decoding_config.decoding_mode == SpeculativeDecodingMode.NONE else False
+    tokenizer = initialize_tokenizer(bench_env.model)
+    eos_id = tokenizer.eos_token_id if not ignore_eos else -1
+    pad_id = tokenizer.pad_token_id if not ignore_eos else -1
+
+    # Dataset Loading and Preparation
+    with open(dataset_path, "r") as dataset:
+        metadata, requests = create_dataset_from_stream(
+            tokenizer, dataset, num_requests=num_requests)
+
+    if metadata.max_sequence_length > engine_max_seq_len:
+        raise RuntimeError(
+            f"Engine supports a max sequence of {engine_max_seq_len}. Provided "
+            "dataset contains a maximum sequence of "
+            f"{metadata.max_sequence_length}. Please rebuild a new engine to"
+            "support this dataset.")
+
+    # Dataset Loading and Preparation
+    executor_requests = get_executor_requests(
+        requests,
+        True,
+        eos_id=eos_id,
+        pad_id=pad_id,
+    )
+    del requests
+
+    # Instantiate the low latency benchmark.
+    benchmark = LatencyBenchmark(
+        executor_requests,
+        runtime_config,
+    )
+
+    try:
+        logger.info("Ready to start benchmark.")
+        benchmark.start_benchmark()
+        benchmark.report_statistics()
+    except KeyboardInterrupt:
+        logger.info("Benchmark interrupted! Shutting down...")
+    finally:
+        benchmark.stop_benchmark()
+
+
+class LatencyBenchmark:
+    """Latency benchmark utility class."""
+
+    def __init__(
+        self,
+        dataset: List[trtllm.Request],
+        runtime_cfg: RuntimeConfig,
+    ) -> None:
+        """Initialize the throughput benchmark.
+
+        Args:
+            dataset (List[trtllm.Request]): A dataset of TRT-LLM requests to
+            benchmark against.
+            runtime_cfg (RuntimeConfig): Runtime configuration.
+        """
+        # Dataset and input properties.
+        self.requests = dataset
+        self.runtime_config = deepcopy(runtime_cfg)
+        self.streaming = True
+
+        # Benchmark stats and time tracking.
+        self.start_time = None
+        self.end_time = None
+        self.submitted_requests = 0
+        self.statistics = StatsKeeper()
+
+        logger.info("Starting Executor backend...")
+        self.executor = None
+        logger.info("Executor started.")
+
+    def _setup_environment(self) -> None:
+        # TODO: Once passing of variables is fixed, these should work
+        # when using MPI in C++ runtime.
+        os.environ["TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG"] = "1"
+        os.environ["TRTLLM_MMHA_KERNEL_BLOCK_SIZE"] = "256"
+        os.environ["TRTLLM_MMHA_KERNEL_BLOCK_SIZE"] = "32"
+        os.environ["FORCE_MULTI_BLOCK_MODE"] = "1"
+        os.environ["TRTLLM_ENABLE_PDL"] = "1"
+
+    def start_benchmark(self) -> None:
+        """Start the benchmark."""
+        logger.info("Initializing backend...")
+        self._setup_environment()
+        self.executor = trtllm.Executor(
+            self.runtime_config.engine_dir,
+            trtllm.ModelType.DECODER_ONLY,
+            executor_config=self.runtime_config.get_config())
+
+        logger.info("WAITING ON EXECUTOR...")
+        while not self.executor.can_enqueue_requests():
+            logger.info("Waiting for executor to stand up...")
+            sleep(1)
+
+        logger.info("Low latency benchmark started.")
+        self.start_time = monotonic_ns()
+        while len(self.requests) > 0:
+            final = False
+            request = self.requests.pop(0)
+
+            req_id = self.executor.enqueue_request(request)
+            self.statistics.register_request(req_id, monotonic_ns(),
+                                             len(request.input_token_ids))
+
+            while not final:
+                responses = self.executor.await_responses(req_id)
+                now = monotonic_ns()
+                for resp in responses:
+                    self.statistics.register_response(
+                        req_id, now, resp.result.is_final, resp.has_error(),
+                        resp.result.decoding_iter,
+                        resp.result.output_token_ids[0])
+                    final = resp.result.is_final
+
+        self.end_time = monotonic_ns()
+        logger.info("Low latency benchmark finished.")
+
+    def stop_benchmark(self) -> None:
+        """Stop the benchmark and clean up backend and threads."""
+        logger.info("Benchmark Shutdown called!")
+        if self.executor is not None:
+            self.executor.shutdown()
+        logger.info("Executor shutdown.")
+
+    def report_statistics(self) -> BenchmarkStatistics:
+        """Report internal statistics about benchmark."""
+
+        config_path = self.runtime_config.engine_dir / "config.json"
+        with open(config_path, "r") as config:
+            engine_config = json.load(config)
+
+        stats = self.statistics.generate_statistics_summary()
+        rt_cfg = self.runtime_config
+        build_cfg = engine_config["build_config"]
+        pretrain_cfg = engine_config["pretrained_config"]
+
+        logging_info = (
+            "\n\n===========================================================\n"
+            "= ENGINE DETAILS\n"
+            "===========================================================\n"
+            f"Model:\t\t\t{rt_cfg.model}\n"
+            f"Engine Directory:\t{rt_cfg.engine_dir}\n"
+            f"TensorRT-LLM Version:\t{rt_cfg.sw_version}\n"
+            f"Dtype:\t\t\t{pretrain_cfg['dtype']}\n"
+            f"KV Cache Dtype:\t\t{pretrain_cfg['quantization']['kv_cache_quant_algo']}\n"
+            f"Quantization:\t\t{pretrain_cfg['quantization']['quant_algo']}\n"
+            f"Max Input Length:\t{build_cfg['max_input_len']}\n"
+            f"Max Sequence Length:\t{build_cfg['max_seq_len']}\n"
+            f"\n"
+            "===========================================================\n"
+            "= WORLD + RUNTIME INFORMATION \n"
+            "===========================================================\n"
+            f"TP Size:\t\t{rt_cfg.world_config.tp_size}\n"
+            f"PP Size:\t\t{rt_cfg.world_config.pp_size}\n"
+            f"Max Runtime Batch Size:\t{rt_cfg.settings_config.max_batch_size}\n"
+            f"Max Runtime Tokens:\t{rt_cfg.settings_config.max_num_tokens}\n"
+            f"Scheduling Policy:\t{rt_cfg.settings_config.scheduler_policy.values[1]}\n"
+            f"KV Memory Percentage:\t{rt_cfg.settings_config.kv_cache_percent * 100.0:.2f}%\n"
+            f"\n"
+            "===========================================================\n"
+            "= GENERAL OVERVIEW \n"
+            "===========================================================\n"
+            f"Number of requests:\t\t{stats.num_requests}\n"
+            f"Average Input Length (tokens):\t{stats.average_input_length:.4f}\n"
+            f"Average Output Length (tokens):\t{stats.average_output_length:.4f}\n"
+            f"Average request latency (ms):\t{stats.request_latency_percentiles.average * 1.0e-6:.4f}\n"
+            f"\n"
+            "===========================================================\n"
+            "= THROUGHPUT OVERVIEW \n"
+            "===========================================================\n"
+            f"Request Throughput (req/sec):\t\t  {stats.request_throughput_ns * 1.0e9:.4f}\n"
+            f"Total Token Throughput (tokens/sec):\t  {stats.token_throughput_ns * 1.0e9:.4f}\n"
+            f"Generation Token Throughput (tokens/sec): {stats.generation_tp_percentiles.average * 1.0e9:.4f}\n"
+            f"\n"
+            "===========================================================\n"
+            "= LATENCY OVERVIEW \n"
+            "===========================================================\n"
+            f"Total Latency (ms):\t\t  {stats.total_latency_ns * 1.0e-6:.4f}\n"
+            f"Average time-to-first-token (ms): {stats.ttft_percentiles.average * 1.0e-6:.4f}\n"
+            f"Average inter-token latency (ms): {stats.itl_percentiles.average * 1.0e-6:.4f}\n"
+            f"Acceptance Rate (Speculative):\t  {stats.acceptance_rate:.2f}\n"
+            f"\n"
+            "===========================================================\n"
+            "= GENERATION LATENCY BREAKDOWN \n"
+            "===========================================================\n"
+            f"MIN (ms): {stats.generation_latency_percentiles.minimum * 1.0e-6:.4f}\n"
+            f"MAX (ms): {stats.generation_latency_percentiles.maximum * 1.0e-6:.4f}\n"
+            f"AVG (ms): {stats.generation_latency_percentiles.average * 1.0e-6:.4f}\n"
+            f"P90 (ms): {stats.generation_latency_percentiles.p50 * 1.0e-6:.4f}\n"
+            f"P95 (ms): {stats.generation_latency_percentiles.p95 * 1.0e-6:.4f}\n"
+            f"P99 (ms): {stats.generation_latency_percentiles.p99 * 1.0e-6:.4f}\n"
+            f"\n"
+            "===========================================================\n"
+            "= ACCEPTANCE BREAKDOWN \n"
+            "===========================================================\n"
+            f"MIN: {stats.acceptance_percentiles.minimum:.2f}\n"
+            f"MAX: {stats.acceptance_percentiles.maximum:.2f}\n"
+            f"AVG: {stats.acceptance_percentiles.average:.2f}\n"
+            f"P90: {stats.acceptance_percentiles.p50:.2f}\n"
+            f"P95: {stats.acceptance_percentiles.p95:.2f}\n"
+            f"P99: {stats.acceptance_percentiles.p99:.2f}\n"
+            f"\n"
+            "===========================================================\n")
+
+        logger.info(logging_info)
+        return stats
diff --git a/tensorrt_llm/bench/run/run.py b/tensorrt_llm/bench/benchmark/throughput.py
similarity index 89%
rename from tensorrt_llm/bench/run/run.py
rename to tensorrt_llm/bench/benchmark/throughput.py
index 4cf22d836..eeb5d5764 100644
--- a/tensorrt_llm/bench/run/run.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -13,14 +13,15 @@
 from click_option_group import optgroup
 
 import tensorrt_llm.bindings.executor as trtllm
+from tensorrt_llm.bench.benchmark.dataclasses import (BenchmarkStatistics,
+                                                      RuntimeConfig)
+from tensorrt_llm.bench.benchmark.utils import (ResponseTuple, StatsKeeper,
+                                                get_executor_requests,
+                                                get_settings_from_engine)
 from tensorrt_llm.bench.dataclasses import BenchmarkEnvironment
 from tensorrt_llm.bench.enums import IFBSchedulingPolicy
-from tensorrt_llm.bench.run.dataclasses import (BenchmarkStatistics,
-                                                RuntimeConfig)
-from tensorrt_llm.bench.run.utils import (ResponseTuple, StatsKeeper,
-                                          get_executor_request,
-                                          get_settings_from_engine)
-from tensorrt_llm.bench.utils.data import generate_dataset_from_stream
+from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
+                                           initialize_tokenizer)
 from tensorrt_llm.logger import logger
 
 
@@ -92,7 +93,7 @@
     help="Enable streaming mode for requests.",
 )
 @click.pass_obj
-def run_command(
+def throughput_command(
     bench_env: BenchmarkEnvironment,
     **params,
 ) -> None:
@@ -114,6 +115,13 @@ def run_command(
     engine_tokens = exec_settings["settings_config"]["max_num_tokens"]
     engine_max_seq_len = build_cfg["max_seq_len"]
 
+    # Check that we are not using a low latency engine
+    # Right now, this is based on max batch size.
+    if engine_bs == 1:
+        raise ValueError(
+            "An engine with a batch size greater than 1 should be used for "
+            "throughput benchmarking. Exiting.")
+
     # Runtime Options
     runtime_max_bs = params.pop("max_batch_size")
     runtime_max_bs = runtime_max_bs if runtime_max_bs else engine_bs
@@ -133,9 +141,13 @@ def run_command(
     # Construct the runtime configuration dataclass.
     runtime_config = RuntimeConfig(**exec_settings)
 
+    # Initialize the HF tokenizer for the specified model.
+    tokenizer = initialize_tokenizer(bench_env.model)
+
     # Dataset Loading and Preparation
-    metadata, requests = generate_dataset_from_stream(dataset_path, model,
-                                                      num_requests)
+    with open(dataset_path, "r") as dataset:
+        metadata, requests = create_dataset_from_stream(
+            tokenizer, dataset, num_requests=num_requests)
     # TODO: Verify that the engine can handle the max/min ISL/OSL.
     if metadata.max_sequence_length > engine_max_seq_len:
         raise RuntimeError(
@@ -143,20 +155,19 @@ def run_command(
             "dataset contains a maximum sequence of "
             f"{metadata.max_sequence_length}. Please rebuild a new engine to"
             "support this dataset.")
-    executor_requests = []
-    while requests:
-        request = requests.pop()
-        executor_requests.append(
-            get_executor_request(request,
-                                 pad_id=-1,
-                                 eos_id=-1,
-                                 streaming=streaming))
-        del request
+
+    # Dataset Loading and Preparation
+    executor_requests = get_executor_requests(
+        requests,
+        streaming,
+        eos_id=-1,
+        pad_id=-1,
+    )
+    del requests
 
     logger.info("Setting up benchmarker and infrastructure.")
     new_request_queue = mp.Queue()
     response_queue = mp.Queue()
-    logger.set_level("error")
     benchmark = ThroughputBenchmark(
         dataset=executor_requests,
         request_rate=request_rate,
@@ -165,7 +176,7 @@ def run_command(
         response_queue=response_queue,
         streaming=streaming,
     )
-    logger.set_level("info")
+
     try:
         logger.info("Ready to start benchmark.")
         benchmark.start_benchmark()
@@ -173,10 +184,8 @@ def run_command(
         benchmark.stop_benchmark()
         benchmark.report_statistics()
     except KeyboardInterrupt:
-        logger.set_level("error")
         benchmark.stop_benchmark()
     finally:
-        logger.set_level("error")
         benchmark.shutdown()
 
 
@@ -195,15 +204,22 @@ def __init__(self, runtime_cfg: RuntimeConfig,
         logger.info("Initializing Executor.")
         # Runtime related properties.
         self.runtime_config: RuntimeConfig = runtime_cfg
+        # Runtime tracking and multiprocessing.
+        self.responses = response_queue
+        self._shutdown = Event()
+        self.backend_ready = Event()
+        self._resp_daemon_finished = Event()
         self.executor = trtllm.Executor(
             self.runtime_config.engine_dir,
             trtllm.ModelType.DECODER_ONLY,
             executor_config=self.runtime_config.get_config())
 
-        # Runtime tracking and multiprocessing.
-        self.responses = response_queue
-        self._shutdown = Event()
-        self._resp_daemon_finished = Event()
+        logger.info("WAITING ON EXECUTOR...")
+        while not self.executor.can_enqueue_requests():
+            logger.info("Waiting for executor to stand up...")
+            sleep(1)
+
+        self.backend_ready.set()
 
         self.response_thread = Thread(target=self.response_daemon)
         self.response_thread.start()
@@ -245,8 +261,8 @@ def _process_response() -> None:
             if len(responses) > 0:
                 self.responses.put([
                     ResponseTuple(now, r.request_id, r.result.is_final,
-                                  r.has_error(), r.result.output_token_ids[0])
-                    for r in responses
+                                  r.has_error(), r.result.output_token_ids[0],
+                                  r.result.decoding_iter) for r in responses
                 ])
 
         while not self._shutdown.is_set():
@@ -282,7 +298,8 @@ def __init__(
             response_queue (mp.Queue): Process-safe queue for passing request
             responses to main process.
         """
-        logger.info(f"Initializing Throughput Benchmark. [rate=%d req/s]")
+        logger.info(
+            f"Initializing Throughput Benchmark. [rate={request_rate} req/s]")
         # Dataset and input properties.
         self.requests = dataset
         self.delay_func = lambda x: sleep(
@@ -313,8 +330,9 @@ def __init__(
 
     def enqueue_process(self) -> None:
         """Method for starting enqueueing requests."""
+        logger.info("WAITING ON BACKEND TO BE READY...")
+        self.executor.backend_ready.wait()
         logger.info("Request serving started.")
-
         request_generator = self.executor.enqueue(*self.requests)
         # Iterate the generator until we run out of requests.
         # Note the walrus operator.
@@ -378,13 +396,14 @@ def _process_requests() -> None:
             while not self.response_queue.empty():
                 responses: Tuple[
                     int,
-                    List[trtllm.Response]] = self.response_queue.get_nowait()
+                    List[ResponseTuple]] = self.response_queue.get_nowait()
                 for response in responses:
                     self.statistics.register_response(
                         response.request_id,
                         response.timestamp,
                         response.final,
                         response.error,
+                        response.decoding_iteration,
                         response.tokens,
                     )
 
@@ -457,7 +476,7 @@ def report_statistics(self) -> BenchmarkStatistics:
                 "===========================================================\n"
                 "= STREAMING STATISTICS \n"
                 "===========================================================\n"
-                f"Average request latency (ms):\t\t{stats.request_percentiles.average * 1.0e-6:.4f}\n"
+                f"Average request latency (ms):\t\t{stats.request_latency_percentiles.average * 1.0e-6:.4f}\n"
                 f"Average time-to-first-token (ms):\t{stats.ttft_percentiles.average * 1.0e-6:.4f}\n"
                 f"Average inter-token latency (ms):\t{stats.itl_percentiles.average * 1.0e-6:.4f}\n"
             )
diff --git a/tensorrt_llm/bench/run/utils.py b/tensorrt_llm/bench/benchmark/utils.py
similarity index 60%
rename from tensorrt_llm/bench/run/utils.py
rename to tensorrt_llm/bench/benchmark/utils.py
index ad69d160e..d2fad8ca8 100644
--- a/tensorrt_llm/bench/run/utils.py
+++ b/tensorrt_llm/bench/benchmark/utils.py
@@ -6,12 +6,33 @@
 from typing import Dict, List, Tuple, Union
 
 import tensorrt_llm.bindings.executor as trtllm
-from tensorrt_llm.bench.run.dataclasses import (BenchmarkStatistics,
-                                                PercentileStats, RequestRecord)
+from tensorrt_llm.bench.benchmark.dataclasses import (BenchmarkStatistics,
+                                                      PercentileStats,
+                                                      RequestRecord)
 from tensorrt_llm.bindings import InferenceRequest
 
-ResponseTuple = namedtuple(
-    "ResponseTuple", ["timestamp", "request_id", "final", "error", "tokens"])
+ResponseTuple = namedtuple("ResponseTuple", [
+    "timestamp", "request_id", "final", "error", "tokens", "decoding_iteration"
+])
+
+
+def get_executor_requests(
+    requests: List[InferenceRequest],
+    streaming: bool,
+    eos_id: int,
+    pad_id: int,
+) -> List[trtllm.Request]:
+    executor_requests = []
+    while requests:
+        request = requests.pop()
+        executor_requests.append(
+            get_executor_request(request,
+                                 pad_id=pad_id,
+                                 eos_id=eos_id,
+                                 streaming=streaming))
+        del request
+
+    return executor_requests
 
 
 def get_executor_request(request: InferenceRequest,
@@ -62,6 +83,10 @@ def get_settings_from_engine(
         "world_config": world_config,
     })
 
+    runtime_config["performance_options"] = {}
+    runtime_config["decoding_config"] = {
+        "decoding_mode": engine_build_cfg["speculative_decoding_mode"]
+    }
     return runtime_config, engine_build_cfg
 
 
@@ -74,7 +99,7 @@ def __init__(self) -> None:
     def register_request(
         self,
         request_id: int,
-        timestamp: float,
+        timestamp: int,
         num_tokens: int,
     ) -> None:
         record = self.requests[request_id]
@@ -82,9 +107,10 @@ def register_request(
         record.start_timestamp = timestamp
 
     def register_response(self, request_id: int, timestamp: int, final: bool,
-                          error: bool, tokens: List[int]) -> None:
+                          error: bool, decode_iter: int,
+                          tokens: List[int]) -> None:
         record = self.requests[request_id]
-        record.register_event(error, final, timestamp, tokens)
+        record.register_event(error, final, timestamp, decode_iter, tokens)
         if final:
             self.num_complete = self.num_complete + 1
 
@@ -96,7 +122,11 @@ def generate_statistics_summary(self) -> None:
         end_time = -1
 
         request_latencies = []
+        generation_latencies = []
+        generation_throughputs = []
         intertoken_avg_latencies = []
+        request_acceptance = []
+        total_decoding_iterations = 0
         ttft_times = []
         last_queue_time = 0.0
         queue_time_total = 0.0
@@ -104,26 +134,42 @@ def generate_statistics_summary(self) -> None:
         for entry in self.requests.values():
             start_time = min(entry.start_timestamp, start_time)
             end_time = max(entry.end_timestamp, end_time)
-            queue_time_total += entry.start_timestamp - last_queue_time
-            last_queue_time = entry.start_timestamp
+            last_queue_time = max(entry.start_timestamp, last_queue_time)
+            request_ar = entry.num_generated_tokens / entry.decode_iteration
 
             request_latencies.append(entry.end_to_end_latency)
+            generation_latencies.append(entry.generation_time)
+            generation_throughputs.append(entry.output_token_throughput)
             ttft_times.append(entry.time_to_first_token)
             intertoken_avg_latencies.append(entry.intertoken_latency)
+            request_acceptance.append(request_ar)
+            total_decoding_iterations += entry.decode_iteration
 
             total_output_tokens += entry.num_output_tokens
             total_input_tokens += entry.num_input_tokens
 
+        global_acceptance_rate = total_output_tokens / total_decoding_iterations
+        queue_time_total = last_queue_time - start_time
+        percentile_request_accept = PercentileStats.from_iterable(
+            request_acceptance) if request_acceptance else None
+
         stats = BenchmarkStatistics(
             num_requests=num_requests,
             total_latency_ns=end_time - start_time,
             total_output_tokens=total_output_tokens,
             total_input_tokens=total_input_tokens,
-            request_percentiles=PercentileStats.from_iterable(
+            acceptance_rate=global_acceptance_rate,
+            request_latency_percentiles=PercentileStats.from_iterable(
                 request_latencies),
             itl_percentiles=PercentileStats.from_iterable(
                 intertoken_avg_latencies),
             ttft_percentiles=PercentileStats.from_iterable(ttft_times),
-            issue_rate_ns=queue_time_total / num_requests)
+            generation_tp_percentiles=PercentileStats.from_iterable(
+                generation_throughputs),
+            generation_latency_percentiles=PercentileStats.from_iterable(
+                generation_latencies),
+            issue_rate_ns=queue_time_total / num_requests,
+            acceptance_percentiles=percentile_request_accept,
+        )
 
         return stats
diff --git a/tensorrt_llm/bench/build/benchmark_config.yml b/tensorrt_llm/bench/build/benchmark_config.yml
index ab432c85d..1f28fb492 100644
--- a/tensorrt_llm/bench/build/benchmark_config.yml
+++ b/tensorrt_llm/bench/build/benchmark_config.yml
@@ -44,7 +44,7 @@ meta-llama/Meta-Llama-3-8B:
     general:
       max_batch_size: 2048
       max_num_tokens: 8192
-meta-llama/Meta-Llama-3.1-8B:
+meta-llama/Llama-3.1-8B: &llama_3_1_8b
   tp1_pp1:
     general:
       max_batch_size: 2048
@@ -69,7 +69,7 @@ meta-llama/Meta-Llama-3-70B:
     general:
       max_batch_size: 8192
       max_num_tokens: 16384
-meta-llama/Meta-Llama-3.1-70B:
+meta-llama/Llama-3.1-70B: &llama_3_1_70b
   tp1_pp1:
     general:
       max_batch_size: 2048
@@ -89,27 +89,36 @@ meta-llama/Meta-Llama-3.1-70B:
     general:
       max_batch_size: 8192
       max_num_tokens: 16384
-meta-llama/Meta-Llama-3.1-405B:
+meta-llama/Llama-3.1-405B: &llama_3_1_405b
   tp8_pp1:
     general:
-      max_batch_size: 320
-      max_num_tokens: 5440
+      max_batch_size: 1024
+      max_num_tokens: 4096
     256:
       max_batch_size: 2048
       max_num_tokens: 4096
+    2000:
+      max_batch_size: 1280
+      max_num_tokens: 2560
+    2176:
+      max_batch_size: 1024
+      max_num_tokens: 4096
     2500:
-      max_batch_size: 320
-      max_num_tokens: 512
+      max_batch_size: 1024
+      max_num_tokens: 2048
     4096:
-      max_batch_size: 192
-      max_num_tokens: 512
+      max_batch_size: 512
+      max_num_tokens: 2048
+    4224:
+      max_batch_size: 512
+      max_num_tokens: 2048
     5500:
-      max_batch_size: 192
-      max_num_tokens: 512
+      max_batch_size: 512
+      max_num_tokens: 5120
     22000:
-      max_batch_size: 64
-      max_num_tokens: 768
-mistralai/Mixtral-8x7B-v0.1:
+      max_batch_size: 128
+      max_num_tokens: 2048
+mistralai/Mixtral-8x7B-v0.1: &mixtral_8x7b_0_1
   tp2_pp1:
     general:
       max_batch_size: 2048
@@ -122,8 +131,31 @@ mistralai/Mixtral-8x7B-v0.1:
     general:
       max_batch_size: 8192
       max_num_tokens: 8192
+mistralai/Mixtral-8x22B-v0.1: &mixtral_8x22b_0_1
+  tp8_pp1:
+    256:
+      max_batch_size: 8192
+      max_num_tokens: 16384
+    2176:
+      max_batch_size: 2048
+      max_num_tokens: 16384
+    4224:
+      max_batch_size: 1024
+      max_num_tokens: 2048
+    5500:
+      max_batch_size: 1024
+      max_num_tokens: 8192
+    general:
+      max_batch_size: 2048
+      max_num_tokens: 8192
 mistralai/Mistral-7B-v0.1:
   tp1_pp1:
     general:
       max_batch_size: 4098
       max_num_tokens: 8192
+
+meta-llama/Llama-3.1-8B-Instruct: *llama_3_1_8b
+meta-llama/Llama-3.1-70B-Instruct: *llama_3_1_70b
+meta-llama/Llama-3.1-405B-Instruct: *llama_3_1_405b
+mistralai/Mixtral-8x7B-Instruct-v0.1: *mixtral_8x7b_0_1
+mistralai/Mixtral-8x22B-Instruct-v0.1: *mixtral_8x22b_0_1
diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py
index 4123e870d..5946b2d20 100644
--- a/tensorrt_llm/bench/build/build.py
+++ b/tensorrt_llm/bench/build/build.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
 
 from pathlib import Path
-from select import select
-from sys import stdin
 from typing import Dict, get_args
 import click
 from click_option_group import AllOptionGroup, optgroup, RequiredMutuallyExclusiveOptionGroup
@@ -170,25 +168,18 @@ def build_command(
     # Dataset options
     dataset_path: Path = params.pop("dataset")
     max_seq_len: int = params.pop("max_seq_length")
-    data_on_stdin: bool = bool(len(select([
-        stdin,
-    ], [], [], 0.0)[0]))
-
     # Initialize the HF tokenizer for the specified model.
     tokenizer = initialize_tokenizer(bench_env.model)
 
     # If we are receiving data from a path or stdin, parse and gather metadata.
-    if dataset_path or data_on_stdin:
+    if dataset_path:
         logger.info("Found dataset.")
-        # Cannot set the data file path and pipe in from stdin. Choose one.
-        if dataset_path is not None and data_on_stdin:
-            raise ValueError(
-                "Cannot provide a dataset on both stdin and by --dataset "
-                "option. Please pick one.")
-        stream = stdin if data_on_stdin else open(dataset_path, "r")
-        # Parse the dataset from stdin and return it plus its metadata.
-        metadata, _ = \
-            create_dataset_from_stream(tokenizer, stream=stream)
+        # Dataset Loading and Preparation
+        with open(dataset_path, "r") as dataset:
+            metadata, _ = create_dataset_from_stream(
+                tokenizer,
+                dataset,
+            )
         # The max sequence length option for build is the sum of max osl + isl.
         max_seq_len = metadata.max_sequence_length
         logger.info(metadata.get_summary_for_print())
diff --git a/tensorrt_llm/bench/dataclasses.py b/tensorrt_llm/bench/dataclasses.py
index a4238689b..f49b88a0d 100644
--- a/tensorrt_llm/bench/dataclasses.py
+++ b/tensorrt_llm/bench/dataclasses.py
@@ -88,12 +88,13 @@ class DatasetMetadata(BaseModel):
     num_requests: int
 
     def get_summary_for_print(self) -> str:
-        return ("===========================================================\n"
-                "= DATASET DETAILS\n"
-                "===========================================================\n"
-                f"Max Input Sequence Length:\t{self.max_isl}\n"
-                f"Max Output Sequence Length:\t{self.max_osl}\n"
-                f"Max Sequence Length:\t{self.max_sequence_length}\n"
-                f"Number of Sequences:\t{self.num_requests}\n"
-                "===========================================================\n"
-                f"\n")
+        return (
+            "\n===========================================================\n"
+            "= DATASET DETAILS\n"
+            "===========================================================\n"
+            f"Max Input Sequence Length:\t{self.max_isl}\n"
+            f"Max Output Sequence Length:\t{self.max_osl}\n"
+            f"Max Sequence Length:\t{self.max_sequence_length}\n"
+            f"Number of Sequences:\t{self.num_requests}\n"
+            "===========================================================\n"
+            f"\n")
diff --git a/tensorrt_llm/bench/utils/data.py b/tensorrt_llm/bench/utils/data.py
index 4f6380325..b6d00a345 100644
--- a/tensorrt_llm/bench/utils/data.py
+++ b/tensorrt_llm/bench/utils/data.py
@@ -1,8 +1,5 @@
 import json
-import sys
 from functools import partial
-from pathlib import Path
-from select import select
 from typing import List, TextIO, Tuple
 
 from transformers import AutoTokenizer, PreTrainedTokenizer
@@ -10,33 +7,6 @@
 from tensorrt_llm.bench.dataclasses import DatasetMetadata, InferenceRequest
 
 
-def generate_dataset_from_stream(dataset_path: Path,
-                                 model: str,
-                                 num_requests: int = 0):
-    # Check for data on stdin.
-    data_on_stdin: bool = bool(len(select([
-        sys.stdin,
-    ], [], [], 0.0)[0]))
-
-    # Cannot set the data file path and pipe in from stdin. Choose one.
-    if dataset_path is not None and data_on_stdin:
-        raise ValueError(
-            "Cannot provide a dataset on both stdin and by --dataset option. "
-            "Please pick one.")
-    # If we are receiving data from a path or stdin, parse and gather metadata.
-    stream = sys.stdin if data_on_stdin else open(dataset_path, "r")
-    tokenizer = initialize_tokenizer(model)
-    # Parse the dataset from stdin and return it plus its metadata.
-    metadata, requests = \
-        create_dataset_from_stream(
-            tokenizer,
-            stream=stream,
-            num_requests=num_requests
-        )
-
-    return metadata, requests
-
-
 def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer:
     """Initialize a tokenizer.
 
@@ -58,20 +28,23 @@ def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer:
 
 def create_dataset_from_stream(
     tokenizer: PreTrainedTokenizer,
+    stream: TextIO,
     max_input_length: int = 0,
     max_output_length: int = 0,
-    stream: TextIO = sys.stdin,
     num_requests: int = 0,
 ) -> Tuple[DatasetMetadata, List[InferenceRequest]]:
     """Generate metadata and a list of requests to drive benchmarking.
 
     Args:
         tokenizer (PreTrainedTokenizer): HuggingFace tokenizer.
-        max_input_length (int): Maximum input length to cap prompts to.
+        stream (TextIO): Stream of input requests.
+        max_input_length (int, optional): Maximum input length to cap prompts to. Defaults to 0.
+        max_output_length (int, optional): Maximum output length to cap prompts to.. Defaults to 0.
+        num_requests (int, optional): Number of requests to limit to. Defaults to 0.
 
     Returns:
-        DatasetMetadata: Dataclass of dataset statistics.
-        List[InferenceRequest]: A list of inference requests for benchmarking.
+        Tuple[DatasetMetadata, List[InferenceRequest]]: A tuple containing a dataclass of dataset
+        statistics and a list of inference requests for benchmarking.
     """
     # Initialize dataset list, and metadata tracking variables.
     dataset = []
diff --git a/tensorrt_llm/bench/utils/tokenize.py b/tensorrt_llm/bench/utils/tokenize.py
deleted file mode 100644
index 44f04df56..000000000
--- a/tensorrt_llm/bench/utils/tokenize.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import json
-import sys
-from functools import partial
-from typing import List, TextIO, Tuple
-
-from transformers import AutoTokenizer, PreTrainedTokenizer
-
-from tensorrt_llm.bench.dataclasses import DatasetMetadata, InferenceRequest
-
-
-def initialize_tokenizer(model_name: str) -> PreTrainedTokenizer:
-    """Initialize a tokenizer.
-
-    Args:
-        model_name (str): The name of the HuggingFace model to pull a
-        tokenizer from.
-
-    Returns:
-        PreTrainedTokenizer: An initialized HuggingFace tokenizer.
-    """
-    # Initialize the tokenizer specific to the model that we are planning
-    # to benchmark.
-    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
-    if tokenizer.pad_token_id is None:
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-    return tokenizer
-
-
-def create_dataset_from_stream(
-    tokenizer: PreTrainedTokenizer,
-    max_input_length: int = 0,
-    max_output_length: int = 0,
-    stream: TextIO = sys.stdin,
-) -> Tuple[DatasetMetadata, List[InferenceRequest]]:
-    """Generate metadata and a list of requests to drive benchmarking.
-
-    Args:
-        tokenizer (PreTrainedTokenizer): HuggingFace tokenizer.
-        max_input_length (int): Maximum input length to cap prompts to.
-
-    Returns:
-        DatasetMetadata: Dataclass of dataset statistics.
-        List[InferenceRequest]: A list of inference requests for benchmarking.
-    """
-    # Initialize dataset list, and metadata tracking variables.
-    dataset = []
-    max_isl = 0
-    max_osl = 0
-    max_sequence = 0
-
-    # If we're limiting the input length to a certain size, then set up
-    # a partial to truncate the data down to size. Otherwise, just use the
-    # unmodified tokenizer callable.
-    tokenize = (partial(
-        tokenizer,
-        padding="max_length",
-        max_length=max_input_length,
-        truncation=True,
-    ) if max_input_length > 0 else tokenizer)
-
-    # If we need to limit the output length, fill in a partial callable
-    # for max, otherwise a lambda that just returns x with no bounds.
-    output_limiter = (partial(max, max_output_length)
-                      if max_output_length > 0 else lambda x: x)
-
-    # For each line in the standard input, parse out the JSON string we expect
-    # to see.
-    # Note the := walrus -- we're assigning and checking the condition.
-    while line := stream.readline():
-        # We expect the data to come in as a JSON string.
-        # For example:
-        # {"prompt": "Generate an infinite response to the following: There once was a man who.", "output_tokens": 1000}
-        # Each line should be a complete JSON dictionary with no indentation
-        # or newline characters.
-        data = json.loads(line)
-        logits = data.get("logits", None)
-        prompt = data.get("prompt", None)
-        task_id = data["task_id"]
-        osl = data["output_tokens"]
-        # If the request comes in with logits, just use the provided.
-        # Otherwise we need to tokenize it.
-        logits = tokenize(prompt)["input_ids"] if logits is None else logits
-
-        request = InferenceRequest(
-            task_id=task_id,
-            prompt=prompt,
-            output_tokens=output_limiter(osl),
-            logits=logits,
-        )
-        max_isl = max(max_isl, len(logits))
-        max_osl = max(max_osl, osl)
-        max_sequence = max(max_sequence, len(logits) + osl)
-        dataset.append(request)
-
-    # Fill in basic dataset metrics here
-    # TODO: Maybe fill this out to be more complete?
-    metadata = DatasetMetadata(
-        max_isl=max_isl,
-        max_osl=max_osl,
-        max_sequence_length=max_sequence,
-        num_requests=len(dataset),
-    )
-
-    return metadata, dataset
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index e8dfb9034..2f712be94 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -141,7 +141,6 @@ def create_builder_config(self,
                               use_refit: bool = False,
                               int8: bool = False,
                               strongly_typed: bool = True,
-                              opt_level: Optional[int] = None,
                               force_num_profiles: Optional[int] = None,
                               profiling_verbosity: str = "layer_names_only",
                               use_strip_plan: bool = False,
@@ -191,9 +190,6 @@ def create_builder_config(self,
         if use_strip_plan:
             config.set_flag(trt.BuilderFlag.STRIP_PLAN)
 
-        if opt_level is not None:
-            config.builder_optimization_level = opt_level
-
         # Set TRT Engine profiling verbosity
         if profiling_verbosity == "detailed":
             config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
@@ -479,7 +475,6 @@ class BuildConfig:
     gather_context_logits: int = False
     gather_generation_logits: int = False
     strongly_typed: bool = True
-    builder_opt: Optional[int] = None
     force_num_profiles: Optional[int] = None
     profiling_verbosity: str = 'layer_names_only'
     enable_debug_output: bool = False
@@ -567,7 +562,6 @@ def from_dict(cls, config, plugin_config=None):
         gather_context_logits = config.pop('gather_context_logits', False)
         gather_generation_logits = config.pop('gather_generation_logits', False)
         strongly_typed = config.pop('strongly_typed', True)
-        builder_opt = config.pop('builder_opt', None)
         force_num_profiles = config.pop('force_num_profiles', None)
         weight_sparsity = config.pop('weight_sparsity', False)
         profiling_verbosity = config.pop('profiling_verbosity',
@@ -584,7 +578,7 @@ def from_dict(cls, config, plugin_config=None):
             config.get('auto_parallel_config', {}))
         max_encoder_input_len = config.pop('max_encoder_input_len', 1024)
         weight_streaming = config.pop('weight_streaming', False)
-
+        use_fused_mlp = config.pop('use_fused_mlp', True)
         use_strip_plan = config.pop('use_strip_plan', False)
 
         if plugin_config is None:
@@ -608,7 +602,6 @@ def from_dict(cls, config, plugin_config=None):
             gather_context_logits=gather_context_logits,
             gather_generation_logits=gather_generation_logits,
             strongly_typed=strongly_typed,
-            builder_opt=builder_opt,
             force_num_profiles=force_num_profiles,
             profiling_verbosity=profiling_verbosity,
             enable_debug_output=enable_debug_output,
@@ -623,6 +616,7 @@ def from_dict(cls, config, plugin_config=None):
             max_encoder_input_len=max_encoder_input_len,
             weight_sparsity=weight_sparsity,
             weight_streaming=weight_streaming,
+            use_fused_mlp=use_fused_mlp,
             plugin_config=plugin_config,
             dry_run=dry_run,
             visualize_network=visualize_network)
@@ -689,13 +683,11 @@ def __init__(
         self,
         config: EngineConfig,
         engine: Union[trt.IHostMemory, None],
-        managed_weights: dict[str, np.ndarray] = None,
+        managed_weights: dict[str, np.ndarray] = {},
     ):
         self.config = config
         self.engine = engine
         self.managed_weights = managed_weights
-
-    def regularize_managed_weights(self):
         if self.managed_weights is None:
             self.managed_weights = {}
         for name, value in self.managed_weights.items():
@@ -731,10 +723,24 @@ def save(self, engine_dir: str):
             if os.path.exists(root_lora_dir) and os.path.isdir(root_lora_dir):
                 shutil.rmtree(root_lora_dir)
         if self.config.pretrained_config.mapping.rank == 0:
+            config_dict = self.config.to_dict()
+            if self.config.pretrained_config.quant_algo == QuantAlgo.MIXED_PRECISION:
+                quant_dict = {
+                    'version': self.config.version,
+                }
+                quant_dict.update(
+                    config_dict['pretrained_config']['quantization'])
+                config_dict['pretrained_config']['quantization'].pop(
+                    'quantized_layers', None)
+                with open(os.path.join(engine_dir, 'quant_cfg.json'),
+                          "w",
+                          encoding="utf-8") as f:
+                    json.dump(quant_dict, f, indent=4, cls=ConfigEncoder)
+
             with open(os.path.join(engine_dir, 'config.json'),
                       "w",
                       encoding="utf-8") as f:
-                json.dump(self.config.to_dict(), f, indent=4, cls=ConfigEncoder)
+                json.dump(config_dict, f, indent=4, cls=ConfigEncoder)
         if self.engine is not None:
             serialize_engine(
                 self.engine,
@@ -807,7 +813,7 @@ def optimize_model_with_config(model: PretrainedModel,
         use_lora=build_config.plugin_config.lora_plugin is not None,
         max_lora_rank=build_config.lora_config.max_lora_rank,
         use_fp8_context_fmha=(
-            model.config.quantization.quant_algo == QuantAlgo.FP8
+            QuantAlgo.FP8 == model.config.quantization.quant_algo
             and build_config.plugin_config.use_fp8_context_fmha),
     )
 
@@ -990,6 +996,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
 
     if build_config.plugin_config.reduce_fusion and (
             model.config.mapping.tp_size == 1
+            or model.config.mapping.pp_size != 1
             or model.config.architecture != "LlamaForCausalLM"):
         logger.warning('Overriding reduce_fusion to False')
         build_config.plugin_config.reduce_fusion = False
@@ -1053,7 +1060,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
                 "Paged Context FMHA doesn't work with int8 kv cache currently.")
 
     if build_config.plugin_config.manage_weights:
-        if model.config.quant_mode & QuantMode.INT4_WEIGHTS or model.config.quant_mode & QuantMode.INT8_WEIGHTS:
+        if model.config.quant_mode.has_weight_quant():
             raise RuntimeError(
                 "Managed weights is not supported with int4 or int8 weights.")
 
@@ -1068,7 +1075,6 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
               and not model.config.quant_mode.has_per_group_scaling())
         or model.config.quant_mode.has_int8_kv_cache(),
         strongly_typed=build_config.strongly_typed,
-        opt_level=build_config.builder_opt,
         force_num_profiles=build_config.force_num_profiles,
         profiling_verbosity=build_config.profiling_verbosity,
         quant_mode=model.config.quant_mode,
@@ -1148,6 +1154,10 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
                 "max_batch_size": build_config.max_batch_size,
             }
 
+        if build_config.speculative_decoding_mode == SpeculativeDecodingMode.LOOKAHEAD_DECODING:
+            prepare_input_args[
+                "spec_decoding_is_generation_length_variable"] = True
+
         inputs = model.prepare_inputs(**prepare_input_args)
         model(**inputs)
 
diff --git a/tensorrt_llm/commands/bench.py b/tensorrt_llm/commands/bench.py
index 9c48dac3f..4e10cdcb3 100644
--- a/tensorrt_llm/commands/bench.py
+++ b/tensorrt_llm/commands/bench.py
@@ -2,9 +2,10 @@
 
 import click
 
+from tensorrt_llm.bench.benchmark.low_latency import latency_command
+from tensorrt_llm.bench.benchmark.throughput import throughput_command
 from tensorrt_llm.bench.build.build import build_command
 from tensorrt_llm.bench.dataclasses import BenchmarkEnvironment
-from tensorrt_llm.bench.run.run import run_command
 
 
 @click.group(name="trtllm-bench", context_settings={'show_default': True})
@@ -36,7 +37,8 @@ def main(
 
 
 main.add_command(build_command)
-main.add_command(run_command)
+main.add_command(throughput_command)
+main.add_command(latency_command)
 
 if __name__ == "__main__":
     main()
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index 913297341..3a7772ecc 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -74,7 +74,7 @@ def parse_arguments():
     parser.add_argument(
         '--max_batch_size',
         type=int,
-        default=256,
+        default=2048,
         help="Maximum number of requests that the engine can schedule.")
     parser.add_argument('--max_input_len',
                         type=int,
@@ -152,11 +152,6 @@ def parse_arguments():
                         type=str,
                         default='model.cache',
                         help="The file path to write the timing cache.")
-    parser.add_argument('--builder_opt',
-                        type=int,
-                        default=None,
-                        choices=[0, 1, 2, 3, 4, 5],
-                        help="TensorRT builder optimization level.")
     parser.add_argument(
         '--profiling_verbosity',
         type=str,
@@ -330,7 +325,6 @@ def build_model(
     bool = False,  # return the modified BuildConfig without actually building the engine
     **kwargs
 ) -> Union[Engine, BuildConfig]:
-
     model_config = copy.deepcopy(model_config)
 
     logits_dtype = kwargs.get('logits_dtype')
@@ -505,12 +499,12 @@ def main():
     else:
         config_path = os.path.join(ckpt_dir_or_model_config, 'config.json')
         ckpt_dir = ckpt_dir_or_model_config
-
     model_config = PretrainedConfig.from_json_file(config_path)
 
     # avoid ValueError if not supported quantization is chosen with use_fused_mlp
     quant_algo = model_config.quantization.quant_algo
-    if quant_algo and quant_algo != QuantAlgo.FP8:
+    if quant_algo and quant_algo not in (QuantAlgo.FP8,
+                                         QuantAlgo.MIXED_PRECISION):
         kwargs['use_fused_mlp'] = False
 
     if args.build_config is None:
@@ -536,7 +530,6 @@ def main():
                 'gather_context_logits': args.gather_context_logits,
                 'gather_generation_logits': args.gather_generation_logits,
                 'strongly_typed': True,
-                'builder_opt': args.builder_opt,
                 'force_num_profiles': args.builder_force_num_profiles,
                 'weight_sparsity': args.weight_sparsity,
                 'profiling_verbosity': args.profiling_verbosity,
diff --git a/tensorrt_llm/executor.py b/tensorrt_llm/executor.py
index af0d7138a..5c2f94db3 100644
--- a/tensorrt_llm/executor.py
+++ b/tensorrt_llm/executor.py
@@ -13,8 +13,8 @@
 from multiprocessing.shared_memory import SharedMemory
 from pathlib import Path
 from queue import Queue
-from typing import (Any, Dict, Generator, List, NamedTuple, Optional, Tuple,
-                    Union)
+from typing import (Any, Dict, Generator, List, Literal, NamedTuple, Optional,
+                    Tuple, Union)
 
 import numpy as np
 import torch
@@ -25,7 +25,8 @@
 from .hlapi.mpi_session import (MpiPoolSession, MpiSession,
                                 external_mpi_comm_available, find_free_port,
                                 need_spawn_mpi_workers)
-from .hlapi.utils import ManagedThread, SamplingParams
+from .hlapi.utils import (ManagedThread, SamplingParams, enable_llm_debug,
+                          print_colored)
 from .lora_manager import LoraManager
 from .runtime import ModelConfig
 from .runtime.model_runner import _engine_config_to_model_config
@@ -39,6 +40,14 @@ def has_event_loop() -> bool:
     return True
 
 
+if enable_llm_debug():
+    print_colored("LLM debug mode enabled.", "yellow")
+
+    import faulthandler
+    import signal
+    faulthandler.register(signal.SIGINT, all_threads=True)
+
+
 @dataclass(slots=True)
 class LoRARequest:
     lora_name: str
@@ -99,6 +108,8 @@ class CompletionOutput:
         token_ids (List[int]): The token ids of the generated output text.
         cumulative_logprob (float): The cumulative log probability of the generated output text.
         logprobs (List[float]): The log probabilities of the top probability words at each position if the logprobs are requested.
+        finish_reason (Literal['stop', 'length']): The reason why the sequence is finished.
+        stop_reason (Union[int, str]): The stop string or token id that caused the completion to stop, None if the completion finished for some other reason.
         generation_logits (torch.Tensor): The logits on the generated output token ids.
         length (int): The number of generated tokens.
         token_ids_diff (List[int]): Newly generated token ids.
@@ -110,8 +121,9 @@ class CompletionOutput:
     token_ids: List[int] = field(default_factory=list)
     cumulative_logprob: Optional[float] = None
     logprobs: List[float] = field(default_factory=list)
+    finish_reason: Optional[Literal['stop', 'length']] = None
+    stop_reason: Optional[Union[int, str]] = None
     generation_logits: Optional[torch.Tensor] = None
-
     _last_text: str = field(default="", init=False, repr=False)
     _last_logprobs_len: int = field(default=0, init=False, repr=False)
     _last_token_ids_len: int = field(default=0, init=False, repr=False)
@@ -212,6 +224,10 @@ def __str__(self):
         return f"{self.message}\nStack trace:\n{self.stack_trace}"
 
 
+class RequestError(RuntimeError):
+    ''' The error raised when the request is failed. '''
+
+
 class GenerationResult:
     '''
     The result of a generation request. It can be used to wait for the completion of the request.
@@ -265,14 +281,12 @@ def beam_width(self):
 
     def handle_response(self, response: "GenerationExecutor.Response"):
 
-        if response.error:
-            if isinstance(response.error, Exception):
-                raise response.error
-            else:
-                raise CppExecutorError(response.error)
-
         self._done = response.is_final
 
+        if response.error:
+            assert isinstance(response.error, str)
+            raise RequestError(response.error)
+
         tensors = response.tensors
 
         for i, beam_ids in enumerate(tensors.output_token_ids):
@@ -286,14 +300,23 @@ def handle_response(self, response: "GenerationExecutor.Response"):
                 self.outputs[i].generation_logits = tensors.generation_logits[
                     i, :self.outputs[i].length]
 
-        if self.finished and not self._generation_request.sampling_params.include_stop_str_in_output:
-            for beam_output in self.outputs:
-                for stop_ids in self._generation_request.sampling_params._get_stop_words(
-                ):
-                    if beam_output.token_ids[-len(stop_ids):] == stop_ids:
-                        beam_output.token_ids = beam_output.token_ids[:-len(
-                            stop_ids)]
-                        break
+        if self.finished:
+            for i, beam_output in enumerate(self.outputs):
+                if response.finish_reasons[i] == tllm.FinishReason.END_ID:
+                    beam_output.finish_reason = 'stop'
+                elif response.finish_reasons[i] == tllm.FinishReason.STOP_WORDS:
+                    beam_output.finish_reason = 'stop'
+                    sampling_params = self._generation_request.sampling_params
+                    for stop_reason, stop_ids in sampling_params._get_stop_reasons_and_words(
+                    ):
+                        if beam_output.token_ids[-len(stop_ids):] == stop_ids:
+                            beam_output.stop_reason = stop_reason
+                            if not sampling_params.include_stop_str_in_output:
+                                beam_output.token_ids = beam_output.token_ids[:-len(
+                                    stop_ids)]
+                            break
+                elif response.finish_reasons[i] == tllm.FinishReason.LENGTH:
+                    beam_output.finish_reason = 'length'
 
         if tensors.context_logits is not None:
             self.context_logits = tensors.context_logits
@@ -401,6 +424,7 @@ class Response(NamedTuple):
         """ The response from the cpp-executor to the Python main thread. """
         request_id: int
         tensors: Optional["GenerationExecutor.ResponseTensors"]
+        finish_reasons: Optional[List[tllm.FinishReason]]
         is_final: Optional[bool]
         # error is either str from cpp-executor or a Exception from Python threads/processes
         error: Optional[str | Exception]
@@ -423,6 +447,9 @@ def __init__(self):
         self._pending_responses: Dict[
             int, List[GenerationExecutor.PendingResponse]] = {}
 
+        # A flag to avoid calling shutdown() recursively. This happens when the background threads raise errors.
+        self.doing_shutdown = False
+
     @abstractmethod
     def submit(self, request: GenerationRequest) -> GenerationResult:
         pass
@@ -492,6 +519,7 @@ def _handle_background_error(self):
         # more than one error.
         if not self._error_queue.empty():
             e = self._error_queue.get()
+            self._error_queue.task_done()
             self.shutdown()
             # We can catch some exceptions here.
             raise e
@@ -627,14 +655,12 @@ def __init__(
             engine = engine[self.rank]
 
         if isinstance(engine, Engine):
-            engine.regularize_managed_weights()
             self.engine = tllm.Executor(engine.engine,
                                         json.dumps(engine.config.to_dict(),
                                                    cls=ConfigEncoder),
                                         tllm.ModelType.DECODER_ONLY,
                                         executor_config=executor_config,
-                                        managed_weights=engine.managed_weights
-                                        or {})
+                                        managed_weights=engine.managed_weights)
         else:
             self.engine = tllm.Executor(engine,
                                         tllm.ModelType.DECODER_ONLY,
@@ -703,28 +729,44 @@ def start_stats_thread(self):
         ) and not self.dispatch_stats_thread.is_alive():
             self.dispatch_stats_thread.start()
 
+    def _engine_response_callback(self, response: tllm.Response):
+        return response
+
     def await_response_task(self) -> bool:
         # Get responses and place in queue.
 
         for response in self.engine.await_responses(timeout=datetime.timedelta(
                 milliseconds=100)):
+            response = self._engine_response_callback(response)
+
             req_id = response.request_id
             if response.has_error():
-                rsp = self.Response(req_id,
-                                    tensors=None,
-                                    is_final=None,
-                                    error=response.error_msg)
+                # This error will be dispatched to the user's generate_async for the corresponding request. It won't
+                # stop the whole service.
+                rsp = self.Response(
+                    req_id,
+                    tensors=None,
+                    # Note: error Response only has one finish reason.
+                    # Since the error will be raised in the main thread, so the finish reason is not actually used.
+                    finish_reasons=[tllm.FinishReason.NOT_FINISHED],
+                    is_final=True,
+                    error=response.error_msg)
+
             else:
                 tensors = self.ResponseTensors(
-                    response.result.output_token_ids,
-                    response.result.context_logits,
-                    response.result.generation_logits,
-                    response.result.log_probs, response.result.cum_log_probs)
-
-                rsp = self.Response(req_id,
-                                    tensors,
-                                    is_final=response.result.is_final,
-                                    error=None)
+                    output_token_ids=response.result.output_token_ids,
+                    context_logits=response.result.context_logits,
+                    generation_logits=response.result.generation_logits,
+                    log_probs=response.result.log_probs,
+                    cum_log_probs=response.result.cum_log_probs,
+                )
+
+                rsp = self.Response(
+                    req_id,
+                    tensors,
+                    finish_reasons=response.result.finish_reasons,
+                    is_final=response.result.is_final,
+                    error=None)
 
             if self._to_delay_response(rsp):
                 continue
@@ -732,18 +774,9 @@ def await_response_task(self) -> bool:
             self._cleanup_pending_responses(nowait=True)
 
             queue = self.return_queue(req_id)
-            bck_error = self._error_queue.get_nowait(
-            ) if not self._error_queue.empty() else None
-
-            if bck_error is not None:
-                rsp = self.Response(req_id,
-                                    tensors=None,
-                                    is_final=None,
-                                    error=bck_error)
-
             queue.put(rsp)
 
-            if response.result.is_final:
+            if rsp.is_final:
                 self._results.pop(req_id)
 
         return True  # success
@@ -788,27 +821,31 @@ def _enqueue_request(self, request: GenerationRequest) -> int:
         else:
             lora_config = None
 
-        executor_request = tllm.Request(
-            input_token_ids=request.prompt_token_ids,
-            max_tokens=request.sampling_params.max_tokens,
-            max_new_tokens=request.sampling_params.max_new_tokens,
-            streaming=request.streaming,
-            sampling_config=request.sampling_params._get_sampling_config(),
-            end_id=request.sampling_params.end_id,
-            pad_id=request.sampling_params.pad_id,
-            output_config=request.sampling_params._get_output_config(),
-            bad_words=request.sampling_params._get_bad_words(),
-            stop_words=request.sampling_params._get_stop_words(),
-            embedding_bias=request.sampling_params.embedding_bias,
-            external_draft_tokens_config=request.sampling_params.
-            external_draft_tokens_config,
-            prompt_tuning_config=request.sampling_params.prompt_tuning_config,
-            lora_config=lora_config,
-            logits_post_processor_name=request.sampling_params.
-            logits_post_processor_name,
-        )
-        req_id = self.engine.enqueue_request(executor_request)
-        return req_id
+        try:
+            executor_request = tllm.Request(
+                input_token_ids=request.prompt_token_ids,
+                max_tokens=request.sampling_params.max_tokens,
+                max_new_tokens=request.sampling_params.max_new_tokens,
+                streaming=request.streaming,
+                sampling_config=request.sampling_params._get_sampling_config(),
+                end_id=request.sampling_params.end_id,
+                pad_id=request.sampling_params.pad_id,
+                output_config=request.sampling_params._get_output_config(),
+                bad_words=request.sampling_params._get_bad_words(),
+                stop_words=request.sampling_params._get_stop_words(),
+                embedding_bias=request.sampling_params.embedding_bias,
+                external_draft_tokens_config=request.sampling_params.
+                external_draft_tokens_config,
+                prompt_tuning_config=request.sampling_params.
+                prompt_tuning_config,
+                lora_config=lora_config,
+                logits_post_processor_name=request.sampling_params.
+                logits_post_processor_name,
+            )
+            req_id = self.engine.enqueue_request(executor_request)
+            return req_id
+        except Exception as e:
+            raise RequestError(str(e))
 
     def submit(self, request: GenerationRequest) -> GenerationResult:
         """ Low-level API to the executor. Return a "future" GenerationResult which can be waited. """
@@ -832,17 +869,27 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
         return result
 
     def shutdown(self):
-        if self.engine is not None:
-            self.await_response_thread.stop()
-            self.dispatch_stats_thread.stop()
+        if enable_llm_debug():
+            print_colored('Proxy.shutdown...\n', "yellow")
+            print(traceback.extract_stack())
+
+        if self.doing_shutdown:
+            return
+        else:
+            self.doing_shutdown = True
 
+        if self.engine is not None:
             if self.engine.can_enqueue_requests():
+
                 if self.await_response_thread.is_alive():
+                    self.await_response_thread.stop()
                     self.await_response_thread.join()
                 if self.dispatch_stats_thread.is_alive():
+                    self.dispatch_stats_thread.stop()
                     self.dispatch_stats_thread.join()
 
-            self.engine.shutdown()
+                self.engine.shutdown()
+
             self.engine = None
 
         # Check if there are any errors from the threads before shutdown.
@@ -900,6 +947,7 @@ def __init__(self,
         self.host_port, self.authkey = (address[0], address[1]), address[2]
         self.is_server = is_server
         self.conn = None
+        self.listener: Optional[Listener] = None
         if is_server:
             self.listener = Listener(self.host_port,
                                      'AF_INET',
@@ -917,8 +965,11 @@ def put(self, obj: Any):
 
         if isinstance(obj, GenerationExecutor.Response):
             tensors = self._store_tensors_in_shmm(obj.tensors)
-            obj = GenerationExecutor.Response(obj.request_id, tensors,
-                                              obj.is_final, obj.error)
+            obj = GenerationExecutor.Response(request_id=obj.request_id,
+                                              tensors=tensors,
+                                              finish_reasons=obj.finish_reasons,
+                                              is_final=obj.is_final,
+                                              error=obj.error)
 
         self.conn.send(obj)
 
@@ -929,8 +980,11 @@ def get(self) -> Any:
         obj = self.conn.recv()
         if isinstance(obj, GenerationExecutor.Response):
             tensors = self._load_tensors_from_shmm(obj.tensors)
-            obj = GenerationExecutor.Response(obj.request_id, tensors,
-                                              obj.is_final, obj.error)
+            obj = GenerationExecutor.Response(request_id=obj.request_id,
+                                              tensors=tensors,
+                                              finish_reasons=obj.finish_reasons,
+                                              is_final=obj.is_final,
+                                              error=obj.error)
         return obj
 
     def _store_tensors_in_shmm(
@@ -985,11 +1039,16 @@ def load_tensor(tensor: Optional[str]) -> Optional[torch.Tensor]:
     def address(self) -> Tuple[str, int, bytes]:
         return (self.host_port[0], self.host_port[1], self.authkey)
 
-    def __del__(self):
+    def close(self):
         if self.conn is not None:
             self.conn.close()
-        if self.is_server:
+            self.conn = None
+        if self.listener is not None:
             self.listener.close()
+            self.listener = None
+
+    def __del__(self):
+        self.close()
 
 
 class ExecutorBindingsProxy(GenerationExecutor):
@@ -1007,7 +1066,7 @@ def __init__(self,
 
         self.request_queue = IpcQueue(is_server=True)
         # Return request id back to dispatcher
-        self.request_id_queue = IpcQueue(is_server=True)
+        self.rid_or_err_queue = IpcQueue(is_server=True)
         self.result_queue = IpcQueue(is_server=True)
         self.mp_stats_queue = IpcQueue(is_server=True)
 
@@ -1024,8 +1083,8 @@ def __init__(self,
         self.workers_kwargs.update({
             "request_queue_addr":
             self.request_queue.address,
-            "request_id_queue_addr":
-            self.request_id_queue.address,
+            "rid_or_err_queue_addr":
+            self.rid_or_err_queue.address,
             "result_queue_addr":
             self.result_queue.address,
             "stats_queue_addr":
@@ -1044,7 +1103,7 @@ def __init__(self,
     @staticmethod
     def workers_main(engine: Union[Path, Engine],
                      request_queue_addr: Tuple[str, int, bytes],
-                     request_id_queue_addr: Tuple[str, int, bytes],
+                     rid_or_err_queue_addr: Tuple[str, int, bytes],
                      result_queue_addr: Tuple[str, int, bytes],
                      stats_queue_addr: Tuple[str, int, bytes],
                      executor_config: tllm.ExecutorConfig = tllm.ExecutorConfig(
@@ -1054,7 +1113,7 @@ def workers_main(engine: Union[Path, Engine],
 
         if mpi_rank() == 0:
             request_queue = IpcQueue(request_queue_addr, is_server=False)
-            request_id_queue = IpcQueue(request_id_queue_addr, is_server=False)
+            rid_or_err_queue = IpcQueue(rid_or_err_queue_addr, is_server=False)
             result_queue = IpcQueue(result_queue_addr, is_server=False)
             mp_stats_queue = IpcQueue(stats_queue_addr, is_server=False)
 
@@ -1077,9 +1136,11 @@ def notify_proxy_threads_to_quit():
                     executor.set_result_queue(result_queue)
                     executor.set_stats_queue(mp_stats_queue)
                     while (req := request_queue.get()) is not None:
-                        result = executor.submit(req)
-                        request_id_queue.put(result.request_id)
-
+                        try:
+                            result = executor.submit(req)
+                            rid_or_err_queue.put(result.request_id)
+                        except RequestError as e:
+                            rid_or_err_queue.put(e)
                     notify_proxy_threads_to_quit()
 
             except ExecutorBindingsWorker.WorkerExit as e:
@@ -1088,8 +1149,9 @@ def notify_proxy_threads_to_quit():
             except Exception as e:  # other critical errors
                 if mpi_rank() == 0:
                     notify_proxy_threads_to_quit()
-
-                raise CppExecutorError(f"Failed during generation: {e}") from e
+                err = CppExecutorError(f"Failed during generation: {e}")
+                if mpi_rank() == 0:
+                    rid_or_err_queue.put(err)
 
     def dispatch_result_task(self) -> bool:
         # process the remaining pending req_ids before getting the next response, since the queue.get will block, we'd
@@ -1141,9 +1203,8 @@ def dispatch_stats_task(self) -> bool:
     def start(self):
 
         def mpi_done_callback(future: concurrent.futures.Future):
-            try:
-                future.result()
-            except:
+            # This is called when the MPI worker is done, so future.exception() will not block.
+            if future.exception() is not None:
                 self._error_queue.put_nowait(future.exception())
 
         self.mpi_futures = self.mpi_session.submit(
@@ -1162,14 +1223,28 @@ def mpi_done_callback(future: concurrent.futures.Future):
         self._handle_background_error()
 
     def shutdown(self):
+        if enable_llm_debug():
+            print_colored('Proxy.shutdown...\n', "yellow")
+            print_colored(str(traceback.extract_stack()), "yellow")
         if not self.workers_started:
             return
 
-        self.request_queue.put(None)  # Tell the rank0 worker to quit
+        if self.doing_shutdown:
+            return
+        else:
+            self.doing_shutdown = True
+
+        # step1: notify the workers to quit
+        self.request_queue.put(None)
 
         for f in self.mpi_futures:
-            f.result()
+            try:
+                f.result()
+            except:
+                # The errors are already captured in mpi_done_callback, ignored here
+                pass
 
+        # step2: notify the background threads to quit
         if self.dispatch_result_thread.is_alive():
             self.dispatch_result_thread.stop()
             self.dispatch_result_thread.join()
@@ -1177,9 +1252,17 @@ def shutdown(self):
             self.dispatch_stats_thread.stop()
             self.dispatch_stats_thread.join()
 
+        # step3: finish all remaining work
+
         # It is possible that some requests are still pending in the workers, we need to process them before shutdown
         self._cleanup_pending_responses(nowait=False)
 
+        # close all the sockets
+        self.request_queue.close()
+        self.rid_or_err_queue.close()
+        self.result_queue.close()
+        self.mp_stats_queue.close()
+
         self.workers_started = False
 
         # Process the errors in-case error during shutting down the threads
@@ -1195,12 +1278,14 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
 
         self.request_queue.put(request)
 
-        req_id = self.request_id_queue.get()
-        request.set_id(req_id)
+        rid_or_err = self.rid_or_err_queue.get()
+        if isinstance(rid_or_err, Exception):
+            raise rid_or_err
+        request.set_id(rid_or_err)
 
         result = GenerationResult(
             request, background_error_handler=self._handle_background_error)
-        self._results[req_id] = result
+        self._results[rid_or_err] = result
 
         self._handle_background_error()
 
@@ -1214,4 +1299,4 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         self.shutdown()
-        return False
+        return False  # propagate the exception
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 7a5cca5a1..335f85d7c 100644
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -27,10 +27,11 @@
 
 from . import graph_rewriting as gw
 from ._common import default_net, default_trtnet, precision
-from ._utils import (bf16_array, bool_array, dim_resolve_negative,
-                     dim_to_trt_axes, dims_array, fp16_array, fp32_array,
-                     int32_array, int64_array, np_dtype_to_trt,
-                     str_dtype_to_trt, trt_dtype_to_np, trt_dtype_to_str)
+from ._utils import (QuantModeWrapper, bf16_array, bool_array,
+                     dim_resolve_negative, dim_to_trt_axes, dims_array,
+                     fp16_array, fp32_array, int32_array, int64_array,
+                     np_dtype_to_trt, str_dtype_to_trt, trt_dtype_to_np,
+                     trt_dtype_to_str)
 from .network import PluginInfo, set_np_weight, set_plugin_info
 from .plugin import TRT_LLM_PLUGIN_NAMESPACE, current_all_reduce_helper
 from .quantization import QuantMode
@@ -4063,7 +4064,7 @@ def bert_attention(tensor: Tensor,
             The maximum distance of relative position in attention, for implicit mode.
             Default value is 0, meaning to use the regular mode of relative attention bias.
             Implicit mode is only enabled when passing in non-zero positive max_distance value.
-            See relative attention bias in docs/gpt_attention.md
+            See relative attention bias in docs/source/advanced/gpt-attention.md
 
         max_input_length: Tensor = None
             The maximum input sequence length represented by Tensor shape. Requires for remove_input_padding to pre-define plugin workspace size.
@@ -4579,7 +4580,7 @@ def gpt_attention(
     kv_orig_quant_scale: Optional[Tensor] = None,
     kv_quant_orig_scale: Optional[Tensor] = None,
     attention_output_orig_quant_scale: Optional[Tensor] = None,
-    kv_cache_quant_mode: QuantMode = QuantMode(0),
+    kv_cache_quant_mode: Union[QuantModeWrapper, QuantMode] = QuantMode(0),
     max_context_length: Optional[int] = None,
     mask_type: AttentionMaskType = AttentionMaskType.causal,
     block_sparse_block_size: int = 64,
@@ -4594,6 +4595,7 @@ def gpt_attention(
     kv_cache_block_offsets: Optional[Tensor] = None,
     host_kv_cache_block_offsets: Tensor = None,
     host_kv_cache_pool_pointers: Tensor = None,
+    host_kv_cache_pool_mapping: Tensor = None,
     do_cross_attention: bool = False,
     cross_qkv: Optional[Tensor] = None,  # for cross attention
     cross_qkv_length: Optional[Tensor] = None,  # for cross attention
@@ -4609,6 +4611,7 @@ def gpt_attention(
     spec_decoding_position_offsets: Tensor = None,
     spec_decoding_packed_mask: Tensor = None,
     host_runtime_perf_knobs: Optional[Tensor] = None,
+    layer_idx_in_cache_pool: Optional[int] = None,
 ) -> Tuple[Tensor, Optional[Tensor]]:
     '''
     Add an operation that performs the multi-head attention in GPT-like models.
@@ -4619,19 +4622,19 @@ def gpt_attention(
     arguments that are likely to be removed or merged with others in the future
     release.
 
-    See docs/gpt_attention.md for the documentation of that function.
+    See docs/source/advanced/gpt-attention.md for the documentation of that function.
 
     Parameters:
         qkv: Tensor (On GPU)
             The input QKV tensor. Its shape is [batch_beam_size, max_seqlen, qkv_dim] in padded mode and [1, num_tokens, qkv_dim] in
-            packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/gpt_attention.md,
+            packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/source/advanced/gpt-attention.md,
 
         past_key_value: Tensor (On GPU)
             The tensor that stores KV cache data. Its shape is
             [max_batch_size * max_beam_width, 2, num_kv_heads, max_seqlen, hidden_dim_per_head]
             in contiguous mode and
             [max_blocks, 2, num_kv_heads, num_tokens_per_block, hidden_dim_per_head]
-            in paged mode. See KV Cache in docs/gpt_attention.md,
+            in paged mode. See KV Cache in docs/source/advanced/gpt-attention.md,
 
         context_fmha_custom_mask: Tensor (On GPU)
             The tensor that stores the packed custom mask for fmha.
@@ -4639,7 +4642,7 @@ def gpt_attention(
 
         sequence_lengths: Tensor (On GPU)
             The tensor that stores the length of each sequence. Its shape is
-            [batch_size]. See QKV Input in docs/gpt_attention.md,
+            [batch_size]. See QKV Input in docs/source/advanced/gpt-attention.md,
 
         host_past_key_value_lengths: Tensor (On CPU)
             An INT32 tensor of shape [batch_size],
@@ -4657,12 +4660,12 @@ def gpt_attention(
         cache_indirection: Tensor (On GPU)
             The tensor to reconstruct the paths when using beam-search. Its
             shape is [batch_size, beam_width, max_seqlen]. See Beam-Search in
-            docs/gpt_attention.md,
+            docs/source/advanced/gpt-attention.md,
 
         host_request_types: Tensor = None (On CPU)
             The tensor on the host that indicates if a request is in context or
             generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,
 
         layer_idx: int
             The index of this attention layer, used to access kv_cache_block_offsets,
@@ -4678,7 +4681,7 @@ def gpt_attention(
 
         q_scaling: float
             The value used to compute the scaling factor applied to the output
-            of the Q*K^T product. See Scaling Factors in docs/gpt_attention.md,
+            of the Q*K^T product. See Scaling Factors in docs/source/advanced/gpt-attention.md,
 
         qk_tanh_scale: float
             The scale * tanh(value / scale) used to compute the scaling factor applied to the output
@@ -4726,12 +4729,12 @@ def gpt_attention(
         kv_orig_quant_scale: Tensor
             The tensor to store the scaling factor for quantization to INT8/FP8
             in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache in
-            docs/gpt_attention.md,
+            docs/source/advanced/gpt-attention.md,
 
         kv_quant_orig_scale: Tensor
             The tensor to store the scaling factor for dequantization from
             INT8/FP8 in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,
 
         attention_output_orig_quant_scale: Tensor
             The tensor to store the scaling factor for quantization to FP8
@@ -4742,7 +4745,7 @@ def gpt_attention(
 
         max_context_length: int32_t
             The length of the longest input sequence. See QKV Input in
-            docs/gpt_attention.md,
+            docs/source/advanced/gpt-attention.md,
 
         mask_type: int = 1
             The type of mask:
@@ -4779,14 +4782,17 @@ def gpt_attention(
         kv_cache_block_offsets:
             The tensor of block offsets for the KV cache. Its shape is
             [num_layers, max_batch_size, max_beam_width, 2, max_blocks_per_sequence * 2],
-            See KV cache section in docs/gpt_attention.md, on gpu,
+            See KV cache section in docs/source/advanced/gpt-attention.md, on gpu,
 
         host_kv_cache_block_offsets:
             The same as kv_cache_block_offsets, but on cpu,
 
         host_kv_cache_pool_pointers:
-            The tensor of pool pointers for the KV cache. Its shape is [2],
-            See KV cache section in docs/gpt_attention.md, on gpu,
+            The tensor of pool pointers for the KV cache. Its shape is [num_layers, 2],
+            See KV cache section in docs/source/advanced/gpt-attention.md, on gpu,
+
+        host_kv_cache_pool_mapping:
+            The tensor of pool mapping for the different memory pools. Its shape is [num_layers,],
 
         do_cross_attention: bool = False
             Do we use this as cross attention instead of self attention,
@@ -4809,7 +4815,7 @@ def gpt_attention(
             The maximum distance of relative position in attention, for implicit mode.
             Default value is 0, meaning to use the regular mode of relative attention bias.
             Implicit mode is only enabled when passing in non-zero positive max_distance value.
-            See relative attention bias in docs/gpt_attention.md
+            See relative attention bias in docs/source/advanced/gpt-attention.md
 
         host_context_lengths: Tensor = None (On CPU)
             A host tensor that contains the lengths of the different inputs,
@@ -4861,6 +4867,9 @@ def gpt_attention(
     assert host_max_attention_window_sizes is not None
     assert host_sink_token_length is not None
 
+    if layer_idx_in_cache_pool is None:
+        layer_idx_in_cache_pool = layer_idx
+
     paged_kv_cache_flag = default_net().plugin_config.paged_kv_cache
     if isinstance(qkv, list):
         is_unfuse_qkv_gemm = 1
@@ -4884,6 +4893,10 @@ def gpt_attention(
     num_kv_heads = trt.PluginField("num_kv_heads",
                                    np.array(num_kv_heads, dtype=np.int32),
                                    trt.PluginFieldType.INT32)
+    layer_idx_in_cache_pool = trt.PluginField(
+        "layer_idx_in_cache_pool",
+        np.array(layer_idx_in_cache_pool, dtype=np.int32),
+        trt.PluginFieldType.INT32)
     head_size = trt.PluginField("head_size",
                                 np.array(hidden_size_per_head, dtype=np.int32),
                                 trt.PluginFieldType.INT32)
@@ -4985,6 +4998,9 @@ def gpt_attention(
                               trt.PluginFieldType.INT32)
     tp_rank = trt.PluginField("tp_rank", np.array(tp_rank, dtype=np.int32),
                               trt.PluginFieldType.INT32)
+    if isinstance(kv_cache_quant_mode, QuantModeWrapper):
+        # Now in TRT-LLM only use global kv_cache, so it's enough to get the first quant mode from list
+        kv_cache_quant_mode = kv_cache_quant_mode[0]
     kv_cache_quant_mode_field = trt.PluginField(
         "kv_cache_quant_mode", np.array(kv_cache_quant_mode, dtype=np.int32),
         trt.PluginFieldType.INT32)
@@ -5034,13 +5050,14 @@ def gpt_attention(
                                    trt.PluginFieldType.INT32)
 
     pfc = trt.PluginFieldCollection([
-        layer_idx, nheads, vision_start, vision_length, num_kv_heads, head_size,
-        unidirectional, q_scaling, qk_tanh_scale, position_embedding_type,
-        rotary_embedding_dim, rotary_embedding_base,
-        rotary_embedding_scale_type, rotary_embedding_scale,
-        rotary_embedding_short_m_scale, rotary_embedding_long_m_scale,
-        rotary_embedding_max_positions, rotary_embedding_original_max_positions,
-        tp_size, tp_rank, unfuse_qkv_gemm, context_fmha_type, enable_xqa,
+        layer_idx, nheads, vision_start, vision_length, num_kv_heads,
+        layer_idx_in_cache_pool, head_size, unidirectional, q_scaling,
+        qk_tanh_scale, position_embedding_type, rotary_embedding_dim,
+        rotary_embedding_base, rotary_embedding_scale_type,
+        rotary_embedding_scale, rotary_embedding_short_m_scale,
+        rotary_embedding_long_m_scale, rotary_embedding_max_positions,
+        rotary_embedding_original_max_positions, tp_size, tp_rank,
+        unfuse_qkv_gemm, context_fmha_type, enable_xqa,
         kv_cache_quant_mode_field, remove_input_padding, mask_type,
         block_sparse_block_size, block_sparse_homo_head_pattern,
         block_sparse_num_local_blocks, block_sparse_vertical_stride,
@@ -5079,9 +5096,10 @@ def gpt_attention(
             assert kv_cache_block_offsets is not None, "Paged kv cache is enabled, the kv_cache_block_offsets tensor shall not be None"
             assert host_kv_cache_block_offsets is not None, "Paged kv cache is enabled, the host_kv_cache_block_offsets tensor shall not be None"
             assert host_kv_cache_pool_pointers is not None, "Paged kv cache is enabled, the host_kv_cache_pool_pointers tensor shall not be None"
+            assert host_kv_cache_pool_mapping is not None, "Paged kv cache is enabled, the host_kv_cache_pool_mapping tensor shall not be None"
             plug_inputs += [
                 kv_cache_block_offsets, host_kv_cache_block_offsets,
-                host_kv_cache_pool_pointers
+                host_kv_cache_pool_pointers, host_kv_cache_pool_mapping
             ]
         else:
             plug_inputs += [past_key_value]
@@ -5609,7 +5627,7 @@ def lora_plugin(
         host_request_types : Tensor = None
             The tensor on the host that indicates if a request is in context or
             generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,
 
         transa : bool
             Is the first input transposed? Set to 'True' if you want the first
@@ -5736,7 +5754,7 @@ def mamba_conv1d(input: Tensor,
         host_request_types : Tensor (On CPU)
             The tensor on the host that indicates if a request is in context or
             generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,
 
         last_token_ids : Tensor (On GPU)
             The inclusive prefix-sum of the lengths or the lengths of the
@@ -5883,7 +5901,7 @@ def selective_scan(input: Tensor,
         host_request_types : Tensor (On CPU)
             The tensor on the host that indicates if a request is in context or
             generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md
+            in docs/source/advanced/gpt-attention.md
 
         last_token_ids : Tensor (On GPU)
             The inclusive prefix-sum of the lengths or the lengths of the
@@ -6029,7 +6047,7 @@ def rg_lru(input: Tensor,
         host_request_types : Tensor (On CPU)
             The tensor on the host that indicates if a request is in context or
             generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,
 
         last_token_ids : Tensor (On GPU)
             The inclusive prefix-sum of the lengths or the lengths of the
@@ -6186,7 +6204,7 @@ def rg_lru(input: Tensor,
 
 
 def topk(input: Tensor,
-         k: int,
+         k: Union[Tensor, int],
          dim: int,
          largest: bool = True) -> Tuple[Tensor, Tensor]:
     '''
@@ -6227,8 +6245,12 @@ def topk(input: Tensor,
     layer = default_trtnet().add_topk(
         input.trt_tensor,
         trt.TopKOperation.MAX if largest else trt.TopKOperation.MIN,
-        k=k,
+        k=k if not isinstance(k, Tensor) else 1,
         axes=axes)
+    if isinstance(k, Tensor):
+        if k.ndim() == 1:
+            k = squeeze(k, 0)
+        layer.set_input(1, k.trt_tensor)
     values = layer.get_output(0)
     indices = layer.get_output(1)
 
diff --git a/tensorrt_llm/hlapi/__init__.py b/tensorrt_llm/hlapi/__init__.py
index 77f4fa3e9..eeb2f07ee 100644
--- a/tensorrt_llm/hlapi/__init__.py
+++ b/tensorrt_llm/hlapi/__init__.py
@@ -1,7 +1,8 @@
+from ..executor import RequestError
 from .build_cache import BuildCacheConfig
 from .llm import LLM, RequestOutput, SamplingParams
-from .llm_utils import (BuildConfig, CapacitySchedulerPolicy, KvCacheConfig,
-                        QuantAlgo, QuantConfig, SchedulerConfig)
+from .llm_utils import (BuildConfig, CalibConfig, CapacitySchedulerPolicy,
+                        KvCacheConfig, QuantAlgo, QuantConfig, SchedulerConfig)
 
 __all__ = [
     'LLM',
@@ -13,5 +14,7 @@
     'BuildConfig',
     'QuantConfig',
     'QuantAlgo',
+    'CalibConfig',
     'BuildCacheConfig',
+    'RequestError',
 ]
diff --git a/tensorrt_llm/hlapi/llm.py b/tensorrt_llm/hlapi/llm.py
index 023e55db2..313127b38 100644
--- a/tensorrt_llm/hlapi/llm.py
+++ b/tensorrt_llm/hlapi/llm.py
@@ -397,7 +397,7 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback) -> bool:
         del exc_value, traceback
         self._shutdown()
-        return exc_type is not None
+        return False  # propagate exceptions
 
     def __getstate__(self):
         raise RuntimeError("LLM object can not be pickled.")
diff --git a/tensorrt_llm/hlapi/llm_utils.py b/tensorrt_llm/hlapi/llm_utils.py
index a877667d5..dc6be04c6 100644
--- a/tensorrt_llm/hlapi/llm_utils.py
+++ b/tensorrt_llm/hlapi/llm_utils.py
@@ -15,6 +15,7 @@
     'BuildConfig',
     'BuildCacheConfig',
     'QuantConfig',
+    'CalibConfig',
     'CachedModelLoader',
     'ConfigArbitrateError',
     '_ConfigArbitrator',
@@ -30,9 +31,8 @@
 from dataclasses import asdict, dataclass, field, fields
 from enum import Enum
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 
-import tensorrt as trt
 import torch
 from tqdm import tqdm
 from transformers import PreTrainedTokenizerBase
@@ -55,7 +55,6 @@
 from .tokenizer import TokenizerBase, TransformersTokenizer, tokenizer_factory
 # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import
 from .utils import (GpuArch, download_hf_model, download_hf_pretrained_config,
-                    file_with_glob_exists, file_with_suffix_exists,
                     get_directory_size_in_gb, print_colored,
                     print_traceback_on_error, set_docstring)
 
@@ -115,6 +114,36 @@ def is_multi_gpu(self) -> bool:
         return self.world_size > 1
 
 
+@dataclass(slots=True)
+class CalibConfig:
+    """
+    Calibration configuration.
+
+    Args:
+        device (Literal['cuda', 'cpu'], default='cuda'): The device to run calibration.
+        calib_dataset (str, default='cnn_dailymail'): The name or local path of calibration dataset.
+        calib_batches (int, default=512): The number of batches that the calibration runs.
+        calib_batch_size (int, default=1): The batch size that the calibration runs.
+        calib_max_seq_length (int, default=512): The maximum sequence length that the calibration runs.
+        random_seed (int, default=1234): The random seed used for calibration.
+        tokenizer_max_seq_length (int, default=2048): The maximum sequence length to initialize tokenizer for calibration.
+    """
+    device: Literal['cuda', 'cpu'] = 'cuda'
+    calib_dataset: str = 'cnn_dailymail'
+    calib_batches: int = 512
+    calib_batch_size: int = 1
+    calib_max_seq_length: int = 512
+    random_seed: int = 1234
+    tokenizer_max_seq_length: int = 2048
+
+    @classmethod
+    def from_dict(cls, config: dict):
+        return cls(**config)
+
+    def to_dict(self):
+        return asdict(self)
+
+
 class _ModelFormatKind(Enum):
     HF = 0
     TLLM_CKPT = 1
@@ -180,6 +209,10 @@ def from_module(cls, module: Module):
 
     revision (str, optional): The revision of the model to use.
         Default is None.
+
+    load_format (Literal['auto', 'dummy'], default='auto'): The format of the model weights to load.
+        * 'auto' will try to load the weights from the provided checkpoint.
+        * 'dummy' will initialize the weights with random values, which is mainly for profiling.
 """
 
 # The arguments locate in LLM class's kwargs, and will be concatenated to LLM class's apidocs.
@@ -204,6 +237,8 @@ def from_module(cls, module: Module):
     quant_config (QuantConfig, default=QuantConfig()): The quantization configuration for the model.
         Default is an empty QuantConfig instance.
 
+    calib_config (CalibConfig, default=CalibConfig()): The calibration configuration for the model.
+
     embedding_parallel_mode (str, default="SHARDING_ALONG_VOCAB"): The parallel mode for embeddings.
 
     share_embedding_table (bool, default=False): Whether to share the embedding table.
@@ -263,6 +298,8 @@ class LlmArgs:
 
     revision: Optional[str] = None
 
+    load_format: Literal['auto', 'dummy'] = 'auto'
+
     # LoRA arguments
     enable_lora: bool = False
 
@@ -275,8 +312,12 @@ class LlmArgs:
     # BuildConfig is introduced to give users a familiar interface to configure the model building.
     build_config: Optional[BuildConfig] = None
 
+    fast_build: Optional[bool] = False
+
     quant_config: QuantConfig = field(default_factory=QuantConfig)
 
+    calib_config: CalibConfig = field(default_factory=CalibConfig)
+
     # A handful of options from PretrainedConfig
     embedding_parallel_mode: str = 'SHARDING_ALONG_VOCAB'
 
@@ -331,8 +372,6 @@ def __post_init__(self):
             if self.dtype == 'bfloat16':
                 raise RuntimeError("Pre SM 80 GPUs do not support bfloat16")
 
-        self._engine_config: Optional[EngineConfig] = None
-
         self.auto_parallel_config = AutoParallelConfig(
             sharded_io_allowlist=[
                 "past_key_value_\\d+",
@@ -422,6 +461,11 @@ def setup(self):
 
         self.build_config = self.build_config or BuildConfig()
 
+        # TODO(xiweny): remove the checker when manage weights support all data types
+        if self.fast_build and (self.quant_config.quant_algo is QuantAlgo.FP8
+                                or self.quant_config.quant_algo is None):
+            self._update_plugin_config("manage_weights", True)
+
         if self.enable_lora:
             self.build_config.plugin_config.lora_plugin = 'auto'
             if self.max_lora_rank is not None:
@@ -433,7 +477,7 @@ def setup(self):
     def _perform_config_arbitration(self):
         '''
         Arbitrate the configurations for the model building. The configs between different functional or performance
-        features might be confilcted, and this method will arbitrate the conflicts and raise errors if necessary.
+        features might be conflicted, and this method will arbitrate the conflicts and raise errors if necessary.
         '''
         self._config_arbitrator = _ConfigArbitrator()
         if self.build_config_mutable:
@@ -773,24 +817,17 @@ class _ModelRuntimeContext:
     ''' _ModelRuntimeContext holds the minimum runtime resources for running a model.
     It could be a runtime cache in MPI nodes.
     '''
-    engine_buffer: Optional[trt.IHostMemory] = None
-    # engine_config is only used for saving the engine to disk
-    engine_config: Optional[Union[dict, EngineConfig]] = None
+    engine: Optional[Engine] = None
     mapping: Optional[Mapping] = None
     model_info: Optional[_ModelInfo] = None
 
     # This is only used when build-cache is enabled
     engine_path: Optional[str] = None
 
-    @property
-    def engine(self) -> trt.IHostMemory:
-        assert self.engine_buffer is not None
-        return self.engine_buffer
-
     @property
     def model_arch(self) -> str:
         # "LlaMACausalForLM" or "OPTForCausalLM" and so on
-        return self.engine_config.pretrained_config['architecture']
+        return self.engine.config.pretrained_config['architecture']
 
 
 class ModelLoader:
@@ -959,16 +996,10 @@ def __call__(self, engine_dir: Optional[Path] = None) -> Path:
         )
         pipeline()
 
-        if not hasattr(self, '_engine_config'):
-            raise RuntimeError("config is not loaded.")
-
-        config = self._engine_config
-
         assert engine_dir
 
         runtime_context = _ModelRuntimeContext(
-            engine_buffer=self._engine_buffer,
-            engine_config=config,
+            engine=self._engine,
             mapping=self.mapping,
             model_info=self._model_info,
         )
@@ -1021,28 +1052,37 @@ def copy_hf_tokenizer_data_to_engine_dir():
                     else:
                         shutil.copy2(src, dst)
 
-        engine = Engine(config=model.engine_config, engine=model.engine)
-        engine.save(engine_dir)
+        model.engine.save(engine_dir)
         if rank == 0:
             copy_hf_tokenizer_data_to_engine_dir()
 
     @staticmethod
     def get_model_format(model_dir: str) -> _ModelFormatKind:
         ''' Get the format of the model.  '''
-        # TODO: migrate to detect version field in config.json after TRTLLM-256 finished
-        if Path.exists(
-                Path(model_dir) / 'config.json') and file_with_glob_exists(
-                    model_dir, 'rank*.safetensors'):
-            return _ModelFormatKind.TLLM_CKPT
-        if (Path.exists(Path(model_dir) / 'config.json')
-                and (file_with_suffix_exists(model_dir, '.bin')
-                     or file_with_suffix_exists(model_dir, '.safetensors'))):
-            return _ModelFormatKind.HF
-        if Path.exists(
-                Path(model_dir) / 'config.json') and file_with_suffix_exists(
-                    model_dir, '.engine'):
-            return _ModelFormatKind.TLLM_ENGINE
-        raise ValueError(f"Unknown model format for {model_dir}")
+        if not (Path(model_dir) / 'config.json').exists():
+            raise ValueError(
+                f"Failed to infer model format because no config.json exists in {model_dir}"
+            )
+
+        with open(Path(model_dir) / 'config.json') as f:
+            config = json.load(f)
+
+        try:
+            if 'pretrained_config' in config and 'build_config' in config:
+                model_format = _ModelFormatKind.TLLM_ENGINE
+                EngineConfig.from_json_file(Path(model_dir) / 'config.json')
+            elif 'architecture' in config and 'dtype' in config:
+                model_format = _ModelFormatKind.TLLM_CKPT
+                PretrainedConfig.from_checkpoint(model_dir)
+            else:
+                model_format = _ModelFormatKind.HF
+                AutoConfig.from_hugging_face(model_dir)
+        except Exception as e:
+            raise ValueError(
+                f"Inferred model format {model_format}, but failed to load config.json: {e}"
+            )
+        else:
+            return model_format
 
     def _download_hf_model(self):
         ''' Download HF model from third-party model hub like www.modelscope.cn or huggingface.  '''
@@ -1065,7 +1105,16 @@ def _load_model_from_hf(self):
         assert self._model_dir is not None
         model_cls = AutoModelForCausalLM.get_trtllm_model_class(
             self._model_dir, self.llm_args.trust_remote_code)
-        if self.llm_args.quant_config.requires_calibration:
+        if self.llm_args.load_format == 'dummy':
+            config = model_cls.config_class.from_hugging_face(
+                str(self._model_dir),
+                dtype=self.llm_args.dtype,
+                mapping=self.mapping,
+                quant_config=self.llm_args.quant_config,
+                **self.convert_checkpoint_options,
+            )
+            self.model = model_cls(config)
+        elif self.llm_args.quant_config.requires_calibration:
             assert self.workspace is not None
             checkpoint_dir = f"{self.workspace}/quantized-checkpoint"
             if self.rank == 0:
@@ -1075,6 +1124,7 @@ def _load_model_from_hf(self):
                     dtype=self.llm_args.dtype,
                     mapping=self.mapping,
                     quant_config=self.llm_args.quant_config,
+                    **self.llm_args.calib_config.to_dict(),
                     trust_remote_code=self.llm_args.trust_remote_code,
                 )
             if self.llm_args.parallel_config.is_multi_gpu:
@@ -1108,8 +1158,11 @@ def _load_model_from_ckpt(self):
         assert architecture in MODEL_MAP, \
             f"Unsupported model architecture: {architecture}"
         model_cls = MODEL_MAP[architecture]
-        self.model = model_cls.from_checkpoint(self._model_dir,
-                                               config=self.pretrained_config)
+        if self.llm_args.load_format == 'dummy':
+            self.model = model_cls(self.pretrained_config)
+        else:
+            self.model = model_cls.from_checkpoint(
+                self._model_dir, config=self.pretrained_config)
         self._model_info = _ModelInfo.from_pretrained_config(
             self.pretrained_config)
 
@@ -1138,10 +1191,7 @@ def _build_engine(self):
             self.model.config.mapping.rank = self.rank
         assert self.model is not None, "model is loaded yet."
 
-        engine = build(self.model, copied_build_config)
-
-        self._engine_buffer = engine.engine
-        self._engine_config = engine.config
+        self._engine = build(self.model, copied_build_config)
         self.mapping = self.model.config.mapping
 
         # delete the model explicitly to free all the build-time resources
@@ -1162,9 +1212,7 @@ def _save_engine_for_runtime(self):
 
     def _load_engine_buffer(self):
         # Load engine buffer from disk
-        engine = Engine.from_dir(self._model_dir)
-        self._engine_buffer = engine.engine
-        self._engine_config = engine.config
+        self._engine = Engine.from_dir(self._model_dir)
 
     @staticmethod
     def load_extra_build_configs_from_engine(
@@ -1322,7 +1370,7 @@ def build_task(engine_dir: Path):
             if model_format is not _ModelFormatKind.TLLM_ENGINE:
                 model_loader_kwargs = {
                     'llm_args': self.llm_args,
-                    'workspace': self.workspace,
+                    'workspace': str(self.workspace),
                     'llm_build_stats': self.llm_build_stats,
                 }
 
@@ -1397,7 +1445,7 @@ def save(self, engine_dir: Path):
 @dataclass
 class LlmBuildStats:
     ''' LlmBuildStats is the statistics for the LLM model building. '''
-    # Whether the cache is hitted for the engine
+    # Whether the cache is hit for the engine
     cache_hitted: bool = False
     cache_info: Optional[str] = None
 
diff --git a/tensorrt_llm/hlapi/utils.py b/tensorrt_llm/hlapi/utils.py
index 9bf2cbe43..6db9d38ee 100644
--- a/tensorrt_llm/hlapi/utils.py
+++ b/tensorrt_llm/hlapi/utils.py
@@ -201,6 +201,24 @@ def _get_stop_words(self) -> List[List[int]]:
                     "please call the setup method.")
             return words + self._stop_word_ids
 
+    def _get_stop_reasons_and_words(
+            self) -> List[Tuple[Union[str, int], List[int]]]:
+        stop_reasons = []
+        if self.stop_token_ids is not None:
+            stop_reasons.extend(self.stop_token_ids)
+        if self.stop is not None:
+            if isinstance(self.stop, str):
+                stop_reasons.append(self.stop)
+            else:
+                stop_reasons.extend(self.stop)
+        stop_words = self._get_stop_words()
+        if len(stop_reasons) != len(stop_words):
+            raise RuntimeError(
+                f"The number of {self.__class__.__name__}.stop_token_ids ({self.stop_token_ids}) "
+                f"and {self.__class__.__name__}.stop ({self.stop}) are inconsistent with the "
+                f"processed stop_words ({stop_words}).")
+        return list(zip(stop_reasons, stop_words))
+
     def _get_sampling_config(self) -> tllme.SamplingConfig:
         expected_fields = [
             "beam_width", "top_k", "top_p", "top_p_min", "top_p_reset_ids",
@@ -451,10 +469,17 @@ def run(self):
                 if not self.task(**self.kwargs):
                     break
             except Exception as e:
-                logger.error(f"Error in thread {self.name}: {e}")
+                logger.error(
+                    f"Error in thread {self.name}: {e}\n{traceback.format_exc()}"
+                )
                 self.error_queue.put(e)
 
         logger.info(f"Thread {self.name} stopped.")
 
     def stop(self):
         self.stop_event.set()
+
+
+def enable_llm_debug() -> bool:
+    ''' Tell whether to enable the debug mode for LLM class.  '''
+    return os.environ.get("TLLM_LLM_ENABLE_DEBUG", "0") == "1"
diff --git a/tensorrt_llm/layers/__init__.py b/tensorrt_llm/layers/__init__.py
index e41e087ac..8d22806b1 100644
--- a/tensorrt_llm/layers/__init__.py
+++ b/tensorrt_llm/layers/__init__.py
@@ -23,7 +23,7 @@
 from .linear import ColumnLinear, Linear, RowLinear
 from .lora import Lora, LoraParams, LoraRuntimeParams
 from .mlp import MLP, FusedGatedMLP, GatedMLP
-from .moe import MOE, MoeConfig
+from .moe import MOE, MoeConfig, SharedMoE
 from .normalization import GroupNorm, LayerNorm, RmsNorm
 from .pooling import AvgPool2d
 from .recurrent import FusedRgLru, GroupedLinear, Recurrent, RgLru
@@ -61,6 +61,7 @@
     'LoraRuntimeParams',
     'MOE',
     'MoeConfig',
+    'SharedMoE',
     'Mamba',
     'Mamba2',
     'Recurrent',
diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py
index 995425bc1..1c8c66159 100644
--- a/tensorrt_llm/layers/attention.py
+++ b/tensorrt_llm/layers/attention.py
@@ -28,7 +28,7 @@
                           allgather, arange, bert_attention, cast, clip, concat,
                           constant, embedding, expand, expand_dims, expand_mask,
                           generate_alibi_biases, generate_alibi_slopes,
-                          gpt_attention, matmul)
+                          gpt_attention, gt, matmul)
 from ..functional import max as fmax
 from ..functional import (minimum, repeat_interleave, shape, slice, softmax,
                           split, unsqueeze, where)
@@ -251,11 +251,13 @@ def __init__(self,
                  kv_cache_block_offsets: Tensor = None,
                  host_kv_cache_block_offsets: Tensor = None,
                  host_kv_cache_pool_pointers: Tensor = None,
+                 host_kv_cache_pool_mapping: Tensor = None,
                  cache_indirection: Tensor = None,
                  past_key_value_length: Tensor = None,
                  cross_kv_cache_block_offsets: Tensor = None,
                  host_cross_kv_cache_block_offsets: Tensor = None,
-                 host_cross_kv_cache_pool_pointers: Tensor = None):
+                 host_cross_kv_cache_pool_pointers: Tensor = None,
+                 host_cross_kv_cache_pool_mapping: Tensor = None):
         self.past_key_value = past_key_value
         self.host_past_key_value_lengths = host_past_key_value_lengths
         self.host_max_attention_window_sizes = host_max_attention_window_sizes
@@ -263,9 +265,11 @@ def __init__(self,
         self.kv_cache_block_offsets = kv_cache_block_offsets
         self.host_kv_cache_block_offsets = host_kv_cache_block_offsets
         self.host_kv_cache_pool_pointers = host_kv_cache_pool_pointers
+        self.host_kv_cache_pool_mapping = host_kv_cache_pool_mapping
         self.cross_kv_cache_block_offsets = cross_kv_cache_block_offsets
         self.host_cross_kv_cache_block_offsets = host_cross_kv_cache_block_offsets
         self.host_cross_kv_cache_pool_pointers = host_cross_kv_cache_pool_pointers
+        self.host_cross_kv_cache_pool_mapping = host_cross_kv_cache_pool_mapping
         self.cache_indirection = cache_indirection
         # self.past_key_value_length = past_key_value_length
 
@@ -349,7 +353,8 @@ def __init__(self,
                  max_attn_value=0.0,
                  block_sparse_params=None,
                  use_implicit_relative_attention=False,
-                 reorder=False):
+                 reorder=False,
+                 layer_idx_in_cache_pool=None):
         super().__init__()
 
         self.local_layer_idx = local_layer_idx
@@ -357,6 +362,7 @@ def __init__(self,
         self.attention_mask_type = attention_mask_type
         self.attention_head_size = hidden_size // num_attention_heads if attention_head_size is None else attention_head_size
         self.num_kv_heads = num_kv_heads
+        self.layer_idx_in_cache_pool = layer_idx_in_cache_pool if layer_idx_in_cache_pool is not None else local_layer_idx
         assert num_attention_heads % tp_size == 0, \
         "num_attention_heads must be divisible by tp_size"
         self.num_attention_heads = num_attention_heads // tp_size
@@ -852,41 +858,32 @@ def compute_cross_qkv(encoder_output):
             attention_output_orig_quant_scale = self.attention_output_orig_quant_scale.value if self.attention_output_orig_quant_scale is not None else None
 
             if self.position_embedding_type == PositionEmbeddingType.long_rope:
-                short = slice(
-                    attention_params.
-                    embed_positions_short_factors_for_attention_plugin,
-                    concat([0, 0, 0]),
-                    concat([
-                        max(attention_params.sequence_length,
-                            self.original_max_position_embeddings),
-                        self.rotary_embedding_dim // 2, 2
-                    ]))
-                long = slice(
-                    attention_params.
-                    embed_positions_long_factors_for_attention_plugin,
-                    concat([0, 0, 0]),
-                    concat([
-                        max(attention_params.sequence_length,
-                            self.original_max_position_embeddings),
-                        self.rotary_embedding_dim // 2, 2
-                    ]))
-                short = short.view((1, -1))
-                long = long.view((1, -1))
-                embed_positions = concat([short, long], dim=0)
-                select = where(
-                    fmax(attention_params.sequence_length, dim=0) <=
-                    self.original_max_position_embeddings, 0, 1)
-                rotary_cos_sin = slice(embed_positions,
-                                       concat([select, 0]),
-                                       sizes=concat([1, shape(long, 1)]))
-                short_inv_freq = attention_params.short_inv_freq
-                long_inv_freq = attention_params.long_inv_freq
-                concat_inv_freq = concat([short_inv_freq, long_inv_freq], dim=0)
-                rotary_inv_freq = slice(concat_inv_freq,
-                                        concat([select, 0]),
-                                        sizes=concat(
-                                            [1, shape(long_inv_freq, 1)]))
-                rotary_inv_freq = rotary_inv_freq.view((-1, ))
+                max_seq_length = fmax(attention_params.sequence_length, dim=0)
+                floor_seq_length = maximum(
+                    max_seq_length, self.original_max_position_embeddings)
+
+                short = attention_params.embed_positions_short_factors_for_attention_plugin
+                long = attention_params.embed_positions_long_factors_for_attention_plugin
+
+                starts = concat([0, 0, 0])
+                shapes = concat(
+                    [floor_seq_length, self.rotary_embedding_dim // 2, 2])
+
+                short = slice(short, starts, shapes).view((1, -1))
+                long = slice(long, starts, shapes).view((1, -1))
+
+                use_long_factors = gt(max_seq_length,
+                                      self.original_max_position_embeddings)
+
+                cond = Conditional(use_long_factors)
+                true_val = cond.add_input(long)
+                false_val = cond.add_input(short)
+                rotary_cos_sin = cond.add_output(true_val, false_val)
+
+                cond = Conditional(use_long_factors)
+                true_val = cond.add_input(attention_params.long_inv_freq)
+                false_val = cond.add_input(attention_params.short_inv_freq)
+                rotary_inv_freq = cond.add_output(true_val, false_val)
             else:
                 # The rotary inv freq can be pre-computed.
                 rotary_inv_freq = getattr(attention_params, "rotary_inv_freq",
@@ -916,6 +913,7 @@ def compute_cross_qkv(encoder_output):
                 layer_idx=self.local_layer_idx,
                 num_heads=self.num_attention_heads,
                 num_kv_heads=self.num_attention_kv_heads,
+                layer_idx_in_cache_pool=self.layer_idx_in_cache_pool,
                 hidden_size_per_head=self.attention_head_size,
                 q_scaling=self.q_scaling,
                 rotary_embedding_dim=self.rotary_embedding_dim,
@@ -956,6 +954,9 @@ def compute_cross_qkv(encoder_output):
                 host_kv_cache_pool_pointers=kv_cache_params.
                 host_kv_cache_pool_pointers if not self.cross_attention else
                 kv_cache_params.host_cross_kv_cache_pool_pointers,
+                host_kv_cache_pool_mapping=kv_cache_params.
+                host_kv_cache_pool_mapping if not self.cross_attention else
+                kv_cache_params.host_cross_kv_cache_pool_mapping,
                 do_cross_attention=self.cross_attention,
                 cross_qkv=cross_qkv,
                 cross_qkv_length=attention_params.encoder_max_input_length,
@@ -1025,24 +1026,18 @@ def transpose_for_scores(x,
             if self.position_embedding_type.is_rope():
                 if self.position_embedding_type == PositionEmbeddingType.long_rope:
                     sequence_length = shape(hidden_states, 1)
+                    floor_seq_length = maximum(
+                        sequence_length, self.original_max_position_embeddings)
+
+                    starts = concat([0, 0, 0])
+                    shapes = concat(
+                        [1, floor_seq_length, self.rotary_embedding_dim])
                     short = slice(
-                        attention_params.embed_positions_short_factors,
-                        concat([0, 0, 0]),
-                        concat([
-                            1,
-                            max(sequence_length,
-                                self.original_max_position_embeddings),
-                            self.rotary_embedding_dim
-                        ]))
-                    long = slice(
-                        attention_params.embed_positions_long_factors,
-                        concat([0, 0, 0]),
-                        concat([
-                            1,
-                            max(sequence_length,
-                                self.original_max_position_embeddings),
-                            self.rotary_embedding_dim
-                        ]))
+                        attention_params.embed_positions_short_factors, starts,
+                        shapes)
+                    long = slice(attention_params.embed_positions_long_factors,
+                                 starts, shapes)
+
                     embed_positions = concat([short, long], dim=0)
                     select = where(
                         sequence_length <=
@@ -1702,6 +1697,8 @@ def forward(self,
                 host_kv_cache_block_offsets,
                 host_kv_cache_pool_pointers=kv_cache_params.
                 host_kv_cache_pool_pointers,
+                host_kv_cache_pool_mapping=kv_cache_params.
+                host_kv_cache_pool_mapping,
                 do_cross_attention=self.cross_attention,
                 cross_qkv=None,
                 cross_qkv_length=attention_params.encoder_max_input_length,
diff --git a/tensorrt_llm/layers/embedding.py b/tensorrt_llm/layers/embedding.py
index 869a05a43..f822cfec1 100644
--- a/tensorrt_llm/layers/embedding.py
+++ b/tensorrt_llm/layers/embedding.py
@@ -90,15 +90,10 @@ def weight_loader(self, mapping: Mapping, param: Parameter,
         param.value = loaded_weight
 
     def postprocess(self, tllm_key, weights, **kwargs):
-        config = kwargs.get("config", None)
         if weights is None:
             return {}
         weights = weights.to(str_dtype_to_torch(self.dtype))
-        if config.share_embedding_table:
-            return {}
-        else:
-            weights = weights.clone()
-            return {tllm_key: weights}
+        return {tllm_key: weights}
 
 
 class PromptTuningEmbedding(Embedding):
@@ -133,7 +128,7 @@ def forward(self, tokens, prompt_embedding_table, tasks, task_vocab_size):
 
         Parameters:
             tokens : Tensor
-                the ids to embbed, size [batch_size, seq_len]
+                the ids to embed, size [batch_size, seq_len]
 
             prompt_embedding_table : Tensor
                 the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size]
diff --git a/tensorrt_llm/layers/linear.py b/tensorrt_llm/layers/linear.py
index 2608708ac..80d6f7f38 100644
--- a/tensorrt_llm/layers/linear.py
+++ b/tensorrt_llm/layers/linear.py
@@ -358,9 +358,9 @@ def postprocess(self, tllm_key, weights, **kwargs):
         config = kwargs.get("config", None)
         if self.is_qkv:
             if isinstance(weights, list):
+                head_size = config.hidden_size // config.num_attention_heads if config.head_size is None else config.head_size
                 if hasattr(config, "remove_duplicated_kv_heads"):
                     if config.remove_duplicated_kv_heads:
-                        head_size = config.hidden_size // config.num_attention_heads if config.head_size is None else config.head_size
                         k, v = weights[1:]
                         k = k.reshape([
                             k.shape[0] // head_size // 2, 2, head_size,
@@ -376,6 +376,22 @@ def postprocess(self, tllm_key, weights, **kwargs):
                         v = v[:, 0].reshape([-1, self.in_features])
                         weights[1] = k
                         weights[2] = v
+                # Duplicate kv heads in case of invalid TP size
+                tp_size = config.mapping.tp_size
+                num_kv_heads = config.num_key_value_heads
+                if num_kv_heads < tp_size:
+                    for qkv_idx in range(3):
+                        v = weights[qkv_idx]
+                        if qkv_idx > 0:
+                            assert tp_size % num_kv_heads == 0
+                            reps = tp_size // num_kv_heads
+                            v = v.reshape(num_kv_heads, head_size,
+                                          -1)[:, None, :, :].expand(
+                                              num_kv_heads, reps, head_size,
+                                              v.shape[1])
+                            v = v.reshape(num_kv_heads * reps * head_size, -1)
+                        weights[qkv_idx] = v.chunk(
+                            tp_size, self.tp_dim)[config.mapping.tp_rank]
                 weights = torch.cat(weights)
             if using_head_as_leading_dim:
                 # Reorder [n_head, 3, head_dim, ...] into [3, n_head, head_dim, ...]
diff --git a/tensorrt_llm/layers/mlp.py b/tensorrt_llm/layers/mlp.py
index 05a760680..312a841eb 100644
--- a/tensorrt_llm/layers/mlp.py
+++ b/tensorrt_llm/layers/mlp.py
@@ -262,7 +262,7 @@ def __init__(
     def fc_gate_plugin(self, hidden_states, lora_layer_params=None):
         # Combine the following pattern
         #
-        #   SiLU(FC(x)) + Gate(x)
+        #   SiLU(FC(x)) * Gate(x)
         #
         # into:
         #
@@ -319,7 +319,7 @@ def fc_gate_plugin(self, hidden_states, lora_layer_params=None):
     def fc_gate(self, hidden_states, lora_layer_params=None):
         # Combine the following pattern
         #
-        #   SiLU(FC(x)) + Gate(x)
+        #   SiLU(FC(x)) * Gate(x)
         #
         # into:
         #
@@ -348,7 +348,6 @@ def forward(self,
                 lora_layer_params=None,
                 reduce_fusion_params: Optional[AllReduceFusionParams] = None):
         if default_net().plugin_config.gemm_swiglu_plugin:
-            assert self.dtype == 'float16', f"Currently limited support, got {self.dtype}"
             inter = self.fc_gate_plugin(hidden_states, lora_layer_params)
         else:
             inter = self.fc_gate(hidden_states, lora_layer_params)
diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py
index 0a93f1d02..e05ea6de3 100644
--- a/tensorrt_llm/layers/moe.py
+++ b/tensorrt_llm/layers/moe.py
@@ -25,7 +25,7 @@
 from tensorrt_llm.layers.lora import LoraParams
 
 from .._common import default_net, default_trtnet
-from .._utils import int32_array
+from .._utils import QuantModeWrapper, int32_array
 from ..functional import (AllReduceFusionParams, _add_plugin_info,
                           _create_tensor, allreduce, cast, concat, constant,
                           div, expand, gather_nd, is_gated_activation,
@@ -61,6 +61,9 @@ class ExpertScaleNormalizationMode(IntEnum):
         SPARSE_MIXER = 2
 
     num_experts: int = 0
+    moe_intermediate_size: int = 0  # Add moe inter size (shanshan)
+    num_shared_experts: int = 0  # Add number of shared experts (shanshan)
+
     top_k: int = 0
     normalization_mode: ExpertScaleNormalizationMode = ExpertScaleNormalizationMode.RENORMALIZE
     sparse_mixer_epsilon: float = 0.01
@@ -167,6 +170,10 @@ def from_parameter(x):
     p_output_type_id = trt.PluginField(
         "output_type_id", np.array([int(output_dtype)], dtype=np.int32),
         trt.PluginFieldType.INT32)
+
+    if isinstance(quant_mode, QuantModeWrapper):
+        # We only need to get one quant mode here for specific moe layer
+        quant_mode = quant_mode[0]
     p_quant_mode = trt.PluginField("quant_mode",
                                    np.array([int(quant_mode)], dtype=np.int32),
                                    trt.PluginFieldType.INT32)
@@ -832,3 +839,51 @@ def load_weights(self, moe: MOE):
                 if is_gated_act:
                     expert.gate.bias.value = experts_bias_1_raw[
                         i, :self.expert_inter_size]
+
+
+# Add SharedMoE class (shanshan)
+class SharedMoE(Module):
+
+    def __init__(self,
+                 moe_config: MoeConfig,
+                 hidden_size: int,
+                 ffn_hidden_size: int,
+                 hidden_act: str,
+                 mapping: Mapping = Mapping(),
+                 bias: bool = True,
+                 dtype=None,
+                 **kwargs):
+        super().__init__()
+
+        self.moe_config = moe_config
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.hidden_act = hidden_act
+        self.mapping = mapping
+        self.bias = bias
+        self.dtype = dtype
+
+        self.moe = MOE(hidden_size=self.hidden_size,
+                       moe_config=self.moe_config,
+                       mapping=self.mapping,
+                       ffn_hidden_size=self.moe_config.moe_intermediate_size,
+                       hidden_act=self.hidden_act,
+                       dtype=self.dtype,
+                       bias=False,
+                       tp_group=self.mapping.tp_group,
+                       tp_size=self.mapping.tp_size)
+        ClsMLP = GatedMLP if is_gated_activation(self.hidden_act) else MLP
+        self.shared_experts = ClsMLP(
+            hidden_size=self.hidden_size,
+            ffn_hidden_size=self.ffn_hidden_size,
+            hidden_act=non_gated_version(self.hidden_act),  # deepseek use SiLU
+            bias=False,
+            dtype=self.dtype,
+            tp_group=self.mapping.tp_group,
+            tp_size=self.mapping.tp_size)
+
+    def forward(self, hidden_states):
+        if self.moe_config.num_shared_experts > 0:
+            return self.moe(hidden_states) + self.shared_experts(hidden_states)
+        else:
+            return self.moe(hidden_states)
diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py
index 00507a118..18ed817e1 100644
--- a/tensorrt_llm/lora_manager.py
+++ b/tensorrt_llm/lora_manager.py
@@ -244,6 +244,15 @@ def load_hf_lora(
     if len(lora_config.lora_target_modules) == 0:
         lora_config.lora_target_modules = lora_loader.get_target_modules(
             trtllm_modules_to_hf_modules)
+    if len(lora_config.lora_target_modules) == 0:
+        raise ValueError(
+            "lora_target_modules is empty. "
+            "Please specify lora_target_modules or provide lora_dir to infer lora_target_modules."
+        )
+
+    missing_qkv_modules = LoraManager.get_missing_qkv_modules(
+        lora_config.lora_target_modules)
+    lora_config.lora_target_modules.extend(missing_qkv_modules)
 
     if lora_loader.is_valid:
         config = model.config
diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py
index 39481b3a0..e5dcc03a4 100755
--- a/tensorrt_llm/models/__init__.py
+++ b/tensorrt_llm/models/__init__.py
@@ -22,7 +22,7 @@
 from .cogvlm.model import CogVLMForCausalLM
 from .dbrx.config import DbrxConfig
 from .dbrx.model import DbrxForCausalLM
-from .deci.model import DeciLMForCausalLM
+from .deepseek_v1.model import DeepseekForCausalLM
 from .dit.model import DiT
 from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder
 from .falcon.config import FalconConfig
@@ -43,6 +43,7 @@
 from .modeling_utils import (PretrainedConfig, PretrainedModel,
                              SpeculativeDecodingMode)
 from .mpt.model import MPTForCausalLM, MPTModel
+from .nemotron_nas.model import DeciLMForCausalLM
 from .opt.model import OPTForCausalLM, OPTModel
 from .phi3.model import Phi3ForCausalLM, Phi3Model
 from .phi.model import PhiForCausalLM, PhiModel
@@ -57,6 +58,7 @@
     'BloomModel',
     'BloomForCausalLM',
     'DiT',
+    'DeepseekForCausalLM',
     'FalconConfig',
     'FalconForCausalLM',
     'FalconModel',
@@ -95,6 +97,7 @@
     'PretrainedModel',
     'WhisperEncoder',
     'MambaForCausalLM',
+    'MambaConfig',
     'MPTForCausalLM',
     'MPTModel',
     'SkyworkForCausalLM',
@@ -125,6 +128,7 @@
     'Phi3ForCausalLM': Phi3ForCausalLM,
     'Phi3VForCausalLM': Phi3ForCausalLM,
     'Phi3SmallForCausalLM': Phi3ForCausalLM,
+    'PhiMoEForCausalLM': Phi3ForCausalLM,
     'MambaForCausalLM': MambaForCausalLM,
     'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
     'GPTJForCausalLM': GPTJForCausalLM,
@@ -158,5 +162,6 @@
     'RecurrentGemmaForCausalLM': RecurrentGemmaForCausalLM,
     'CogVLMForCausalLM': CogVLMForCausalLM,
     'DiT': DiT,
+    'DeepseekForCausalLM': DeepseekForCausalLM,
     'DeciLMForCausalLM': DeciLMForCausalLM,
 }
diff --git a/tensorrt_llm/models/automodel.py b/tensorrt_llm/models/automodel.py
index a65781a88..9e382d9df 100644
--- a/tensorrt_llm/models/automodel.py
+++ b/tensorrt_llm/models/automodel.py
@@ -17,7 +17,14 @@ def from_hugging_face(hf_model_or_dir,
 
         hf_config = transformers.AutoConfig.from_pretrained(
             hf_model_or_dir, trust_remote_code=True)
-        hf_arch = hf_config.architectures[0]
+
+        if hasattr(hf_config,
+                   'architectures') and hf_config.architectures is not None:
+            hf_arch = hf_config.architectures[0]
+        elif hasattr(hf_config,
+                     'model_type') and hf_config.model_type.find('mamba') != -1:
+            hf_arch = 'MambaForCausalLM'
+
         trtllm_model_cls = MODEL_MAP.get(hf_arch, None)
         if trtllm_model_cls is None:
             raise NotImplementedError(
@@ -47,7 +54,14 @@ def get_trtllm_model_class(hf_model_or_dir, trust_remote_code=False):
 
         hf_config = transformers.AutoConfig.from_pretrained(
             hf_model_or_dir, trust_remote_code=trust_remote_code)
-        hf_arch = hf_config.architectures[0]
+
+        if hasattr(hf_config,
+                   'architectures') and hf_config.architectures is not None:
+            hf_arch = hf_config.architectures[0]
+        elif hasattr(hf_config,
+                     'model_type') and hf_config.model_type.find('mamba') != -1:
+            hf_arch = 'MambaForCausalLM'
+
         trtllm_model_cls = MODEL_MAP.get(hf_arch, None)
 
         if trtllm_model_cls is None:
diff --git a/tensorrt_llm/models/chatglm/convert.py b/tensorrt_llm/models/chatglm/convert.py
index 9d1c59a61..77aaca23b 100644
--- a/tensorrt_llm/models/chatglm/convert.py
+++ b/tensorrt_llm/models/chatglm/convert.py
@@ -434,9 +434,8 @@ def load_weights_from_hf_model(hf_model: AutoModel,
                                           is_qkv=True,
                                           multi_query_mode=True)
             weights[
-                f'{tllm_prex}.attention.kv_cache_scaling_factor'] = torch.from_numpy(
-                    np.array([qkv_vals_int8['scale_y_quant_orig']],
-                             dtype=np.float32)).contiguous()
+                f'{tllm_prex}.attention.kv_cache_scaling_factor'] = qkv_vals_int8[
+                    'scale_y_quant_orig'].contiguous()
 
         # Attention dense
         attn_dense_weight, attn_dense_bias = get_weight_and_bias(
diff --git a/tensorrt_llm/models/convert_utils.py b/tensorrt_llm/models/convert_utils.py
index 7d25399fe..877a62444 100644
--- a/tensorrt_llm/models/convert_utils.py
+++ b/tensorrt_llm/models/convert_utils.py
@@ -67,14 +67,14 @@ def get_weight(params: Dict[str, torch.Tensor], prefix: str,
                dtype: torch.dtype) -> torch.Tensor:
     if f'{prefix}.weight' not in params:
         return None
-    return params[f'{prefix}.weight'].to(dtype).detach().cpu()
+    return params[f'{prefix}.weight'].to(dtype).detach().cpu().contiguous()
 
 
 def get_bias(params: Dict[str, torch.Tensor], prefix: str,
              dtype: torch.dtype) -> torch.Tensor:
     if f'{prefix}.bias' not in params:
         return None
-    return params[f'{prefix}.bias'].to(dtype).detach().cpu()
+    return params[f'{prefix}.bias'].to(dtype).detach().cpu().contiguous()
 
 
 def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
@@ -248,6 +248,7 @@ def has_safetensors(model_dir: str):
     'ccdv/cnn_dailymail': ('3.0.0', 'train', 'article'),
     'cnn_dailymail': ('3.0.0', 'train', 'article'),
     'lambada': (None, 'validation', 'text'),
+    '': (None, 'train', 'text'),  # Default value in HF
 }
 
 
diff --git a/tensorrt_llm/models/deepseek_v1/__init__.py b/tensorrt_llm/models/deepseek_v1/__init__.py
new file mode 100644
index 000000000..71bf6d298
--- /dev/null
+++ b/tensorrt_llm/models/deepseek_v1/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/models/deepseek_v1/convert.py b/tensorrt_llm/models/deepseek_v1/convert.py
new file mode 100644
index 000000000..0e7edb796
--- /dev/null
+++ b/tensorrt_llm/models/deepseek_v1/convert.py
@@ -0,0 +1,361 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from tensorrt_llm.layers import MoeConfig
+
+from ..._utils import pad_vocab_size, release_gc
+from ...mapping import Mapping
+
+
+## Convert config parameters to dict
+def create_trt_config_from_hf(model_dir,
+                              dtype,
+                              mapping: Mapping,
+                              override_fields: dict = {}):
+    config = {}
+    assert isinstance(model_dir, str)
+    hf_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+    dtype = dtype
+    n_layer = hf_config.num_hidden_layers
+    n_head = hf_config.num_attention_heads
+    n_embd = hf_config.hidden_size
+    inter_size = hf_config.intermediate_size
+    n_kv_head = hf_config.num_key_value_heads
+    vocab_size = hf_config.vocab_size
+    n_positions = hf_config.max_position_embeddings
+    hidden_act = 'swiglu'  # TRT-LLM request make gated activation explicit for MOE implementation
+    rotary_base = hf_config.rope_theta
+    rms_norm_eps = hf_config.rms_norm_eps
+    moe_num_experts = hf_config.n_routed_experts
+    moe_top_k = hf_config.num_experts_per_tok
+    ## shanshan fix
+    moe_renorm_mode = MoeConfig.ExpertScaleNormalizationMode.NONE
+    moe_num_shared_experts = hf_config.n_shared_experts
+    moe_inter_size = hf_config.moe_intermediate_size
+    rotary_scaling = hf_config.rope_scaling
+
+    config = {
+        'architecture': "DeepseekForCausalLM",
+        'dtype': dtype,
+        'logits_type': 'float32',
+        'num_hidden_layers': n_layer,
+        'num_attention_heads': n_head,
+        'hidden_size': n_embd,
+        'intermediate_size': inter_size,
+        'num_key_value_heads': n_kv_head,
+        'vocab_size': vocab_size,
+        'position_embedding_type': 'rope_gpt_neox',
+        'max_position_embeddings': n_positions,
+        'hidden_act': hidden_act,
+        'rotary_base': rotary_base,
+        'norm_epsilon': rms_norm_eps,
+        'rotary_scaling': rotary_scaling,
+        'moe_num_experts': moe_num_experts,
+        'moe_top_k': moe_top_k,
+        'moe_renorm_mode': moe_renorm_mode,
+        'moe_num_shared_experts': moe_num_shared_experts,
+        'moe_inter_size': moe_inter_size,
+        'mapping': {
+            'world_size': mapping.tp_size * mapping.pp_size,
+            'tp_size': mapping.tp_size,
+            'pp_size': mapping.pp_size,
+            'moe_tp_size': mapping.moe_tp_size,
+            'moe_ep_size': mapping.moe_ep_size,
+        },
+    }
+    config.update(override_fields)
+
+    moe_config = MoeConfig(num_experts=config['moe_num_experts'],
+                           moe_intermediate_size=config['moe_inter_size'],
+                           num_shared_experts=config['moe_num_shared_experts'],
+                           top_k=config['moe_top_k'],
+                           normalization_mode=config['moe_renorm_mode'])
+    moe_config.validate()
+
+    return config
+
+
+## Get HF model
+def load_hf_deepseek(model_dir):
+    model = AutoModelForCausalLM.from_pretrained(model_dir,
+                                                 device_map='auto',
+                                                 torch_dtype='auto',
+                                                 trust_remote_code=True)
+    return model
+
+
+## Prepare weights for TP
+def split(v, tp_size, idx, dim=0):
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return torch.chunk(v, tp_size)[idx].contiguous()
+    else:
+        return torch.chunk(v, tp_size, dim=dim)[idx].contiguous()
+
+
+def split_qkv_tp(v, n_head, n_hidden, tensor_parallel, rank):
+    """
+    Splits the QKV matrix according to tensor parallelism
+    """
+    v = v.reshape(3, n_hidden, n_hidden)
+    split_v = split(v, tensor_parallel, rank, dim=1)
+    split_v = split_v.reshape(3 * (n_hidden // tensor_parallel), n_hidden)
+    return split_v.contiguous()
+
+
+def split_matrix_tp(v, tensor_parallel, rank, dim):
+    return split(v, tensor_parallel, rank, dim=dim)
+
+
+def get_weight(config, prefix, dtype, postfix='.weight'):
+    if config[prefix + postfix].dtype != dtype:
+        config[prefix + postfix].data = config[prefix + postfix].to(dtype)
+    return config[prefix + postfix].detach().cpu()
+
+
+def get_trtllm_linear_weight(weight, prefix, postfix='weight'):
+    results = {}
+    results[prefix + postfix] = weight
+
+    return results
+
+
+def convert_deepseek(hf_model,
+                     config,
+                     mapping,
+                     dtype='float32',
+                     use_parallel_embedding=False,
+                     sharding_dim=0,
+                     share_embedding_table=False):
+
+    weights = {}
+    tik = time.time()
+    mapping.tp_size
+    model_params = dict(hf_model.named_parameters())
+    dtype = getattr(torch, dtype)
+    moe_config = MoeConfig(num_experts=config['moe_num_experts'],
+                           moe_intermediate_size=config['moe_inter_size'],
+                           num_shared_experts=config['moe_num_shared_experts'],
+                           top_k=config['moe_top_k'],
+                           normalization_mode=config['moe_renorm_mode'])
+
+    layers_range = mapping.pp_layers(config['num_hidden_layers'])
+
+    def convert_layer(l):
+        prefix = f'model.layers.{l}.'
+        print(prefix)
+        trtllm_prex = f'transformer.layers.{l - layers_range[0]}.'
+        q_weight = get_weight(model_params, prefix + 'self_attn.q_proj', dtype)
+        k_weight = get_weight(model_params, prefix + 'self_attn.k_proj', dtype)
+        v_weight = get_weight(model_params, prefix + 'self_attn.v_proj', dtype)
+
+        qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+
+        split_v = split_qkv_tp(qkv_weight, config['num_attention_heads'],
+                               config['hidden_size'], mapping.tp_size,
+                               mapping.tp_rank)
+
+        weights.update(
+            get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.qkv.'))
+
+        attn_dense_weight = get_weight(model_params,
+                                       prefix + 'self_attn.o_proj', dtype)
+        split_v = split_matrix_tp(attn_dense_weight,
+                                  mapping.tp_size,
+                                  mapping.tp_rank,
+                                  dim=1)
+
+        weights.update(
+            get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.dense.'))
+
+        if moe_config.has_moe() and l > 0:
+            rank_experts = list(range(moe_config.num_experts))
+            if mapping.has_moe_ep():
+                rank_experts = mapping.ep_experts(moe_config.num_experts)
+            for suffix in ["gate_proj", "down_proj", "up_proj"]:
+                model_params[f'model.layers.{l}.mlp.experts.{suffix}.weight'] = \
+                torch.stack([model_params[f'model.layers.{l}.mlp.experts.{expert}.{suffix}.weight'].detach().cpu()
+                            for expert in rank_experts])
+
+            gate_proj = model_params[
+                f'model.layers.{l}.mlp.experts.gate_proj.weight']
+            down_proj = model_params[
+                f'model.layers.{l}.mlp.experts.down_proj.weight']
+            up_proj = model_params[
+                f'model.layers.{l}.mlp.experts.up_proj.weight']
+            if mapping.has_moe_tp():
+                gate_proj = split(gate_proj,
+                                  mapping.tp_size,
+                                  mapping.tp_rank,
+                                  dim=1)
+                down_proj = split(down_proj,
+                                  mapping.tp_size,
+                                  mapping.tp_rank,
+                                  dim=2)
+                up_proj = split(up_proj,
+                                mapping.tp_size,
+                                mapping.tp_rank,
+                                dim=1)
+
+            model_params[
+                f'model.layers.{l}.mlp.experts.up_gate_proj.weight'] = torch.concat(
+                    [up_proj, gate_proj], dim=-2)
+            model_params[
+                f'model.layers.{l}.mlp.experts.down_proj.weight'] = down_proj
+
+            ## mlp.experts.down_proj.weight
+            moe_experts_down_proj_weights = get_weight(
+                model_params, prefix + 'mlp.experts.down_proj', dtype)
+            weights.update(
+                get_trtllm_linear_weight(moe_experts_down_proj_weights,
+                                         trtllm_prex + 'mlp.moe.proj.'))
+            ##mlp.experts.up_gate.weight
+            moe_experts_up_gate_proj_weights = get_weight(
+                model_params, prefix + 'mlp.experts.up_gate_proj', dtype)
+            weights.update(
+                get_trtllm_linear_weight(moe_experts_up_gate_proj_weights,
+                                         trtllm_prex + 'mlp.moe.fc.'))
+            ## MOE hardcoded routing_input into trt.float32, please refer to moe.py line 397
+            moe_experts_gate_weights = get_weight(model_params,
+                                                  prefix + 'mlp.gate',
+                                                  torch.float32)
+            weights.update(
+                get_trtllm_linear_weight(moe_experts_gate_weights,
+                                         trtllm_prex + 'mlp.moe.router.'))
+
+            if moe_config.num_shared_experts > 0:
+                ## mlp.shared_experts.gate_proj.weight
+                shared_moe_gate_proj_weights = get_weight(
+                    model_params, prefix + 'mlp.shared_experts.gate_proj',
+                    dtype)
+                split_v = split_matrix_tp(shared_moe_gate_proj_weights,
+                                          mapping.tp_size,
+                                          mapping.tp_rank,
+                                          dim=0)
+                weights.update(
+                    get_trtllm_linear_weight(
+                        split_v, trtllm_prex + 'mlp.shared_experts.fc.'))
+                # mlp.shared_experts.down_proj.weight
+                shared_moe_down_proj_weights = get_weight(
+                    model_params, prefix + 'mlp.shared_experts.down_proj',
+                    dtype)
+                split_v = split_matrix_tp(shared_moe_down_proj_weights,
+                                          mapping.tp_size,
+                                          mapping.tp_rank,
+                                          dim=1)
+                weights.update(
+                    get_trtllm_linear_weight(
+                        split_v, trtllm_prex + 'mlp.shared_experts.proj.'))
+                ## mlp.shared_experts.up_proj.weight
+                shared_moe_up_proj_weights = get_weight(
+                    model_params, prefix + 'mlp.shared_experts.up_proj', dtype)
+                split_v = split_matrix_tp(shared_moe_up_proj_weights,
+                                          mapping.tp_size,
+                                          mapping.tp_rank,
+                                          dim=0)
+                weights.update(
+                    get_trtllm_linear_weight(
+                        split_v, trtllm_prex + 'mlp.shared_experts.gate.'))
+
+        else:
+            ## Current deepseek model has one MLP layer only, if it goes large consider to do fuse
+            mlp_gate_weight = get_weight(model_params, prefix + 'mlp.up_proj',
+                                         dtype)
+            split_gate = split_matrix_tp(mlp_gate_weight,
+                                         mapping.tp_size,
+                                         mapping.tp_rank,
+                                         dim=0)
+            weights.update(
+                get_trtllm_linear_weight(split_gate, trtllm_prex + 'mlp.gate.'))
+
+            mlp_fc_weight = get_weight(model_params, prefix + 'mlp.gate_proj',
+                                       dtype)
+            split_fc = split_matrix_tp(mlp_fc_weight,
+                                       mapping.tp_size,
+                                       mapping.tp_rank,
+                                       dim=0)
+            weights.update(
+                get_trtllm_linear_weight(split_fc, trtllm_prex + 'mlp.fc.'))
+
+            mlp_proj_weight = get_weight(model_params, prefix + 'mlp.down_proj',
+                                         dtype)
+            split_proj = split_matrix_tp(mlp_proj_weight,
+                                         mapping.tp_size,
+                                         mapping.tp_rank,
+                                         dim=1)
+            weights.update(
+                get_trtllm_linear_weight(split_proj, trtllm_prex + 'mlp.proj.'))
+
+        # Layer norms do not use tensor parallelism
+        input_ln_weight = get_weight(model_params, prefix + 'input_layernorm',
+                                     dtype)
+        weights[trtllm_prex + 'input_layernorm.weight'] = input_ln_weight
+        post_ln_weight = get_weight(model_params,
+                                    prefix + 'post_attention_layernorm', dtype)
+        weights[trtllm_prex + 'post_layernorm.weight'] = post_ln_weight
+
+    for l in layers_range:
+        convert_layer(l)
+        release_gc()
+
+    v = get_weight(model_params, 'model.embed_tokens', dtype)
+    if hf_model.config.tie_word_embeddings:
+        # lm_head.weight has the same weights as embedding
+        if mapping.is_last_pp_rank():
+            if config['vocab_size'] % mapping.tp_size != 0:
+                # padding
+                vocab_size_padded = pad_vocab_size(config['vocab_size'],
+                                                   mapping.tp_size)
+                pad_width = vocab_size_padded - config['vocab_size']
+                v = torch.nn.functional.pad(v, (0, 0, 0, pad_width), 'constant',
+                                            0)
+            weights['lm_head.weight'] = split(v, mapping.tp_size,
+                                              mapping.tp_rank)
+    if use_parallel_embedding:
+        v = split_matrix_tp(v,
+                            mapping.tp_size,
+                            mapping.tp_rank,
+                            dim=config.embedding_sharding_dim)
+    if mapping.is_first_pp_rank():
+        weights['transformer.vocab_embedding.weight'] = v
+    lm_head_weights = get_weight(model_params, 'lm_head', dtype)
+
+    if mapping.is_last_pp_rank():
+        if config['vocab_size'] % mapping.tp_size != 0:
+            # padding
+            vocab_size_padded = pad_vocab_size(config['vocab_size'],
+                                               mapping.tp_size)
+            pad_width = vocab_size_padded - config['vocab_size']
+            lm_head_weights = torch.nn.functional.pad(lm_head_weights,
+                                                      (0, 0, 0, pad_width),
+                                                      'constant',
+                                                      value=0)
+        weights['lm_head.weight'] = split_matrix_tp(lm_head_weights,
+                                                    mapping.tp_size,
+                                                    mapping.tp_rank,
+                                                    dim=0)
+    ln_f_w = get_weight(model_params, 'model.norm', dtype)
+    weights['transformer.ln_f.weight'] = ln_f_w
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Weights loaded. Total time: {t}')
+    #print(set(weights.keys()))
+    return weights
diff --git a/tensorrt_llm/models/deepseek_v1/model.py b/tensorrt_llm/models/deepseek_v1/model.py
new file mode 100644
index 000000000..ff6dcc18d
--- /dev/null
+++ b/tensorrt_llm/models/deepseek_v1/model.py
@@ -0,0 +1,257 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+
+from ..._utils import pad_vocab_size, torch_dtype_to_str
+from ...functional import Tensor, non_gated_version, recv, send
+from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding,
+                       GatedMLP, MoeConfig, PositionEmbeddingType, RmsNorm,
+                       SharedMoE)
+from ...mapping import Mapping
+from ...module import Module
+from ...plugin import init_all_reduce_helper
+from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
+                              PretrainedConfig)
+from .convert import convert_deepseek, create_trt_config_from_hf
+
+
+class DeepseekDecoderLayer(Module):
+
+    def __init__(self, config: PretrainedConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+
+        ### Input layernorm in Deepseek v1 is same as Llama
+        self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                       eps=config.norm_epsilon,
+                                       dtype=config.dtype)
+
+        layers_range = config.mapping.pp_layers(config.num_hidden_layers)
+        local_layer_idx = layer_idx - layers_range[0]
+        ### Deepseek v1 model with standard attention
+        self.attention = Attention(
+            local_layer_idx=local_layer_idx,
+            hidden_size=config.hidden_size,
+            attention_head_size=config.head_size,
+            num_attention_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            dtype=config.dtype,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=False,
+            position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+            rotary_embedding_base=config.rotary_base,
+            rotary_embedding_scaling=config.rotary_scaling,
+            tp_group=config.mapping.tp_group,
+            tp_size=config.mapping.tp_size,
+            tp_rank=config.mapping.tp_rank)
+
+        ClsMLP = GatedMLP
+
+        moe_config = MoeConfig(num_experts=config.moe_num_experts,
+                               moe_intermediate_size=config.moe_inter_size,
+                               num_shared_experts=config.moe_num_shared_experts,
+                               top_k=config.moe_top_k,
+                               normalization_mode=config.moe_renorm_mode)
+
+        mlp_kwargs = {}
+        if config.moe_num_experts > 0 and layer_idx > 0:
+            mlp_hidden_size = moe_config.num_shared_experts * moe_config.moe_intermediate_size
+            hidden_act = config.hidden_act
+            ClsMLP = SharedMoE
+            mlp_kwargs = {"moe_config": moe_config, "mapping": config.mapping}
+        else:
+            ClsMLP = GatedMLP
+            mlp_hidden_size = config.intermediate_size
+            hidden_act = non_gated_version(
+                config.hidden_act)  # back to non gated for dense layers
+
+        self.mlp = ClsMLP(hidden_size=config.hidden_size,
+                          ffn_hidden_size=mlp_hidden_size,
+                          hidden_act=hidden_act,
+                          dtype=config.dtype,
+                          bias=False,
+                          tp_group=config.mapping.tp_group,
+                          tp_size=config.mapping.tp_size,
+                          **mlp_kwargs)
+
+        ### Pose layernorm in Deepseek v1 is same as Llama             )
+        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                      eps=config.norm_epsilon,
+                                      dtype=config.dtype)
+
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                use_cache=False,
+                spec_decoding_params=None,
+                kv_cache_params=None,
+                attention_params=None):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            spec_decoding_params=spec_decoding_params,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params)
+        if use_cache:
+            attention_output, presents = attention_output
+
+        hidden_states = residual + attention_output
+
+        residual = hidden_states
+
+        hidden_states = self.post_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        if use_cache:
+            return (hidden_states, presents)
+        return hidden_states
+
+
+class DeepseekModel(Module):
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__()
+        init_all_reduce_helper()  # enable use_customer_all_reduce
+
+        self.mapping = config.mapping
+        if self.mapping.is_first_pp_rank():
+            self.vocab_embedding = Embedding(config.vocab_size,
+                                             config.hidden_size,
+                                             dtype=config.dtype)
+
+        self.layers = DecoderLayerList(DeepseekDecoderLayer, config)
+
+        if self.mapping.is_last_pp_rank():
+            self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
+                                eps=config.norm_epsilon,
+                                dtype=config.dtype)
+
+    def forward(self,
+                input_ids,
+                position_ids=None,
+                use_cache=False,
+                attention_mask=None,
+                spec_decoding_params=None,
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None,
+                prompt_embedding_table: Optional[Tensor] = None,
+                prompt_tasks: Optional[Tensor] = None,
+                prompt_vocab_size: Optional[Tensor] = None):
+
+        ptuning_args = [
+            prompt_embedding_table, prompt_tasks, prompt_vocab_size
+        ] if prompt_embedding_table is not None else []
+
+        if self.mapping.is_first_pp_rank():
+            hidden_states = self.vocab_embedding(input_ids, *ptuning_args)
+        else:
+            hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
+
+        hidden_states = self.layers.forward(
+            hidden_states,
+            use_cache=use_cache,
+            attention_mask=attention_mask,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+            spec_decoding_params=spec_decoding_params)
+
+        if use_cache:
+            hidden_states, presents = hidden_states
+
+        if self.mapping.is_last_pp_rank():
+            hidden_states = self.ln_f(hidden_states)
+        else:
+            hidden_states = send(hidden_states, self.mapping.next_pp_rank())
+
+        if use_cache:
+            return (hidden_states, tuple(presents))
+        return hidden_states
+
+
+class DeepseekForCausalLM(DecoderModelForCausalLM):
+
+    def __init__(self, config: PretrainedConfig):
+        transformer = DeepseekModel(config)
+        vocab_size_padded = pad_vocab_size(config.vocab_size,
+                                           config.mapping.tp_size)
+        if config.mapping.is_last_pp_rank():
+            lm_head = ColumnLinear(config.hidden_size,
+                                   vocab_size_padded,
+                                   bias=False,
+                                   dtype=config.dtype,
+                                   tp_group=config.mapping.tp_group,
+                                   tp_size=config.mapping.tp_size,
+                                   gather_output=True)
+        else:
+            lm_head = None
+        self.mapping = config.mapping
+        super().__init__(config, transformer, lm_head)
+
+    @classmethod
+    def from_hugging_face(cls,
+                          hf_model,
+                          model_dir,
+                          dtype: str = 'auto',
+                          mapping: Optional[Mapping] = None,
+                          override_fields={},
+                          **kwargs):
+        assert hf_model is not None
+        if mapping is None:
+            mapping = Mapping()
+        config = create_trt_config_from_hf(model_dir,
+                                           dtype,
+                                           mapping=mapping,
+                                           override_fields=override_fields)
+        print(config)
+        pretrained_config = PretrainedConfig.from_dict(config)
+        pretrained_config.set_rank(mapping.rank)  # TODO:remove this hack
+
+        if dtype == 'auto':
+            dtype = getattr(config, 'torch_dtype', None)
+        if dtype is None:
+            dtype = 'float16'
+        if isinstance(dtype, torch.dtype):
+            dtype = torch_dtype_to_str(dtype)
+        if dtype == 'float32':  # should remove "float32"
+            dtype = 'float16'
+        if dtype == 'bfloat16' and torch.cuda.get_device_properties(
+                0).major < 8:
+            logger.warning(
+                "Pre SM 80 GPUs do not support bfloat16, fallback to float16")
+            dtype = 'float16'
+
+        deepseek = cls.from_config(pretrained_config)
+        weights = convert_deepseek(
+            hf_model,
+            config,
+            mapping,
+            dtype=dtype,
+            use_parallel_embedding=config.get('use_parallel_embedding', False),
+            sharding_dim=config.get('embedding_sharding_dim', 0),
+            share_embedding_table=config.get('share_embedding_table', False))
+        #check_share_embedding(weights, config)
+        deepseek.load(weights)
+
+        return deepseek
diff --git a/tensorrt_llm/models/enc_dec/model.py b/tensorrt_llm/models/enc_dec/model.py
index 3f540e690..52a013d16 100644
--- a/tensorrt_llm/models/enc_dec/model.py
+++ b/tensorrt_llm/models/enc_dec/model.py
@@ -1160,12 +1160,16 @@ def forward(self,
                     host_cross_kv_cache_block_offsets,
                     host_kv_cache_pool_pointers=kv_cache_params.
                     host_kv_cache_pool_pointers,
+                    host_kv_cache_pool_mapping=kv_cache_params.
+                    host_kv_cache_pool_mapping,
                     cross_kv_cache_block_offsets=kv_cache_params.
                     cross_kv_cache_block_offsets,
                     host_cross_kv_cache_block_offsets=kv_cache_params.
                     host_cross_kv_cache_block_offsets,
                     host_cross_kv_cache_pool_pointers=kv_cache_params.
-                    host_cross_kv_cache_pool_pointers),
+                    host_cross_kv_cache_pool_pointers,
+                    host_cross_kv_cache_pool_mapping=kv_cache_params.
+                    host_cross_kv_cache_pool_mapping),
                 attention_params=attention_params,
                 lora_layer_params=lora_layer_params,
                 cross_kv_cache_gen=cross_kv_cache_gen,
@@ -1601,10 +1605,12 @@ def prepare_inputs(self,
         kv_cache_block_offsets = None
         host_kv_cache_block_offsets = None
         host_kv_cache_pool_pointers = None
+        host_kv_cache_pool_mapping = None
 
         cross_kv_cache_block_offsets = None
         host_cross_kv_cache_block_offsets = None
         host_cross_kv_cache_pool_pointers = None
+        host_cross_kv_cache_pool_mapping = None
 
         if use_cache:
             if not paged_kv_cache:
@@ -1669,21 +1675,25 @@ def prepare_inputs(self,
                     x for x in max_cross_blocks_per_seq_range[0]
                 ]]
 
-                kv_cache_block_offsets = Tensor(name=f'kv_cache_block_offsets',
-                                                dtype=trt.int32,
-                                                shape=[-1, 2, -1],
-                                                dim_range=OrderedDict([
-                                                    ('batch_size_beam_width',
-                                                     [bb_range]),
-                                                    ('kv', [2]),
-                                                    ('max_blocks_per_seq',
-                                                     max_blocks_per_seq_range),
-                                                ]))
+                # TODO(oargov): add support for vgqa, meanwhile assume a single kv cache pool
+                num_kv_cache_pools = 1
+
+                kv_cache_block_offsets = Tensor(
+                    name=f'kv_cache_block_offsets',
+                    dtype=trt.int32,
+                    shape=[num_kv_cache_pools, -1, 2, -1],
+                    dim_range=OrderedDict([
+                        ('num_kv_cache_pools', [num_kv_cache_pools]),
+                        ('batch_size_beam_width', [bb_range]),
+                        ('kv', [2]),
+                        ('max_blocks_per_seq', max_blocks_per_seq_range),
+                    ]))
                 host_kv_cache_block_offsets = Tensor(
                     name=f'host_kv_cache_block_offsets',
                     dtype=trt.int32,
-                    shape=[-1, 2, -1],
+                    shape=[num_kv_cache_pools, -1, 2, -1],
                     dim_range=OrderedDict([
+                        ('num_kv_cache_pools', [num_kv_cache_pools]),
                         ('batch_size_beam_width', [bb_range]),
                         ('kv', [2]),
                         ('max_blocks_per_seq', max_blocks_per_seq_range),
@@ -1691,17 +1701,26 @@ def prepare_inputs(self,
                 host_kv_cache_pool_pointers = Tensor(
                     name=f'host_kv_cache_pool_pointers',
                     dtype=trt.int64,
-                    shape=[2],
+                    shape=[num_kv_cache_pools, 2],
                     dim_range=OrderedDict([
-                        ('num_pools', [2]),
+                        ('num_pools_layers', [num_kv_cache_pools]),
+                        ('num_pools_kv', [2]),
+                    ]))
+                host_kv_cache_pool_mapping = Tensor(
+                    name=f"host_kv_cache_pool_mapping",
+                    dtype=trt.int32,
+                    shape=[num_pp_layers],
+                    dim_range=OrderedDict([
+                        ('pools_mapping', [num_pp_layers]),
                     ]))
 
                 # paged blocks for cross kv
                 cross_kv_cache_block_offsets = Tensor(
                     name=f'cross_kv_cache_block_offsets',
                     dtype=trt.int32,
-                    shape=[-1, 2, -1],
+                    shape=[num_kv_cache_pools, -1, 2, -1],
                     dim_range=OrderedDict([
+                        ('num_kv_cache_pools', [num_kv_cache_pools]),
                         ('batch_size_beam_width', [bb_range]),
                         ('kv', [2]),
                         ('max_cross_blocks_per_seq',
@@ -1710,8 +1729,9 @@ def prepare_inputs(self,
                 host_cross_kv_cache_block_offsets = Tensor(
                     name=f'host_cross_kv_cache_block_offsets',
                     dtype=trt.int32,
-                    shape=[-1, 2, -1],
+                    shape=[num_kv_cache_pools, -1, 2, -1],
                     dim_range=OrderedDict([
+                        ('num_kv_cache_pools', [num_kv_cache_pools]),
                         ('batch_size_beam_width', [bb_range]),
                         ('kv', [2]),
                         ('max_cross_blocks_per_seq',
@@ -1720,10 +1740,18 @@ def prepare_inputs(self,
                 host_cross_kv_cache_pool_pointers = Tensor(
                     name=f'host_cross_kv_cache_pool_pointers',
                     dtype=trt.int64,
-                    shape=[2],
+                    shape=[num_kv_cache_pools, 2],
                     dim_range=OrderedDict([
+                        ('num_kv_cache_pools', [num_kv_cache_pools]),
                         ('num_pools', [2]),
                     ]))
+                host_cross_kv_cache_pool_mapping = Tensor(
+                    name=f"host_cross_kv_cache_pool_mapping",
+                    dtype=trt.int32,
+                    shape=[num_pp_layers],
+                    dim_range=OrderedDict([
+                        ('pools_mapping', [num_pp_layers]),
+                    ]))
 
                 for i in layers_range:
                     past_key_value.append(None)
@@ -1737,11 +1765,14 @@ def prepare_inputs(self,
                 kv_cache_block_offsets=kv_cache_block_offsets,
                 host_kv_cache_block_offsets=host_kv_cache_block_offsets,
                 host_kv_cache_pool_pointers=host_kv_cache_pool_pointers,
+                host_kv_cache_pool_mapping=host_kv_cache_pool_mapping,
                 cross_kv_cache_block_offsets=cross_kv_cache_block_offsets,
                 host_cross_kv_cache_block_offsets=
                 host_cross_kv_cache_block_offsets,
                 host_cross_kv_cache_pool_pointers=
                 host_cross_kv_cache_pool_pointers,
+                host_cross_kv_cache_pool_mapping=
+                host_cross_kv_cache_pool_mapping,
             )
 
             attention_params = AttentionParams(
diff --git a/tensorrt_llm/models/falcon/model.py b/tensorrt_llm/models/falcon/model.py
index a8f6458da..627335eeb 100644
--- a/tensorrt_llm/models/falcon/model.py
+++ b/tensorrt_llm/models/falcon/model.py
@@ -65,8 +65,7 @@ def __init__(self, config: FalconConfig, layer_idx: int):
             tp_rank=tp_rank,
             bias=config.bias,
             position_embedding_type=config.position_embedding_type,
-            quant_mode=config.quantization.quant_mode,
-        )
+            quant_mode=config.quantization.quant_mode)
 
         mlp_hidden_size = hidden_size * 4 if config.intermediate_size is None else config.intermediate_size
 
diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py
index f34ad71d1..4024b2e88 100644
--- a/tensorrt_llm/models/gemma/model.py
+++ b/tensorrt_llm/models/gemma/model.py
@@ -78,8 +78,7 @@ def __init__(self, config: GemmaConfig, layer_idx: int):
             tp_size=config.mapping.tp_size,
             quant_mode=config.quant_mode,
             q_scaling=q_scaling,
-            max_attn_value=max_attn_value,
-        )
+            max_attn_value=max_attn_value)
 
         mlp_hidden_size = config.hidden_size * 4 if config.intermediate_size is None else config.intermediate_size
 
diff --git a/tensorrt_llm/models/gemma/smoothquant.py b/tensorrt_llm/models/gemma/smoothquant.py
index 640ff7ed9..2e1dd7aa2 100644
--- a/tensorrt_llm/models/gemma/smoothquant.py
+++ b/tensorrt_llm/models/gemma/smoothquant.py
@@ -27,6 +27,7 @@
 from transformers import LlamaConfig, LlamaForCausalLM
 from transformers.models.llama.modeling_llama import (LlamaAttention,
                                                       LlamaDecoderLayer,
+                                                      LlamaRotaryEmbedding,
                                                       apply_rotary_pos_emb,
                                                       repeat_kv)
 from transformers.pytorch_utils import Conv1D
@@ -380,7 +381,8 @@ def __init__(self, *args, **kwargs):
         self.o_proj = nn.Linear(self.num_heads * self.head_dim,
                                 self.hidden_size,
                                 bias=False)
-        self._init_rope()
+        self.config.head_dim = self.head_dim
+        self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
 
     def forward(
         self,
diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py
index cb12289f8..92361e705 100644
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import math
 from collections import OrderedDict
-from typing import List
+from typing import List, Optional
 
 import tensorrt as trt
 
@@ -166,28 +166,34 @@ def get_profiles_ranges(
         }
         return num_profiles, ranges
 
-    def prepare_attention_inputs(self,
-                                 *,
-                                 max_batch_size,
-                                 max_beam_width,
-                                 max_input_len,
-                                 max_seq_len,
-                                 num_kv_heads,
-                                 head_size,
-                                 num_layers,
-                                 kv_dtype,
-                                 kv_cache_type: KVCacheType,
-                                 num_profiles=1,
-                                 enable_ctx_gen_opt_profiles=False,
-                                 remove_input_padding=False,
-                                 use_gpt_attention_plugin=False,
-                                 tokens_per_block=64,
-                                 mapping=Mapping(),
-                                 streamingllm=False,
-                                 attn_layer_idx=None,
-                                 opt_batch_size=None,
-                                 num_kv_heads_per_layer=None):
-
+    def prepare_attention_inputs(
+            self,
+            *,
+            max_batch_size,
+            max_beam_width,
+            max_input_len,
+            max_seq_len,
+            num_kv_heads,
+            head_size,
+            num_layers,
+            kv_dtype,
+            kv_cache_type: KVCacheType,
+            num_profiles=1,
+            enable_ctx_gen_opt_profiles=False,
+            remove_input_padding=False,
+            use_gpt_attention_plugin=False,
+            tokens_per_block=64,
+            mapping=Mapping(),
+            streamingllm=False,
+            attn_layer_idx=None,
+            opt_batch_size=None,
+            num_kv_heads_per_layer: Optional[List[int]] = None):
+
+        if attn_layer_idx is not None and num_kv_heads_per_layer is not None:
+            assert len(attn_layer_idx) == len(num_kv_heads_per_layer), (
+                f"Expected len(attn_layer_idx) ({len(attn_layer_idx)})"
+                f" == len(num_kv_heads_per_layer) ({len(num_kv_heads_per_layer)})"
+            )
         default_range = GenerationMixin.default_range
 
         if opt_batch_size:
@@ -245,23 +251,40 @@ def prepare_attention_inputs(self,
         max_len_range = [_max_len_range] * num_profiles
 
         num_kv_heads = (num_kv_heads + mapping.tp_size - 1) // mapping.tp_size
+        if num_kv_heads_per_layer is not None:
+            num_kv_heads_per_layer = [
+                (nheads + mapping.tp_size - 1) // mapping.tp_size
+                for nheads in num_kv_heads_per_layer
+            ]
+
         layers_range = mapping.pp_layers(num_layers)
-        num_pp_layers = len(layers_range)
         if attn_layer_idx is None:
             attn_layer_idx = [i for i in range(num_layers)]
+        # layer indices of attention layers local to the current pp rank
+        local_attn_layers = [i for i in layers_range if i in attn_layer_idx]
+        # number of attention layers local to previous pp ranks
+        num_attn_layers_lower_ranks = attn_layer_idx.index(local_attn_layers[0])
         past_key_value = []
         kv_cache_block_offsets = None
         host_kv_cache_block_offsets = None
         host_kv_cache_pool_pointers = None
+        host_kv_cache_pool_mapping = None
         if kv_cache_type == KVCacheType.DISABLED:
             for i in layers_range:
                 past_key_value.append(None)
         else:
             if kv_cache_type != KVCacheType.PAGED:
-                for i in layers_range:
+                for layer_idx in layers_range:
+                    if layer_idx not in local_attn_layers:
+                        # not an attention layer ==> give it None pkv input
+                        past_key_value.append(None)
+                        continue
+
+                    attn_idx = local_attn_layers.index(layer_idx)
                     if num_kv_heads_per_layer is not None:
-                        heads_dim_name = f"num_heads_{attn_layer_idx[i]}"
-                        kv_heads = num_kv_heads_per_layer[i]
+                        heads_dim_name = f"num_heads_{layer_idx}"
+                        kv_heads = num_kv_heads_per_layer[
+                            num_attn_layers_lower_ranks + attn_idx]
                     else:
                         heads_dim_name = "num_heads"
                         kv_heads = num_kv_heads
@@ -274,7 +297,7 @@ def prepare_attention_inputs(self,
                         ('head_size', [head_size] * num_profiles),
                     ])
 
-                    kv = Tensor(name=f'past_key_value_{attn_layer_idx[i]}',
+                    kv = Tensor(name=f'past_key_value_{layer_idx}',
                                 dtype=kv_dtype,
                                 shape=[-1, 2, kv_heads, -1, head_size],
                                 dim_range=kv_dim_range)
@@ -300,21 +323,28 @@ def prepare_attention_inputs(self,
                         math.ceil(kv_cache_range[0][2] / tokens_per_block)
                     ]] * num_profiles
 
-                kv_cache_block_offsets = Tensor(name=f'kv_cache_block_offsets',
-                                                dtype=trt.int32,
-                                                shape=[-1, 2, -1],
-                                                dim_range=OrderedDict([
-                                                    ('batch_size_beam_width',
-                                                     bb_range),
-                                                    ('kv', [2] * num_profiles),
-                                                    ('max_blocks_per_seq',
-                                                     max_blocks_per_seq_range),
-                                                ]))
+                num_kv_cache_pools = 1 if num_kv_heads_per_layer is None else len(
+                    set(num_kv_heads_per_layer[num_attn_layers_lower_ranks:
+                                               num_attn_layers_lower_ranks +
+                                               len(local_attn_layers)]))
+                kv_cache_block_offsets = Tensor(
+                    name=f'kv_cache_block_offsets',
+                    dtype=trt.int32,
+                    shape=[num_kv_cache_pools, -1, 2, -1],
+                    dim_range=OrderedDict([
+                        ('num_kv_cache_pools',
+                         [num_kv_cache_pools] * num_profiles),
+                        ('batch_size_beam_width', bb_range),
+                        ('kv', [2] * num_profiles),
+                        ('max_blocks_per_seq', max_blocks_per_seq_range),
+                    ]))
                 host_kv_cache_block_offsets = Tensor(
                     name=f'host_kv_cache_block_offsets',
                     dtype=trt.int32,
-                    shape=[-1, 2, -1],
+                    shape=[num_kv_cache_pools, -1, 2, -1],
                     dim_range=OrderedDict([
+                        ('num_kv_cache_pools',
+                         [num_kv_cache_pools] * num_profiles),
                         ('batch_size_beam_width', bb_range),
                         ('kv', [2] * num_profiles),
                         ('max_blocks_per_seq', max_blocks_per_seq_range),
@@ -322,9 +352,20 @@ def prepare_attention_inputs(self,
                 host_kv_cache_pool_pointers = Tensor(
                     name=f'host_kv_cache_pool_pointers',
                     dtype=trt.int64,
-                    shape=[2],
+                    shape=[num_kv_cache_pools, 2],
+                    dim_range=OrderedDict([
+                        ('num_pools_layers',
+                         [num_kv_cache_pools] * num_profiles),
+                        ('num_pools_kv', [2] * num_profiles),
+                    ]))
+
+                host_kv_cache_pool_mapping = Tensor(
+                    name=f'host_kv_cache_pool_mapping',
+                    dtype=trt.int32,
+                    shape=[len(local_attn_layers)],
                     dim_range=OrderedDict([
-                        ('num_pools', [2] * num_profiles),
+                        ('pools_mapping',
+                         [len(local_attn_layers)] * num_profiles),
                     ]))
 
                 for i in layers_range:
@@ -403,9 +444,10 @@ def prepare_attention_inputs(self,
             host_max_attention_window_sizes = Tensor(
                 name=f'host_max_attention_window_sizes',
                 dtype=trt.int32,
-                shape=[num_pp_layers],
-                dim_range=OrderedDict([('num_layers',
-                                        [num_pp_layers] * num_profiles)]))
+                shape=[len(local_attn_layers)],
+                dim_range=OrderedDict([
+                    ('num_layers', [len(local_attn_layers)] * num_profiles)
+                ]))
 
             host_sink_token_length = Tensor(name='host_sink_token_length',
                                             dtype=trt.int32,
@@ -437,6 +479,7 @@ def prepare_attention_inputs(self,
             'kv_cache_block_offsets': kv_cache_block_offsets,
             'host_kv_cache_block_offsets': host_kv_cache_block_offsets,
             'host_kv_cache_pool_pointers': host_kv_cache_pool_pointers,
+            'host_kv_cache_pool_mapping': host_kv_cache_pool_mapping,
             'context_lengths': context_lengths,
             'host_context_lengths': host_context_lengths,
             'host_request_types': host_request_types,
diff --git a/tensorrt_llm/models/gpt/config.py b/tensorrt_llm/models/gpt/config.py
index ba34ae255..01e1ac257 100644
--- a/tensorrt_llm/models/gpt/config.py
+++ b/tensorrt_llm/models/gpt/config.py
@@ -135,6 +135,11 @@ def from_hugging_face(
             hf_config.rotary_base = hf_config.rope_theta
             hf_config.rotary_pct = getattr(hf_config, 'partial_rotary_factor',
                                            1.0)
+            try:
+                # only for persimmon, not starcoder2
+                hf_config.vocab_size = hf_config.text_config.vocab_size
+            except AttributeError:
+                pass
         elif gpt_variant == "kosmos-2":
             hf_config.n_embd = hf_config.text_config.embed_dim
             hf_config.n_inner = hf_config.text_config.ffn_dim
diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py
index 7e40e5872..bde4dc991 100644
--- a/tensorrt_llm/models/gpt/model.py
+++ b/tensorrt_llm/models/gpt/model.py
@@ -25,6 +25,8 @@
 from ...mapping import Mapping
 from ...module import Module
 from ...quantization import QuantMode
+from ...quantization.functional import quantize_fp8_per_token
+from ...quantization.layers import Fp8RowwiseMLP
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
                               QuantConfig, check_share_embedding)
 from .config import GPTConfig
@@ -171,6 +173,10 @@ def forward(self,
         residual = hidden_states
         hidden_states = self.post_layernorm(hidden_states)
 
+        # Quantize per-token for fp8
+        if isinstance(self.mlp, Fp8RowwiseMLP):
+            hidden_states = quantize_fp8_per_token(hidden_states)
+
         hidden_states = self.mlp(hidden_states,
                                  lora_layer_params=lora_layer_params)
 
diff --git a/tensorrt_llm/models/grok/convert.py b/tensorrt_llm/models/grok/convert.py
index 527782e3c..219233cef 100644
--- a/tensorrt_llm/models/grok/convert.py
+++ b/tensorrt_llm/models/grok/convert.py
@@ -504,8 +504,10 @@ def load_weights_from_xai(*, config, mapping, model):
     assert quant_algo == QuantAlgo.W8A16
     plugin_weight_only_quant_type = torch.int8
 
-    moe_config = MoeConfig(config['moe_num_experts'], config['moe_top_k'],
-                           config['moe_normalization_mode']).validate()
+    moe_config = MoeConfig(
+        num_experts=config['moe_num_experts'],
+        top_k=config['moe_top_k'],
+        normalization_mode=config['moe_normalization_mode']).validate()
 
     use_weight_only = quant_algo in [QuantAlgo.W8A16]
 
diff --git a/tensorrt_llm/models/grok/model.py b/tensorrt_llm/models/grok/model.py
index 7b77873d7..8fc34349f 100644
--- a/tensorrt_llm/models/grok/model.py
+++ b/tensorrt_llm/models/grok/model.py
@@ -68,8 +68,10 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
         mlp_kwargs = {}
         assert config.moe_num_experts > 1, "Grok model is a MoE model."
         ClsMLP = MOE
-        moe_config = MoeConfig(config.moe_num_experts, config.moe_top_k,
-                               config.moe_normalization_mode).validate()
+        moe_config = MoeConfig(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_top_k,
+            normalization_mode=config.moe_normalization_mode).validate()
         mlp_kwargs = {
             "moe_config": moe_config,
             "mapping": config.mapping,
diff --git a/tensorrt_llm/models/llama/convert.py b/tensorrt_llm/models/llama/convert.py
index cd9bbc63f..7c9077081 100644
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@@ -1085,7 +1085,9 @@ def quantize(hf_model_dir: str,
              config: LLaMAConfig,
              device: str = 'cuda',
              calib_dataset: str = 'cnn_dailymail',
-             trust_remote_code: bool = True):
+             trust_remote_code: bool = True,
+             calib_batches: int = 512,
+             calib_max_seq_length: int = 512):
     '''
         Quantize the save the model as TRT-LLM checkpoint to output_dir
     '''
@@ -1121,7 +1123,14 @@ def quantize(hf_model_dir: str,
 
     dataset = load_calib_dataset(calib_dataset)
 
-    act_range = capture_activation_range(hf_model, tokenizer, dataset)
+    if calib_batches == -1:  # use the whole dataset if calib_batches is -1
+        calib_batches = len(dataset)
+
+    act_range = capture_activation_range(hf_model,
+                                         tokenizer,
+                                         dataset,
+                                         num_samples=calib_batches,
+                                         seq_len=calib_max_seq_length)
     qkv_para, smoother = {}, {}
     if use_smooth_quant:
         smooth_llama_model(hf_model, act_range, quant_config.smoothquant_val,
@@ -1548,11 +1557,15 @@ def load(key,
             res = tensor_slice[:]
         elif tp_dim >= 0 and tp_dim < len(tensor_shape):
             if is_expert_weights:
-                tp_size = tp_size or mapping.moe_tp_size
-                tp_rank = tp_rank or mapping.moe_tp_rank
+                if tp_size is None:
+                    tp_size = mapping.moe_tp_size
+                if tp_rank is None:
+                    tp_rank = mapping.moe_tp_rank
             else:
-                tp_size = tp_size or mapping.tp_size
-                tp_rank = tp_rank or mapping.tp_rank
+                if tp_size is None:
+                    tp_size = mapping.tp_size
+                if tp_rank is None:
+                    tp_rank = mapping.tp_rank
             dim_size = tensor_shape[tp_dim]
             if dim_size % tp_size != 0:
                 logger.error(
diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py
index 534c3bb4b..aabcc5265 100644
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@@ -297,7 +297,9 @@ def from_hugging_face(
         load_by_shard = kwargs.pop('load_by_shard', False)
         load_model_on_cpu = kwargs.pop('load_model_on_cpu', False)
         quant_ckpt_path = kwargs.pop('quant_ckpt_path', None)
-        if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER") is not None:
+        if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER"
+                          ) is not None and not isinstance(
+                              hf_model_or_dir, transformers.PreTrainedModel):
             if "vila" in hf_model_or_dir or "llava" in hf_model_or_dir:
                 hf_model_or_dir = load_hf_llama(hf_model_or_dir,
                                                 load_model_on_cpu)
@@ -326,14 +328,15 @@ def from_hugging_face(
             config.num_key_value_heads = config.num_key_value_heads // 2
         if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER") is None:
             custom_dict = {}
-            if "llava" in hf_model_or_dir:
+            model_name = hf_model.config.model_type if use_preloading else hf_model_or_dir
+            if "llava" in model_name:
                 custom_dict = {
                     "transformer": "language_model.model",
                     "lm_head": "language_model.lm_head"
                 }
-            elif "vila" in hf_model_or_dir:
+            elif "vila" in model_name:
                 hf_model_dir += "/llm"
-            elif "exaone" in hf_model_or_dir:
+            elif "exaone" in model_name:
                 custom_dict = {
                     "transformer": "transformer",
                     "layers": "h",
@@ -352,8 +355,7 @@ def from_hugging_face(
                 hf_model_dir = quant_ckpt_path
 
             loader = ModelWeightsLoader(hf_model_dir, custom_dict)
-            if config.share_embedding_table:
-                config.share_embedding_table = loader.check_share_embedding()
+            loader.check_share_embedding(config)
             model = cls(config)
             loader.generate_tllm_weights(model)
         else:
@@ -449,7 +451,9 @@ def quantize(
                              config=config,
                              device=device,
                              calib_dataset=calib_dataset,
-                             trust_remote_code=trust_remote_code)
+                             trust_remote_code=trust_remote_code,
+                             calib_batches=calib_batches,
+                             calib_max_seq_length=calib_max_seq_length)
         else:
             raise ValueError(
                 f"The quant_config ({quant_config}) does not require calibration, try {cls.__name__}.from_hugging_face instead."
diff --git a/tensorrt_llm/models/mamba/config.py b/tensorrt_llm/models/mamba/config.py
new file mode 100644
index 000000000..13e54020f
--- /dev/null
+++ b/tensorrt_llm/models/mamba/config.py
@@ -0,0 +1,340 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from enum import Enum
+from typing import List, Optional, Union
+
+import torch
+import transformers
+
+from ..._utils import torch_dtype_to_str
+from ...logger import logger
+from ...mapping import Mapping
+from ..modeling_utils import PretrainedConfig, QuantConfig
+
+
+class CheckpointType(str, Enum):
+    mistral_inference = "mistral_inference"
+    state_spaces = "state_spaces"
+    hf = "hf"
+
+
+def get_ckpt_type(model_path):
+    hf_config = transformers.AutoConfig.from_pretrained(model_path,
+                                                        trust_remote_code=True)
+    if hasattr(hf_config, "ssm_cfg") and hf_config.ssm_cfg:
+        return CheckpointType.state_spaces
+    if os.path.exists(os.path.join(model_path, "params.json")):
+        return CheckpointType.mistral_inference
+    return CheckpointType.hf
+
+
+class MambaConfig(PretrainedConfig):
+
+    def __init__(self,
+                 *,
+                 residual_in_fp32: bool = True,
+                 pad_vocab_size_multiple: int = -1,
+                 layer_types: List[str] = ["recurrent"],
+                 **kwargs):
+        self.residual_in_fp32 = residual_in_fp32
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.layer_types = layer_types
+        super().__init__(**kwargs)
+
+    def to_dict(self):
+        output = super().to_dict()
+        # Serialize the fields added in MambaConfig
+
+        return output
+
+    def update(self, data_dict):
+        self.__dict__.update(data_dict)
+
+    @classmethod
+    def from_hugging_face(
+            cls,
+            hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'],
+            dtype: str = 'auto',
+            mapping: Optional[Mapping] = None,
+            quant_config: Optional[QuantConfig] = None,
+            **kwargs):
+        import transformers
+
+        ckpt_type = get_ckpt_type(hf_config_or_dir)
+
+        mamba_version = 'Mamba1'
+        if ckpt_type == CheckpointType.hf:
+            if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
+                hf_config = hf_config_or_dir
+            else:
+                hf_config_dir = str(hf_config_or_dir)
+
+                hf_config = transformers.AutoConfig.from_pretrained(
+                    hf_config_dir, trust_remote_code=True)
+
+            if dtype == 'auto':
+                dtype = getattr(hf_config, 'torch_dtype', None)
+                if dtype is None:
+                    dtype = 'float16'
+                if isinstance(dtype, torch.dtype):
+                    dtype = torch_dtype_to_str(dtype)
+                if dtype == 'float32':
+                    dtype = 'float16'
+            if dtype == 'bfloat16' and torch.cuda.get_device_properties(
+                    0).major < 8:
+                logger.warning(
+                    "Pre SM 80 GPUs do not support bfloat16, fallback to float16"
+                )
+                dtype = 'float16'
+
+            vocab_size = hf_config.vocab_size
+            pad_vocab_size_multiple = getattr(hf_config,
+                                              "pad_vocab_size_multiple", 1)
+            if vocab_size % pad_vocab_size_multiple != 0:
+                vocab_size += pad_vocab_size_multiple - (
+                    vocab_size % pad_vocab_size_multiple)
+            return cls(architecture="MambaForCausalLM",
+                       dtype=dtype,
+                       num_hidden_layers=hf_config.num_hidden_layers,
+                       num_attention_heads=mapping.world_size,
+                       hidden_size=hf_config.hidden_size,
+                       intermediate_size=hf_config.intermediate_size,
+                       vocab_size=vocab_size,
+                       mamba_version=mamba_version,
+                       hidden_act=hf_config.hidden_act,
+                       rms_norm=hf_config.rms_norm,
+                       residual_in_fp32=hf_config.residual_in_fp32,
+                       pad_vocab_size_multiple=pad_vocab_size_multiple,
+                       rnn_hidden_size=hf_config.intermediate_size,
+                       rnn_conv_dim_size=hf_config.intermediate_size,
+                       state_size=hf_config.state_size,
+                       conv_kernel=hf_config.conv_kernel,
+                       use_bias=hf_config.use_bias,
+                       mapping=mapping,
+                       quantization=quant_config,
+                       **kwargs)
+        elif ckpt_type == CheckpointType.state_spaces:
+
+            mamba_version = 'Mamba2'
+            if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
+                hf_config = hf_config_or_dir
+            else:
+                hf_config_dir = str(hf_config_or_dir)
+
+                hf_config = transformers.AutoConfig.from_pretrained(
+                    hf_config_dir, trust_remote_code=True)
+            if dtype == 'auto':
+                dtype = getattr(hf_config, 'torch_dtype', None)
+                if dtype is None:
+                    dtype = 'float16'
+                if isinstance(dtype, torch.dtype):
+                    dtype = torch_dtype_to_str(dtype)
+                if dtype == 'float32':
+                    dtype = 'float16'
+            if dtype == 'bfloat16' and torch.cuda.get_device_properties(
+                    0).major < 8:
+                logger.warning(
+                    "Pre SM 80 GPUs do not support bfloat16, fallback to float16"
+                )
+                dtype = 'float16'
+
+            vocab_size = hf_config.vocab_size
+            pad_vocab_size_multiple = getattr(hf_config,
+                                              "pad_vocab_size_multiple", 1)
+            if vocab_size % pad_vocab_size_multiple != 0:
+                vocab_size += pad_vocab_size_multiple - (
+                    vocab_size % pad_vocab_size_multiple)
+            assert hasattr(hf_config,
+                           'ssm_cfg') and hf_config.ssm_cfg['layer'] == 'Mamba2'
+            config = json.load(
+                open(os.path.join(hf_config_or_dir, 'config.json')))
+            ssm_cfg = config.pop('ssm_cfg')
+            cfg_to_mamba_cfg = {
+                'd_model': 'hidden_size',
+                'n_layer': 'num_hidden_layers',
+                'fused_add_norm': None,
+                'tie_embeddings': None,
+            }
+            ssm_cfg_to_mamba_cfg = {
+                'd_state': 'state_size',
+                'd_conv': 'conv_kernel',
+                'bias': 'use_bias',
+                'headdim': 'head_dim',
+                'ngroups': 'n_groups',
+                'chunk_size': 'chunk_size',
+                'rmsnorm': 'ssm_rmsnorm',
+            }
+            for k in cfg_to_mamba_cfg:
+                if k in config:
+                    v = config.pop(k)
+                    if cfg_to_mamba_cfg[k] is not None:
+                        config[cfg_to_mamba_cfg[k]] = v
+            for k in ssm_cfg_to_mamba_cfg:
+                if k in ssm_cfg and ssm_cfg_to_mamba_cfg[k] is not None:
+                    config[ssm_cfg_to_mamba_cfg[k]] = ssm_cfg[k]
+
+            if 'expand' in config:
+                expand = config['expand']
+                hf_config.intermediate_size = expand * config['hidden_size']
+            else:
+                hf_config.intermediate_size = 2 * config['hidden_size']
+            mamba2_default_cfg = {
+                'n_groups': 1,
+                'hidden_size': hf_config.d_model,
+                'head_dim': 64,
+                'chunk_size': 256,
+                'state_size': 128,
+            }
+            hf_config.update(mamba2_default_cfg)
+
+            conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size
+            ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm)
+            mamba2_cfg = {
+                'rnn_head_size': hf_config.head_dim,
+                'rnn_conv_dim_size': conv_dim,
+                'ngroups': hf_config.n_groups,
+                'chunk_size': hf_config.chunk_size,
+                'ssm_rmsnorm': ssm_rmsnorm,
+            }
+            hf_config.update(mamba2_cfg)
+
+            return cls(architecture="MambaForCausalLM",
+                       dtype=dtype,
+                       num_hidden_layers=hf_config.n_layer,
+                       num_attention_heads=mapping.world_size
+                       if mapping is not None else 1,
+                       hidden_size=hf_config.hidden_size,
+                       intermediate_size=hf_config.intermediate_size,
+                       vocab_size=vocab_size,
+                       mamba_version=mamba_version,
+                       hidden_act=hf_config.hidden_act,
+                       rms_norm=hf_config.rms_norm,
+                       residual_in_fp32=hf_config.residual_in_fp32,
+                       pad_vocab_size_multiple=pad_vocab_size_multiple,
+                       rnn_hidden_size=hf_config.intermediate_size,
+                       rnn_conv_dim_size=hf_config.rnn_conv_dim_size,
+                       state_size=hf_config.state_size,
+                       conv_kernel=hf_config.conv_kernel,
+                       use_bias=hf_config.use_bias,
+                       mapping=mapping,
+                       quantization=quant_config,
+                       rnn_head_size=hf_config.rnn_head_size,
+                       ngroups=hf_config.ngroups,
+                       chunk_size=hf_config.chunk_size,
+                       ssm_rmsnorm=hf_config.ssm_rmsnorm,
+                       **kwargs)
+        elif ckpt_type == CheckpointType.mistral_inference:
+            mamba_version = 'Mamba2'
+
+            config = json.load(
+                open(os.path.join(hf_config_or_dir, 'params.json')))
+            cfg_to_mamba_cfg = {
+                'dim': 'hidden_size',
+                'n_layers': 'num_hidden_layers',
+                'n_groups': 'n_groups',
+                'fused_add_norm': None,
+                'tie_embeddings': None,
+                'model_type': None,
+            }
+            for k in cfg_to_mamba_cfg:
+                if k in config:
+                    v = config.pop(k)
+                    if cfg_to_mamba_cfg[k] is not None:
+                        config[cfg_to_mamba_cfg[k]] = v
+
+            config['architecture'] = 'MambaForCuasualLM'
+            config['dtype'] = dtype
+            config['num_attention_heads'] = mapping.world_size
+
+            hf_config = MambaConfig(**config)
+            mamba2_default_cfg = {
+                'n_groups': 8,
+                'hidden_size': 4096,
+                'head_dim': 64,
+                'chunk_size': 256,
+                'state_size': 128,
+                'conv_kernel': 4,
+                'use_bias': False
+            }
+
+            hf_config.update(mamba2_default_cfg)
+            conv_dim = hf_config.intermediate_size + 2 * hf_config.n_groups * hf_config.state_size
+            ssm_rmsnorm = getattr(hf_config, "ssm_rmsnorm", hf_config.rms_norm)
+            mamba2_cfg = {
+                'rnn_head_size': hf_config.head_dim,
+                'rnn_conv_dim_size': conv_dim,
+                'ngroups': hf_config.n_groups,
+                'chunk_size': hf_config.chunk_size,
+                'ssm_rmsnorm': ssm_rmsnorm,
+            }
+            hf_config.update(mamba2_cfg)
+
+            if 'expand' in config:
+                expand = config['expand']
+                hf_config.intermediate_size = expand * hf_config.hidden_size
+            else:
+                hf_config.intermediate_size = 2 * hf_config.hidden_size
+            vocab_size = hf_config.vocab_size
+            pad_vocab_size_multiple = getattr(hf_config,
+                                              "pad_vocab_size_multiple", 1)
+            if vocab_size % pad_vocab_size_multiple != 0:
+                vocab_size += pad_vocab_size_multiple - (
+                    vocab_size % pad_vocab_size_multiple)
+
+            return cls(
+                architecture="MambaForCausalLM",
+                dtype=dtype,
+                num_hidden_layers=hf_config.num_hidden_layers,
+                num_attention_heads=mapping.world_size,
+                hidden_size=hf_config.hidden_size,
+                intermediate_size=hf_config.intermediate_size,
+                #    num_key_value_heads=num_key_value_heads,
+                vocab_size=vocab_size,
+                mamba_version=mamba_version,
+                hidden_act=hf_config.hidden_act,
+                rms_norm=hf_config.rms_norm,
+                residual_in_fp32=hf_config.residual_in_fp32,
+                pad_vocab_size_multiple=pad_vocab_size_multiple,
+                rnn_hidden_size=hf_config.intermediate_size,
+                rnn_conv_dim_size=hf_config.rnn_conv_dim_size,
+                state_size=hf_config.state_size,
+                conv_kernel=hf_config.conv_kernel,
+                use_bias=hf_config.use_bias,
+                mapping=mapping,
+                quantization=quant_config,
+                rnn_head_size=hf_config.rnn_head_size,
+                ngroups=hf_config.n_groups,
+                chunk_size=hf_config.chunk_size,
+                ssm_rmsnorm=hf_config.ssm_rmsnorm,
+                **kwargs)
+        else:
+            pass
+
+        if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
+            hf_config = hf_config_or_dir
+        else:
+            hf_config_dir = str(hf_config_or_dir)
+
+            hf_config = transformers.AutoConfig.from_pretrained(
+                hf_config_dir, trust_remote_code=True)
+
+        vocab_size = hf_config.vocab_size
+        pad_vocab_size_multiple = getattr(hf_config, "pad_vocab_size_multiple",
+                                          1)
+        if vocab_size % pad_vocab_size_multiple != 0:
+            vocab_size += pad_vocab_size_multiple - (vocab_size %
+                                                     pad_vocab_size_multiple)
diff --git a/tensorrt_llm/models/mamba/convert.py b/tensorrt_llm/models/mamba/convert.py
new file mode 100644
index 000000000..f55bda43c
--- /dev/null
+++ b/tensorrt_llm/models/mamba/convert.py
@@ -0,0 +1,245 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import re
+import time
+from pathlib import Path
+from typing import Union
+
+import torch
+
+import tensorrt_llm
+from tensorrt_llm.models.convert_utils import (iterate_shard_files,
+                                               load_state_dict)
+
+
+def get_weight(config, prefix, dtype):
+    return config[prefix + '.weight'].to(dtype).detach()
+
+
+def get_bias(config, prefix, dtype):
+    if (prefix + '.bias') in config:
+        return config[prefix + '.bias'].to(dtype).detach()
+    return None
+
+
+def get_weight_and_bias(config, prefix, dtype_w, dtype_b):
+    return get_weight(config, prefix,
+                      dtype_w), get_bias(config, prefix, dtype_b)
+
+
+def split(v, tp_size, idx, dim=0):
+    assert v.shape[dim] % tp_size == 0
+    split_size = v.shape[dim] // tp_size
+    if tp_size == 1:
+        return v
+    return torch.split(v, split_size, dim=dim)[idx]
+
+
+def rename_hf_to_tllm(name: str):
+    """ Rename a HF parameter name by the corresponding TRT-LLM style name. """
+    # remove model
+    if 'model.' in name:
+        name = name.replace('model.', '')
+
+    # change layer name
+    if 'embeddings.' in name:
+        name = name.replace('embeddings', 'vocab_embedding')
+    elif 'embedding.' in name:
+        name = name.replace('embedding', 'vocab_embedding')
+    norm_pattern = r'\d\.norm\.'
+    if 'mixer.' in name:
+        name = name.replace('mixer.', 'ssm.')
+    elif re.search(norm_pattern, name):
+        name = name.replace('norm.', 'input_layernorm.')
+    elif 'norm_f.' in name:
+        name = name.replace('norm_f.', 'ln_f.')
+
+    # Parameter names in ssm layers
+    if 'A_log' in name:
+        name = name.replace('A_log', 'A')
+    elif 'dt_proj.bias' in name:
+        name = name.replace('dt_proj.bias', 'dt_bias')
+    return name
+
+
+def convert_hf_mamba(hf_mamba, dtype='float32'):
+    weights = {}
+    tik = time.time()
+
+    model_params = dict(hf_mamba.named_parameters())
+    dtype = getattr(torch, dtype)
+
+    # Parameter names in mamba block
+    for l in range(hf_mamba.config.num_hidden_layers):
+        # ssm layer
+        prefix = f'backbone.layers.{l}.mixer.'
+        tllm_prex = f'backbone.layers.{l}.ssm.'
+        for layer in ['conv1d', 'x_proj', 'dt_proj', 'out_proj']:
+            dtype_b = torch.float32 if layer == 'dt_proj' else dtype
+            weight, bias = get_weight_and_bias(model_params, prefix + layer,
+                                               dtype, dtype_b)
+            if layer == 'conv1d':
+                weight = weight.unsqueeze(3)
+            tllm_weight_name = tllm_prex + layer + '.weight'
+            tllm_bias_name = tllm_prex + ('dt_bias' if layer == 'dt_proj' else
+                                          layer + '.bias')
+            weights[tllm_weight_name] = weight
+            if bias is not None:
+                weights[tllm_bias_name] = bias
+        # in_proj
+        weight, bias = get_weight_and_bias(model_params, prefix + 'in_proj',
+                                           dtype, dtype)
+        in_proj_weights = torch.split(weight, weight.size(0) // 2, dim=0)
+        tllm_weight_name = tllm_prex + 'in_proj.weight'
+        weights[tllm_weight_name.replace('proj', 'proj_x')] = in_proj_weights[0]
+        weights[tllm_weight_name.replace('proj', 'proj_z')] = in_proj_weights[1]
+        if bias is not None:
+            in_proj_biases = torch.split(bias, bias.size(0) // 2, dim=0)
+            tllm_bias_name = tllm_prex + 'in_proj.bias'
+            weights[tllm_bias_name.replace('proj',
+                                           'proj_x')] = in_proj_biases[0]
+            weights[tllm_bias_name.replace('proj',
+                                           'proj_x')] = in_proj_biases[1]
+
+        # A and D
+        Aparam = model_params[prefix + 'A_log'].float().detach()
+        Aparam = Aparam.permute(1, 0).contiguous()
+        weights[tllm_prex + 'A'] = -torch.exp(Aparam)
+        weights[tllm_prex + 'D'] = model_params[prefix + 'D'].float().detach()
+        # norm
+        prefix = f'backbone.layers.{l}.norm'
+        tllm_prex = f'backbone.layers.{l}.input_layernorm.'
+        weight, bias = get_weight_and_bias(model_params, prefix, dtype, dtype)
+        weights[tllm_prex + 'weight'] = weight
+        if bias is not None:
+            weights[tllm_prex + 'bias'] = bias
+
+    # others
+    for layer in ['backbone.embeddings', 'backbone.norm_f']:
+        weight, bias = get_weight_and_bias(model_params, layer, dtype, dtype)
+        layer = layer.replace('embeddings', 'vocab_embedding')
+        layer = layer.replace('norm_f', 'ln_f')
+        weights[layer + '.weight'] = weight
+        if bias is not None:
+            weights[layer + '.bias'] = bias
+    weights['lm_head.weight'], _ = get_weight_and_bias(model_params,
+                                                       'backbone.embeddings',
+                                                       dtype, dtype)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Weights loaded. Total time: {t}')
+    return weights
+
+
+def convert_from_hf_checkpoint(mamba_config: dict, model_dir: Union[str, Path]):
+
+    print('Loading weights from HF Mamba...')
+    tik = time.time()
+
+    tp_rank = mamba_config.mapping.tp_rank
+    tp_size = mamba_config.mapping.tp_size
+    d_inner = mamba_config.rnn_hidden_size
+    d_state = mamba_config.state_size
+    dtype = mamba_config.dtype
+    mamba_version = mamba_config.mamba_version
+    weights = {}
+    if isinstance(dtype, str):
+        dtype = tensorrt_llm.str_dtype_to_torch(dtype)
+
+    for model_file in iterate_shard_files(model_dir, 0):
+        # logger.debug(f'Loading file {str(model_file)}...')
+        model_params = load_state_dict(model_file, dtype=dtype)
+        for name, param in model_params.items():
+            # logger.debug(f'Converting weight {name}...')
+            tllm_name = rename_hf_to_tllm(name)
+            param = param.detach().cpu()
+            if 'A_log' in name:
+                param = -torch.exp(param.float())
+                if mamba_version == 'Mamba1':
+                    param = param.permute(1, 0).contiguous()
+            elif 'D' in name:
+                param = param.float()
+            elif 'dt_proj.bias' in name:
+                param = param.float()
+            elif 'dt_bias' in name:
+                param = param.float()
+            elif 'conv1d.weight' in name:
+                param = param.unsqueeze(3)
+
+            # split in_proj in Mamba1
+            if 'in_proj' in name and mamba_version == 'Mamba1':
+                in_proj_params = torch.split(param, param.size(0) // 2, dim=0)
+                weights[tllm_name.replace('proj', 'proj_x')] = in_proj_params[0]
+                weights[tllm_name.replace('proj', 'proj_z')] = in_proj_params[1]
+            elif 'in_proj' in name and mamba_version == 'Mamba2':
+                nheads = d_inner // mamba_config.rnn_head_size
+                ngroups = mamba_config.ngroups
+
+                in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split(
+                    param, [
+                        d_inner, d_inner, ngroups * d_state, ngroups * d_state,
+                        nheads
+                    ],
+                    dim=0)
+                in_proj_z = split(in_proj_z, tp_size, tp_rank, dim=0)
+                in_proj_x = split(in_proj_x, tp_size, tp_rank, dim=0)
+                in_proj_b = split(in_proj_b, tp_size, tp_rank, dim=0)
+                in_proj_c = split(in_proj_c, tp_size, tp_rank, dim=0)
+                in_proj_dt = split(in_proj_dt, tp_size, tp_rank, dim=0)
+                in_proj = torch.concat(
+                    [in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt])
+                weights[tllm_name] = in_proj.contiguous()
+            elif 'conv1d' in name and mamba_version == 'Mamba2':
+                ngroups = mamba_config.ngroups
+                conv_x, conv_b, conv_c = torch.split(
+                    param, [d_inner, ngroups * d_state, ngroups * d_state],
+                    dim=0)
+                conv_x = split(conv_x, tp_size, tp_rank, dim=0)
+                conv_b = split(conv_b, tp_size, tp_rank, dim=0)
+                conv_c = split(conv_c, tp_size, tp_rank, dim=0)
+                conv = torch.concat([conv_x, conv_b, conv_c])
+                weights[tllm_name] = conv.contiguous()
+            elif any(keyword in name for keyword in (
+                    'mixer.norm.weight',
+                    'A_log',
+                    'D',
+                    'dt_proj.bias',
+                    'dt_bias',
+            )) and mamba_version == 'Mamba2':
+                weights[tllm_name] = split(param, tp_size, tp_rank, dim=0)
+            elif 'out_proj' in name and mamba_version == 'Mamba2':
+                weights[tllm_name] = split(param, tp_size, tp_rank,
+                                           dim=1).contiguous()
+            else:
+                weights[tllm_name] = param
+        del model_params
+
+    # lm_head
+    emb = weights['backbone.vocab_embedding.weight']
+    if 'lm_head.weight' not in weights or weights['lm_head.weight'].data_ptr(
+    ) == emb.data_ptr():
+        weights['lm_head.weight'] = copy.deepcopy(emb)
+    if mamba_version == 'Mamba2':
+        weights['lm_head.weight'] = split(weights['lm_head.weight'],
+                                          tp_size,
+                                          tp_rank,
+                                          dim=0)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Weights loaded. Total time: {t}')
+    return weights
diff --git a/tensorrt_llm/models/mamba/model.py b/tensorrt_llm/models/mamba/model.py
index 7d2aac4d6..79a20798d 100644
--- a/tensorrt_llm/models/mamba/model.py
+++ b/tensorrt_llm/models/mamba/model.py
@@ -12,20 +12,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from collections import OrderedDict
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import tensorrt as trt
+from transformers import AutoModelForCausalLM
 
 from ..._common import default_net
 from ..._utils import str_dtype_to_trt
 from ...functional import (Tensor, arange, cast, concat, expand,
                            gather_last_token_logits, shape, unsqueeze)
 from ...layers import ColumnLinear, Embedding, LayerNorm, Mamba, Mamba2, RmsNorm
+from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...plugin import current_all_reduce_helper
 from ..generation_mixin import GenerationMixin
-from ..modeling_utils import PretrainedConfig, PretrainedModel
+from ..modeling_utils import PretrainedConfig, PretrainedModel, QuantConfig
+from .config import MambaConfig
+from .convert import convert_from_hf_checkpoint, convert_hf_mamba
 
 
 class MambaLayer(Module):
@@ -168,6 +173,7 @@ def forward(self,
 
 
 class MambaForCausalLM(PretrainedModel):
+    config_class = MambaConfig
 
     def __init__(self, config: PretrainedConfig):
         super().__init__(config)
@@ -425,3 +431,42 @@ def prepare_inputs(
             return_dict['slot_mapping'] = slot_mapping
 
         return return_dict
+
+    @classmethod
+    def from_hugging_face(
+            cls,
+            hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'],
+            dtype: str = 'auto',
+            mapping: Optional[Mapping] = None,
+            quant_config: Optional[QuantConfig] = None,
+            **kwargs):
+        import transformers
+
+        assert hf_model_or_dir is not None
+        use_preloading = isinstance(hf_model_or_dir,
+                                    transformers.PreTrainedModel)
+        if use_preloading:
+            hf_model = hf_model_or_dir
+            hf_config_or_dir = hf_model.config
+        else:
+            hf_model_dir = hf_model_or_dir
+            hf_config_or_dir = hf_model_or_dir
+        config = MambaConfig.from_hugging_face(hf_config_or_dir,
+                                               dtype=dtype,
+                                               mapping=mapping,
+                                               quant_config=quant_config,
+                                               **kwargs)
+
+        if not os.path.exists(hf_model_dir):
+            hf_model = AutoModelForCausalLM.from_pretrained(
+                hf_model_dir, torch_dtype="auto", trust_remote_code=True)
+
+            assert isinstance(hf_model, transformers.PreTrainedModel)
+            weights = convert_hf_mamba(hf_model, dtype)
+        else:
+            weights = convert_from_hf_checkpoint(config, hf_model_dir)
+
+        model = cls(config)
+        model.load(weights)
+
+        return model
diff --git a/tensorrt_llm/models/model_weights_loader.py b/tensorrt_llm/models/model_weights_loader.py
index e30f437e8..6bdc1871b 100644
--- a/tensorrt_llm/models/model_weights_loader.py
+++ b/tensorrt_llm/models/model_weights_loader.py
@@ -8,6 +8,7 @@
 import torch
 from safetensors import safe_open
 from tqdm import tqdm
+from transformers import PreTrainedModel
 
 from .._utils import trt_dtype_to_torch
 from ..layers.moe import MOEWeightWrapper
@@ -17,6 +18,7 @@
 
 
 class ModelWeightsFormat(Enum):
+    IN_MEMORY = "in_mem"
     SAFETENSORS = "safetensors"
     BINARY = "bin"
     PYTORCH = "pth"
@@ -69,7 +71,7 @@ def translate_to_external_key(
         """Translate TRT-LLM key into HF key or HF key list (e.g. QKV/MoE/GPTQ)
 
         tllm_key will get translated into HF format section by section.
-        If one section is responeded with multiple hf_keys in a list, \
+        If one section is responded with multiple hf_keys in a list, \
         the translated keys will also get multiplied accordingly.
         tllm_key : "transformer.layers.0.attention.  qkv .weight"
                           |        |   |     |        |     |
@@ -135,9 +137,13 @@ def detect_format(self):
             else:
                 raise NotImplementedError(
                     "Only safetensors/pickle/binary directories are supported.")
+        elif isinstance(self.model_dir, dict) or isinstance(
+                self.model_dir, PreTrainedModel):
+            self.format = ModelWeightsFormat.IN_MEMORY
         else:
             raise NotImplementedError(
-                "args.model_dir is Neither a directory nor a file!")
+                "args.model_dir is not a directory, a file or an in-memory module!"
+            )
 
     def preload(self):
         # Initialize shards and load_func
@@ -145,9 +151,14 @@ def preload(self):
             shard_files = glob.glob(self.model_dir + "/*." + self.format.value)
         elif os.path.isfile(self.model_dir):
             shard_files = [self.model_dir]
+        elif isinstance(self.model_dir, dict):
+            shard_files = [self.model_dir]
+        elif isinstance(self.model_dir, PreTrainedModel):
+            shard_files = [dict(self.model_dir.named_parameters())]
         else:
             raise NotImplementedError(
-                "args.model_dir is Neither a directory nor a file!")
+                "args.model_dir is not a directory, a file or an in-memory module!"
+            )
         shard_files.sort()
         if self.format == ModelWeightsFormat.SAFETENSORS:
             self.shards = [
@@ -158,6 +169,8 @@ def preload(self):
                 torch.load(f, weights_only=True, map_location="cpu", mmap=True)
                 for f in shard_files
             ]
+        elif self.format == ModelWeightsFormat.IN_MEMORY:
+            self.shards = [shard_files[0]]
         else:
             raise NotImplementedError(
                 "Only *.safetensors/*.pth/*.bin files are supported.")
@@ -177,7 +190,7 @@ def load_tensor(self, key, tp_size=1, tp_dim=-1, tp_rank=0):
             if tensor_shape == []:
                 tensor = self.shards[ptr_idx].get_tensor(key).unsqueeze(0)
                 tensor_shape = tensor.shape
-        elif self.format == ModelWeightsFormat.BINARY or self.format == ModelWeightsFormat.PYTORCH:
+        else:
             tensor = self.shards[ptr_idx][key]
             tensor_shape = tensor.shape
 
@@ -244,6 +257,11 @@ def load(self,
             elif tllm_key.endswith("weight"):
                 tp_dim = 1 - tp_dim
         tp_size = sub_module.tp_size if hasattr(sub_module, "tp_size") else 1
+        # Disable auto TP when num_kv_heads is invalid for split
+        if getattr(sub_module, "is_qkv",
+                   False) and self.model.config.num_key_value_heads < tp_size:
+            tp_dim = -1
+            tp_size = 1
         if skip_tp:
             tp_dim = -1
             tp_size = 1
@@ -287,23 +305,49 @@ def load(self,
 
         return weight_dict
 
-    def check_share_embedding(self):
+    def check_share_embedding(self, config):
+        # TODO: Remove after --use_share_embedding is removed
+        if not config.share_embedding_table:
+            return
+
+        from ..logger import logger
         lm_head_weights = self.load_tensor(
             self.translate_to_external_key("lm_head.weight",
                                            self.tllm_to_externel_key_dict))
         vocab_embed_weights = self.load_tensor(
             self.translate_to_external_key("transformer.vocab_embedding.weight",
                                            self.tllm_to_externel_key_dict))
+        share_embedding_table = False
         if lm_head_weights is not None and vocab_embed_weights is not None:
             if lm_head_weights.shape == vocab_embed_weights.shape:
                 if not (lm_head_weights - vocab_embed_weights).any():
-                    return True
-        from ..logger import logger
-        logger.warning(
-            "lm_head.weight and transformer.vocab_embedding.weight are not identical, "
-            "share_embedding_table cannot be enabled; setting share_embedding_table=False."
-        )
-        return False
+                    share_embedding_table = True
+        elif lm_head_weights is None and vocab_embed_weights is not None:
+            self.tllm_to_externel_key_dict[
+                'lm_head'] = self.tllm_to_externel_key_dict[
+                    'transformer'] + '.' + self.tllm_to_externel_key_dict[
+                        'vocab_embedding']
+            share_embedding_table = True
+        elif lm_head_weights is not None and vocab_embed_weights is None:
+            self.tllm_to_externel_key_dict[
+                'vocab_embedding'] = self.tllm_to_externel_key_dict['lm_head']
+            share_embedding_table = True
+
+        # Validation
+        mapping = config.mapping
+        if mapping.tp_size > 1:
+            if (not config.use_parallel_embedding) or (
+                    config.use_parallel_embedding
+                    and config.embedding_sharding_dim == 1):
+                share_embedding_table = False
+        if mapping.pp_size > 1:
+            share_embedding_table = False
+        if mapping.cp_size > 1:
+            share_embedding_table = False
+        config.share_embedding_table = share_embedding_table
+
+        if config.share_embedding_table:
+            logger.info("share_embedding_table enabled.")
 
     def update_key_mapping(self, model):
         self.model = weakref.ref(model)()
@@ -312,11 +356,18 @@ def update_key_mapping(self, model):
         if config.mapping.has_pp():
             pp_layers = config.mapping.pp_layers(config.num_hidden_layers)
             self.tllm_to_externel_key_dict.update({
-                str(tllm_locl_layer_idx): str(hf_global_layer_idx)
-                for tllm_locl_layer_idx, hf_global_layer_idx in enumerate(
+                str(tllm_local_layer_idx): str(hf_global_layer_idx)
+                for tllm_local_layer_idx, hf_global_layer_idx in enumerate(
                     pp_layers)
             })
 
+        # Share embedding
+        if self.tllm_to_externel_key_dict[
+                'vocab_embedding'] == self.tllm_to_externel_key_dict['lm_head']:
+            self.model.transformer.vocab_embedding.tllm_to_externel_key_dict = {
+                self.tllm_to_externel_key_dict['transformer']: '',
+            }
+
     def fill(self, weights):
         for tllm_key, param in self.model.named_parameters():
             if param.is_buffer:
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
index 5a03adc9f..16722a74f 100644
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@@ -3,18 +3,21 @@
 import dataclasses
 import json
 import os
+import re
 from enum import IntFlag, auto
 from functools import cached_property
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Union
+from typing import (TYPE_CHECKING, Callable, Dict, Generator, List, Optional,
+                    Union)
 
 import numpy as np
 import safetensors
 import torch
 
 from .._common import default_net
-from .._utils import (get_init_params, numpy_to_torch, release_gc,
-                      str_dtype_to_torch, str_dtype_to_trt, trt_dtype_to_torch)
+from .._utils import (QuantModeWrapper, get_init_params, numpy_to_torch,
+                      release_gc, str_dtype_to_torch, str_dtype_to_trt,
+                      trt_dtype_to_torch)
 from ..bindings import KVCacheType
 from ..functional import (PositionEmbeddingType, Tensor,
                           gather_last_token_logits, tanh)
@@ -106,7 +109,17 @@ def use_plugin_sq(self):
         return self.quant_algo in W8A8_SQ_PLUGIN_LIST
 
     @cached_property
-    def quant_mode(self) -> QuantMode:
+    def quant_mode(self) -> QuantModeWrapper:
+        quant_mode_list = [
+            QuantMode.from_quant_algo(
+                self.quant_algo,
+                self.kv_cache_quant_algo,
+            )
+        ]
+        return QuantModeWrapper(quant_mode_list)
+
+    @cached_property
+    def layer_quant_mode(self) -> QuantMode:
         return QuantMode.from_quant_algo(
             self.quant_algo,
             self.kv_cache_quant_algo,
@@ -123,7 +136,8 @@ def requires_calibration(self):
     def requires_modelopt_quantization(self):
         if self.quant_algo in [
                 QuantAlgo.W4A16_AWQ, QuantAlgo.FP8,
-                QuantAlgo.W8A8_SQ_PER_CHANNEL, QuantAlgo.W4A8_AWQ
+                QuantAlgo.W8A8_SQ_PER_CHANNEL, QuantAlgo.W4A8_AWQ,
+                QuantAlgo.MIXED_PRECISION
         ]:
             return True
         elif self.quant_algo is None and self.kv_cache_quant_algo == QuantAlgo.FP8:
@@ -131,6 +145,9 @@ def requires_modelopt_quantization(self):
         else:
             return False
 
+    def get_quant_cfg(self, module_name=None):
+        return self
+
     def get_modelopt_qformat(self):
         algo_to_modelopt_map = {
             QuantAlgo.W8A16: "int8_wo",
@@ -140,6 +157,7 @@ def get_modelopt_qformat(self):
             QuantAlgo.FP8: 'fp8',
             QuantAlgo.W8A8_SQ_PER_CHANNEL: 'int8_sq',
         }
+        assert self.quant_algo != QuantAlgo.MIXED_PRECISION, f"We don't support mixed precision in QuantConfig"
         if self.quant_algo is not None:
             assert self.quant_algo in algo_to_modelopt_map, f"We don't use Modelopt for quantization algorithm {self.quant_algo}, you probably shall not call this"
             return algo_to_modelopt_map[self.quant_algo]
@@ -159,12 +177,102 @@ def get_modelopt_kv_cache_dtype(self):
 
     @classmethod
     def from_dict(cls, config: dict):
-        return cls(**config)
+        obj = cls(**config)
+        return obj
 
     def to_dict(self):
         return dataclasses.asdict(self)
 
 
+@dataclasses.dataclass
+class LayerQuantConfig(QuantConfig):
+    quant_algo: Optional[QuantConfig] = None
+    kv_cache_quant_algo: Optional[QuantConfig] = None
+    quantized_layers: Optional[Dict[str, QuantConfig]] = None
+    exclude_modules: Optional[List[str]] = None
+
+    def __init__(self,
+                 *,
+                 quant_algo: Optional[QuantConfig] = None,
+                 kv_cache_quant_algo: Optional[QuantConfig] = None,
+                 quantized_layers: Optional[Dict[str, QuantConfig]] = None,
+                 exclude_modules: Optional[List[str]] = None,
+                 **kwargs):
+        self.quant_algo = quant_algo
+        self.quantized_layers = quantized_layers
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
+        self.auto_quant_mode = {}
+        for name, layer_config in self.quantized_layers.items():
+            self.auto_quant_mode.update({
+                name:
+                QuantMode.from_quant_algo(
+                    layer_config.quant_algo,
+                    self.kv_cache_quant_algo,
+                )
+            })
+        for key in kwargs:
+            logger.warning(
+                f"Warning: Unrecognized parameter '{key}' with value '{kwargs[key]}'"
+            )
+
+    @cached_property
+    def quant_mode(self):
+        quant_mode_list = list(set(self.auto_quant_mode.values()))
+        return QuantModeWrapper(quant_mode_list)
+
+    @property
+    def layer_quant_mode(self) -> Dict[str, QuantMode]:
+        return self.auto_quant_mode
+
+    @cached_property
+    def auto_quant_list(self):
+        quant_list = []
+        for _, layer_config in self.quantized_layers.items():
+            quant_list.append(layer_config.quant_algo)
+        return list(set(quant_list))
+
+    @classmethod
+    def from_dict(cls, config: dict):
+        quantized_layers = config.pop('quantized_layers', {})
+
+        quantized_layers_dict = {
+            layer_name: QuantConfig(**layer_config)
+            for layer_name, layer_config in quantized_layers.items()
+        }
+
+        obj = cls(quantized_layers=quantized_layers_dict, **config)
+        return obj
+
+    def get_quant_cfg(self, module_name):
+        assert module_name in self.quantized_layers.keys(), \
+            "module {module_name} should be included in `quantized_layers` in AutoQuant mode"
+        return self.quantized_layers[module_name]
+
+    def get_modelopt_qformat(self):
+        algo_to_modelopt_map = {
+            QuantAlgo.W4A16_AWQ: "int4_awq",
+            QuantAlgo.W4A8_AWQ: 'w4a8_awq',
+            QuantAlgo.FP8: 'fp8',
+            QuantAlgo.W8A8_SQ_PER_CHANNEL: 'int8_sq',
+        }
+        assert self.quant_algo == QuantAlgo.MIXED_PRECISION, f"We only support mixed precision quantization in LayerQuantConfig"
+        autoq_format = ','.join(
+            [algo_to_modelopt_map[item] for item in self.auto_quant_list])
+        return autoq_format
+
+    def to_dict(self):
+        output = copy.deepcopy(self.__dict__)
+        output.pop('auto_quant_mode', None)
+        output.pop('quant_mode', None)
+        output.pop('exclude_modules', None)
+        for name, per_layer_config in output['quantized_layers'].items():
+            per_layer_config = per_layer_config.to_dict()
+            per_layer_config.pop('exclude_modules')
+            output['quantized_layers'][name] = per_layer_config
+        return output
+
+
 class PretrainedConfig:
 
     def __init__(self,
@@ -268,6 +376,8 @@ def __init__(self,
 
     @property
     def kv_dtype(self):
+        # TODO: need to align the kv dtype
+        # now assume the kv cache is for all layers
         if self.quant_mode.has_int8_kv_cache():
             return 'int8'
         elif self.quant_mode.has_fp8_kv_cache():
@@ -301,7 +411,17 @@ def to_dict(self):
     def from_json_file(cls, config_file: str):
         with open(config_file) as f:
             config = json.load(f)
-        return cls.from_dict(config)
+        obj = cls.from_dict(config)
+        if obj.quantization.quant_algo == QuantAlgo.MIXED_PRECISION:
+            try:
+                layer_config_path = str(config_file).replace(
+                    'config.json', 'quant_cfg.json')
+                obj.to_layer_quant_config(layer_config_path)
+            except Exception as e:
+                raise RuntimeError(
+                    f"Encounter error '{e}' for read quantization config '{layer_config_path}'"
+                )
+        return obj
 
     @classmethod
     def from_checkpoint(cls, ckpt_dir: str):
@@ -311,10 +431,22 @@ def to_json_file(self, config_file: str):
         with open(config_file, 'w') as f:
             json.dump(self.to_dict(), f, indent=4)
 
+    def to_layer_quant_config(self, config_file: str):
+        with open(config_file) as f:
+            config = json.load(f)
+        self.quantization = LayerQuantConfig.from_dict(config)
+
     @property
     def quant_mode(self):
         return self.quantization.quant_mode
 
+    @property
+    def quant_algo(self):
+        return self.quantization.quant_algo
+
+    def get_quant_cfg(self, module_name: str):
+        return self.quantization.get_quant_cfg(module_name)
+
     def set_rank(self, rank):
         self.mapping = Mapping(self.mapping.world_size,
                                rank=rank,
@@ -380,8 +512,10 @@ def forward(self,
             if default_net().plugin_config.reduce_fusion:
                 if layer_idx < self.layer_list[-1]:
                     kwargs['next_layer_input_layernorm_args'] = (
-                        self[layer_idx + 1].input_layernorm.weight.value,
-                        self[layer_idx + 1].input_layernorm.eps)
+                        self[layer_idx + 1 -
+                             self.layer_list[0]].input_layernorm.weight.value,
+                        self[layer_idx + 1 -
+                             self.layer_list[0]].input_layernorm.eps)
                 else:
                     kwargs['next_layer_input_layernorm_args'] = None
 
@@ -403,6 +537,8 @@ def forward(self,
                     host_kv_cache_block_offsets,
                     host_kv_cache_pool_pointers=kv_cache_params.
                     host_kv_cache_pool_pointers,
+                    host_kv_cache_pool_mapping=kv_cache_params.
+                    host_kv_cache_pool_mapping,
                     cache_indirection=kv_cache_params.cache_indirection),
                 attention_params=attention_params,
                 **kwargs)
@@ -462,10 +598,14 @@ def from_config(cls, config: PretrainedConfig):
         return cls(config)
 
     @classmethod
-    def from_checkpoint(cls,
-                        ckpt_dir: str,
-                        rank: Optional[int] = None,
-                        config: Optional[PretrainedConfig] = None):
+    def from_checkpoint(
+        cls,
+        ckpt_dir: str,
+        rank: Optional[int] = None,
+        config: Optional[PretrainedConfig] = None,
+        *,
+        preprocess_weights_hook: Optional[Callable[[Dict[str, Tensor]],
+                                                   Dict[str, Tensor]]] = None):
         if config is None:
             config = PretrainedConfig.from_json_file(
                 os.path.join(ckpt_dir, 'config.json'))
@@ -478,9 +618,14 @@ def from_checkpoint(cls,
 
         assert os.path.isfile(weights_path)
         weights = safetensors.torch.load_file(weights_path)
-
         is_checkpoint_pruned = getattr(config, 'is_pruned', False)
-        preprocess_weights(weights, config, from_pruned=is_checkpoint_pruned)
+
+        if preprocess_weights_hook is not None:
+            weights = preprocess_weights_hook(weights)
+
+        weights = preprocess_weights(weights,
+                                     config,
+                                     from_pruned=is_checkpoint_pruned)
         model = cls(config)
         model.load(weights, from_pruned=is_checkpoint_pruned)
         return model
@@ -629,6 +774,8 @@ def prepare_inputs(
                     'host_kv_cache_block_offsets'],
                 host_kv_cache_pool_pointers=model_inputs[
                     'host_kv_cache_pool_pointers'],
+                host_kv_cache_pool_mapping=model_inputs[
+                    'host_kv_cache_pool_mapping'],
                 cache_indirection=model_inputs['cache_indirection'],
             ),
             'attention_params':
@@ -817,11 +964,6 @@ def fuse_gate_mlp(
 ) -> PretrainedModel:
     from ..quantization.quantize import fp8_quantize
 
-    quant_algo = model.config.quantization.quant_algo
-    if quant_algo != QuantAlgo.FP8 and quant_algo is not None:
-        logger.warning("fuse_gate_mlp cannot be done for this model. Skipping.")
-        return model
-
     for name, mlp, layer in model.named_modules_with_parent():
         if isinstance(mlp, GatedMLP):
             init_params = get_init_params(mlp)
@@ -836,9 +978,18 @@ def fuse_gate_mlp(
             init_params["inner_layernorm"] = mlp.inner_layernorm is not None
             fused_layer = FusedGatedMLP(**init_params)
 
-            if quant_algo == QuantAlgo.FP8:
-                fused_layer = fp8_quantize(fused_layer,
-                                           model.config.quantization)
+            fc_name = name + '.fc'
+            layer_quant_cfg = model.config.get_quant_cfg(fc_name)
+            layer_quant_algo = layer_quant_cfg.quant_algo
+            if layer_quant_algo != QuantAlgo.FP8 and layer_quant_algo is not None:
+                continue
+
+            if isinstance(model.config.quantization.exclude_modules, list) \
+                    and fc_name in model.config.quantization.exclude_modules:
+                layer_quant_algo = None
+
+            if layer_quant_algo == QuantAlgo.FP8:
+                fused_layer = fp8_quantize(fused_layer, layer_quant_cfg)
 
                 if isinstance(mlp.dtype, str):
                     dtype = str_dtype_to_torch(mlp.dtype)
@@ -891,7 +1042,7 @@ def fuse_gate_mlp(
                     mlp.gate.activation_scaling_factor.raw_value,
                     mlp.fc.activation_scaling_factor.raw_value,
                 )
-            elif quant_algo is None:
+            elif layer_quant_algo is None:
                 fused_layer.fused_fc.weight.value = np.concatenate(
                     [
                         mlp.gate.weight.raw_value,
@@ -904,7 +1055,7 @@ def fuse_gate_mlp(
                         [mlp.gate.bias.raw_value, mlp.fc.bias.raw_value],
                         axis=0)
             else:
-                raise ValueError(f'Unsupported quant algo: {quant_algo}')
+                raise ValueError(f'Unsupported quant algo: {layer_quant_algo}')
 
             fused_layer.proj = mlp.proj
             fused_layer.inner_layernorm = mlp.inner_layernorm
@@ -950,9 +1101,10 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
                     layer.tp_size * layer.num_attention_kv_heads *
                     layer.attention_head_size,
                 })
-            q = quantize(q, model.config.quantization)
-            k = quantize(k, model.config.quantization)
-            v = quantize(v, model.config.quantization)
+            layer_quant_cfg = model.config.get_quant_cfg(name + '.qkv')
+            q = quantize(q, layer_quant_cfg)
+            k = quantize(k, layer_quant_cfg)
+            v = quantize(v, layer_quant_cfg)
             out_features = q.out_features + k.out_features + v.out_features
             if isinstance(layer.qkv, (
                     WeightOnlyQuantLinear,
@@ -1130,7 +1282,8 @@ def share_embedding(model: PretrainedModel) -> PretrainedModel:
 
 def set_fp8_context_fhma(model: PretrainedModel) -> PretrainedModel:
     for name, layer in model.named_modules():
-        if isinstance(layer, Attention):
+        if isinstance(layer, Attention) and hasattr(
+                layer.dense, 'activation_scaling_factor'):
             scale = [1.0] / layer.dense.activation_scaling_factor.raw_value
             layer.attention_output_orig_quant_scale = Parameter(
                 value=scale.astype(np.float32))
@@ -1180,19 +1333,11 @@ def optimize_model(
     return model
 
 
-def preprocess_weights(weights: Dict[str, torch.Tensor],
-                       model_config: PretrainedConfig,
-                       from_pruned=False) -> None:
-    """This function in-place modifies weights and model_config, making them compatible with each other.
-
-    Note: Typically, it should be called before model creation and weight loading. For example,
-        preprocess_weights(weights, model_config)
-        model = XXXForCausalLM(model_config)
-        model.load(weights)
-    """
-    quant_algo = model_config.quantization.quant_algo
+def preprocess_perlayer_weights(weights,
+                                model_config,
+                                quant_algo,
+                                from_pruned=False):
     exclude_modules = model_config.quantization.exclude_modules
-
     # INT4_AWQ
     if quant_algo == QuantAlgo.W4A8_AWQ or quant_algo == QuantAlgo.W4A16_AWQ:
         preprocessor = torch.ops.trtllm.preprocess_weights_for_mixed_gemm
@@ -1267,20 +1412,76 @@ def preprocess_weights(weights: Dict[str, torch.Tensor],
                                             exclude_modules=exclude_modules,
                                             plugin=True)
 
-    # Parallel block rowlinear should not have duplicate bias.
-    elif model_config.architecture == 'GPTJForCausalLM':
-        if model_config.mapping.tp_rank > 0:
-            for name, param in weights.items():
+
+def preprocess_weights(weights: Dict[str, torch.Tensor],
+                       model_config: PretrainedConfig,
+                       from_pruned=False) -> None:
+    """This function in-place modifies weights and model_config, making them compatible with each other.
+
+    Note: Typically, it should be called before model creation and weight loading. For example,
+        preprocess_weights(weights, model_config)
+        model = XXXForCausalLM(model_config)
+        model.load(weights)
+    """
+    quant_config = model_config.quantization
+    quant_algo = quant_config.quant_algo
+
+    pattern_info = ['fc', 'gate', 'proj', 'qkv', 'dense']
+
+    per_layer_weights = {}
+
+    for name, param in weights.items():
+        in_mode = False
+        for info in pattern_info:
+            pattern = rf'(.*?{info}.*?)'
+            pattern_match = re.match(pattern, name)
+            if pattern_match:
+                base_name = pattern_match.group(1)
+                if base_name not in per_layer_weights.keys():
+                    per_layer_weights[base_name] = {}
+                per_layer_weights[base_name][name] = param
+                in_mode = True
+                break
+        if not in_mode:
+            # [lm_head.weight, ln_f.weight, vocab_embedding.weight]
+            base_name = name.rsplit('.', 1)[0]
+            if base_name not in per_layer_weights.keys():
+                per_layer_weights[base_name] = {}
+            per_layer_weights[base_name][name] = param
+
+    new_weights = {}
+    for base_name, layer_weights in per_layer_weights.items():
+        if quant_algo != QuantAlgo.MIXED_PRECISION:
+            layer_quant_algo = quant_algo
+        else:
+            if base_name not in quant_config.quantized_layers.keys():
+                new_weights.update(layer_weights)
+                continue
+            layer_quant_algo = quant_config.quantized_layers[
+                base_name].quant_algo
+
+        preprocess_perlayer_weights(layer_weights, model_config,
+                                    layer_quant_algo, from_pruned)
+        new_weights.update(layer_weights)
+
+    weights = new_weights
+    for name, param in weights.items():
+        if model_config.architecture == 'GPTJForCausalLM':
+            if model_config.mapping.tp_rank > 0:
                 if 'attention.dense.bias' in name or 'mlp.proj.bias' in name:
                     weights[name] = torch.zeros_like(param)
 
     # For share_embedding_table
     check_share_embedding(weights, model_config)
+    return weights
 
 
 def check_share_embedding(weights: Dict[str, torch.Tensor],
                           model_config: PretrainedConfig):
     if model_config.share_embedding_table:
+        if "lm_head.weight" in weights:
+            if weights["lm_head.weight"] is None:
+                weights.pop("lm_head.weight")
         if "lm_head.weight" in weights and "transformer.vocab_embedding.weight" in weights:
             if (weights["lm_head.weight"] -
                     weights["transformer.vocab_embedding.weight"]).any():
diff --git a/tensorrt_llm/models/nemotron_nas/__init__.py b/tensorrt_llm/models/nemotron_nas/__init__.py
new file mode 100644
index 000000000..71bf6d298
--- /dev/null
+++ b/tensorrt_llm/models/nemotron_nas/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tensorrt_llm/models/deci/config.py b/tensorrt_llm/models/nemotron_nas/config.py
similarity index 86%
rename from tensorrt_llm/models/deci/config.py
rename to tensorrt_llm/models/nemotron_nas/config.py
index b9accc61e..ca3b4fb1b 100644
--- a/tensorrt_llm/models/deci/config.py
+++ b/tensorrt_llm/models/nemotron_nas/config.py
@@ -21,11 +21,11 @@
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models.deci.convert import hf_block_config_to_layer_config
-from tensorrt_llm.models.deci.layer_config import (AttentionConfig,
-                                                   AttentionImplementation,
-                                                   DeciLayerConfig, FFNConfig)
 from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
+from tensorrt_llm.models.nemotron_nas.convert import \
+    hf_block_configs_to_layer_configs
+from tensorrt_llm.models.nemotron_nas.layer_config import (
+    AttentionConfig, AttentionImplementation, DeciLayerConfig, FFNConfig)
 
 
 class DeciConfig(PretrainedConfig):
@@ -60,6 +60,7 @@ def __init__(self,
                                                     Dict[str,
                                                          Dict[str,
                                                               Any]]]]] = None,
+                 block_configs: Optional[object] = None,
                  **kwargs):
         super().__init__(architecture=architecture,
                          dtype=dtype,
@@ -86,7 +87,13 @@ def __init__(self,
         self.rotary_base = rotary_base
         self.rotary_scaling = rotary_scaling
 
-        if layer_configs is not None:
+        if block_configs is not None:
+            assert layer_configs is None
+            self.layer_configs = hf_block_configs_to_layer_configs(
+                block_configs,
+                num_attention_heads=num_attention_heads,
+                hidden_size=hidden_size)
+        elif layer_configs is not None:
             assert len(
                 layer_configs
             ) == num_hidden_layers, f"num_hidden_layers ({num_hidden_layers}) must match len(layer_configs) ({len(layer_configs)})"
@@ -102,6 +109,14 @@ def __init__(self,
             for layer_idx in range(self.num_hidden_layers)
         ]
 
+        # HACK: this is here since the runtime doesn't parse the layer_configs yet
+        self.num_kv_heads_per_layer = []
+        for layer_idx in range(self.num_hidden_layers):
+            layer_config = self.get_layer_config(layer_idx)
+            if layer_config.is_attention_layer:
+                self.num_kv_heads_per_layer.append(
+                    layer_config.attention.num_key_value_heads)
+
     def _ensure_layer_configs(
         self, layer_configs: List[Union[DeciLayerConfig, Dict[str, Any]]]
     ) -> List[DeciLayerConfig]:
@@ -154,16 +169,16 @@ def from_hugging_face(
             hf_config = transformers.AutoConfig.from_pretrained(
                 hf_config_or_dir, trust_remote_code=trust_remote_code)
 
-        assert hf_config.model_type == "deci", f"Unsupported model type: {hf_config.model_type}"
+        assert hf_config.model_type in (
+            "deci",
+            "nemotron-nas"), f"Unsupported model type: {hf_config.model_type}"
 
         block_configs = getattr(hf_config, "block_configs", None)
         if block_configs is not None:
-            layer_configs = [
-                hf_block_config_to_layer_config(block_config,
-                                                hf_config.num_attention_heads,
-                                                hf_config.hidden_size)
-                for block_config in block_configs
-            ]
+            layer_configs = hf_block_configs_to_layer_configs(
+                block_configs,
+                num_attention_heads=hf_config.num_attention_heads,
+                hidden_size=hf_config.hidden_size)
         else:
             # older deci arch
             num_key_value_heads_per_layer = getattr(
diff --git a/tensorrt_llm/models/deci/convert.py b/tensorrt_llm/models/nemotron_nas/convert.py
similarity index 77%
rename from tensorrt_llm/models/deci/convert.py
rename to tensorrt_llm/models/nemotron_nas/convert.py
index c6bff772a..06ca34b61 100644
--- a/tensorrt_llm/models/deci/convert.py
+++ b/tensorrt_llm/models/nemotron_nas/convert.py
@@ -17,8 +17,9 @@
 import time
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
+from dataclasses import asdict
 from pathlib import Path
-from typing import Any, Dict, Iterator, Optional, TypedDict, Union
+from typing import Any, Dict, Iterator, List, Optional, TypedDict, Union
 
 import safetensors
 import torch
@@ -26,10 +27,9 @@
 from tensorrt_llm._utils import pad_vocab_size
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.convert_utils import dup_kv_weight, split
-from tensorrt_llm.models.deci.layer_config import (AttentionConfig,
-                                                   AttentionImplementation,
-                                                   DeciLayerConfig, FFNConfig,
-                                                   FFNImplementation)
+from tensorrt_llm.models.nemotron_nas.layer_config import (
+    AttentionConfig, AttentionImplementation, DeciLayerConfig, FFNConfig,
+    FFNImplementation)
 from tensorrt_llm.quantization.mode import QuantAlgo
 
 
@@ -45,35 +45,39 @@ def _find_multiple(n: int, k: int) -> int:
 
 
 # BlockConfig is a custom class defined inside deci huggingface checkpoints, we can't import it
-def hf_block_config_to_layer_config(block_config: "BlockConfig",
+def hf_block_config_to_layer_config(block_config: Union["BlockConfig", dict],
                                     num_attn_heads: int,
                                     hidden_size: int) -> DeciLayerConfig:
-    attn = block_config.attention
-    if attn.no_op:
+    """`block_config` (`Union[BlockConfig, dict]`): A `dict` when exported from `ModelOpt`; A `dataclass` at the HF phase
+    """
+    block_config = block_config if isinstance(block_config,
+                                              dict) else asdict(block_config)
+    attn = block_config["attention"]
+    if attn["no_op"]:
         attn_impl = AttentionImplementation.NO_OP
         num_key_value_heads = None
-    elif attn.replace_with_linear:
+    elif attn["replace_with_linear"]:
         attn_impl = AttentionImplementation.LINEAR
         num_key_value_heads = None
-    elif attn.sparsify:
+    elif attn.get("sparsify", None):
         raise NotImplementedError("Sparsification is not supported")
     else:
         attn_impl = AttentionImplementation.ATTENTION
-        num_key_value_heads = num_attn_heads // attn.n_heads_in_group
+        num_key_value_heads = num_attn_heads // attn["n_heads_in_group"]
 
-    ffn = block_config.ffn
-    if ffn.no_op:
+    ffn = block_config["ffn"]
+    if ffn["no_op"]:
         ffn_impl = FFNImplementation.NO_OP
         intermediate_size = None
-    elif ffn.replace_with_linear:
+    elif ffn["replace_with_linear"]:
         ffn_impl = FFNImplementation.LINEAR
         intermediate_size = None
-    elif ffn.sparsify:
+    elif ffn.get("sparsify", None):
         raise NotImplementedError("Sparsification is not supported")
     else:
         ffn_impl = FFNImplementation.MLP
         intermediate_size = _ffn_mult_to_intermediate_size(
-            ffn.ffn_mult, hidden_size)
+            ffn["ffn_mult"], hidden_size)
 
     return DeciLayerConfig(attention=AttentionConfig(
         impl=attn_impl, num_key_value_heads=num_key_value_heads),
@@ -81,6 +85,16 @@ def hf_block_config_to_layer_config(block_config: "BlockConfig",
                                          intermediate_size=intermediate_size))
 
 
+def hf_block_configs_to_layer_configs(
+        block_configs: Union["BlockConfig", dict], *, num_attention_heads: int,
+        hidden_size: int) -> List[DeciLayerConfig]:
+    return [
+        hf_block_config_to_layer_config(block_config, num_attention_heads,
+                                        hidden_size)
+        for block_config in block_configs
+    ]
+
+
 @contextmanager
 def timed_loading() -> Iterator[None]:
     tik = time.time()
@@ -105,12 +119,31 @@ class SafetensorsIndex(TypedDict):
 class WeightsLoader(ABC):
 
     @abstractmethod
+    def read_weight(self, name: str) -> torch.Tensor:
+        ...
+
     def get_weight(self,
                    name: str,
                    tp_dim: TpDim = TpDim.NO_TP,
                    tp_size: int = 1,
                    tp_rank: int = 0) -> torch.Tensor:
-        ...
+        weight = self.read_weight(name)
+        if tp_dim != TpDim.NO_TP:
+            weight = split(weight, tp_size, tp_rank, dim=tp_dim)
+        return weight
+
+    def get_kv_weight(self,
+                      name: str,
+                      num_heads: int,
+                      tp_size: int = 1,
+                      tp_rank: int = 0) -> torch.Tensor:
+        weight = self.read_weight(name)
+        if tp_size > num_heads:
+            weight = dup_kv_weight(weight, num_heads, tp_size)
+        if tp_size > 1:
+            weight = split(weight, tp_size, tp_rank, dim=0)
+
+        return weight
 
 
 class HFModelWeightsLoader(WeightsLoader):
@@ -120,18 +153,11 @@ def __init__(self, *, hf_model: "transformers.PreTrainedModel",
         self.model_params = dict(hf_model.named_parameters())
         self.dtype = getattr(torch, dtype)
 
-    def get_weight(self,
-                   name: str,
-                   tp_dim: TpDim = TpDim.NO_TP,
-                   tp_size: int = 1,
-                   tp_rank: int = 0) -> torch.Tensor:
+    def read_weight(self, name: str) -> torch.Tensor:
         weight = self.model_params[name]
         if weight.dtype != self.dtype:
             weight = weight.to(self.dtype)
         weight = weight.detach()
-
-        if tp_dim != TpDim.NO_TP:
-            weight = split(weight, tp_size, tp_rank, dim=tp_dim)
         return weight
 
 
@@ -163,37 +189,10 @@ def __init__(self, *, model_dir: Path, dtype: str) -> None:
             for shard_file in shard_files
         }
 
-    def get_weight(self,
-                   name: str,
-                   tp_dim: TpDim = TpDim.NO_TP,
-                   tp_size: int = 1,
-                   tp_rank: int = 0) -> torch.Tensor:
+    def read_weight(self, name: str) -> torch.Tensor:
         shard_filename = self.sharding_map['weight_map'].get(
             name, self.shard_files[0])
-        if tp_dim == TpDim.NO_TP:
-            res = self.safetensors_files[shard_filename].get_tensor(name)
-        else:
-            tensor_slice = self.safetensors_files[shard_filename].get_slice(
-                name)
-            tensor_shape = tensor_slice.get_shape()
-            if len(tensor_shape) == 1:
-                if tp_dim == TpDim.COLWISE:
-                    slice_width = tensor_shape[0] // tp_size
-                    res = tensor_slice[slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1)]
-                else:  # row-wise, but 1-dimensional ==> no tp
-                    res = tensor_slice[:]
-            else:
-                assert tensor_shape[
-                    tp_dim] % tp_size == 0, f"Current weight shape is invalid for tp_size={tp_size}"
-                slice_width = tensor_shape[tp_dim] // tp_size
-                if tp_dim == TpDim.COLWISE:
-                    res = tensor_slice[slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1), :]
-                else:
-                    res = tensor_slice[:, slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1)]
-
+        res = self.safetensors_files[shard_filename].get_tensor(name)
         return res.to(self.dtype).contiguous()
 
 
@@ -245,24 +244,20 @@ def load_weight(name: str, tp_dim: TpDim = TpDim.NO_TP) -> torch.Tensor:
                     f"model.layers.{l}.input_layernorm.weight"
                 )  # input_layernorm
 
-                qkv = {}
-                for comp in ["q", "k", "v"]:
-                    weight_part = load_weight(
-                        f"model.layers.{l}.self_attn.{comp}_proj.weight",
-                        TpDim.COLWISE)
-                    qkv[comp] = weight_part
-
-                if layer_config.attention.num_key_value_heads < mapping.tp_size:
-                    # duplicate the KV heads up to tensor_parallel
-                    qkv["k"] = dup_kv_weight(
-                        qkv["k"], layer_config.attention.num_key_value_heads,
-                        mapping.tp_size)
-                    qkv["v"] = dup_kv_weight(
-                        qkv["v"], layer_config.attention.num_key_value_heads,
-                        mapping.tp_size)
-
+                q = load_weight(f"model.layers.{l}.self_attn.q_proj.weight",
+                                TpDim.COLWISE)
+                k = loader.get_kv_weight(
+                    f"model.layers.{l}.self_attn.k_proj.weight",
+                    num_heads=layer_config.attention.num_key_value_heads,
+                    tp_size=mapping.tp_size,
+                    tp_rank=mapping.tp_rank)
+                v = loader.get_kv_weight(
+                    f"model.layers.{l}.self_attn.v_proj.weight",
+                    num_heads=layer_config.attention.num_key_value_heads,
+                    tp_size=mapping.tp_size,
+                    tp_rank=mapping.tp_rank)
                 weights[f'{tllm_prex}.attention.qkv.weight'] = torch.cat(
-                    [qkv["q"], qkv["k"], qkv["v"]], 0)
+                    [q, k, v], 0)
                 weights[f'{tllm_prex}.attention.dense.weight'] = load_weight(
                     f"model.layers.{l}.self_attn.o_proj.weight",
                     TpDim.ROWWISE)  # attention.dense
@@ -363,3 +358,23 @@ def load_weights_from_hf_safetensors(
     loader = SafetensorsWeightsLoader(model_dir=model_dir, dtype=config.dtype)
     logger.info('Loading weights from Huggingface safetensors...')
     return load_model_weights(loader=loader, config=config)
+
+
+def update_weights_following_modelopt_optimization(
+        weights: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    # Rename MLPs to FFNs to match TRTLLM implementation expectation
+    weights = {k.replace('.mlp.', '.ffn.'): v for k, v in weights.items()}
+
+    # Move all linear attentions to their expected locations
+    weights = {
+        k.replace('.attn_replacing_linear.', '.attention.'): v
+        for k, v in weights.items()
+    }
+
+    # Move all linear MLPs to their expected locations
+    weights = {
+        k.replace('.mlp_replacing_linear.', '.ffn.'): v
+        for k, v in weights.items()
+    }
+
+    return weights
diff --git a/tensorrt_llm/models/deci/layer_config.py b/tensorrt_llm/models/nemotron_nas/layer_config.py
similarity index 100%
rename from tensorrt_llm/models/deci/layer_config.py
rename to tensorrt_llm/models/nemotron_nas/layer_config.py
diff --git a/tensorrt_llm/models/deci/model.py b/tensorrt_llm/models/nemotron_nas/model.py
similarity index 67%
rename from tensorrt_llm/models/deci/model.py
rename to tensorrt_llm/models/nemotron_nas/model.py
index b0d0ded0e..a3c3e2388 100644
--- a/tensorrt_llm/models/deci/model.py
+++ b/tensorrt_llm/models/nemotron_nas/model.py
@@ -16,9 +16,10 @@
 from typing import List, Optional, Tuple, Type, Union
 
 from tensorrt_llm.bindings import KVCacheType
-from tensorrt_llm.functional import (AllReduceFusionParams, AttentionMaskType,
-                                     PositionEmbeddingType, Tensor,
-                                     gather_last_token_logits, recv, send)
+from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceFusionParams,
+                                     AttentionMaskType, PositionEmbeddingType,
+                                     Tensor, gather_last_token_logits, recv,
+                                     send)
 from tensorrt_llm.layers.attention import (Attention, AttentionParams,
                                            KeyValueCacheParams,
                                            SpecDecodingParams)
@@ -29,16 +30,17 @@
 from tensorrt_llm.layers.normalization import RmsNorm
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import has_safetensors
-from tensorrt_llm.models.deci.config import DeciConfig
-from tensorrt_llm.models.deci.convert import (load_weights_from_hf_model,
-                                              load_weights_from_hf_safetensors)
 from tensorrt_llm.models.modeling_utils import DecoderModelForCausalLM
+from tensorrt_llm.models.nemotron_nas.config import DeciConfig
+from tensorrt_llm.models.nemotron_nas.convert import (
+    load_weights_from_hf_model, load_weights_from_hf_safetensors,
+    update_weights_following_modelopt_optimization)
 from tensorrt_llm.module import Module, ModuleList
 from tensorrt_llm.plugin.plugin import init_all_reduce_helper
 
 from ..._common import default_net
 from ..._utils import pad_vocab_size
-from ..modeling_utils import QuantConfig, preprocess_weights
+from ..modeling_utils import PretrainedConfig, QuantConfig, preprocess_weights
 
 
 @dataclass
@@ -123,21 +125,106 @@ def __init__(self, config: DeciConfig, layer_idx: int):
 
         self.layer_config = self.config.get_layer_config(self.layer_idx)
 
-        layer_type_len = len(config.layer_types)
-        layer_types = config.layer_types * ((layer_idx + 1) // layer_type_len)
-        layer_types = layer_types + config.layer_types[0:(
-            (layer_idx + 1) % layer_type_len)]
-
-        attention_layer_idx = layer_types.count('attention') - 1
-        self._init_attention(attention_layer_idx)
+        self._init_attention()
         self._init_ffn()
 
-    def _init_attention(self, attention_layer_idx) -> None:
+    @property
+    def input_layernorm_was_fused(self) -> bool:
+        """
+        The previous layer ran our input_layernorm for us if:
+        1. The reduce_fusion plugin is enabled and
+        2. We are not the first local model layer and
+        3. The previous layer is an MLP layer
+        """
+        return default_net(
+        ).plugin_config.reduce_fusion and self.local_layer_idx > 0 and self.config.get_layer_config(
+            self.layer_idx -
+            1).is_mlp_layer and self.needs_input_layernorm_fusion
+
+    @property
+    def needs_input_layernorm_fusion(self) -> bool:
+        """
+        This layer needs the previous layer to perform input_layernorm fusion if:
+        1. The reduce_fusion plugin is enabled and
+        2. This is not a NOOP attention layer (otherwise it has no input_layernorm)
+        """
+        return default_net(
+        ).plugin_config.reduce_fusion and not self.layer_config.is_noop_attention_layer
+
+    @property
+    def can_fuse_post_layernorm(self) -> bool:
+        """
+        This layer can fuse attention and post_layernorm if:
+        1. The reduce_fusion plugin is enabled and
+        2. It is an attention layer and
+        3. It is not a NOOP FFN layer (othrewise it has no post_layernorm)
+        """
+        return default_net(
+        ).plugin_config.reduce_fusion and self.layer_config.is_attention_layer and not self.layer_config.is_noop_ffn_layer
+
+    @property
+    def can_fuse_input_layernorm(self) -> bool:
+        """
+        This layer can run the next layer's input_layernorm if:
+        1. The reduce_fusion plugin is enable and
+        2. It is an MLP layer
+        """
+        return default_net(
+        ).plugin_config.reduce_fusion and self.layer_config.is_mlp_layer
+
+    def _init_attention(self) -> None:
         """
         Initialize some attention alternative
         """
         # normal attention
         if self.layer_config.is_attention_layer:
+            # according to recurrentgemma, len(layer_types) can be less than num_hidden_layers
+            # in this case, the list should wrap-around
+            # for example, if layer_types = ["attention", "recurrent", "recurrent"], and we have 5 layers, we get:
+            # layer 0 ==> attention
+            # layer 1 ==> recurrent
+            # layer 2 ==> recurrent
+            # layer 3 ==> attention
+            # layer 4 ==> recurrent
+            # we check which layers are local to our rank
+            layers_range = self.config.mapping.pp_layers(
+                self.config.num_hidden_layers)
+            # then take the size of layer_types in the config
+            layer_type_len = len(self.config.layer_types)
+            # collect the layer types of all the local layers
+            local_layer_types = [
+                self.config.layer_types[layer_id % layer_type_len]
+                for layer_id in layers_range
+            ]
+            # and see how many of them are attention layers to determine our local attention layer idx
+            local_attn_layer_idx = local_layer_types[:self.
+                                                     local_layer_idx].count(
+                                                         "attention")
+
+            # Iterate over all local layer configs, getting num_kv_heads of the attention ones
+            num_kv_heads_per_local_layer = [
+                layer_config.attention.num_key_value_heads for layer_config in
+                [self.config.layer_configs[idx] for idx in layers_range]
+                if layer_config.is_attention_layer
+            ]
+
+            # adjust num heads according to tp size
+            num_kv_heads_per_local_layer = [
+                (nheads + self.config.mapping.tp_size - 1) //
+                self.config.mapping.tp_size
+                for nheads in num_kv_heads_per_local_layer
+            ]
+            nheads_tp = (self.layer_config.attention.num_key_value_heads +
+                         self.config.mapping.tp_size -
+                         1) // self.config.mapping.tp_size
+
+            # local layers with the same number of kv heads share the same cache pool
+            # we count how many such layers there are before us to determine our index inside that pool
+            layer_idx_in_cache_pool = num_kv_heads_per_local_layer[:
+                                                                   local_attn_layer_idx].count(
+                                                                       nheads_tp
+                                                                   )
+
             self.input_layernorm = RmsNorm(
                 normalized_shape=self.config.hidden_size,
                 eps=self.config.norm_epsilon,
@@ -145,7 +232,7 @@ def _init_attention(self, attention_layer_idx) -> None:
             )
 
             self.attention = Attention(
-                local_layer_idx=attention_layer_idx,
+                local_layer_idx=local_attn_layer_idx,
                 hidden_size=self.config.hidden_size,
                 attention_head_size=self.config.head_size,
                 num_attention_heads=self.config.num_attention_heads,
@@ -161,7 +248,7 @@ def _init_attention(self, attention_layer_idx) -> None:
                 tp_size=self.config.mapping.tp_size,
                 tp_rank=self.config.mapping.tp_rank,
                 quant_mode=self.config.quant_mode,
-            )
+                layer_idx_in_cache_pool=layer_idx_in_cache_pool)
 
         elif self.layer_config.is_noop_attention_layer:
             self.input_layernorm = NoOpLayerNorm()
@@ -238,20 +325,34 @@ def _init_ffn(self) -> None:
                 f"FFN of type {str(self.layer_config.ffn.impl)} is not implemented"
             )
 
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Optional[Tensor] = None,
-        use_cache: bool = False,
-        spec_decoding_params=None,
-        kv_cache_params: Optional[KeyValueCacheParams] = None,
-        attention_params: Optional[AttentionParams] = None,
-        lora_layer_params: Optional[LoraParams] = None,
-    ):
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
+    def forward(self,
+                hidden_states: Tensor | Tuple[Tensor, Tensor],
+                attention_mask: Optional[Tensor] = None,
+                use_cache: bool = False,
+                spec_decoding_params=None,
+                kv_cache_params: Optional[KeyValueCacheParams] = None,
+                attention_params: Optional[AttentionParams] = None,
+                lora_layer_params: Optional[LoraParams] = None,
+                next_layer_input_layernorm_args: Optional[Tuple[Tensor,
+                                                                float]] = None):
+        if self.input_layernorm_was_fused:
+            # previous layer already performed our layer norm
+            assert isinstance(hidden_states, tuple)
+            hidden_states, residual = hidden_states
+        else:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+
+        if self.can_fuse_post_layernorm:
+            reduce_fusion_params = AllReduceFusionParams(
+                fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
+                residual=residual,
+                norm_weight=self.post_layernorm.weight.value,
+                eps=self.post_layernorm.eps)
+        else:
+            reduce_fusion_params = None
 
-        attention_output = self.attention(
+        attention_output = self._run_attention(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             use_cache=use_cache,
@@ -259,23 +360,92 @@ def forward(
             kv_cache_params=kv_cache_params,
             attention_params=attention_params,
             lora_layer_params=lora_layer_params,
-        )
+            reduce_fusion_params=reduce_fusion_params)
 
         if use_cache:
             attention_output, present_kv = attention_output
         else:
             present_kv = None
 
-        hidden_states = residual + attention_output
-        residual = hidden_states
-        hidden_states = self.post_layernorm(hidden_states)
-        hidden_states = self.ffn(hidden_states,
-                                 lora_layer_params=lora_layer_params)
-        hidden_states = residual + hidden_states
+        if self.can_fuse_post_layernorm:
+            hidden_states, residual = attention_output
+        else:
+            hidden_states = residual + attention_output
+            residual = hidden_states
+            hidden_states = self.post_layernorm(hidden_states)
+
+        if next_layer_input_layernorm_args is not None:
+            assert self.can_fuse_input_layernorm
+            norm_weight, eps = next_layer_input_layernorm_args
+            reduce_fusion_params = AllReduceFusionParams(
+                fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
+                residual=residual,
+                norm_weight=norm_weight,
+                eps=eps)
+            hidden_states = self._run_ffn(
+                hidden_states,
+                lora_layer_params=lora_layer_params,
+                reduce_fusion_params=reduce_fusion_params)
+
+        else:
+            hidden_states = self._run_ffn(hidden_states,
+                                          lora_layer_params=lora_layer_params)
+            hidden_states = residual + hidden_states
 
         return DeciLMLayerOutput(hidden_states=hidden_states,
                                  present_kv=present_kv)
 
+    def _run_attention(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        use_cache: bool = False,
+        spec_decoding_params=None,
+        kv_cache_params: Optional[KeyValueCacheParams] = None,
+        attention_params: Optional[AttentionParams] = None,
+        lora_layer_params: Optional[LoraParams] = None,
+        reduce_fusion_params: Optional[AllReduceFusionParams] = None
+    ) -> Union[Tensor, Tuple[Tensor, None]]:
+        """
+        Ideally, this functionality would be encapsulated in a LinearAttention class, but during
+        FP8 and lower quantization, our linear classes get overrun by ModelOpt, thus we must
+        control the attention inputs at the DecoderLayer level.
+        """
+        if self.layer_config.is_linear_attention_layer:
+            out = self.attention(hidden_states)
+            return out, None if use_cache else out
+        else:
+            if not self.layer_config.is_attention_layer:
+                assert reduce_fusion_params is None, f"Layer with attention of type {self.layer_config.attention.impl} can't do reduce_fusion"
+
+            return self.attention(hidden_states=hidden_states,
+                                  attention_mask=attention_mask,
+                                  use_cache=use_cache,
+                                  spec_decoding_params=spec_decoding_params,
+                                  kv_cache_params=kv_cache_params,
+                                  attention_params=attention_params,
+                                  lora_layer_params=lora_layer_params,
+                                  reduce_fusion_params=reduce_fusion_params)
+
+    def _run_ffn(self,
+                 hidden_states,
+                 lora_layer_params=None,
+                 reduce_fusion_params: Optional[AllReduceFusionParams] = None):
+        """
+        Ideally, this functionality would be encapsulated in a LinearMLP class, but during
+        FP8 and lower quantization, our linear classes get overrun by ModelOpt, thus we must
+        control the MLP inputs at the DecoderLayer level.
+        """
+        if reduce_fusion_params is not None:
+            assert self.layer_config.is_mlp_layer, f"Layer with FFN of type {self.layer_config.ffn.impl} can't do reduce_fusion"
+
+        if self.layer_config.is_linear_ffn_layer:
+            return self.ffn(hidden_states)
+        else:
+            return self.ffn(hidden_states,
+                            lora_layer_params=lora_layer_params,
+                            reduce_fusion_params=reduce_fusion_params)
+
 
 class DeciLMDecoderLayerList(ModuleList):
 
@@ -311,6 +481,17 @@ def forward(
         past_key_values = [x for x in pkv_iter]
 
         for layer_idx, (layer, past) in enumerate(zip(self, past_key_values)):
+            next_layer_input_layernorm_args = None
+            if default_net().plugin_config.reduce_fusion:
+                if layer_idx < self.layer_list[-1]:
+                    # this is not the last layer
+                    next_layer = self[layer_idx + 1]
+                    if layer.can_fuse_input_layernorm and next_layer.needs_input_layernorm_fusion:
+                        # this layer can fuse the next layer's input_layernorm
+                        next_layer_input_layernorm_args = (
+                            next_layer.input_layernorm.weight.value,
+                            next_layer.input_layernorm.eps)
+
             layer_out = layer(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
@@ -329,13 +510,16 @@ def forward(
                     host_kv_cache_block_offsets,
                     host_kv_cache_pool_pointers=kv_cache_params.
                     host_kv_cache_pool_pointers,
+                    host_kv_cache_pool_mapping=kv_cache_params.
+                    host_kv_cache_pool_mapping,
                     cache_indirection=kv_cache_params.cache_indirection,
                 ),
                 spec_decoding_params=spec_decoding_params,
                 use_cache=use_cache,
                 lora_layer_params=lora_params.get_layer_config(layer_idx)
                 if lora_params is not None
-                and lora_params.lora_ranks is not None else None)
+                and lora_params.lora_ranks is not None else None,
+                next_layer_input_layernorm_args=next_layer_input_layernorm_args)
 
             hidden_states = layer_out.hidden_states
             if use_cache and layer_out.present_kv is not None:
@@ -511,6 +695,19 @@ def from_hugging_face(cls,
         model.load(weights)
         return model
 
+    @classmethod
+    def from_checkpoint(cls,
+                        ckpt_dir: str,
+                        rank: Optional[int] = None,
+                        config: Optional["PretrainedConfig"] = None):
+        return super().from_checkpoint(
+            ckpt_dir,
+            rank,
+            config,
+            preprocess_weights_hook=
+            update_weights_following_modelopt_optimization,
+        )
+
     def forward(
         self,
         input_ids: Tensor,
@@ -605,7 +802,6 @@ def prepare_attention_inputs(
                     attn_layer_idx.append(layer_idx)
                     num_kv_heads_per_layer.append(
                         layer_config.attention.num_key_value_heads)
-            num_layers = len(attn_layer_idx)
 
         attention_inputs = super().prepare_attention_inputs(
             max_batch_size=max_batch_size,
@@ -628,16 +824,4 @@ def prepare_attention_inputs(
             opt_batch_size=opt_batch_size,
             num_kv_heads_per_layer=num_kv_heads_per_layer)
 
-        kv_idx = 0
-        past_key_value = []
-        for i in range(self.config.num_hidden_layers):
-            layer_config = self.config.get_layer_config(i)
-            if layer_config.is_attention_layer:
-                past_key_value.append(
-                    attention_inputs['past_key_value'][kv_idx])
-                kv_idx += 1
-            else:
-                past_key_value.append(None)
-        attention_inputs['past_key_value'] = past_key_value
-
         return attention_inputs
diff --git a/tensorrt_llm/models/phi3/config.py b/tensorrt_llm/models/phi3/config.py
index 601603fd2..558196930 100644
--- a/tensorrt_llm/models/phi3/config.py
+++ b/tensorrt_llm/models/phi3/config.py
@@ -18,6 +18,7 @@
 import torch
 
 from ..._utils import torch_dtype_to_str
+from ...layers import MoeConfig
 from ...logger import logger
 from ...mapping import Mapping
 from ..modeling_utils import PretrainedConfig, QuantConfig
@@ -103,9 +104,23 @@ def from_hugging_face(
                 hf_config, "blocksparse_vert_stride", None)
             kwargs['dense_attention_every_n_layers'] = getattr(
                 hf_config, "dense_attention_every_n_layers", None)
+            kwargs['norm_epsilon'] = hf_config.layer_norm_epsilon
         else:
             kwargs['rotary_base'] = hf_config.rope_theta
             kwargs['norm_epsilon'] = hf_config.rms_norm_eps
+        moe_variant = hf_config.architectures[0] == "PhiMoEForCausalLM"
+        if moe_variant:
+            kwargs.update({
+                'moe': {
+                    'num_experts': hf_config.num_local_experts,
+                    'top_k': hf_config.num_experts_per_tok,
+                    'normalization_mode':
+                    MoeConfig.ExpertScaleNormalizationMode.SPARSE_MIXER,
+                    'sparse_mixer_epsilon': hf_config.router_jitter_noise,
+                },
+                'attention_bias': hf_config.attention_bias
+            })
+
         kwargs['position_embedding_type'] = 'rope_gpt_neox'
         if hf_config.max_position_embeddings >= 128000:
             kwargs[
@@ -115,7 +130,7 @@ def from_hugging_face(
                 "short_factor"]
             kwargs['longrope_scaling_long_factors'] = hf_config.rope_scaling[
                 "long_factor"]
-            if small_variant:
+            if small_variant or moe_variant:
                 kwargs['longrope_long_mscale'] = hf_config.rope_scaling[
                     "long_mscale"]
                 kwargs['longrope_short_mscale'] = hf_config.rope_scaling[
diff --git a/tensorrt_llm/models/phi3/convert.py b/tensorrt_llm/models/phi3/convert.py
index 9ee6821db..5a2bf59ec 100644
--- a/tensorrt_llm/models/phi3/convert.py
+++ b/tensorrt_llm/models/phi3/convert.py
@@ -34,6 +34,12 @@ def load_weights_from_hf_model(hf_model, config):
             key = key.replace("mlp.down_proj.", "mlp.proj.")  #128k
             key = key.replace("mlp.gate_proj.", "mlp.fc.")  #128k
             key = key.replace("o_proj.", "dense.")  #128k
+
+            #MoE
+            key = key.replace("block_sparse_moe.gate", "mlp.router")
+            key = key.replace("block_sparse_moe.experts.0.w3", "mlp.fc")
+            key = key.replace("block_sparse_moe.experts.0.w2", "mlp.proj")
+
             #Layer norm
             key = key.replace("post_attention_layernorm.",
                               "post_layernorm.")  #128k
@@ -54,16 +60,44 @@ def load_weights_from_hf_model(hf_model, config):
             # Swap the halves
             value = torch.cat((second_half, first_half), dim=0)
 
+        if config.architecture == "PhiMoEForCausalLM":
+            num_experts = config.moe["num_experts"]
+            mlp_hidden_size = config.intermediate_size
+            num_hidden = config.hidden_size
+            rank_experts = list(range(num_experts))
+            if config.mapping.has_moe_ep():
+                rank_experts = config.mapping.ep_experts(num_experts)
+
+            def get_moe_weight(key, suffix):
+                param = []
+                for expert in rank_experts:
+                    name = key.replace(f"0.{suffix}", f"{expert}.{suffix}")
+                    fc_value = hf_state_dict[name]
+                    param.append(fc_value)
+                w = torch.stack(param)
+                return w.reshape(-1, mlp_hidden_size, num_hidden)
+
+            if ".0.w3" in orig_key:
+                w3 = get_moe_weight(orig_key, 'w3')
+                w1 = get_moe_weight(orig_key.replace("w3", "w1"), 'w1')
+                value = torch.concat([w3, w1], dim=-2)
+            elif ".0.w2" in orig_key:
+                w2 = get_moe_weight(orig_key, 'w2')
+                value = w2.reshape(-1, num_hidden, mlp_hidden_size)
+            elif any([k in orig_key for k in ["w1", "w2", "w3"]]):
+                continue
+
         if "q_proj" in key:  #128k
             q_param = value
             k_param = hf_state_dict[orig_key.replace("q_proj", "k_proj")]
             v_param = hf_state_dict[orig_key.replace("q_proj", "v_proj")]
             value = torch.cat([q_param, k_param, v_param], dim=0)
-            key = key.replace("q_proj.weight", "qkv.weight")
+            key = key.replace("q_proj", "qkv")
         elif "k_proj" in key or "v_proj" in key:
             continue
 
-        weights[key] = value.to(torch_dtype).cpu()
+        dtype = torch.float if "router" in key else torch_dtype
+        weights[key] = value.to(dtype).cpu()
 
     if config.architecture == 'Phi3SmallForCausalLM':
         weights['lm_head.weight'] = weights[
@@ -74,6 +108,8 @@ def load_weights_from_hf_model(hf_model, config):
             if "qkv." in key:
                 weights[key] = shuffle_qkv_weights(weights[key], config)
 
+    if config.architecture in ['Phi3SmallForCausalLM', "PhiMoEForCausalLM"
+                               ] and config.mapping.has_tp():
         weights = split_weights_tp(config, weights, torch_dtype)
 
     return weights
diff --git a/tensorrt_llm/models/phi3/model.py b/tensorrt_llm/models/phi3/model.py
index 82abec886..ac29ab9a0 100644
--- a/tensorrt_llm/models/phi3/model.py
+++ b/tensorrt_llm/models/phi3/model.py
@@ -5,8 +5,9 @@
 
 from ..._utils import pad_vocab_size
 from ...functional import PositionEmbeddingType, Tensor
-from ...layers import (MLP, Attention, AttentionMaskType, BlockSparseAttnParams,
-                       ColumnLinear, Embedding, LayerNorm, RmsNorm)
+from ...layers import (MLP, MOE, Attention, AttentionMaskType,
+                       BlockSparseAttnParams, ColumnLinear, Embedding,
+                       LayerNorm, MoeConfig, RmsNorm)
 from ...lora_manager import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
@@ -31,6 +32,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
         self.gegelu_limit = None
 
         self.small_variant = config.architecture == "Phi3SmallForCausalLM"
+        self.moe_variant = config.architecture == "PhiMoEForCausalLM"
         if self.small_variant:
             self.gegelu_limit = config.gegelu_limit
 
@@ -51,10 +53,14 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
                 config.blocksparse_num_local_blocks,
                 config.blocksparse_vertical_stride)
 
+        if self.small_variant or self.moe_variant:
             self.input_layernorm = LayerNorm(
-                normalized_shape=config.hidden_size, dtype=config.dtype)
+                normalized_shape=config.hidden_size,
+                dtype=config.dtype,
+                eps=config.norm_epsilon)
             self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
-                                            dtype=config.dtype)
+                                            dtype=config.dtype,
+                                            eps=config.norm_epsilon)
         else:
             self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
                                            eps=config.norm_epsilon,
@@ -80,7 +86,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
             original_max_position_embeddings = config.original_max_position_embeddings
             position_embedding_type = PositionEmbeddingType.long_rope
 
-            if self.small_variant:
+            if self.small_variant or self.moe_variant:
                 rope_scaling_short_mscale = config.longrope_short_mscale
                 rope_scaling_long_mscale = config.longrope_long_mscale
 
@@ -94,7 +100,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
             max_position_embeddings=config.max_position_embeddings,
             dtype=config.dtype,
             attention_mask_type=attention_mask_type,
-            bias=self.small_variant,
+            bias=self.small_variant or self.moe_variant,
             q_scaling=q_scaling,
             tp_group=tp_group,
             tp_size=tp_size,
@@ -106,14 +112,27 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
             original_max_position_embeddings=original_max_position_embeddings,
             block_sparse_params=block_sparse_attn_params)
 
-        self.mlp = MLP(hidden_size=config.hidden_size,
-                       ffn_hidden_size=config.intermediate_size,
-                       hidden_act=config.hidden_act,
-                       dtype=config.dtype,
-                       tp_group=tp_group,
-                       tp_size=tp_size,
-                       quant_mode=config.quant_mode,
-                       bias=self.small_variant)
+        ClsMLP = MLP
+        mlp_kwargs = {}
+        if hasattr(config, "moe"):
+            ClsMLP = MOE
+            moe_config = MoeConfig()
+            for key, value in config.moe.items():
+                setattr(moe_config, key, value)
+            mlp_kwargs = {
+                "moe_config": moe_config,
+                "mapping": config.mapping,
+            }
+
+        self.mlp = ClsMLP(hidden_size=config.hidden_size,
+                          ffn_hidden_size=config.intermediate_size,
+                          hidden_act=config.hidden_act,
+                          dtype=config.dtype,
+                          tp_group=tp_group,
+                          tp_size=tp_size,
+                          quant_mode=config.quant_mode,
+                          bias=self.small_variant,
+                          **mlp_kwargs)
 
     def forward(
         self,
@@ -141,10 +160,14 @@ def forward(
 
         post_attention_input = hidden_states + attention_output
         post_attention_output = self.post_layernorm(post_attention_input)
-        feed_forward_hidden_states = self.mlp(
-            post_attention_output,
-            gegelu_limit=self.gegelu_limit,
-            lora_layer_params=lora_layer_params)
+        if self.small_variant:
+            feed_forward_hidden_states = self.mlp(
+                post_attention_output,
+                gegelu_limit=self.gegelu_limit,
+                lora_layer_params=lora_layer_params)
+        else:
+            feed_forward_hidden_states = self.mlp(
+                post_attention_output, lora_layer_params=lora_layer_params)
         hidden_states = post_attention_input + feed_forward_hidden_states
         if use_cache:
             return (hidden_states, presents)
@@ -161,10 +184,13 @@ def __init__(self, config: PretrainedConfig):
 
         self.layers = DecoderLayerList(Phi3DecoderLayer, config)
         self.small_variant = config.architecture == "Phi3SmallForCausalLM"
-        if self.small_variant:
+        self.moe_variant = config.architecture == "PhiMoEForCausalLM"
+        if self.small_variant or self.moe_variant:
             self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
+                                  eps=config.norm_epsilon,
                                   dtype=config.dtype)
-            self.mup_embedding_multiplier = config.mup_embedding_multiplier
+            if self.small_variant:
+                self.mup_embedding_multiplier = config.mup_embedding_multiplier
         else:
             self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
                                 eps=config.norm_epsilon,
@@ -216,9 +242,10 @@ def __init__(self, config: PretrainedConfig):
         vocab_size_padded = pad_vocab_size(config.vocab_size,
                                            config.mapping.tp_size)
 
+        self.moe_variant = config.architecture == "PhiMoEForCausalLM"
         lm_head = ColumnLinear(config.hidden_size,
                                vocab_size_padded,
-                               bias=False,
+                               bias=self.moe_variant,
                                dtype=config.dtype,
                                tp_group=config.mapping.tp_group,
                                tp_size=config.mapping.tp_size,
diff --git a/tensorrt_llm/models/phi3/split_weights.py b/tensorrt_llm/models/phi3/split_weights.py
index fcc4d735b..62a889123 100644
--- a/tensorrt_llm/models/phi3/split_weights.py
+++ b/tensorrt_llm/models/phi3/split_weights.py
@@ -15,8 +15,8 @@
 
 import torch
 
-from tensorrt_llm.models.convert_utils import (get_weight_and_bias, split,
-                                               split_matrix_tp,
+from tensorrt_llm.models.convert_utils import (get_weight, get_weight_and_bias,
+                                               split, split_matrix_tp,
                                                split_qkv_bias_tp, split_qkv_tp)
 
 from ..._utils import pad_vocab_size
@@ -110,10 +110,13 @@ def split_weights_tp(config, weights, dtype):
     num_heads = config.num_attention_heads
     num_kv_heads = config.num_key_value_heads
     hidden_size = config.hidden_size
+    moe_variant = config.architecture == "PhiMoEForCausalLM"
 
     mha_mode = num_heads == num_kv_heads
     tp_size = config.mapping.tp_size
     rank = config.mapping.tp_rank
+    moe_tp_size = config.mapping.moe_tp_size
+    moe_tp_rank = config.mapping.moe_tp_rank
     use_weight_only = config.quant_mode.is_weight_only()
     plugin_weight_only_quant_type = None
     if use_weight_only and config.quant_mode.is_int8_weight_only() == 'int8':
@@ -121,8 +124,7 @@ def split_weights_tp(config, weights, dtype):
     elif use_weight_only and config.quant_mode.is_int4_weight_only() == 'int4':
         plugin_weight_only_quant_type = torch.quint4x2
 
-    # Helper
-    def get_weight(weight, prefix, bias):
+    def get_quant_weight(weight, prefix, bias):
         return get_tllm_linear_weight(weight, prefix, bias, use_weight_only,
                                       plugin_weight_only_quant_type)
 
@@ -156,25 +158,43 @@ def get_weight(weight, prefix, bias):
             split_bias = split_qkv_bias_tp(qkv_bias, num_heads, hidden_size,
                                            tp_size, rank)
 
-        weights.update(get_weight(split_weight, prefix, split_bias))
+        weights.update(get_quant_weight(split_weight, prefix, split_bias))
 
         prefix = layer_prefix + 'attention.dense'
         attn_dense_weight, attn_dense_bias = get_weight_and_bias(
             weights, prefix, dtype)
         split_v = split_matrix_tp(attn_dense_weight, tp_size, rank, dim=1)
-        weights.update(get_weight(split_v, prefix, attn_dense_bias))
+        weights.update(get_quant_weight(split_v, prefix, attn_dense_bias))
 
         prefix = layer_prefix + 'mlp.fc'
-        mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(weights, prefix, dtype)
-        split_v = split_matrix_tp(mlp_fc_weight, tp_size, rank, dim=0)
-        bias = split_matrix_tp(mlp_fc_bias, tp_size, rank, dim=0)
-        weights.update(get_weight(split_v, prefix, bias))
+        if not moe_variant:
+            mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
+                weights, prefix, dtype)
+            split_v = split_matrix_tp(mlp_fc_weight, tp_size, rank, dim=0)
+            bias = split_matrix_tp(mlp_fc_bias, tp_size, rank, dim=0)
+            weights.update(get_quant_weight(split_v, prefix, bias))
+        else:
+            mlp_fc_weight = get_weight(weights, prefix, dtype)
+            w3 = split_matrix_tp(mlp_fc_weight, 2, 0, dim=1)
+            split_w3 = split_matrix_tp(w3, moe_tp_size, moe_tp_rank, dim=1)
+            w1 = split_matrix_tp(mlp_fc_weight, 2, 1, dim=1)
+            split_w1 = split_matrix_tp(w1, moe_tp_size, moe_tp_rank, dim=1)
+            split_v = torch.concat([split_w3, split_w1], dim=-2)
+            weights.update(get_quant_weight(split_v, prefix, None))
 
         prefix = layer_prefix + 'mlp.proj'
-        mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
-            weights, prefix, dtype)
-        split_v = split_matrix_tp(mlp_proj_weight, tp_size, rank, dim=1)
-        weights.update(get_weight(split_v, prefix, mlp_proj_bias))
+        if not moe_variant:
+            mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
+                weights, prefix, dtype)
+            split_v = split_matrix_tp(mlp_proj_weight, tp_size, rank, dim=1)
+            weights.update(get_quant_weight(split_v, prefix, mlp_proj_bias))
+        else:
+            mlp_proj_weight = get_weight(weights, prefix, dtype)
+            split_v = split_matrix_tp(mlp_proj_weight,
+                                      moe_tp_size,
+                                      moe_tp_rank,
+                                      dim=2)
+            weights.update(get_quant_weight(split_v, prefix, None))
 
     weights['transformer.vocab_embedding.weight'] = split_embedding(
         weights['transformer.vocab_embedding.weight'], tp_size, rank)
@@ -182,5 +202,10 @@ def get_weight(weight, prefix, bias):
                                                 tp_size,
                                                 rank,
                                                 dim=0)
+    if moe_variant:
+        weights['lm_head.bias'] = split_matrix_tp(weights['lm_head.bias'],
+                                                  tp_size,
+                                                  rank,
+                                                  dim=0)
 
     return weights
diff --git a/tensorrt_llm/models/qwen/model.py b/tensorrt_llm/models/qwen/model.py
index 2eec6e62d..c3dd5b305 100644
--- a/tensorrt_llm/models/qwen/model.py
+++ b/tensorrt_llm/models/qwen/model.py
@@ -25,6 +25,7 @@
 from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear,
                        Embedding, GatedMLP, RmsNorm, RowLinear)
 from ...layers.moe import MOEWeightWrapper
+from ...logger import logger
 from ...lora_manager import (LoraConfig,
                              get_default_trtllm_modules_to_hf_modules, use_lora)
 from ...mapping import Mapping
@@ -428,12 +429,18 @@ def concat_gate_up_proj(weights):
         else:
             if not use_preloading:
                 hf_model = load_hf_qwen(hf_model_dir, load_model_on_cpu)
+
+            logger.debug(f"HuggingFace model: {hf_model}")
+
+            model = QWenForCausalLM(config)
+
+            logger.debug(f"TensorRT-LLM model: {model}")
+
             if use_hf_gptq_checkpoint:
                 weights = load_weights_from_hf_gptq_model(hf_model, config)
             else:
                 weights = load_weights_from_hf_model(hf_model, config)
             check_share_embedding(weights, config)
-            model = QWenForCausalLM(config)
             model.load(weights)
         return model
 
diff --git a/tensorrt_llm/models/recurrentgemma/model.py b/tensorrt_llm/models/recurrentgemma/model.py
index d555fc5c3..e0cbe77b1 100644
--- a/tensorrt_llm/models/recurrentgemma/model.py
+++ b/tensorrt_llm/models/recurrentgemma/model.py
@@ -57,6 +57,7 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
             layer_types = layer_types + config.layer_types[0:(
                 (layer_idx + 1) % layer_type_len)]
             attention_layer_idx = layer_types.count('attention') - 1
+
             self.attention = Attention(
                 local_layer_idx=attention_layer_idx,
                 hidden_size=config.hidden_size,
@@ -209,6 +210,8 @@ def forward(self,
                     host_kv_cache_block_offsets,
                     host_kv_cache_pool_pointers=kv_cache_params.
                     host_kv_cache_pool_pointers,
+                    host_kv_cache_pool_mapping=kv_cache_params.
+                    host_kv_cache_pool_mapping,
                     cache_indirection=kv_cache_params.cache_indirection),
                 attention_params=attention_params,
                 conv_state=past_conv,
@@ -499,7 +502,6 @@ def prepare_inputs(
                 mapping, num_profiles)
 
         # attention inputs
-        num_attention_layers = self.layer_types.count('attention')
         attn_layer_idx = []
         for i in range(self.config.num_hidden_layers):
             if self.layer_types[i] == 'attention':
@@ -511,7 +513,7 @@ def prepare_inputs(
             max_seq_len=max_seq_len,
             num_kv_heads=self.config.num_key_value_heads,
             head_size=self.config.head_size,
-            num_layers=num_attention_layers,
+            num_layers=self.config.num_hidden_layers,
             kv_dtype=str_dtype_to_trt(self.config.kv_dtype),
             num_profiles=num_profiles,
             enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
@@ -523,17 +525,6 @@ def prepare_inputs(
             streamingllm=streamingllm,
             attn_layer_idx=attn_layer_idx)
 
-        kv_idx = 0
-        past_key_value = []
-        for i in range(self.config.num_hidden_layers):
-            if self.layer_types[i] == 'attention' and not paged_kv_cache:
-                past_key_value.append(
-                    attention_inputs['past_key_value'][kv_idx])
-                kv_idx += 1
-            else:
-                past_key_value.append(None)
-        attention_inputs['past_key_value'] = past_key_value
-
         # recurrent inputs
         recurrent_inputs = self.prepare_recurrent_inputs(
             max_batch_size=max_batch_size,
@@ -601,6 +592,8 @@ def prepare_inputs(
                     'host_kv_cache_block_offsets'],
                 host_kv_cache_pool_pointers=attention_inputs[
                     'host_kv_cache_pool_pointers'],
+                host_kv_cache_pool_mapping=attention_inputs[
+                    'host_kv_cache_pool_mapping'],
                 cache_indirection=attention_inputs['cache_indirection'],
             ),
             'attention_params':
diff --git a/tensorrt_llm/models/redrafter/redrafter_helper.py b/tensorrt_llm/models/redrafter/redrafter_helper.py
index 9604f40af..4cbd0b05c 100644
--- a/tensorrt_llm/models/redrafter/redrafter_helper.py
+++ b/tensorrt_llm/models/redrafter/redrafter_helper.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Tuple
 
 import numpy as np
@@ -11,7 +12,7 @@
     div, eq, exp, expand, expand_dims, floordiv, gather, gather_nd,
     index_select, int32_array, log_softmax, lt, max, maximum, masked_select,
     minimum, nonzero, not_op, op_and, rand, relu, scatter, select, shape, slice,
-    softmax, squeeze, stack, sum, topk, transpose, unsqueeze, view, where)
+    silu, softmax, squeeze, stack, sum, topk, transpose, unsqueeze, view, where)
 # isort: on
 from tensorrt_llm.layers import Embedding
 from tensorrt_llm.module import Module
@@ -358,9 +359,133 @@ def _unflatten_decoding_dim(x: Tensor, num_beams: int) -> Tensor:
     return x
 
 
-def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding,
-                            drafter: Module, num_beams: int, beam_length: int,
+def _beam_search_candidates(prompt_state: Tensor, init_token: Tensor,
+                            embedding: Embedding, drafter: Module,
+                            num_beams: int, beam_length: int,
                             is_rnn: bool) -> Tuple[Tensor, Tensor]:
+    """
+        This version of beam search matches with ReDrafter GitHub version as of 10/02/2024.
+        Link: https://github.com/apple/ml-recurrent-drafter/releases/tag/v1.1
+    """
+
+    LOG_0 = -50000.0
+    LOG_1 = 0.0
+
+    def maintain_logits(logits: Tensor) -> Tensor:
+        max_logits = max(logits, -1, keepdim=True)
+        max_logits = expand(max_logits,
+                            shape(logits, cast_to_dtype=INT_DTYPE_STR))
+        return logits - max_logits
+
+    def warp_logits(logits: Tensor,
+                    top_k: int = 50,
+                    mask_value: float = LOG_0) -> Tensor:
+        top_k = minimum(top_k, shape(logits,
+                                     dim=-1,
+                                     cast_to_dtype=INT_DTYPE_STR))
+        top_values, _ = topk(logits, k=top_k, dim=-1)  # [bs, nb, top_k]
+        starts = concat([0, 0, top_k - 1])
+        sizes = concat([shape(logits, 0), shape(logits, 1), 1])
+        lt_mask = logits < slice(top_values, starts=starts, sizes=sizes)
+        logits = where(lt_mask,
+                       constant_to_tensor_(mask_value, dtype=logits.dtype),
+                       logits)
+        return logits
+
+    def compute_logits(x: Tensor) -> Tensor:
+        """
+        x: [bs, nb, 2*H]
+        """
+        logits = drafter(x)  # [bs, nb, 2*H] => [bs, nb, V]
+        logits = maintain_logits(logits)  # [bs, nb, V]
+        logits = warp_logits(logits)  # [bs, nb, V]
+        return logits
+
+    assert prompt_state.ndim() == 2
+    assert init_token.ndim() == 1
+    assert beam_length > 1
+    batch_size = shape(prompt_state, 0, INT_DTYPE_STR)
+    vocab_size = embedding.num_embeddings
+    dtype = prompt_state.dtype
+
+    log_p_beam = expand(
+        unsqueeze(
+            constant(
+                numpy_array([LOG_1] + [LOG_0] * (num_beams - 1),
+                            trt_dtype=dtype)), 0),  # [1, nb]
+        concat([batch_size, num_beams]))  # [bs, nb]
+    context = _add_decoding_dim(prompt_state, num_beams)  # [bs, nb, H]
+    if init_token.ndim() == 1:
+        init_token = unsqueeze(init_token, -1)  # [bs] => [bs, 1]
+    beams = _add_decoding_dim(init_token, num_beams)  # [bs, nb, 1]
+
+    last_tokens = squeeze(beams, -1)  # [bs, nb]
+    state_shape = shape(context, cast_to_dtype=INT_DTYPE_STR)  # [bs, nb, H]
+    state = expand(expand_dims(constant_to_tensor_(0.0, dtype=dtype), [0, 1]),
+                   state_shape)  # [bs, nb, H]
+    logits_token_in_beam = None
+    candidate_length = beam_length - 1
+    for _ in range(candidate_length):
+        state = (
+            silu(drafter.rnn_w(embedding(last_tokens)) +
+                 drafter.rnn_u(state)) if is_rnn else embedding(last_tokens) +
+            state)  # [bs, nb, H]
+
+        logits_new_token = compute_logits(concat([context, state],
+                                                 -1))  # [bs, nb, V]
+        log_p_new_token = log_softmax(logits_new_token, -1)  # [bs, nb, V]
+
+        log_p_beam_new_token = log_p_new_token + unsqueeze(log_p_beam,
+                                                           2)  # [bs, nb, V]
+
+        tokens_times_beams = view(log_p_beam_new_token,
+                                  concat([batch_size, num_beams * vocab_size
+                                          ]))  # [bs, nb*V]
+        log_p_beam, topk_indices = topk(tokens_times_beams, k=num_beams,
+                                        dim=-1)  # [bs, nb]
+        top_beam_indices = topk_indices // vocab_size  # [bs, nb]
+        # Avoid repeated division for: top_token_ids = topk_indices % vocab_size
+        top_token_ids = topk_indices - (top_beam_indices * vocab_size
+                                        )  # [bs, nb]
+
+        # get the common indices to gather beams
+        gather_indices = _get_indices_for_gather_beams(batch_size,
+                                                       top_beam_indices,
+                                                       num_beams)
+
+        # update running beams, state, logits, and last_tokens
+        prev_top_beams = _gather_beams(beams, gather_indices, batch_size,
+                                       num_beams)  # [bs, nb] OR [bs, nb, 1+i]
+        if prev_top_beams.ndim() == 2:
+            prev_top_beams = unsqueeze(prev_top_beams, -1)  # [bs, nb, 1]
+        new_tokens = unsqueeze(top_token_ids, -1)  # [bs, nb, 1]
+        beams = concat([prev_top_beams, new_tokens], dim=-1)  # [bs, nb, 1+i+1]
+
+        state = _gather_beams(state, gather_indices, batch_size,
+                              num_beams)  # [bs, nb, H]
+
+        cur_logits_token_in_beam = unsqueeze(
+            _gather_beams(logits_new_token, gather_indices, batch_size,
+                          num_beams), 2)  # [bs, nb, 1, V]
+        if logits_token_in_beam is None:  # first iteration
+            logits_token_in_beam = cur_logits_token_in_beam
+        else:
+            logits_token_in_beam = concat(
+                [
+                    _gather_beams(logits_token_in_beam, gather_indices,
+                                  batch_size,
+                                  num_beams),  # prev_top_logits [bs, nb, i, V]
+                    cur_logits_token_in_beam
+                ],
+                dim=2)  # [bs, nb, i+1, V]
+        last_tokens = top_token_ids  # [bs, nb]
+    return beams, logits_token_in_beam
+
+
+def _beam_search_candidates_v0(x: Tensor, init_token: Tensor,
+                               embedding: Embedding, drafter: Module,
+                               num_beams: int, beam_length: int,
+                               is_rnn: bool) -> Tuple[Tensor, Tensor]:
     '''
     x: [bs, H]
     init_token: [bs]
@@ -372,6 +497,9 @@ def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding,
             draft_probs: (batch, num_beams, beam_length - 1, vocab_size)
                 Probabilities for the draft_tokens.
     '''
+    warnings.warn(
+        "This version of beam search is deprecated and will be removed in the future."
+    )
     NEG_INF = -50000.0
     batch_size = shape(x, 0, INT_DTYPE_STR)
     vocab_size = embedding.num_embeddings
@@ -408,7 +536,7 @@ def _beam_search_candidates(x: Tensor, init_token: Tensor, embedding: Embedding,
             h))  # [bs, nb, 2H] => [bs*nb, 2H] => [bs*nb, V]
         new_flat_log_probs = log_softmax(new_flat_logits, dim=-1)  # [bs*nb, V]
 
-        # compute probabilties and flatten the beams for topk
+        # compute probabilities and flatten the beams for topk
         candidate_log_probs = _unflatten_decoding_dim(
             new_flat_log_probs, num_beams)  # [bs*nb, V] => [bs, nb, V]
         log_probs = candidate_log_probs + unsqueeze(scores, 2)  # [bs, nb, V]
diff --git a/tensorrt_llm/module.py b/tensorrt_llm/module.py
index 96575c0d8..106974973 100644
--- a/tensorrt_llm/module.py
+++ b/tensorrt_llm/module.py
@@ -18,6 +18,18 @@
 from .logger import logger
 
 
+def _addindent(s_, numSpaces):
+    s = s_.split('\n')
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * ' ') + line for line in s]
+    s = '\n'.join(s)
+    s = first + '\n' + s
+    return s
+
+
 class Module(object):
 
     def __init__(self) -> None:
@@ -191,6 +203,23 @@ def update_parameters(self, torch_module):
         for k, v in self.named_parameters():
             v.value = tm[k].detach().cpu().numpy()
 
+    def _get_name(self):
+        return self.__class__.__name__
+
+    def __repr__(self):
+        # We treat the extra repr like the sub-module, one item per line
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append('(' + key + '): ' + mod_str)
+        main_str = self._get_name() + '('
+        if child_lines:
+            # simple one-liner info, which most builtin Modules will use
+            main_str += '\n  ' + '\n  '.join(child_lines) + '\n'
+        main_str += ')'
+        return main_str
+
 
 class ModuleList(Module):
 
@@ -221,3 +250,35 @@ def __setitem__(self, idx, module) -> None:
 
     def __len__(self):
         return len(self._modules)
+
+    def __repr__(self):
+        """Return a custom repr for ModuleList that compresses repeated module representations."""
+        list_of_reprs = [repr(item) for item in self]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + "()"
+
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+
+        lines = []
+        main_str = self._get_name() + "("
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+            local_repr = f"({start_id}): {b}"  # default repr
+
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                local_repr = f"({start_id}-{end_id}): {n} x {b}"
+
+            local_repr = _addindent(local_repr, 2)
+            lines.append(local_repr)
+
+        main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
diff --git a/tensorrt_llm/parameter.py b/tensorrt_llm/parameter.py
index f405387a0..47f719eff 100644
--- a/tensorrt_llm/parameter.py
+++ b/tensorrt_llm/parameter.py
@@ -86,11 +86,13 @@ def _create_managed_tensor(self, network, need_transpose=False) -> Tensor:
 
         if self._value is None or (isinstance(self._value, np.ndarray)
                                    and not self._value.flags['C_CONTIGUOUS']):
+            value_old = self._value
+            self._value = np.empty(self._shape, trt_dtype_to_np(self._dtype))
             network._register_unfilled_weights(
                 # use updated self._shape here
                 name,
-                np.empty(self._shape, trt_dtype_to_np(self._dtype)),
-                self._value)
+                self._value,
+                value_old)
         return Tensor(name=name, dtype=self._dtype, shape=shape)
 
     def get_managed_tensor(self,
diff --git a/tensorrt_llm/plugin/plugin.py b/tensorrt_llm/plugin/plugin.py
index 6b0a86814..84441597c 100644
--- a/tensorrt_llm/plugin/plugin.py
+++ b/tensorrt_llm/plugin/plugin.py
@@ -368,7 +368,7 @@ class CustomAllReduceHelper:
             - Set custom_all_reduce_helper.workspace with the required tensor.
               Then, each instance of allreduce will reference that tensor automatically.
     """
-    POINTERS_PER_RANK = 4
+    POINTERS_PER_RANK = 7
 
     def __init__(self) -> None:
         self.workspace: Optional[Tensor] = None
@@ -377,7 +377,7 @@ def set_workspace_tensor(self,
                              mapping: Mapping,
                              num_profiles: Optional[int] = None):
         from ..functional import Tensor
-        workspace_size = self.POINTERS_PER_RANK * mapping.tp_size + 1
+        workspace_size = self.POINTERS_PER_RANK * mapping.tp_size + 2
 
         dim_range = None
         if num_profiles is not None:
@@ -412,16 +412,23 @@ def allocate_workspace(mapping: Mapping,
         ipc_barriers_out = IpcMemory(
             mapping, IpcMemory.IPC_BARRIERS_SIZE_PER_GPU * mapping.tp_size * 2,
             is_p2p_supported)
+        lamport_buffers_0 = IpcMemory(mapping, size * mapping.tp_size,
+                                      is_p2p_supported)
+        lamport_buffers_1 = IpcMemory(mapping, size * mapping.tp_size,
+                                      is_p2p_supported)
+        lamport_buffers_2 = IpcMemory(mapping, size * mapping.tp_size,
+                                      is_p2p_supported)
         buffers = [
-            ipc_buffers_ping,
-            ipc_buffers_pong,
-            ipc_barriers_in,
-            ipc_barriers_out,
+            ipc_buffers_ping, ipc_buffers_pong, ipc_barriers_in,
+            ipc_barriers_out, lamport_buffers_0, lamport_buffers_1,
+            lamport_buffers_2
         ]
 
         return buffers, torch.tensor(
             ipc_buffers_ping.serialize() + ipc_buffers_pong.serialize() +
-            ipc_barriers_in.serialize() + ipc_barriers_out.serialize() + [0],
+            ipc_barriers_in.serialize() + ipc_barriers_out.serialize() +
+            lamport_buffers_0.serialize() + lamport_buffers_1.serialize() +
+            lamport_buffers_2.serialize() + [0] + [0],
             dtype=torch.int64,
             device="cpu")
 
diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py
index c185c9cd9..ac0b14916 100644
--- a/tensorrt_llm/quantization/layers.py
+++ b/tensorrt_llm/quantization/layers.py
@@ -1495,7 +1495,10 @@ def forward(self, hidden_states, lora_layer_params=None):
         if self.quant_mode.has_fp8_rowwise():
             # Quantize per token outputs tuple:
             # quantized tensor and scaling factors per token
-            inter = quantize_fp8_per_token(inter, self.clamp_val.val)
+            if hasattr(self.clamp_val, "val"):
+                inter = quantize_fp8_per_token(inter, self.clamp_val.val)
+            else:
+                inter = quantize_fp8_per_token(inter)
         output = self.proj(inter)
         return output
 
@@ -1619,32 +1622,31 @@ def forward(self, hidden_states, lora_layer_params=None):
 
 class SmoothQuantAttention(Module):
 
-    def __init__(
-            self,
-            *,
-            local_layer_idx,
-            hidden_size,
-            num_attention_heads,
-            num_kv_heads=None,
-            max_position_embeddings=1024,
-            num_layers=1,
-            apply_query_key_layer_scaling=False,
-            attention_head_size=None,
-            attention_mask_type=AttentionMaskType.padding,
-            bias=True,
-            dense_bias=None,
-            dtype=None,
-            position_embedding_type=PositionEmbeddingType.learned_absolute,
-            rotary_embedding_base=10000.0,
-            rotary_embedding_scaling=None,
-            rotary_embedding_percentage=1.0,
-            tp_group=None,
-            tp_size=1,
-            tp_rank=0,
-            scale_alibi_bias=False,
-            paged_kv_cache=False,
-            quant_mode=QuantMode(0),
-    ):
+    def __init__(self,
+                 *,
+                 local_layer_idx,
+                 hidden_size,
+                 num_attention_heads,
+                 num_kv_heads=None,
+                 max_position_embeddings=1024,
+                 num_layers=1,
+                 apply_query_key_layer_scaling=False,
+                 attention_head_size=None,
+                 attention_mask_type=AttentionMaskType.padding,
+                 bias=True,
+                 dense_bias=None,
+                 dtype=None,
+                 position_embedding_type=PositionEmbeddingType.learned_absolute,
+                 rotary_embedding_base=10000.0,
+                 rotary_embedding_scaling=None,
+                 rotary_embedding_percentage=1.0,
+                 tp_group=None,
+                 tp_size=1,
+                 tp_rank=0,
+                 scale_alibi_bias=False,
+                 paged_kv_cache=False,
+                 quant_mode=QuantMode(0),
+                 layer_idx_in_cache_pool=None):
         super().__init__()
         self.local_layer_idx = local_layer_idx
         self.attention_mask_type = attention_mask_type
@@ -1653,6 +1655,7 @@ def __init__(
         self.num_attention_kv_heads = (
             num_kv_heads + tp_size - 1
         ) // tp_size if num_kv_heads is not None else self.num_attention_heads
+        self.layer_idx_in_cache_pool = layer_idx_in_cache_pool
         self.hidden_size = hidden_size // tp_size
         self.max_position_embeddings = 0 if max_position_embeddings is None else max_position_embeddings
         self.tp_size = tp_size
@@ -1817,6 +1820,7 @@ def forward(
                 layer_idx=self.local_layer_idx,
                 num_heads=self.num_attention_heads,
                 num_kv_heads=self.num_attention_kv_heads,
+                layer_idx_in_cache_pool=self.layer_idx_in_cache_pool,
                 hidden_size_per_head=self.attention_head_size,
                 q_scaling=self.q_scaling,
                 rotary_embedding_dim=self.rotary_embedding_dim,
@@ -1839,6 +1843,8 @@ def forward(
                 host_kv_cache_block_offsets,
                 host_kv_cache_pool_pointers=kv_cache_params.
                 host_kv_cache_pool_pointers,
+                host_kv_cache_pool_mapping=kv_cache_params.
+                host_kv_cache_pool_mapping,
                 host_context_lengths=attention_params.host_context_lengths,
                 use_cache=use_cache,
                 spec_decoding_generation_lengths=spec_decoding_params.
diff --git a/tensorrt_llm/quantization/mode.py b/tensorrt_llm/quantization/mode.py
index 04ffe2fe7..0ececc424 100644
--- a/tensorrt_llm/quantization/mode.py
+++ b/tensorrt_llm/quantization/mode.py
@@ -34,6 +34,8 @@ class QuantAlgo(StrEnum, metaclass=BaseEnumMeta):
     FP8 = auto()
     FP8_PER_CHANNEL_PER_TOKEN = auto()
     INT8 = auto()
+    MIXED_PRECISION = auto()
+    NO_QUANT = auto()
 
 
 QUANT_ALGO_LIST = list(set(QuantAlgo) - {QuantAlgo.INT8})
@@ -82,6 +84,9 @@ class QuantMode(IntFlag):
     # The mask of all valid flags.
     VALID_FLAGS = COUNT - 1
 
+    def __deepcopy__(self, memo):
+        return self
+
     # All the bits set? You can restrict the test to the bits indicated by "mask".
     def _all(self, bits, mask=VALID_FLAGS):
         return (self & mask) == bits
@@ -138,6 +143,9 @@ def has_fp8_qdq(self):
     def has_fp8_rowwise(self):
         return self._any(self.FP8_ROWWISE)
 
+    def has_weight_quant(self):
+        return self._any(self.INT4_WEIGHTS | self.INT8_WEIGHTS)
+
     def has_any_quant(self):
         return self._any(self.INT4_WEIGHTS | self.INT8_WEIGHTS
                          | self.ACTIVATIONS
@@ -241,7 +249,7 @@ def use_weight_only(use_int4_weights=False, per_group=False):
 
     @staticmethod
     def from_quant_algo(
-        quant_algo: Optional[QuantAlgo],
+        quant_algo: Optional[QuantAlgo] = None,
         kv_cache_quant_algo: Optional[QuantAlgo] = None,
     ) -> "QuantMode":
         assert quant_algo is None or quant_algo in QUANT_ALGO_LIST
diff --git a/tensorrt_llm/quantization/quantize.py b/tensorrt_llm/quantization/quantize.py
index 25b4bdf6c..92cc7bac8 100644
--- a/tensorrt_llm/quantization/quantize.py
+++ b/tensorrt_llm/quantization/quantize.py
@@ -1,10 +1,11 @@
 import fnmatch
+from typing import Union
 
 from .._utils import get_init_params
 from ..layers import (MLP, Attention, ColumnLinear, Embedding, GatedMLP,
                       LayerNorm, RmsNorm, RowLinear)
 from ..layers.moe import MixtureOfExperts
-from ..models.modeling_utils import QuantConfig
+from ..models.modeling_utils import LayerQuantConfig, QuantConfig
 from ..parameter import Parameter
 from .layers import (FP8Linear, FP8RowLinear, Fp8RowwiseGatedMLP, Fp8RowwiseMLP,
                      Fp8RowwiseRmsNorm, Int8SmoothQuantLinear,
@@ -79,9 +80,14 @@ def quantize_layers(
     return model
 
 
-def weight_only_quantize(model, quant_config: QuantConfig):
+def weight_only_quantize(model, quant_config: QuantConfig, model_config=None):
     assert quant_config.quant_mode.is_weight_only()
 
+    try:
+        model_cfg = model.config
+    except Exception:
+        model_cfg = model_config
+
     quant_map = {
         ColumnLinear: WeightOnlyQuantColumnLinear,
         RowLinear: WeightOnlyQuantRowLinear,
@@ -93,7 +99,7 @@ def preprocess_init_params(init_params, name, module):
         if isinstance(module, ColumnLinear):
             module_name = name.rsplit('.', 1)[-1]
             init_params["transb"] = module_name == "lm_head"
-        init_params["tp_rank"] = model.config.mapping.tp_rank
+        init_params["tp_rank"] = model_cfg.mapping.tp_rank
 
     model = quantize_layers(
         model,
@@ -104,9 +110,16 @@ def preprocess_init_params(init_params, name, module):
     return model
 
 
-def weight_only_groupwise_quantize(model, quant_config: QuantConfig):
+def weight_only_groupwise_quantize(model,
+                                   quant_config: QuantConfig,
+                                   model_config=None):
     assert quant_config.quant_mode.is_weight_only()
 
+    try:
+        model_cfg = model.config
+    except Exception:
+        model_cfg = model_config
+
     quant_map = {
         ColumnLinear: WeightOnlyGroupwiseQuantColumnLinear,
         RowLinear: WeightOnlyGroupwiseQuantRowLinear,
@@ -118,7 +131,7 @@ def preprocess_init_params(init_params, name, module):
         init_params["zero"] = quant_config.has_zero_point
         init_params[
             "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ
-        init_params["tp_rank"] = model.config.mapping.tp_rank
+        init_params["tp_rank"] = model_cfg.mapping.tp_rank
 
     model = quantize_layers(
         model,
@@ -207,9 +220,14 @@ def fp8_quantize(model, quant_config: QuantConfig):
     return model
 
 
-def fp8_rowwise_quantize(model, quant_config: QuantConfig):
+def fp8_rowwise_quantize(model, quant_config: QuantConfig, model_config=None):
     assert quant_config.quant_mode.has_fp8_rowwise()
 
+    try:
+        model_cfg = model.config
+    except Exception:
+        model_cfg = model_config
+
     quant_map = {
         RmsNorm: Fp8RowwiseRmsNorm,
         GatedMLP: Fp8RowwiseGatedMLP,
@@ -230,8 +248,8 @@ def extract_layer_idx(name):
             continue
 
         # Meta's Fp8 recipe
-        mapping = model.config.mapping
-        layers_range = mapping.pp_layers(model.config.num_hidden_layers)
+        mapping = model_cfg.mapping
+        layers_range = mapping.pp_layers(model_cfg.num_hidden_layers)
         is_first_layer = mapping.is_first_pp_rank() and layer_idx == 0
         is_last_layer = mapping.is_last_pp_rank(
         ) and layer_idx == len(layers_range) - 1
@@ -259,30 +277,54 @@ def extract_layer_idx(name):
     return model
 
 
-def kv_cache_quantize(model, quant_config: QuantConfig):
-    assert quant_config.quant_mode.has_kv_cache_quant()
+# Now consider the kv cache is enabled for all layers
+def kv_cache_quantize(model):
     for name, module in model.named_modules():
         if isinstance(module, (Attention, SmoothQuantAttention)):
             module.kv_cache_scaling_factor = Parameter(shape=(1, ),
                                                        dtype='float32')
+    return model
 
 
-def quantize(model, quant_config: QuantConfig):
-    quant_mode = quant_config.quant_mode
+def quantize(model, quant_config: Union[QuantConfig, LayerQuantConfig]):
+    quant_mode = quant_config.layer_quant_mode
 
-    if quant_mode.has_fp8_qdq():
-        model = fp8_quantize(model, quant_config)
-    elif quant_mode.has_fp8_rowwise():
-        model = fp8_rowwise_quantize(model, quant_config)
-    elif quant_mode.has_act_and_weight_quant():
-        model = smooth_quantize(model, quant_config)
-    elif quant_mode.is_weight_only():
-        if quant_mode.has_per_group_scaling():
-            model = weight_only_groupwise_quantize(model, quant_config)
+    for name, module, parent in model.named_modules_with_parent():
+        if quant_config.quant_algo == QuantAlgo.MIXED_PRECISION:
+            if name in quant_mode.keys():
+                layer_quant_mode = quant_mode[name]
+            else:
+                continue
         else:
-            model = weight_only_quantize(model, quant_config)
+            layer_quant_mode = quant_mode
+        if layer_quant_mode == QuantMode(0):
+            continue
+
+        layer_quant_cfg = quant_config.get_quant_cfg(name)
+
+        if layer_quant_mode.has_fp8_qdq():
+            module = fp8_quantize(module, layer_quant_cfg)
+        elif layer_quant_mode.has_fp8_rowwise():
+            module = fp8_rowwise_quantize(module, layer_quant_cfg, model.config)
+        elif layer_quant_mode.has_act_and_weight_quant():
+            module = smooth_quantize(module, layer_quant_cfg)
+        elif layer_quant_mode.is_weight_only():
+            if layer_quant_mode.has_per_group_scaling():
+                module = weight_only_groupwise_quantize(module, layer_quant_cfg,
+                                                        model.config)
+            else:
+                module = weight_only_quantize(module, layer_quant_cfg,
+                                              model.config)
 
-    if quant_mode.has_kv_cache_quant():
-        model = kv_cache_quantize(model, quant_config)
+        if parent is not None:  # for per layer
+            module_name = name.rsplit('.', 1)[-1]
+            setattr(parent, module_name, module)
+        else:  # for all layer
+            model = module
+            break
+
+    if quant_config.quant_mode.has_kv_cache_quant():
+        model = kv_cache_quantize(model)
 
+    setattr(model, 'quant_mode', quant_config.quant_mode)
     return model
diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py
index 2e38acb16..7aae7aec3 100644
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@@ -26,6 +26,7 @@
 import numpy as np
 import safetensors
 import torch
+from accelerate.hooks import remove_hook_from_module
 from datasets import load_dataset
 from safetensors.torch import load_file, save_file
 from torch.utils.data import DataLoader
@@ -126,9 +127,26 @@ def quant_cfg_choices():
     "Starcoder2ForCausalLM": "gptnext",
     "GPTBigCodeForCausalLM": "gptnext",
     "GLM": "glm",
+    "DeciLMForCausalLM": "deci",
 }
 
 
+class _CustomDataset(torch.utils.data.Dataset):
+
+    def __init__(self, encodings):
+        self.encodings = encodings
+
+    def __getitem__(self, idx):
+        item = {
+            key: torch.tensor(val[idx])
+            for key, val in self.encodings.items()
+        }
+        return item
+
+    def __len__(self):
+        return len(self.encodings["input_ids"])
+
+
 def get_tokenizer(ckpt_path, max_seq_length=2048, model_type=None):
     logger.info(f"Initializing tokenizer from {ckpt_path}")
     tokenizer = AutoTokenizer.from_pretrained(
@@ -174,11 +192,20 @@ def get_model(ckpt_path, dtype="fp16", device="cuda"):
         raise NotImplementedError(f"Unknown dtype {dtype}")
 
     # Note: VILA model is not in public HF model zoo yet. We need to explicitly import from the git repo
-    hf_config = AutoConfig.from_pretrained(ckpt_path, trust_remote_code=True)
+    if "mpt" in ckpt_path:
+        # MPT-7B cannot get initialized from AutoConfig
+        from transformers import MptConfig
+        hf_config = MptConfig.from_pretrained(ckpt_path)
+    else:
+        hf_config = AutoConfig.from_pretrained(ckpt_path,
+                                               trust_remote_code=True)
     model_cls = AutoModelForCausalLM
     if hf_config.model_type == "llava":
         from transformers import LlavaForConditionalGeneration
         model_cls = LlavaForConditionalGeneration
+    elif hf_config.model_type == "mpt":
+        from transformers import MptForCausalLM
+        model_cls = MptForCausalLM
     if "vila" in ckpt_path:
         model = _get_vila_model(ckpt_path)
     elif hf_config.model_type == "glm":
@@ -217,7 +244,9 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
                          tokenizer=None,
                          batch_size=1,
                          calib_size=512,
-                         block_size=512):
+                         block_size=512,
+                         device=None,
+                         include_labels=False):
     logger.info("Loading calibration dataset")
     if dataset_name_or_dir == "pileval":
         dataset = load_dataset(
@@ -226,7 +255,11 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
             split="train")
         dataset = dataset["text"][:calib_size]
     elif "cnn_dailymail" in dataset_name_or_dir:
-        dataset = load_dataset(dataset_name_or_dir, name="3.0.0", split="train")
+        dataset = load_dataset(
+            dataset_name_or_dir,
+            name="3.0.0",
+            split="train",
+        )
         dataset = dataset["article"][:calib_size]
     elif os.path.isdir(dataset_name_or_dir):
         logger.info(
@@ -245,7 +278,23 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
                                                 padding=True,
                                                 truncation=True,
                                                 max_length=block_size)
-    batch_encoded = batch_encoded["input_ids"]
+
+    if device:
+        batch_encoded = batch_encoded.to(device)
+
+    if include_labels:
+        # Labels are needed when backward is called in the model.
+        # The labels should be a shifted version of the input_ids.
+        # However, we should not shift the input_ids here since the labels are shifted by
+        # Huggingface models during loss calculation as shown here -
+        # https://github.com/huggingface/transformers/blob/7f79a97399bb52aad8460e1da2f36577d5dccfed/src/transformers/models/llama/modeling_llama.py#L1093-L1095
+        batch_encoded["labels"] = torch.where(
+            batch_encoded["attention_mask"] > 0.5, batch_encoded["input_ids"],
+            -100)
+        batch_encoded = _CustomDataset(batch_encoded)
+    else:
+        # For backward compatibility, if labels are not needed, we only return input_ids.
+        batch_encoded = batch_encoded["input_ids"]
 
     calib_dataloader = DataLoader(batch_encoded,
                                   batch_size=batch_size,
@@ -254,7 +303,8 @@ def get_calib_dataloader(dataset_name_or_dir="cnn_dailymail",
     return calib_dataloader
 
 
-def quantize_model(model, quant_cfg, calib_dataloader=None):
+def quantize_model(model, quant_cfg, calib_dataloader, batch_size, qformat,
+                   weight_compression):
     import modelopt.torch.quantization as atq
 
     def calibrate_loop():
@@ -267,14 +317,40 @@ def calibrate_loop():
             data = data.to(model.device)
             model(data)
 
+    QUANT_CFG_CHOICES = {
+        "int8": "INT8_DEFAULT_CFG",
+        "int8_sq": "INT8_SMOOTHQUANT_CFG",
+        "fp8": "FP8_DEFAULT_CFG",
+        "int4_awq": "INT4_AWQ_CFG",
+        "w4a8_awq": "W4A8_AWQ_BETA_CFG",
+    }
+
     logger.info("Starting quantization...")
     start_time = time.time()
-    atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    if weight_compression:
+        logger.info("Starting mixed precision quantization...")
+        model, search_history = atq.auto_quantize(
+            model,
+            data_loader=calib_dataloader,
+            loss_func=lambda output, batch: output.loss,
+            constraints={"weight_compression": weight_compression},
+            quantization_formats=[
+                QUANT_CFG_CHOICES[item] for item in qformat.split(",")
+            ] + [None],
+            collect_func=lambda x: x,
+            num_calib_steps=len(calib_dataloader),
+            num_score_steps=min(
+                len(calib_dataloader), 128 // batch_size
+            ),  # Limit the number of score steps to avoid long calibration time
+            verbose=True,
+        )
+        atq.print_quant_summary(model)
+    else:
+        atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
     end_time = time.time()
     logger.info(
         "Quantization done. Total time used: {:.2f} s.".format(end_time -
                                                                start_time))
-
     return model
 
 
@@ -366,7 +442,8 @@ def quantize_and_export(*,
                         max_draft_len=None,
                         medusa_hidden_act=None,
                         medusa_model_dir=None,
-                        quant_medusa_head=None):
+                        quant_medusa_head=None,
+                        weight_compression=None):
     '''
         Load model from the model_dir, call Modelopt to quantize the model, and then export
         the quantized model as TRT-LLM checkpoint
@@ -402,7 +479,7 @@ def quantize_and_export(*,
                    ] and kv_cache_dtype is None:
         logger.info(f"No quantization applied, export {dtype} model")
     else:
-        if "awq" in qformat:
+        if any("awq" in item for item in qformat.split(",")):
             if calib_size > 32:
                 logger.info(
                     f"AWQ calibration could take longer with calib_size = {calib_size}, Using"
@@ -414,34 +491,53 @@ def quantize_and_export(*,
                 " set by adding the argument --batch_size <batch_size> to the command line.\n"
             )
 
+        # Check if qformat provided is supported. qformat is list of one element for non auto_quant case.
+        if all(item in quant_cfg_choices() for item in qformat.split(",")):
+            quant_cfg = quant_cfg_choices()[qformat.split(",")[0]]
+        else:
+            raise ValueError(f"Unsupported quantization format: {qformat}")
+
+        # Auto quantize does not use quant_cfg
+        if not weight_compression and "awq" in qformat:
+            quant_cfg = copy.deepcopy(quant_cfg_choices()[qformat])
+            weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
+            if isinstance(weight_quantizer, list):
+                weight_quantizer = weight_quantizer[0]
+            weight_quantizer["block_sizes"][-1] = awq_block_size
+
+            # Coarser optimal scale search seems to resolve the overflow in TRT-LLM for some models
+            if "w4a8_awq" == qformat and model_type in ["gemma", "mpt"]:
+                quant_cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 1}
+
         calib_dataloader = get_calib_dataloader(
             dataset_name_or_dir=calib_dataset,
             tokenizer=tokenizer,
             batch_size=batch_size,
             calib_size=calib_size,
             block_size=calib_max_seq_length,
+            device=torch.device("cuda") if weight_compression else None,
+            include_labels=weight_compression is not None,
         )
 
-        if qformat in quant_cfg_choices():
-            quant_cfg = quant_cfg_choices()[qformat]
-        else:
-            raise ValueError(f"Unsupported quantization format: {qformat}")
-
-        if "awq" in qformat:
-            quant_cfg = copy.deepcopy(quant_cfg_choices()[qformat])
-            weight_quantizer = quant_cfg["quant_cfg"][
-                "*weight_quantizer"]  # type: ignore
-            if isinstance(weight_quantizer, list):
-                weight_quantizer = weight_quantizer[0]
-            weight_quantizer["block_sizes"][-1] = awq_block_size
+        # Always turn on FP8 kv cache to save memory footprint.
+        # For int8_sq, we do not quantize kv cache to preserve accuracy.
+        # We turn off FP8 kv cache for unified_hf checkpoint
+        enable_quant_kv_cache = "int8" not in qformat
+        print(
+            f'{"Enable" if enable_quant_kv_cache else "Disable"} KV cache quantization'
+        )
+        quant_cfg["quant_cfg"]["*output_quantizer"] = {
+            "num_bits": 8 if qformat == "int8_sq" else (4, 3),
+            "axis": None,
+            "enable": enable_quant_kv_cache,
+        }
 
-        if kv_cache_dtype is not None:
-            if kv_cache_dtype == "fp8":
-                for value in KV_CACHE_CFG.values():
-                    value.update({"num_bits": (4, 3)})  # type: ignore
-            quant_cfg["quant_cfg"].update(KV_CACHE_CFG)  # type: ignore
+        # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
+        if model_type == "gemma" and "int8_sq" in qformat.split(","):
+            quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}
 
-        model = quantize_model(model, quant_cfg, calib_dataloader)
+        model = quantize_model(model, quant_cfg, calib_dataloader, batch_size,
+                               qformat, weight_compression)
 
     with torch.inference_mode():
         if model_type is None:
@@ -453,12 +549,37 @@ def quantize_and_export(*,
         export_path = output_dir
         start_time = time.time()
 
-        export_tensorrt_llm_checkpoint(model,
-                                       model_type,
-                                       getattr(torch, dtype),
-                                       export_dir=export_path,
-                                       inference_tensor_parallel=tp_size,
-                                       inference_pipeline_parallel=pp_size)
+        # Move meta tensor back to device before exporting.
+        remove_hook_from_module(model, recurse=True)
+
+        QUANT_ALGO = {
+            "int8": "INT8",
+            "int8_sq": "W8A8_SQ_PER_CHANNEL",
+            "fp8": "FP8",
+            "int4_awq": "W4A16_AWQ",
+            "w4a8_awq": "W4A8_AWQ",
+        }
+
+        # workaround for old API version
+        if weight_compression:
+            export_tensorrt_llm_checkpoint(
+                model,
+                model_type,
+                getattr(torch, dtype),
+                export_dir=export_path,
+                inference_tensor_parallel=tp_size,
+                inference_pipeline_parallel=pp_size,
+                auto_quant=weight_compression is not None,
+            )
+        else:
+            export_tensorrt_llm_checkpoint(
+                model,
+                model_type,
+                getattr(torch, dtype),
+                export_dir=export_path,
+                inference_tensor_parallel=tp_size,
+                inference_pipeline_parallel=pp_size,
+            )
 
         with open(f"{export_path}/config.json", "r") as f:
             tensorrt_llm_config = json.load(f)
diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py
index f843772eb..983d458b8 100755
--- a/tensorrt_llm/runtime/generation.py
+++ b/tensorrt_llm/runtime/generation.py
@@ -16,13 +16,13 @@
 import copy
 import math
 import platform
+from collections import Counter
 from dataclasses import dataclass, field
 from functools import reduce, wraps
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Sequence, Set, Union
 
 import numpy as np
-import tensorrt as trt
 
 # isort: off
 import torch
@@ -30,6 +30,10 @@
 # isort: on
 from cuda import cudart
 
+from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \
+    MemoryPoolsAllocator
+from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
+    PoolsKVCacheManager
 from tensorrt_llm.runtime.redrafter_utils import *
 
 from .._utils import (pad_vocab_size, str_dtype_to_torch, torch_to_numpy,
@@ -40,7 +44,7 @@
 from ..mapping import Mapping
 from ..plugin.plugin import CustomAllReduceHelper
 from ..quantization import QuantMode
-from .kv_cache_manager import GenerationSequence, KVCacheManager, KVCacheUpdater
+from .kv_cache_manager import GenerationSequence, KVCacheUpdater
 from .session import _scoped_stream
 
 
@@ -810,10 +814,12 @@ def __init__(self,
             expected_tensor_names += [f'kv_cache_block_offsets']
             expected_tensor_names += [f'host_kv_cache_block_offsets']
             expected_tensor_names += [f'host_kv_cache_pool_pointers']
+            expected_tensor_names += [f'host_kv_cache_pool_mapping']
             if self.cross_attention:
                 expected_tensor_names += [f'cross_kv_cache_block_offsets']
                 expected_tensor_names += [f'host_cross_kv_cache_block_offsets']
                 expected_tensor_names += [f'host_cross_kv_cache_pool_pointers']
+                expected_tensor_names += [f'host_cross_kv_cache_pool_mapping']
         else:
             # Refer to gpt_attention() inside functional.py
             if self.use_kv_cache and not self.paged_kv_cache:
@@ -1696,40 +1702,42 @@ def setup(self,
                 num_blocks, _ = self._get_num_paged_blocks(
                     self.max_attention_window_size, self.sink_token_length,
                     self.use_one_more_block)
-                cache_shape = (
-                    num_blocks,
-                    self.num_attn_layers,
-                    2,
-                    self.get_num_heads_kv(),
-                    self.tokens_per_block,
-                    self.head_size,
-                )
-                self.kv_cache_pool = torch.empty(cache_shape,
-                                                 dtype=kv_cache_type,
-                                                 device=self.device)
+                self._memory_pool_allocator = MemoryPoolsAllocator(
+                    num_blocks=num_blocks,
+                    tokens_per_block=self.tokens_per_block,
+                    head_size=self.head_size)
+                if self._model_config.num_kv_heads_per_layer is None:
+                    num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer(
+                        self.get_num_heads_kv(), self.num_attn_layers)
+                else:
+                    num_kv_heads_per_layer = self._model_config.num_kv_heads_per_layer
+
+                self._memory_pool_allocator.allocate(kv_cache_type,
+                                                     num_kv_heads_per_layer)
+
                 if self.cross_attention:  # As for now we enable cross paged kv and self paged kv to share the same tokens_per_block
                     cross_num_blocks, _ = self._get_num_paged_blocks(
                         self.encoder_max_input_length,
                         sink_token_length=0,
                         use_one_more_block=False)
-                    cross_cache_shape = (
-                        cross_num_blocks,
-                        self.num_layers,
-                        2,
-                        self.get_num_heads_kv(),
-                        self.tokens_per_block,
-                        self.head_size,
-                    )
-                    self.cross_kv_cache_pool = torch.empty(cross_cache_shape,
-                                                           dtype=kv_cache_type,
-                                                           device=self.device)
+
+                    num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer(
+                        self.get_num_heads_kv(), self.num_layers)
+
+                    self._cross_memory_pool_allocator = MemoryPoolsAllocator(
+                        num_blocks=cross_num_blocks,
+                        tokens_per_block=self.tokens_per_block,
+                        head_size=self.head_size)
+                    self._cross_memory_pool_allocator.allocate(
+                        kv_cache_type, num_kv_heads_per_layer)
+
             elif self.has_attn_layers:
                 for i in range(self.first_layer, self.last_layer):
                     if self.layer_types[i] == 'attention':
                         cache_shape = (
                             batch_size,
                             2,
-                            self.get_num_heads_kv(self.general_to_attn_idx[i]),
+                            self.get_num_heads_kv(i),
                             self.max_attention_window_size,
                             self.head_size,
                         )
@@ -1845,6 +1853,43 @@ def setup(self,
         if self.is_medusa_mode:
             return self.num_draft_tokens
 
+    def _allocate_empty_kv_cache_pools(self, kv_cache_type, num_blocks):
+        # Layers are homogeneous, use old kv cache shape
+        unique_cache_pools = []
+        if self._model_config.num_kv_heads_per_layer is None:
+            cache_shape = (
+                num_blocks,
+                self.num_attn_layers,
+                2,
+                self.get_num_heads_kv(),
+                self.tokens_per_block,
+                self.head_size,
+            )
+            unique_cache_pools.append(
+                torch.empty(cache_shape,
+                            dtype=kv_cache_type,
+                            device=self.device))
+
+        # Layers are not homogeneous, use new kv cache shape
+        else:
+            kv_heads_unique_counter = Counter(
+                self._model_config.num_kv_heads_per_layer)
+            for kv_head, num_layers in kv_heads_unique_counter.items():
+                cache_shape = (
+                    num_blocks,
+                    num_layers,
+                    2,
+                    kv_head,
+                    self.tokens_per_block,
+                    self.head_size,
+                )
+                unique_cache_pools.append(
+                    torch.empty(cache_shape,
+                                dtype=kv_cache_type,
+                                device=self.device))
+
+        return unique_cache_pools
+
     def _get_context_shape_buffer(
         self,
         input_ids: torch.Tensor,
@@ -1963,17 +2008,20 @@ def add_tensor_with_bs(x, name, bs):
         if self.paged_kv_cache and self.has_attn_layers:
             buffer = kv_cache_block_offsets.contiguous()
             shape = kv_cache_block_offsets.shape
-            shape = [shape[0] * shape[1], *shape[2:]]
+            shape = [shape[0], shape[1] * shape[2], *shape[3:]]
             add_tensor_with_shape(buffer, f'kv_cache_block_offsets', shape)
             add_tensor_with_shape(host_kv_cache_block_offsets,
                                   f'host_kv_cache_block_offsets', shape)
             pool_pointers = f'host_kv_cache_pool_pointers'
+            pool_mapping = f'host_kv_cache_pool_mapping'
             add_tensor(self.buffer[pool_pointers], pool_pointers)
+            add_tensor(self.buffer[pool_mapping], pool_mapping)
             if self.cross_attention:
                 cross_buffer = cross_kv_cache_block_offsets.contiguous()
                 cross_shape = cross_kv_cache_block_offsets.shape
                 cross_shape = [
-                    cross_shape[0] * cross_shape[1], *cross_shape[2:]
+                    cross_shape[0], cross_shape[1] * cross_shape[2],
+                    *cross_shape[3:]
                 ]
                 add_tensor_with_shape(cross_buffer,
                                       f'cross_kv_cache_block_offsets',
@@ -1982,8 +2030,10 @@ def add_tensor_with_bs(x, name, bs):
                                       f'host_cross_kv_cache_block_offsets',
                                       cross_shape)
                 cross_pool_pointers = f'host_cross_kv_cache_pool_pointers'
+                cross_pool_mapping = f'host_cross_kv_cache_pool_mapping'
                 add_tensor(self.buffer[cross_pool_pointers],
                            cross_pool_pointers)
+                add_tensor(self.buffer[cross_pool_mapping], cross_pool_mapping)
 
         batch_size = context_lengths.shape[0]
         if self.use_kv_cache and not self.paged_kv_cache:
@@ -2246,17 +2296,20 @@ def add_tensor_with_shape(x, name, shape):
 
         if self.paged_kv_cache and self.has_attn_layers:
             shape = kv_cache_block_offsets.shape
-            shape = [shape[0] * shape[1], *shape[2:]]
+            shape = [shape[0], shape[1] * shape[2], *shape[3:]]
             add_tensor_with_shape(kv_cache_block_offsets,
                                   f'kv_cache_block_offsets', shape)
             add_tensor_with_shape(host_kv_cache_block_offsets,
                                   f'host_kv_cache_block_offsets', shape)
             pool_pointers = f'host_kv_cache_pool_pointers'
+            pool_mapping = f'host_kv_cache_pool_mapping'
             add_tensor(self.buffer[pool_pointers], pool_pointers)
+            add_tensor(self.buffer[pool_mapping], pool_mapping)
             if self.cross_attention:
                 cross_shape = cross_kv_cache_block_offsets.shape
                 cross_shape = [
-                    cross_shape[0] * cross_shape[1], *cross_shape[2:]
+                    cross_shape[0], cross_shape[1] * cross_shape[2],
+                    *cross_shape[3:]
                 ]
                 add_tensor_with_shape(cross_kv_cache_block_offsets,
                                       f'cross_kv_cache_block_offsets',
@@ -2265,8 +2318,10 @@ def add_tensor_with_shape(x, name, shape):
                                       f'host_cross_kv_cache_block_offsets',
                                       cross_shape)
                 cross_pool_pointers = f'host_cross_kv_cache_pool_pointers'
+                cross_pool_mapping = f'host_cross_kv_cache_pool_mapping'
                 add_tensor(self.buffer[cross_pool_pointers],
                            cross_pool_pointers)
+                add_tensor(self.buffer[cross_pool_mapping], cross_pool_mapping)
 
         if prompt_embedding_table is not None:
             add_tensor(prompt_embedding_table, 'prompt_embedding_table')
@@ -3055,11 +3110,11 @@ def handle_per_step(
                 'host_runtime_perf_knobs', None)
 
             if self.paged_kv_cache and self.has_attn_layers:
-                host_kv_cache_block_offsets = self.kv_cache_manager.get_block_offsets(
+                host_kv_cache_block_offsets = self.pools_kv_cache_manager.get_block_offsets(
                     beam_width=1)
                 kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda')
                 if self.cross_attention:
-                    host_cross_kv_cache_block_offsets = self.cross_kv_cache_manager.get_block_offsets(
+                    host_cross_kv_cache_block_offsets = self.cross_pools_kv_cache_manager.get_block_offsets(
                         beam_width=1)
                     cross_kv_cache_block_offsets = host_cross_kv_cache_block_offsets.to(
                         'cuda')
@@ -3236,7 +3291,7 @@ def handle_per_step(
                                            self.accept_lengths).item()
                     assert add_token_count > 0
                     for _ in range(add_token_count):
-                        self.kv_cache_manager.step([False] * batch_size)
+                        self.pools_kv_cache_manager.step([False] * batch_size)
                 if self.is_medusa_mode and self.num_draft_tokens > 0:
                     # Allocate kv cache token slots for next step.
                     # Make sure there are always > (num_draft_tokens + 1) free token slots.
@@ -3246,16 +3301,16 @@ def handle_per_step(
                                            self.accept_lengths).item()
                     assert add_token_count > 0
                     for _ in range(add_token_count):
-                        self.kv_cache_manager.step([False] * batch_size)
+                        self.pools_kv_cache_manager.step([False] * batch_size)
                 else:
-                    self.kv_cache_manager.step([False] * batch_size)
+                    self.pools_kv_cache_manager.step([False] * batch_size)
                 torch.cuda.nvtx.range_pop()
                 torch.cuda.nvtx.range_push("paged_kv_post_alloc")
-                host_kv_cache_block_offsets = self.kv_cache_manager.get_block_offsets(
+                host_kv_cache_block_offsets = self.pools_kv_cache_manager.get_block_offsets(
                     beam_width)
                 kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda')
                 if self.cross_attention:
-                    host_cross_kv_cache_block_offsets = self.cross_kv_cache_manager.get_block_offsets(
+                    host_cross_kv_cache_block_offsets = self.cross_pools_kv_cache_manager.get_block_offsets(
                         beam_width)
                     cross_kv_cache_block_offsets = host_cross_kv_cache_block_offsets.to(
                         'cuda')
@@ -3386,9 +3441,9 @@ def handle_per_step(
                                                      and should_stop.item()):
                 # Free all blocks in all sequences.
                 # With in-flight batching and while loop we'll free some sequences, when they are done
-                self.kv_cache_manager.step([True] * batch_size)
+                self.pools_kv_cache_manager.step([True] * batch_size)
                 if self.cross_attention:
-                    self.cross_kv_cache_manager.step([True] * batch_size)
+                    self.cross_pools_kv_cache_manager.step([True] * batch_size)
 
         if self.debug_mode:
             self.dump_debug_buffers(step)
@@ -3764,21 +3819,23 @@ def decode(self,
             num_blocks, max_blocks_per_seq = self._get_num_paged_blocks(
                 self.max_attention_window_size, self.sink_token_length,
                 self.use_one_more_block)
-            self.buffer[f'host_kv_cache_pool_pointers'] = torch.tensor(
-                [self.kv_cache_pool.data_ptr(), 0], dtype=torch.int64)
-
-            block_size = self.get_num_heads_kv(
-            ) * self.tokens_per_block * self.head_size
-            self.kv_cache_manager = KVCacheManager(
-                num_layers=self.num_attn_layers,
-                num_blocks=num_blocks,
-                block_size=block_size,
-                tokens_per_block=self.tokens_per_block,
-                max_blocks_per_seq=max_blocks_per_seq,
+
+            self.buffer[
+                f'host_kv_cache_pool_pointers'] = self._memory_pool_allocator.get_kv_cache_pool_pointers(
+                )
+            self.buffer[
+                f'host_kv_cache_pool_mapping'] = self._memory_pool_allocator.pool_mapping
+
+            self.pools_kv_cache_manager = PoolsKVCacheManager(
+                self._memory_pool_allocator.pools_metadata,
+                max_blocks_per_seq,
+                num_blocks,
+                self.tokens_per_block,
+                self.head_size,
                 max_attention_window_size=self.max_attention_window_size,
-                sink_token_len=self.sink_token_length,
                 beam_width=beam_width,
-                use_one_more_block=self.use_one_more_block)
+                use_one_more_block=self.use_one_more_block,
+                sink_token_len=self.sink_token_length)
 
             if self.cross_attention:
                 cross_num_blocks, max_cross_blocks_per_seq = self._get_num_paged_blocks(
@@ -3786,33 +3843,32 @@ def decode(self,
                     sink_token_length=0,
                     use_one_more_block=False)
                 self.buffer[
-                    f'host_cross_kv_cache_pool_pointers'] = torch.tensor(
-                        [self.cross_kv_cache_pool.data_ptr(), 0],
-                        dtype=torch.int64)
-
-                cross_block_size = self.get_num_heads_kv(
-                ) * self.tokens_per_block * self.head_size
-                self.cross_kv_cache_manager = KVCacheManager(
-                    num_layers=self.num_layers,
-                    num_blocks=cross_num_blocks,
-                    block_size=cross_block_size,
-                    tokens_per_block=self.tokens_per_block,
-                    max_blocks_per_seq=max_cross_blocks_per_seq,
+                    f'host_cross_kv_cache_pool_pointers'] = self._cross_memory_pool_allocator.get_kv_cache_pool_pointers(
+                    )
+                self.buffer[
+                    f'host_cross_kv_cache_pool_mapping'] = self._cross_memory_pool_allocator.pool_mapping
+
+                self.cross_pools_kv_cache_manager = PoolsKVCacheManager(
+                    self._memory_pool_allocator.pools_metadata,
+                    max_cross_blocks_per_seq,
+                    cross_num_blocks,
+                    self.tokens_per_block,
+                    self.head_size,
                     max_attention_window_size=self.encoder_max_input_length,
-                    sink_token_len=self.sink_token_length,
                     beam_width=beam_width,
-                    use_one_more_block=False)
+                    use_one_more_block=False,
+                    sink_token_len=self.sink_token_length)
 
             # Add sequences to the manager
             for bi in range(batch_size):
                 generation_sequence = GenerationSequence(seq_idx=bi,
                                                          batch_idx=bi)
-                self.kv_cache_manager.add_sequence(generation_sequence,
-                                                   max_context_length)
+                self.pools_kv_cache_manager.add_sequence(
+                    generation_sequence, max_context_length)
                 if self.cross_attention:
                     cross_generation_sequence = GenerationSequence(seq_idx=bi,
                                                                    batch_idx=bi)
-                    self.cross_kv_cache_manager.add_sequence(
+                    self.cross_pools_kv_cache_manager.add_sequence(
                         cross_generation_sequence,
                         self.encoder_max_input_length,
                         always_share_across_beam=True)
@@ -3834,7 +3890,7 @@ def decode(self,
             if self.paged_kv_cache:
                 self.kv_cache_updater.init_paged_kv_cache(
                     self.num_layers, self.get_num_heads_kv(), self.head_size,
-                    kv_cache_type, self.kv_cache_manager,
+                    kv_cache_type, self.pools_kv_cache_manager,
                     self.buffer[f'host_kv_cache_pool_pointers'])
             else:
                 past_key_value_list = [
diff --git a/tensorrt_llm/runtime/kv_cache_manager.py b/tensorrt_llm/runtime/kv_cache_manager.py
index f7b33c336..c2b6c3f9b 100644
--- a/tensorrt_llm/runtime/kv_cache_manager.py
+++ b/tensorrt_llm/runtime/kv_cache_manager.py
@@ -79,7 +79,8 @@ def __init__(self,
                  max_blocks_per_seq: int = 128,
                  beam_width: int = 1):
         """
-        expected block pool shape: [num_blocks, num_layers, 2, block_size]
+        If layers are homogeneous then the expected block pool shape is: [num_blocks, num_layers, 2, block_size]
+        Otherwise, the expected block pool shape is: [num_blocks, 2, block_size]
         """
 
         self.max_blocks_per_seq = max_blocks_per_seq
@@ -263,6 +264,7 @@ def __init__(self,
             block_size=block_size,
             max_blocks_per_seq=max_blocks_per_seq,
             beam_width=beam_width)
+
         self.tokens_per_block = tokens_per_block
         self.max_attention_window_size = max_attention_window_size
         self.sink_token_len = sink_token_len
@@ -422,8 +424,15 @@ def update(self, accepted_draft_token_offsets,
                                                           int) else 0
         assert self.use_paged_kv_cache is not None
         if self.use_paged_kv_cache:
-            host_kv_cache_block_offsets = self.kv_cache_manager.get_block_offsets(
-                1)
+            if self.kv_cache_manager.has_single_pool():
+                kv_cache_manager = self.kv_cache_manager.get_single_kv_cache_manager(
+                )
+            else:
+                raise RuntimeError(
+                    "Currently, using KVCacheUpdater with more then single memory pool is not supported"
+                )
+
+            host_kv_cache_block_offsets = kv_cache_manager.get_block_offsets(1)
             kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda')
             torch.ops.tensorrt_llm.update_kv_cache_draft_token_location(
                 accepted_draft_token_offsets,
@@ -434,13 +443,13 @@ def update(self, accepted_draft_token_offsets,
                 self.num_kv_heads,
                 self.head_dim * self.elt_size,
                 rewind_tokens_count,
-                self.kv_cache_manager.max_attention_window_size,
+                kv_cache_manager.max_attention_window_size,
                 rewind_tokens_tensor,
                 None,
                 self.host_kv_cache_pool_pointers,
                 kv_cache_block_offsets,
-                self.kv_cache_manager.blocks_manager.max_blocks_per_seq,
-                self.kv_cache_manager.tokens_per_block,
+                kv_cache_manager.blocks_manager.max_blocks_per_seq,
+                kv_cache_manager.tokens_per_block,
                 None,
             )
         else:
diff --git a/tensorrt_llm/runtime/memory_pools/__init__.py b/tensorrt_llm/runtime/memory_pools/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py b/tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py
new file mode 100644
index 000000000..d24d8d68b
--- /dev/null
+++ b/tensorrt_llm/runtime/memory_pools/memory_pools_allocator.py
@@ -0,0 +1,80 @@
+from collections import Counter
+from typing import List
+
+import torch
+
+import tensorrt_llm
+from tensorrt_llm.runtime.memory_pools.pool import Pool
+
+
+class MemoryPoolsAllocator(object):
+
+    def __init__(self, num_blocks, tokens_per_block, head_size):
+        self._pools_metadata = []
+        self._pool_pointers = []
+        self._pool_mapping = None
+
+        self._num_blocks = num_blocks
+        self._tokens_per_block = tokens_per_block
+        self._head_size = head_size
+
+    def allocate(self, dtype, num_kv_heads_per_layer: List[int], device="cuda"):
+        self._num_kv_heads_per_layer = num_kv_heads_per_layer
+
+        if isinstance(dtype, str):
+            dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype)
+        kv_heads_unique_counter = Counter(self._num_kv_heads_per_layer)
+        keys_to_indices = {}
+
+        for idx, (kv_head,
+                  num_layers) in enumerate(kv_heads_unique_counter.items()):
+            keys_to_indices[kv_head] = idx
+            cache_shape = (
+                self._num_blocks,
+                num_layers,
+                2,
+                kv_head,
+                self._tokens_per_block,
+                self._head_size,
+            )
+            self._pool_pointers.append(
+                torch.empty(cache_shape, dtype=dtype, device=device))
+            self._pools_metadata.append(
+                Pool(num_kv_heads=kv_head, num_layers=num_layers))
+
+        self._set_layers_mapping(keys_to_indices)
+
+    def get_kv_cache_pool_pointers(self):
+        return self._get_primarmy_secondary_pool_pointers()
+
+    def _set_layers_mapping(self, keys_to_indices):
+        layers_mapping = []
+        for kv_size in self._num_kv_heads_per_layer:
+            layers_mapping.append(keys_to_indices[kv_size])
+
+        self._pool_mapping = torch.tensor(layers_mapping, dtype=torch.int32)
+
+    def _get_primarmy_secondary_pool_pointers(self):
+        assert len(self._pool_pointers
+                   ) >= 1, "pool pointers haven't been initiated yet"
+        data_ptr_pointers = torch.tensor(list(
+            map(lambda x: x.data_ptr(), self._pool_pointers)),
+                                         dtype=torch.int64)
+        host_kv_cache_pool_pointers = torch.cat(
+            (data_ptr_pointers.view(-1, 1),
+             torch.zeros(len(self._pool_pointers), 1, dtype=torch.int64)),
+            dim=1)
+
+        return host_kv_cache_pool_pointers
+
+    @classmethod
+    def prepare_num_kv_heads_per_layer(cls, kv_head, num_layers):
+        return [kv_head] * num_layers
+
+    @property
+    def pools_metadata(self):
+        return self._pools_metadata
+
+    @property
+    def pool_mapping(self):
+        return self._pool_mapping
diff --git a/tensorrt_llm/runtime/memory_pools/pool.py b/tensorrt_llm/runtime/memory_pools/pool.py
new file mode 100644
index 000000000..63308ad0d
--- /dev/null
+++ b/tensorrt_llm/runtime/memory_pools/pool.py
@@ -0,0 +1,7 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class Pool(object):
+    num_kv_heads: int
+    num_layers: int
diff --git a/tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py b/tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py
new file mode 100644
index 000000000..4baf86ad3
--- /dev/null
+++ b/tensorrt_llm/runtime/memory_pools/pools_kv_cache_manager.py
@@ -0,0 +1,67 @@
+from typing import List
+
+import torch
+
+from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence,
+                                                   KVCacheManager)
+from tensorrt_llm.runtime.memory_pools.pool import Pool
+
+
+class PoolsKVCacheManager(object):
+
+    def __init__(self,
+                 pools_metadata: List[Pool],
+                 max_blocks_per_seq,
+                 num_blocks,
+                 tokens_per_block,
+                 head_size,
+                 max_attention_window_size,
+                 beam_width,
+                 sink_token_len,
+                 use_one_more_block: bool = False) -> None:
+        self._num_pools = len(pools_metadata)
+        self._kv_cache_managers = []
+
+        for pool in pools_metadata:
+            block_size = pool.num_kv_heads * tokens_per_block * head_size
+            self._kv_cache_managers.append(
+                KVCacheManager(
+                    num_layers=pool.num_layers,
+                    num_blocks=num_blocks,
+                    block_size=block_size,
+                    tokens_per_block=tokens_per_block,
+                    max_blocks_per_seq=max_blocks_per_seq,
+                    max_attention_window_size=max_attention_window_size,
+                    sink_token_len=sink_token_len,
+                    use_one_more_block=use_one_more_block,
+                    beam_width=beam_width,
+                ))
+
+    def add_sequence(self,
+                     sequence: GenerationSequence,
+                     context_len: int,
+                     always_share_across_beam: bool = False):
+        for kv_cache_manager in self._kv_cache_managers:
+            kv_cache_manager.add_sequence(sequence, context_len,
+                                          always_share_across_beam)
+
+    def step(self, finished: List[bool]):
+        for kv_cache_manager in self._kv_cache_managers:
+            kv_cache_manager.step(finished)
+
+    def get_block_offsets(self, beam_width: int) -> torch.Tensor:
+        offsets = []
+        for kv_cache_manager in self._kv_cache_managers:
+            block_offset = kv_cache_manager.get_block_offsets(beam_width)
+            offsets.append(block_offset)
+
+        return torch.stack(offsets)
+
+    def get_single_kv_cache_manager(self):
+        assert len(self._kv_cache_managers
+                   ) == 1, f"More then one kv cache manager exists"
+
+        return self._kv_cache_managers[0]
+
+    def has_single_pool(self):
+        return len(self._kv_cache_managers) == 1
diff --git a/tensorrt_llm/runtime/model_runner_cpp.py b/tensorrt_llm/runtime/model_runner_cpp.py
index d715c2da3..269ca2f37 100644
--- a/tensorrt_llm/runtime/model_runner_cpp.py
+++ b/tensorrt_llm/runtime/model_runner_cpp.py
@@ -15,7 +15,7 @@
 
 import copy
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 
@@ -24,6 +24,7 @@
 from ..bindings import (DataType, GptJsonConfig, KVCacheType, ModelConfig,
                         WorldConfig)
 from ..bindings import executor as trtllm
+from ..bindings.executor import ExternalDraftTokensConfig, ParallelConfig
 from ..builder import EngineConfig
 from ..logger import logger
 from ..mapping import Mapping
@@ -87,6 +88,7 @@ def from_dir(
         max_attention_window_size: Optional[list[int]] = None,
         sink_token_length: Optional[int] = None,
         kv_cache_free_gpu_memory_fraction: Optional[float] = None,
+        cross_kv_cache_fraction: Optional[float] = None,
         medusa_choices: list[list[int]] | None = None,
         lookahead_config: list[int] | None = None,
         debug_mode: bool = False,
@@ -97,7 +99,10 @@ def from_dir(
         enable_chunked_context: bool = False,
         is_enc_dec: bool = False,
         multi_block_mode: bool = True,
-        enable_context_fmha_fp32_acc: Optional[bool] = None
+        enable_context_fmha_fp32_acc: Optional[bool] = None,
+        cuda_graph_mode: Optional[bool] = None,
+        logits_processor_map: Optional[Dict[str, LogitsProcessor]] = None,
+        device_ids: List[int] | None = None,
     ) -> 'ModelRunnerCpp':
         """
         Create a ModelRunnerCpp instance from an engine directory.
@@ -131,6 +136,8 @@ def from_dir(
                 The sink token length, default=0.
             kv_cache_free_gpu_memory_fraction (float) :
                 Free GPU memory fraction that KV cache used.
+            cross_kv_cache_fraction (float) :
+                KV Cache fraction reserved for cross attention, should only be used with enc-dec models.
             debug_mode (bool):
                 Whether or not to turn on the debug mode.
             medusa_choices (List[List[int]]):
@@ -149,6 +156,13 @@ def from_dir(
                 Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.
             enable_context_fmha_fp32_acc (bool):
                 Enable FMHA runner FP32 accumulation.
+            cuda_graph_mode (bool):
+                Whether to use cuda graph for inference.
+            logits_processor_map (Dict[str, LogitsProcessor])
+                A map of logits processor functions indexed by names. A name can be provided later to
+                the generate() function to specify which logits processor to run.
+            device_ids (List[int]):
+                Device indices to run the Executor on.
         Returns:
             ModelRunnerCpp: An instance of ModelRunnerCpp.
         """
@@ -158,6 +172,8 @@ def from_dir(
             extended_runtime_perf_knob_config.multi_block_mode = multi_block_mode
         if enable_context_fmha_fp32_acc is not None:
             extended_runtime_perf_knob_config.enable_context_fmha_fp32_acc = enable_context_fmha_fp32_acc
+        if cuda_graph_mode is not None:
+            extended_runtime_perf_knob_config.cuda_graph_mode = cuda_graph_mode
 
         if is_enc_dec:
             encoder_config_path = Path(engine_dir) / "encoder" / "config.json"
@@ -182,8 +198,8 @@ def from_dir(
             profiler.start('load tensorrt_llm engine')
 
             kv_cache_config = trtllm.KvCacheConfig(
-                free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction /
-                2,  # hardcoded as half self kv & half cross kv for now
+                free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
+                cross_kv_cache_fraction=cross_kv_cache_fraction,
                 max_attention_window=max_attention_window_size,
                 sink_token_length=sink_token_length)
 
@@ -215,6 +231,7 @@ def from_dir(
         json_config = GptJsonConfig.parse_file(config_path)
         model_config = json_config.model_config
         use_kv_cache = model_config.kv_cache_type != KVCacheType.DISABLED
+        assert cross_kv_cache_fraction is None, "cross_kv_cache_fraction should only be used with enc-dec models."
 
         if not use_kv_cache:
             assert max_output_len == 1 or max_output_len is None, 'Disabled KV cache is intended for context phase only now.'
@@ -320,8 +337,8 @@ def from_dir(
             debug_tensor_names: List[str] = [
             ]  # modify this list for specific tensor dump
             debug_config = trtllm.DebugConfig(
-                dump_input_tensors=True,
-                dump_output_tensors=True,
+                debug_input_tensors=True,
+                debug_output_tensors=True,
                 debug_tensor_names=debug_tensor_names)
 
         trtllm_config = trtllm.ExecutorConfig(
@@ -333,6 +350,16 @@ def from_dir(
             gpu_weights_percent=gpu_weights_percent)
         trtllm_config.enable_chunked_context = enable_chunked_context
         trtllm_config.extended_runtime_perf_knob_config = extended_runtime_perf_knob_config
+        trtllm_config.parallel_config = ParallelConfig(
+            trtllm.CommunicationType.MPI,
+            trtllm.CommunicationMode.LEADER,
+            device_ids=device_ids,
+            orchestrator_config=None)
+
+        logits_proc_config = trtllm.LogitsPostProcessorConfig()
+        if logits_processor_map is not None:
+            logits_proc_config.processor_map = logits_processor_map
+        trtllm_config.logits_post_processor_config = logits_proc_config
 
         executor = trtllm.Executor(engine_dir, trtllm.ModelType.DECODER_ONLY,
                                    trtllm_config)
@@ -434,7 +461,7 @@ def generate(
             lookahead_config: list[int] | None = None,
             streaming: bool = False,
             stopping_criteria: Optional[StoppingCriteria] = None,
-            logits_processor: Optional[LogitsProcessor] = None,
+            logits_processor_names: list[str] | None = None,
             max_new_tokens: int = 1,
             num_return_sequences: int = 1,
             end_id: int | None = None,
@@ -447,6 +474,7 @@ def generate(
             output_cum_log_probs: bool = False,
             prompt_table: Optional[Union[str, torch.Tensor]] = None,
             prompt_tasks: Optional[str] = None,
+            input_token_extra_ids: List[List[int]] = None,
             return_all_generated_tokens: bool = False,
             **kwargs) -> Union[torch.Tensor, dict]:
         """
@@ -473,14 +501,16 @@ def generate(
                 The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.
             prompt_tasks (str):
                 The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).
+            input_token_extra_ids (List[List[int]]):
+                Input token extra ids for using p-tuning and KV Cache reuse together
             lora_uids (list):
                 The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.
             streaming (bool):
                 Whether or not to use streaming mode for generation.
             stopping_criteria (StoppingCriteria):
                 Custom stopping criteria.
-            logits_processor (LogitsProcessor):
-                Custom logits processors.
+            logits_processor_names (List[str]):
+                Custom logits processor names.
             return_all_generated_tokens (bool):
                 Whether the full output is returned at each streaming step
             num_return_sequences (int):
@@ -501,9 +531,6 @@ def generate(
         if stopping_criteria is not None:
             raise RuntimeError(
                 "Stopping criteria is not supported in C++ session.")
-        if logits_processor is not None:
-            raise RuntimeError(
-                "Logits processor is not supported in C++ session.")
 
         if not self.use_kv_cache and max_new_tokens > 1:
             raise RuntimeError(
@@ -554,12 +581,15 @@ def generate(
         )
 
         prompt_tuning_configs = self._prepare_ptuning_executor(
-            batch_input_ids_list, prompt_table, prompt_tasks)
+            batch_input_ids_list, prompt_table, prompt_tasks,
+            input_token_extra_ids)
 
         stop_words_list = self._prepare_words_list(stop_words_list,
                                                    len(batch_input_ids_list))
         bad_words_list = self._prepare_words_list(bad_words_list,
                                                   len(batch_input_ids_list))
+        logits_processor_names = self._prepare_names_list(
+            logits_processor_names, len(batch_input_ids_list))
 
         lora_configs = self._prepare_lora_configs(lora_uids,
                                                   len(batch_input_ids_list))
@@ -568,6 +598,29 @@ def generate(
             [w, n, g] = lookahead_config
             request_lookahead_config = trtllm.LookaheadDecodingConfig(w, n, g)
 
+        # Draft-Target-Model speculative decoding
+        if "draft_tokens_list" in kwargs.keys() and kwargs[
+                "draft_tokens_list"] is not None and "draft_logits_list" in kwargs.keys(
+                ) and kwargs["draft_logits_list"] is not None:
+            # Use logits to accept
+            external_draft_tokens_configs = [
+                ExternalDraftTokensConfig(draft_tokens, draft_logits, 1.0e-8)
+                for draft_tokens, draft_logits in zip(
+                    kwargs["draft_tokens_list"], kwargs["draft_logits_list"])
+            ]
+            is_draft_target_model = True
+        elif "draft_tokens_list" in kwargs.keys(
+        ) and kwargs["draft_tokens_list"] is not None:
+            # Use tokens to accept
+            external_draft_tokens_configs = [
+                ExternalDraftTokensConfig(draft_tokens)
+                for draft_tokens in kwargs["draft_tokens_list"]
+            ]
+            is_draft_target_model = True
+        else:
+            external_draft_tokens_configs = [None] * len(batch_input_ids_list)
+            is_draft_target_model = False
+
         requests = [
             trtllm.Request(
                 input_token_ids=input_ids,
@@ -591,11 +644,16 @@ def generate(
                 output_config=output_config,
                 prompt_tuning_config=prompt_tuning_config,
                 lora_config=lora_config,
-                return_all_generated_tokens=return_all_generated_tokens) for i,
+                return_all_generated_tokens=return_all_generated_tokens,
+                logits_post_processor_name=logits_post_processor_name,
+                external_draft_tokens_config=external_draft_tokens_config,
+            ) for i,
             (input_ids, stop_words, bad_words, prompt_tuning_config,
-             lora_config) in enumerate(
+             lora_config, logits_post_processor_name,
+             external_draft_tokens_config) in enumerate(
                  zip(batch_input_ids_list, stop_words_list, bad_words_list,
-                     prompt_tuning_configs, lora_configs))
+                     prompt_tuning_configs, lora_configs,
+                     logits_processor_names, external_draft_tokens_configs))
         ]
 
         request_ids = self.session.enqueue_requests(requests)
@@ -603,14 +661,15 @@ def generate(
             return self._initialize_and_fill_output(
                 request_ids, end_id, return_dict, output_sequence_lengths,
                 output_log_probs, output_cum_log_probs, batch_input_ids,
-                streaming, max_new_tokens, num_return_sequences)
+                streaming, max_new_tokens, num_return_sequences,
+                is_draft_target_model)
         else:
             return self._stream(request_ids, end_id, return_dict,
                                 output_sequence_lengths, output_log_probs,
                                 output_cum_log_probs, batch_input_ids,
                                 batch_input_ids_list, streaming,
                                 return_all_generated_tokens, max_new_tokens,
-                                num_return_sequences)
+                                num_return_sequences, is_draft_target_model)
 
     def _prepare_words_list(self, words_list: List[List[List[int]]],
                             batch_size: int):
@@ -618,8 +677,16 @@ def _prepare_words_list(self, words_list: List[List[List[int]]],
             return [None] * batch_size
         return words_list
 
+    def _prepare_names_list(self, names_list: List[str], batch_size: int):
+        if names_list is None:
+            return [None] * batch_size
+        return names_list
+
     def _prepare_ptuning_executor(self, batch_input_ids_list, prompt_table,
-                                  prompt_tasks):
+                                  prompt_tasks, input_token_extra_ids):
+        if input_token_extra_ids:
+            assert len(batch_input_ids_list) == len(input_token_extra_ids), \
+                f"Batch size of input_token_extra_ids ({len(input_token_extra_ids)}) must be the same as input batch size ({len(batch_input_ids_list)})"
         prompt_tuning_configs = len(batch_input_ids_list) * [None]
         if prompt_table is not None:
             prompt_table_data = self._prepare_embedding_table(
@@ -630,14 +697,18 @@ def _prepare_ptuning_executor(self, batch_input_ids_list, prompt_table,
                     f"Number of supplied tasks ({len(task_indices)}) must match input batch size ({len(batch_input_ids_list)})"
                 prompt_tuning_configs = [
                     trtllm.PromptTuningConfig(
-                        embedding_table=prompt_table_data[task_indices[i]])
+                        embedding_table=prompt_table_data[task_indices[i]],
+                        input_token_extra_ids=input_token_extra_ids[i]
+                        if input_token_extra_ids else None)
                     for i in range(len(batch_input_ids_list))
                 ]
             else:
                 prompt_tuning_configs = [
                     trtllm.PromptTuningConfig(
-                        embedding_table=prompt_table_data[0])
-                    for _ in range(len(batch_input_ids_list))
+                        embedding_table=prompt_table_data[0],
+                        input_token_extra_ids=input_token_extra_ids[i]
+                        if input_token_extra_ids else None)
+                    for i in range(len(batch_input_ids_list))
                 ]
         return prompt_tuning_configs
 
@@ -652,17 +723,20 @@ def _prepare_lora_configs(self, lora_uids, batch_size):
             if int(uid) >= 0 else None for uid in lora_uids
         ]
 
-    def _initialize_and_fill_output(self,
-                                    request_ids,
-                                    end_id,
-                                    return_dict,
-                                    output_sequence_lengths,
-                                    output_log_probs,
-                                    output_cum_log_probs,
-                                    batch_input_ids,
-                                    streaming,
-                                    max_new_tokens: int,
-                                    num_return_sequences: int = 1):
+    def _initialize_and_fill_output(
+        self,
+        request_ids,
+        end_id,
+        return_dict,
+        output_sequence_lengths,
+        output_log_probs,
+        output_cum_log_probs,
+        batch_input_ids,
+        streaming,
+        max_new_tokens: int,
+        num_return_sequences: int = 1,
+        is_draft_target_model: bool = False,
+    ):
         output_ids = [[[] for _ in range(self.max_beam_width)]
                       for _ in range(len(request_ids) * num_return_sequences)]
 
@@ -675,21 +749,24 @@ def _initialize_and_fill_output(self,
                                  output_sequence_lengths, output_log_probs,
                                  output_cum_log_probs, batch_input_ids, [],
                                  streaming, request_ids, False, max_new_tokens,
-                                 num_return_sequences)
-
-    def _stream(self,
-                request_ids,
-                end_id,
-                return_dict,
-                output_sequence_lengths,
-                output_log_probs,
-                output_cum_log_probs,
-                batch_input_ids,
-                batch_input_ids_list,
-                streaming,
-                return_all_generated_tokens,
-                max_new_tokens: int,
-                num_return_sequences: int = 1):
+                                 num_return_sequences, is_draft_target_model)
+
+    def _stream(
+        self,
+        request_ids,
+        end_id,
+        return_dict,
+        output_sequence_lengths,
+        output_log_probs,
+        output_cum_log_probs,
+        batch_input_ids,
+        batch_input_ids_list,
+        streaming,
+        return_all_generated_tokens,
+        max_new_tokens: int,
+        num_return_sequences: int = 1,
+        is_draft_target_model: bool = False,
+    ):
 
         output_ids = [[]
                       for _ in range(len(request_ids) * num_return_sequences)]
@@ -712,14 +789,15 @@ def _stream(self,
                                     output_cum_log_probs, batch_input_ids,
                                     batch_input_ids_list, streaming,
                                     request_ids, return_all_generated_tokens,
-                                    max_new_tokens, num_return_sequences)
+                                    max_new_tokens, num_return_sequences,
+                                    is_draft_target_model)
 
     def _fill_output(self, responses, output_ids, end_id, return_dict,
                      output_sequence_lengths, output_log_probs,
                      output_cum_log_probs, batch_input_ids,
                      batch_input_ids_list, streaming, request_ids,
                      return_all_generated_tokens, max_new_tokens,
-                     num_return_sequences):
+                     num_return_sequences, is_draft_target_model):
         cuda_device = torch.device("cuda")
 
         # Total number of output sequences = batch_size * num_return_sequences.
@@ -792,32 +870,39 @@ def req_idx(response: trtllm.Response):
                 outputs['context_logits'] = context_logits
 
             if self.gather_generation_logits:
-                if not streaming:
-                    gen_shape = (num_beams, max_new_tokens, vocab_size)
-                elif streaming and return_all_generated_tokens:
-                    gen_shape = (max_new_tokens, num_beams, vocab_size)
-                else:  # streaming and not return_all_generated_tokens
-                    gen_shape = (1, num_beams, vocab_size)
-
                 gen_logits = None
-                for response in responses:
-                    # gen logits shape: (beam, seq, vocab)
-                    logits = response.result.generation_logits
-                    if logits is None:
-                        continue
-                    num_beams, seq_len, vocab_size = logits.shape
-                    if gen_logits is None:
-                        gen_logits = torch.zeros(
-                            (num_output_sequences, *gen_shape),
-                            dtype=logits.dtype,
-                            device=cuda_device)
-                    batch_idx = request_ids.index(response.request_id)
-                    seq_idx = response.result.sequence_index
-                    reqid_pos = batch_idx * num_return_sequences + seq_idx
-                    if streaming:
-                        gen_logits[reqid_pos, :seq_len, ...] = logits[0]
-                    else:
-                        gen_logits[reqid_pos, :, :seq_len, ...] = logits[0]
+                if is_draft_target_model:
+                    # Put the outputs in a list rather than a tensor since their
+                    # length may vary among requests in a batch
+                    gen_logits = [
+                        a.result.generation_logits.cuda() for a in responses
+                        if a.result.generation_logits is not None
+                    ]
+                else:
+                    for response in responses:
+                        # gen logits shape: (beam, seq, vocab)
+                        logits = response.result.generation_logits
+                        if logits is None:
+                            continue
+                        num_beams, seq_len, vocab_size = logits.shape
+                        if not streaming:
+                            gen_shape = (num_beams, max_new_tokens, vocab_size)
+                        elif streaming and return_all_generated_tokens:
+                            gen_shape = (max_new_tokens, num_beams, vocab_size)
+                        else:  # streaming and not return_all_generated_tokens
+                            gen_shape = (1, num_beams, vocab_size)
+                        if gen_logits is None:
+                            gen_logits = torch.zeros(
+                                (num_output_sequences, *gen_shape),
+                                dtype=logits.dtype,
+                                device=cuda_device)
+                        batch_idx = request_ids.index(response.request_id)
+                        seq_idx = response.result.sequence_index
+                        reqid_pos = batch_idx * num_return_sequences + seq_idx
+                        if streaming:
+                            gen_logits[reqid_pos, :seq_len, ...] = logits[0]
+                        else:
+                            gen_logits[reqid_pos, :, :seq_len, ...] = logits[0]
                 outputs['generation_logits'] = gen_logits
 
             if output_log_probs:
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
index 3c66b5261..dc3d9d279 100644
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@@ -18,7 +18,6 @@
                           Pix2StructForConditionalGeneration,
                           VisionEncoderDecoderModel)
 # isort: on
-import json
 import math
 
 import torch.nn.functional as F
@@ -145,7 +144,8 @@ def build_trt_engine(model_type,
 
     config_args = {
         "precision": str(dtype).split('.')[-1],
-        "model_type": model_type
+        "model_type": model_type,
+        "strongly_typed": False
     }
     if num_frames is not None:
         config_args["num_frames"] = num_frames
@@ -325,8 +325,10 @@ def forward(self, image):
                 features = all_hidden_states[self.feature_layer][:, 1:]
                 return self.projector(features)
 
+        hf_config = AutoConfig.from_pretrained(args.model_path)
+        hf_config.vision_config._attn_implementation = "eager"
         model = LlavaForConditionalGeneration.from_pretrained(
-            args.model_path, torch_dtype=torch.float16)
+            args.model_path, torch_dtype=torch.float16, config=hf_config)
         wrapper = LlavaVisionWrapper(
             model.vision_tower.to(args.device),
             model.multi_modal_projector.to(args.device),
@@ -352,8 +354,10 @@ def forward(self, pixel_values):
                 image_features = self.projector(selected_image_feature)
                 return image_features  # (bs, 576, c)
 
+        hf_config = AutoConfig.from_pretrained(args.model_path)
+        hf_config.vision_config._attn_implementation = "eager"
         model = LlavaNextForConditionalGeneration.from_pretrained(
-            args.model_path, torch_dtype=torch.float16)
+            args.model_path, torch_dtype=torch.float16, config=hf_config)
         wrapper = LlavaNextVisionWrapper(
             model.vision_tower.vision_model.to(args.device),
             model.multi_modal_projector.to(args.device),
@@ -644,7 +648,8 @@ def forward(self, images):
 
     encoder = AutoModel.from_pretrained(vision_config["from_pretrained"],
                                         torch_dtype=torch.bfloat16,
-                                        trust_remote_code=True)
+                                        trust_remote_code=True,
+                                        attn_implementation="eager")
     vision_encoder = encoder.vision_model
     hf_config = encoder.config
     dtype = hf_config.torch_dtype
@@ -731,13 +736,6 @@ def build_phi_engine(args):
                       images=raw_image,
                       return_tensors="pt")['pixel_values'].to(
                           args.device, torch.float16)
-    try:
-        with open(f"{args.model_path}/preprocessor_config.json", "r") as file:
-            config = file.read()
-            config_dict = json.loads(config)
-            num_crops = config_dict.get("num_crops")
-    except:
-        num_crops = 16
 
     class Phi3VisionWrapper(torch.nn.Module):
 
@@ -792,7 +790,8 @@ def apply_img_projection(self, input):
     tensors = {"glb_GN": glb_GN, "sub_GN": sub_GN}
     save_file(tensors, args.output_dir + "/image_newlines.safetensors")
     export_onnx(wrapper, image, f'{args.output_dir}/onnx')
-    build_trt_engine(
-        args.model_type, [image.shape[1], image.shape[2], image.shape[3]],
-        f'{args.output_dir}/onnx', args.output_dir,
-        args.max_batch_size * (num_crops + 1))  #TODO: Take input from config
+    num_crops = processor.image_processor.num_crops
+    build_trt_engine(args.model_type,
+                     [image.shape[1], image.shape[2], image.shape[3]],
+                     f'{args.output_dir}/onnx', args.output_dir,
+                     args.max_batch_size * (num_crops + 1))
diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py
index 4c97312a5..b425c50e5 100644
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.13.0"
+__version__ = "0.14.0"
diff --git a/tests/attention/test_gpt_attention.py b/tests/attention/test_gpt_attention.py
index 9fdbaec30..ed3e5cadd 100644
--- a/tests/attention/test_gpt_attention.py
+++ b/tests/attention/test_gpt_attention.py
@@ -40,13 +40,18 @@
                                      RotaryScalingType)
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import GenerationSequence, KVCacheManager
+from tensorrt_llm.runtime import GenerationSequence
+from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
+    PoolsKVCacheManager
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.util import (getSMVersion, skip_bf16_fp32_accum,
                         skip_bf16_pre_ampere, skip_fp8_pre_ada,
                         skip_fp32_accum_pre_ampere, unittest_name_func)
 
+from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \
+    MemoryPoolsAllocator
+
 
 class TestFunctional(unittest.TestCase):
 
@@ -399,11 +404,12 @@ def test_gpt_attention(self,
         def _construct_execution(
                 session, input_tensor, weight, bias, past_key_value,
                 host_kv_cache_block_offsets, host_kv_cache_pool_pointers,
-                packed_mask_for_fmha, sequence_length,
-                host_past_key_value_lengths, host_max_attention_window_sizes,
-                host_sink_token_length, context_lengths, host_context_lengths,
-                cache_indirection, host_request_types, num_heads, hidden_size,
-                num_kv_heads, output, dtype, max_context_length, shape_dict,
+                host_kv_cache_pool_mapping, packed_mask_for_fmha,
+                sequence_length, host_past_key_value_lengths,
+                host_max_attention_window_sizes, host_sink_token_length,
+                context_lengths, host_context_lengths, cache_indirection,
+                host_request_types, num_heads, hidden_size, num_kv_heads,
+                output, dtype, max_context_length, shape_dict,
                 kv_int8_quant_scale, kv_int8_dequant_scale, configuration,
                 host_runtime_perf_knobs):
             kv_cache_block_offsets = None
@@ -480,6 +486,7 @@ def _construct_execution(
                 kv_cache_block_offsets_tensor = None
                 host_kv_cache_block_offsets_tensor = None
                 host_kv_cache_pool_pointers_tensor = None
+                host_kv_cache_pool_mapping_tensor = None
                 if paged_kv_cache:
                     kv_cache_block_offsets_tensor = Tensor(
                         name='kv_cache_block_offsets',
@@ -491,8 +498,15 @@ def _construct_execution(
                         dtype=tensorrt_llm.str_dtype_to_trt('int32'))
                     host_kv_cache_pool_pointers_tensor = Tensor(
                         name='host_kv_cache_pool_pointers',
-                        shape=(1, ),
+                        shape=(
+                            1,
+                            1,
+                        ),
                         dtype=tensorrt_llm.str_dtype_to_trt('int64'))
+                    host_kv_cache_pool_mapping_tensor = Tensor(
+                        name='host_kv_cache_pool_mapping',
+                        shape=(1, ),
+                        dtype=tensorrt_llm.str_dtype_to_trt('int32'))
                 else:
                     past_key_value_tensor = Tensor(
                         name='past_key_value',
@@ -606,6 +620,7 @@ def _construct_execution(
                     host_kv_cache_block_offsets_tensor,
                     host_kv_cache_pool_pointers=
                     host_kv_cache_pool_pointers_tensor,
+                    host_kv_cache_pool_mapping=host_kv_cache_pool_mapping_tensor,
                     max_context_length=max_context_length,
                     qkv_bias=qkv_bias,
                     host_runtime_perf_knobs=host_runtime_perf_knobs_tensor)
@@ -639,6 +654,8 @@ def _construct_execution(
                     'host_kv_cache_block_offsets'] = host_kv_cache_block_offsets
                 inputs[
                     'host_kv_cache_pool_pointers'] = host_kv_cache_pool_pointers
+                inputs[
+                    'host_kv_cache_pool_mapping'] = host_kv_cache_pool_mapping
             else:
                 inputs['past_key_value'] = past_key_value
 
@@ -663,7 +680,6 @@ def _construct_execution(
             builder_config = builder.create_builder_config(
                 name=attention_type,
                 precision=dtype,
-                opt_level=0,
                 int8=int8_trt_flag,
                 quant_mode=quant_mode)
 
@@ -725,24 +741,34 @@ def _construct_execution(
                                         dtype=torch_kv_cache_dtype,
                                         device='cuda')
         host_kv_cache_pool_pointers = None
+        host_kv_cache_pool_mapping = None
         # Init KV cache block manager
         if paged_kv_cache:
-            block_size = plugin_kv_num_heads * tokens_per_block * head_size
-            kv_cache_manager = KVCacheManager(
-                num_layers=1,
+            memory_pools_allocator = MemoryPoolsAllocator(
                 num_blocks=num_blocks,
-                block_size=block_size,
                 tokens_per_block=tokens_per_block,
-                max_blocks_per_seq=max_blocks_per_seq,
+                head_size=head_size)
+
+            num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer(
+                plugin_kv_num_heads, 1)
+            memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer)
+            pools_kv_cache_manager = PoolsKVCacheManager(
+                memory_pools_allocator.pools_metadata,
+                max_blocks_per_seq,
+                num_blocks,
+                tokens_per_block,
+                head_size,
                 max_attention_window_size=max_seq_len,
-                sink_token_len=sink_token_len,
-                beam_width=beam_width)
+                beam_width=beam_width,
+                sink_token_len=sink_token_len)
+
             host_kv_cache_pool_pointers = torch.tensor(
                 [present_key_value.data_ptr(), 0], dtype=torch.int64)
+            host_kv_cache_pool_mapping = memory_pools_allocator.pool_mapping
 
             # Add sequences to the kv_cache_manager
             for bi in range(batch_size):
-                kv_cache_manager.add_sequence(
+                pools_kv_cache_manager.add_sequence(
                     GenerationSequence(seq_idx=bi, batch_idx=bi), in_len)
 
         weight = torch.randn(shape_dict['weight'],
@@ -801,7 +827,7 @@ def _construct_execution(
                 #       See input_lengths below.
                 configuration.max_position_embeddings = (
                     in_len // 2) + out_len - (out_len // 2)
-        attention = AttentionCls(configuration).cuda().eval()
+        attention = AttentionCls(configuration, layer_idx=0).cuda().eval()
         if attention_type == 'gpt2_attention':
             attention.c_attn.weight = torch.nn.parameter.Parameter(
                 data=weight.clone().detach(), requires_grad=False)
@@ -969,29 +995,38 @@ def remove_input_padding(tensor):
                                        device='cuda')
 
         def get_kv_quant_scale(torch_present):
-
-            torch_kv = torch.cat((torch_present[0], torch_present[1]))
-            kv_dequant_scale = torch.tensor([torch.max(torch_kv).item() / 127],
-                                            dtype=torch.float32,
-                                            device='cuda').reshape(
-                                                shape_dict['kv_dequant_scale'])
-
-            # fp8 kv cache uses 1.0f scale.
-            if not use_int8_kv_cache:
+            if torch_present is None:
                 kv_dequant_scale = torch.tensor(
                     [1.0], dtype=torch.float32,
                     device='cuda').reshape(shape_dict['kv_dequant_scale'])
+                kv_quant_scale = 1.0 / kv_dequant_scale
+            else:
+                torch_kv = torch.cat((torch_present[0], torch_present[1]))
+                kv_dequant_scale = torch.tensor(
+                    [torch.max(torch_kv).item() / 127],
+                    dtype=torch.float32,
+                    device='cuda').reshape(shape_dict['kv_dequant_scale'])
+
+                # fp8 kv cache uses 1.0f scale.
+                if not use_int8_kv_cache:
+                    kv_dequant_scale = torch.tensor(
+                        [1.0], dtype=torch.float32,
+                        device='cuda').reshape(shape_dict['kv_dequant_scale'])
 
-            kv_quant_scale = 1.0 / kv_dequant_scale
+                kv_quant_scale = 1.0 / kv_dequant_scale
             return kv_dequant_scale, kv_quant_scale
 
         def verify_kv_cache(torch_present):
             # If enable streamingllm, kv_cache stores keys and values that with no positional embedding applied
-            if streamingllm:
+            if streamingllm or torch_present is None:
                 return
 
             if not use_int8_kv_cache and not use_fp8_kv_cache and num_kv_heads == num_heads and beam_width == 1:
                 if paged_kv_cache:
+                    assert pools_kv_cache_manager.has_single_pool(
+                    ) is True, f"Current test assuming only one memory pool"
+                    kv_cache_manager = pools_kv_cache_manager.get_single_kv_cache_manager(
+                    )
                     kv_cache_cont = kv_cache_manager.blocks_manager.get_continuous_caches(
                         present_key_value)
                     kv_cache_cont = kv_cache_cont.permute(1, 0, 2)
@@ -1054,9 +1089,12 @@ def verify_kv_cache(torch_present):
             kv_cache_block_offsets = None
             if paged_kv_cache:
                 # Get arrays of pointers to the "pages" of KV values
+                assert pools_kv_cache_manager.has_single_pool(
+                ) is True, f"Current test assuming only one memory pool"
+                kv_cache_manager = pools_kv_cache_manager.get_single_kv_cache_manager(
+                )
                 kv_cache_block_offsets = kv_cache_manager.get_block_offsets(
                     beam_width)
-
             if step == 0:
                 host_request_types = torch.tensor([0] * batch_size,
                                                   dtype=torch.int32)
@@ -1181,8 +1219,9 @@ def verify_kv_cache(torch_present):
                 session, output, present_key_value = _construct_execution(
                     session, input_tensor, weight_plugin, bias_plugin,
                     present_key_value, kv_cache_block_offsets,
-                    host_kv_cache_pool_pointers, packed_mask_for_fmha,
-                    sequence_length, host_past_key_value_lengths,
+                    host_kv_cache_pool_pointers, host_kv_cache_pool_mapping,
+                    packed_mask_for_fmha, sequence_length,
+                    host_past_key_value_lengths,
                     host_max_attention_window_sizes, host_sink_token_length,
                     input_lengths, host_context_lengths, cache_indirection,
                     host_request_types, num_heads, hidden_size, num_kv_heads,
@@ -1191,7 +1230,6 @@ def verify_kv_cache(torch_present):
                     context_host_runtime_perf_knobs)
                 del session
                 session = None
-
                 # Note: Volta has larger errors.
                 # We speculate it’s because Volta’s TC is smaller and more calculations are required,
                 # which may lead to more error accumulation.
@@ -1353,7 +1391,8 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int):
                 session, tiled_output, present_key_value = _construct_execution(
                     session, tiled_input_tensor, weight_plugin, bias_plugin,
                     tiled_present_key_value, kv_cache_block_offsets,
-                    host_kv_cache_pool_pointers, None, tiled_sequence_length,
+                    host_kv_cache_pool_pointers, host_kv_cache_pool_mapping,
+                    None, tiled_sequence_length,
                     tiled_host_past_key_value_lengths,
                     host_max_attention_window_sizes, host_sink_token_length,
                     tiled_input_lengths, tiled_host_context_lengths,
@@ -1374,7 +1413,7 @@ def tile_beam_width(tensor: torch.Tensor, num_beams: int):
             if paged_kv_cache:
                 # Iterate to the next step. Increase number of tokens for all unfinished sequences
                 # And allocate new blocks if needed
-                kv_cache_manager.step([False] * batch_size)
+                pools_kv_cache_manager.step([False] * batch_size)
         # assert False, "Force fail"
         return
 
diff --git a/tests/attention/test_gpt_attention_IFB.py b/tests/attention/test_gpt_attention_IFB.py
index 08d327fe3..3f0514c44 100644
--- a/tests/attention/test_gpt_attention_IFB.py
+++ b/tests/attention/test_gpt_attention_IFB.py
@@ -45,13 +45,18 @@
                                      RotaryScalingType)
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import GenerationSequence, KVCacheManager
+from tensorrt_llm.runtime import GenerationSequence
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.util import (skip_bf16_fp32_accum, skip_bf16_pre_ampere,
                         skip_fp8_pre_ada, skip_fp32_accum_pre_ampere,
                         unittest_name_func)
 
+from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \
+    MemoryPoolsAllocator
+from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
+    PoolsKVCacheManager
+
 
 class TestFunctional(unittest.TestCase):
 
@@ -217,6 +222,7 @@ def _construct_execution(session,
                                  bias,
                                  host_kv_cache_block_offsets,
                                  host_kv_cache_pool_pointers,
+                                 host_kv_cache_pool_mapping,
                                  sequence_length,
                                  host_past_key_value_lengths,
                                  host_max_attention_window_sizes,
@@ -291,8 +297,15 @@ def _construct_execution(session,
                     dtype=tensorrt_llm.str_dtype_to_trt('int32'))
                 host_kv_cache_pool_pointers_tensor = Tensor(
                     name='host_kv_cache_pool_pointers',
-                    shape=(1, ),
+                    shape=(
+                        1,
+                        1,
+                    ),
                     dtype=tensorrt_llm.str_dtype_to_trt('int64'))
+                host_kv_cache_pool_mapping_tensor = Tensor(
+                    name='host_kv_cache_pool_mapping',
+                    shape=(1, ),
+                    dtype=tensorrt_llm.str_dtype_to_trt('int32'))
                 host_runtime_perf_knobs_tensor = Tensor(
                     name='host_runtime_perf_knobs',
                     shape=[16],
@@ -419,6 +432,7 @@ def _construct_execution(session,
                     host_kv_cache_block_offsets_tensor,
                     host_kv_cache_pool_pointers=
                     host_kv_cache_pool_pointers_tensor,
+                    host_kv_cache_pool_mapping=host_kv_cache_pool_mapping_tensor,
                     host_context_lengths=host_context_lengths_tensor,
                     qkv_bias=qkv_bias,
                     host_runtime_perf_knobs=host_runtime_perf_knobs_tensor)
@@ -443,6 +457,7 @@ def _construct_execution(session,
                 'kv_cache_block_offsets': kv_cache_block_offsets,
                 'host_kv_cache_block_offsets': host_kv_cache_block_offsets,
                 'host_kv_cache_pool_pointers': host_kv_cache_pool_pointers,
+                'host_kv_cache_pool_mapping': host_kv_cache_pool_mapping,
                 'host_runtime_perf_knobs': host_runtime_perf_knobs
             }
             if use_int8_kv_cache or use_fp8_kv_cache:
@@ -465,7 +480,6 @@ def _construct_execution(session,
             builder_config = builder.create_builder_config(
                 name=attention_type,
                 precision=dtype,
-                opt_level=0,
                 fp8=use_fp8_context_fmha,
                 int8=int8_trt_flag)
             if session is None:
@@ -570,6 +584,11 @@ def _construct_execution(session,
                 configuration.max_position_embeddings = (
                     in_len // 2) + out_len - (out_len // 2)
         attention = AttentionCls(configuration).cuda().eval()
+        if isinstance(attention, LlamaAttention):
+            from transformers.models.llama.modeling_llama import \
+                LlamaRotaryEmbedding
+            attention.rotary_emb = LlamaRotaryEmbedding(config=configuration,
+                                                        device="cuda")
         if attention_type == 'gpt2_attention':
             attention.c_attn.weight = torch.nn.parameter.Parameter(
                 data=weight.clone().detach(), requires_grad=False)
@@ -780,18 +799,27 @@ def torch_exec(step: int,
             torch.cuda.synchronize()
             return torch_output, torch_present
 
-        # Init KV cache block manager
-        block_size = plugin_kv_num_heads * tokens_per_block * head_size
-        kv_cache_manager = KVCacheManager(num_layers=1,
-                                          num_blocks=num_blocks,
-                                          block_size=block_size,
-                                          tokens_per_block=tokens_per_block,
-                                          max_blocks_per_seq=max_blocks_per_seq,
-                                          max_attention_window_size=max_seq_len,
-                                          sink_token_len=sink_token_len,
-                                          beam_width=beam_width)
-        host_kv_cache_pool_pointers = torch.tensor(
-            [ordered_key_value.data_ptr(), 0], dtype=torch.int64)
+        # Init Pools KV cache manager
+        memory_pools_allocator = MemoryPoolsAllocator(
+            num_blocks=num_blocks,
+            tokens_per_block=tokens_per_block,
+            head_size=head_size)
+        num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer(
+            plugin_kv_num_heads, 1)
+        memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer)
+        pools_kv_cache_manager = PoolsKVCacheManager(
+            memory_pools_allocator.pools_metadata,
+            max_blocks_per_seq,
+            num_blocks,
+            tokens_per_block,
+            head_size,
+            max_attention_window_size=max_seq_len,
+            beam_width=beam_width,
+            sink_token_len=sink_token_len)
+
+        host_kv_cache_pool_pointers = memory_pools_allocator.get_kv_cache_pool_pointers(
+        )
+        host_kv_cache_pool_mapping = memory_pools_allocator.pool_mapping
         print("pool ptr ", ordered_key_value.data_ptr())
 
         torch_cache_list = [None] * num_req
@@ -848,11 +876,15 @@ def torch_exec(step: int,
                 # Add sequence to the manager
                 sequence = GenerationSequence(seq_idx=iteration,
                                               batch_idx=iteration)
-                kv_cache_manager.add_sequence(sequence, in_len_req.clone())
+                pools_kv_cache_manager.add_sequence(sequence,
+                                                    in_len_req.clone())
 
             # Get arrays of pointers to the "pages" of KV values
-            offset_array = kv_cache_manager.get_block_offsets(beam_width)
-            dense_offset_array = offset_array[sequence_selection]
+            offset_array = pools_kv_cache_manager.get_block_offsets(beam_width)
+            assert offset_array.shape[
+                0] == 1, f"test is suppose to use only one pool. sequence_selection is based on a single pool"
+            # assume only one pool
+            dense_offset_array = offset_array[0][sequence_selection]
 
             host_input_lengths = np.concatenate(input_length_list)
             host_input_lengths = torch.tensor(host_input_lengths,
@@ -1022,11 +1054,11 @@ def torch_exec(step: int,
             session, output = _construct_execution(
                 session, input_tensor, weight_plugin, bias_plugin,
                 dense_offset_array, host_kv_cache_pool_pointers,
-                sequence_lengths, host_past_key_value_lengths,
-                host_max_attention_window_sizes, host_sink_token_length,
-                context_lengths, max_context_length, cache_indirection,
-                num_heads, hidden_size, num_kv_heads, output, dtype,
-                kv_quant_scale, kv_dequant_scale, host_context_lengths,
+                host_kv_cache_pool_mapping, sequence_lengths,
+                host_past_key_value_lengths, host_max_attention_window_sizes,
+                host_sink_token_length, context_lengths, max_context_length,
+                cache_indirection, num_heads, hidden_size, num_kv_heads, output,
+                dtype, kv_quant_scale, kv_dequant_scale, host_context_lengths,
                 host_request_types, generation_host_runtime_perf_knobs,
                 use_fp8_context_fmha, atten_output_quant_scale)
 
@@ -1050,7 +1082,7 @@ def torch_exec(step: int,
             finished = [False for _ in range(cache_num_req)]
             # Iterate to the next step. Increase number of tokens for all unfinished sequences
             # And allocate new blocks if needed
-            kv_cache_manager.step(finished)
+            pools_kv_cache_manager.step(finished)
 
 
 if __name__ == "__main__":
diff --git a/tests/bindings/test_bindings_ut.py b/tests/bindings/test_bindings_ut.py
index 3d688faa0..3cb1ba598 100644
--- a/tests/bindings/test_bindings_ut.py
+++ b/tests/bindings/test_bindings_ut.py
@@ -40,9 +40,10 @@ def test_model_config():
     num_heads = 16
     hidden_size = 768
     data_type = _tb.DataType.FLOAT
-    model_config = _tb.ModelConfig(vocab_size, num_attention_layers,
-                                   num_rnn_layers, num_heads, hidden_size,
-                                   data_type)
+    model_config = _tb.ModelConfig(vocab_size,
+                                   num_attention_layers + num_rnn_layers,
+                                   num_attention_layers, num_rnn_layers,
+                                   num_heads, hidden_size, data_type)
     assert model_config.vocab_size == vocab_size
     assert model_config.num_attention_layers() == num_attention_layers
     assert model_config.num_rnn_layers() == num_rnn_layers
@@ -53,10 +54,23 @@ def test_model_config():
     assert model_config.vocab_size_padded(1) is not None
     assert model_config.size_per_head == hidden_size // num_heads
 
-    assert model_config.num_kv_heads == num_heads
+    num_kv_heads_per_layer = model_config.num_kv_heads_per_layer
+    for layer_idx in range(num_attention_layers):
+        assert model_config.num_kv_heads(layer_idx) == num_heads
+        assert num_kv_heads_per_layer[layer_idx] == num_heads
+
     num_kv_heads = 1
-    model_config.num_kv_heads = num_kv_heads
-    assert model_config.num_kv_heads == num_kv_heads
+    model_config.set_num_kv_heads(num_kv_heads)
+    num_kv_heads_per_layer = model_config.num_kv_heads_per_layer
+    for layer_idx in range(num_attention_layers):
+        assert model_config.num_kv_heads(layer_idx) == num_kv_heads
+        assert num_kv_heads_per_layer[layer_idx] == num_kv_heads
+
+    num_kv_heads_per_layer[-1] = 2
+    model_config.num_kv_heads_per_layer = num_kv_heads_per_layer
+    for nheads, ref in zip(model_config.num_kv_heads_per_layer,
+                           num_kv_heads_per_layer):
+        assert nheads == ref
 
     assert not model_config.use_gpt_attention_plugin
     model_config.use_gpt_attention_plugin = True
@@ -182,6 +196,7 @@ def check_empty_then_set(member, value):
 def test_gpt_json_config():
     model_config = {
         "vocab_size": 1000,
+        "num_layers": 18,  # >= attn + rnn
         "num_attention_layers": 12,
         "num_rnn_layers": 2,
         "num_heads": 4,
@@ -314,7 +329,7 @@ def test_llm_request():
     assert llm_request.max_num_generated_tokens == 2
 
     llm_request.pause(0)
-    assert llm_request.state == _tb.LlmRequestState.REQUEST_STATE_CONTEXT_INIT
+    assert llm_request.state == _tb.LlmRequestState.CONTEXT_INIT
 
     llm_request.max_sent_token_len = 1
     assert llm_request.max_sent_token_len == 1
diff --git a/tests/bindings/test_executor_bindings.py b/tests/bindings/test_executor_bindings.py
index 89632e3e9..a2f08f5d0 100644
--- a/tests/bindings/test_executor_bindings.py
+++ b/tests/bindings/test_executor_bindings.py
@@ -102,13 +102,15 @@ def test_shutdown(model_files, model_path):
     with pytest.raises(Exception):
         executor.await_responses()
     with pytest.raises(Exception):
-        executor.get_latest_iteration_stats()()
+        executor.get_latest_iteration_stats()
     with pytest.raises(Exception):
-        executor.get_latest_request_stats()()
+        executor.get_latest_request_stats()
     with pytest.raises(Exception):
-        executor.cancel_request(req_id)()
+        executor.get_latest_debug_tensors()
     with pytest.raises(Exception):
-        executor.get_num_responses_ready(req_id)()
+        executor.cancel_request(req_id)
+    with pytest.raises(Exception):
+        executor.get_num_responses_ready(req_id)
 
 
 @skip_pre_ampere  # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture
@@ -215,6 +217,7 @@ def test_single_request(streaming: bool, exclude_input_from_output: bool,
 
     executor.get_latest_iteration_stats()
     executor.get_latest_request_stats()
+    executor.get_latest_debug_tensors()
 
 
 @skip_pre_ampere  # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture
@@ -475,13 +478,12 @@ def test_get_num_responses_ready(streaming: bool,
 @pytest.mark.parametrize("return_context_logits", [False, True])
 @pytest.mark.parametrize("return_generation_logits", [False, True])
 @skip_pre_ampere  # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture
-def test_token_comparison(batching_type: trtllm.BatchingType, streaming: bool,
-                          beam_width: int, compute_log_probs: bool,
-                          exclude_input_from_output: bool,
-                          return_context_logits: bool,
-                          return_generation_logits: bool, model_files,
-                          model_path, model_path_return_logits, input_data_path,
-                          results_data_path, results_data_path_beam_width_2):
+def test_token_comparison(
+        batching_type: trtllm.BatchingType, streaming: bool, beam_width: int,
+        compute_log_probs: bool, exclude_input_from_output: bool,
+        return_context_logits: bool, return_generation_logits: bool,
+        model_files, model_path, model_path_return_logits, input_data_path,
+        results_data_path_fmhafp32acc, results_data_path_beam_width_2):
     if streaming and beam_width > 1:
         pytest.skip("Test does not support streaming with beam search")
 
@@ -594,7 +596,7 @@ def verify_output(beam_tokens, test_data, given_input_lengths):
                                executor_config)
 
     # Load test data
-    results_path = results_data_path if beam_width == 1 else results_data_path_beam_width_2
+    results_path = results_data_path_fmhafp32acc if beam_width == 1 else results_data_path_beam_width_2
     given_input, given_input_lengths, max_input_length, test_data = load_test_data(
         input_data_path, results_path)
 
@@ -1020,6 +1022,11 @@ def test_scheduler_config():
     assert config.capacity_scheduler_policy == capacity_scheduler_policy
     assert config.context_chunking_policy == None
 
+    capacity_scheduler_policy = trtllm.CapacitySchedulerPolicy.STATIC_BATCH
+    config = trtllm.SchedulerConfig(capacity_scheduler_policy)
+    assert config.capacity_scheduler_policy == capacity_scheduler_policy
+    assert config.context_chunking_policy == None
+
     context_chunking_policy = trtllm.ContextChunkingPolicy.FIRST_COME_FIRST_SERVED
     config = trtllm.SchedulerConfig(capacity_scheduler_policy,
                                     context_chunking_policy)
@@ -1034,6 +1041,7 @@ def test_kv_cache_config():
     assert config.max_attention_window is None
     assert config.sink_token_length is None
     assert config.free_gpu_memory_fraction is None
+    assert config.cross_kv_cache_fraction is None
     assert config.host_cache_size is None
     assert config.onboard_blocks == True
 
@@ -1042,6 +1050,7 @@ def test_kv_cache_config():
     config.max_attention_window = [2]
     config.sink_token_length = 3
     config.free_gpu_memory_fraction = 0.5
+    config.cross_kv_cache_fraction = 0.5
     config.host_cache_size = 4
     config.onboard_blocks = False
     assert config.enable_block_reuse == True
@@ -1049,6 +1058,7 @@ def test_kv_cache_config():
     assert config.max_attention_window == [2]
     assert config.sink_token_length == 3
     assert config.free_gpu_memory_fraction == 0.5
+    assert config.cross_kv_cache_fraction == 0.5
     assert config.host_cache_size == 4
     assert config.onboard_blocks == False
 
@@ -1058,6 +1068,7 @@ def test_kv_cache_config():
         "max_attention_window": [10],
         "sink_token_length": 2,
         "free_gpu_memory_fraction": 0.5,
+        "cross_kv_cache_fraction": 0.5,
         "host_cache_size": 1024,
         "onboard_blocks": False,
     }
@@ -1208,8 +1219,8 @@ def test_executor_config():
         "extended_runtime_perf_knob_config":
         trtllm.ExtendedRuntimePerfKnobConfig(multi_block_mode=True),
         "debug_config":
-        trtllm.DebugConfig(dump_input_tensors=True,
-                           dump_output_tensors=True,
+        trtllm.DebugConfig(debug_input_tensors=True,
+                           debug_output_tensors=True,
                            debug_tensor_names=["test"]),
         "recv_poll_period_ms":
         50,
@@ -1455,6 +1466,7 @@ def test_request_stats():
         "avgNumDecodedTokensPerIter"] == stats.avg_num_decoded_tokens_per_iter
     assert stats_json["scheduled"] == stats.scheduled
     assert stats_json["paused"] == stats.paused
+    assert stats_json["disServingStats"] is None
 
 
 def test_request_stats_per_iteration():
@@ -1480,12 +1492,14 @@ def test_kv_cache_config_pickle():
     config = trtllm.KvCacheConfig()
     config.enable_block_reuse = True
     config.free_gpu_memory_fraction = 0.3
+    config.cross_kv_cache_fraction = 0.5
     config_copy = pickle.loads(pickle.dumps(config))
     assert config.enable_block_reuse == config_copy.enable_block_reuse
     assert config.max_tokens == config_copy.max_tokens
     assert config.max_attention_window == config_copy.max_attention_window
     assert config.sink_token_length == config_copy.sink_token_length
     assert config.free_gpu_memory_fraction == config_copy.free_gpu_memory_fraction
+    assert config.cross_kv_cache_fraction == config_copy.cross_kv_cache_fraction
     assert config.host_cache_size == config_copy.host_cache_size
     assert config.onboard_blocks == config_copy.onboard_blocks
 
@@ -1516,13 +1530,15 @@ def test_decoding_config_pickle():
 
 
 def test_debug_config_pickle():
-    config = trtllm.DebugConfig(dump_input_tensors=True,
-                                dump_output_tensors=True,
-                                debug_tensor_names=["test"])
+    config = trtllm.DebugConfig(debug_input_tensors=True,
+                                debug_output_tensors=True,
+                                debug_tensor_names=["test"],
+                                debug_tensors_max_iterations=5)
     config_copy = pickle.loads(pickle.dumps(config))
-    assert config.dump_input_tensors == config_copy.dump_input_tensors
-    assert config.dump_output_tensors == config_copy.dump_output_tensors
+    assert config.debug_input_tensors == config_copy.debug_input_tensors
+    assert config.debug_output_tensors == config_copy.debug_output_tensors
     assert config.debug_tensor_names == config_copy.debug_tensor_names
+    assert config.debug_tensors_max_iterations == config_copy.debug_tensors_max_iterations
 
 
 def test_logits_post_processor_config_pickle():
@@ -1573,8 +1589,8 @@ def test_executor_config_pickle():
         "extended_runtime_perf_knob_config":
         trtllm.ExtendedRuntimePerfKnobConfig(multi_block_mode=True),
         "debug_config":
-        trtllm.DebugConfig(dump_input_tensors=True,
-                           dump_output_tensors=True,
+        trtllm.DebugConfig(debug_input_tensors=True,
+                           debug_output_tensors=True,
                            debug_tensor_names=["test"]),
         "recv_poll_period_ms":
         50,
@@ -1602,7 +1618,7 @@ def test_executor_config_pickle():
     assert config.peft_cache_config.num_host_module_layer == config_copy.peft_cache_config.num_host_module_layer
     assert config_copy.decoding_config.decoding_mode.isTopKandTopP
     assert config.extended_runtime_perf_knob_config.multi_block_mode == config_copy.extended_runtime_perf_knob_config.multi_block_mode
-    assert config.debug_config.dump_input_tensors == config_copy.debug_config.dump_input_tensors
+    assert config.debug_config.debug_input_tensors == config_copy.debug_config.debug_input_tensors
     assert config.max_seq_idle_microseconds == config_copy.max_seq_idle_microseconds
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..1afbe2f35
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# # Force resource release after test
+import pytest
+
+
+@pytest.hookimpl(wrapper=True)
+def pytest_runtest_protocol(item, nextitem):
+    yield
+
+    import sys
+    for m in sys.modules:
+        if m == 'torch' or m.startswith('torch.'):
+            import gc
+            import os
+
+            import torch
+            worker_count = int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', 1))
+
+            if (torch.cuda.memory_reserved(0) + torch.cuda.memory_allocated(0)
+                ) >= (torch.cuda.get_device_properties(0).total_memory //
+                      worker_count) * 0.9:
+                gc.collect()
+                print("torch.cuda.memory_allocated: %fGB" %
+                      (torch.cuda.memory_allocated(0) / 1024 / 1024 / 1024))
+                print("torch.cuda.memory_reserved: %fGB" %
+                      (torch.cuda.memory_reserved(0) / 1024 / 1024 / 1024))
+                print("torch.cuda.max_memory_reserved: %fGB" %
+                      (torch.cuda.max_memory_reserved(0) / 1024 / 1024 / 1024))
+
+                torch.cuda.empty_cache()
+            break
diff --git a/tests/functional/test_moe.py b/tests/functional/test_moe.py
index c6819f3f1..deedde4f6 100644
--- a/tests/functional/test_moe.py
+++ b/tests/functional/test_moe.py
@@ -1022,8 +1022,7 @@ def create_trt_session(
                                  network,
                                  precision=trt_dtype_to_str(dtype),
                                  int8=weight_dtype == trt.int8,
-                                 quant_mode=quant_mode,
-                                 opt_level=4)
+                                 quant_mode=quant_mode)
         return session
 
     def generate_reference(self, inputs, k, actfn, weight_dtype, quant_mode,
diff --git a/tests/hlapi/apps/_test_llm_server.py b/tests/hlapi/apps/_test_llm_server.py
index bf6b97881..73026a26a 100644
--- a/tests/hlapi/apps/_test_llm_server.py
+++ b/tests/hlapi/apps/_test_llm_server.py
@@ -7,7 +7,7 @@
 sys.path.append(
     os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples",
                  "apps"))
-from fastapi_server import LLM, KvCacheConfig, LlmServer
+from fastapi_server import LLM, LlmServer
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from test_llm import llama_model_path
@@ -16,9 +16,8 @@
 @pytest.fixture(scope="module")
 def client():
     llm = LLM(llama_model_path)
-    kv_cache_config = KvCacheConfig()
 
-    app_instance = LlmServer(llm, kv_cache_config)
+    app_instance = LlmServer(llm)
     client = TestClient(app_instance.app)
     yield client
 
@@ -31,6 +30,11 @@ def test_health(client):
     assert response.status_code == 200
 
 
+def test_health(client):
+    response = client.get("/health")
+    assert response.status_code == 200
+
+
 def test_generate(client):
     response = client.post("/generate", json={"prompt": "A B C"})
     assert response.status_code == 200
diff --git a/tests/hlapi/test_llm.py b/tests/hlapi/test_llm.py
index 5ebed6314..7750b52dc 100644
--- a/tests/hlapi/test_llm.py
+++ b/tests/hlapi/test_llm.py
@@ -4,17 +4,18 @@
 import sys
 import tempfile
 import time
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import pytest
 import torch
-from transformers import AutoTokenizer
+import transformers
 
 from tensorrt_llm._utils import release_gc
+from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.executor import (ExecutorBindingsWorker, GenerationRequest,
                                    GenerationResult, LoRARequest)
 from tensorrt_llm.hlapi import (LLM, BuildCacheConfig, KvCacheConfig,
-                                SamplingParams)
+                                RequestError, SamplingParams)
 from tensorrt_llm.hlapi.llm_utils import BuildConfig, _ParallelConfig
 from tensorrt_llm.hlapi.tokenizer import TokenizerBase, TransformersTokenizer
 from tensorrt_llm.hlapi.utils import get_total_gpu_memory
@@ -24,7 +25,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import force_ampere, similar, skip_less_than_40gb_memory
 
-from tensorrt_llm.models.llama.model import LLaMAForCausalLM
+from tensorrt_llm.models.automodel import AutoConfig, AutoModelForCausalLM
 
 # The unittests are based on the tiny-llama, which is fast to build and run.
 # There are other tests based on llama-7B model, such as the end-to-end tests in test_e2e.py, and parallel tests in
@@ -79,20 +80,34 @@ def llm_test_harness(model_dir: str,
 def llm_check_output(llm: LLM,
                      inputs: List[str],
                      references: List[str],
+                     *,
+                     sampling_params: Optional[SamplingParams] = None,
                      similar_threshold: float = 0.8,
-                     *gen_args,
+                     finish_reasons: Optional[List[str]] = None,
+                     stop_reasons: Optional[List[Union[int, str]]] = None,
                      **gen_kwargs):
-    outputs = llm.generate(inputs, *gen_args, **gen_kwargs)
+    outputs = llm.generate(inputs,
+                           sampling_params=sampling_params,
+                           **gen_kwargs)
     assert len(outputs) == len(references)
 
-    for output, target_output in zip(outputs, references):
+    for i, (output, target_output) in enumerate(zip(outputs, references)):
         if isinstance(target_output, list):
             # N output
             assert len(output.outputs) == len(target_output)
-            for out, ref in zip(output.outputs, target_output):
+            for j, (out, ref) in enumerate(zip(output.outputs, target_output)):
                 assert similar(out.text, ref, threshold=similar_threshold)
+                if finish_reasons is not None:
+                    assert out.finish_reason == finish_reasons[i][j]
+                if stop_reasons is not None:
+                    assert out.stop_reason == stop_reasons[i][j]
         else:
-            assert similar(output.outputs[0].text, target_output)
+            out = output.outputs[0]
+            assert similar(out.text, target_output, threshold=similar_threshold)
+            if finish_reasons is not None:
+                assert out.finish_reason == finish_reasons[i]
+            if stop_reasons is not None:
+                assert out.stop_reason == stop_reasons[i]
 
 
 default_model_name = "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@@ -100,6 +115,9 @@ def llm_check_output(llm: LLM,
 
 llama_model_path = get_model_path(default_model_name)
 llm_engine_dir = os.environ.get('LLM_ENGINE_DIR', './tmp.engine')
+
+cnn_dailymail_path = str(llm_models_root() / "datasets" / "cnn_dailymail")
+
 prompts = ["A B C"]
 global_kvcache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
 
@@ -110,7 +128,6 @@ def test_llm_build_config():
     # change some building parameters
     build_config.max_batch_size = 129
     build_config.max_beam_width = 4
-    build_config.builder_opt = 3
     build_config.max_num_tokens = 888
     build_config.strongly_typed = True
     build_config.max_seq_len = 333
@@ -131,7 +148,6 @@ def test_llm_build_config():
         build_config1.plugin_config.nccl_plugin = 'float16'
         assert build_config1.max_batch_size == build_config.max_batch_size
         assert build_config1.max_beam_width == build_config.max_beam_width
-        assert build_config1.builder_opt == build_config.builder_opt
         assert build_config1.max_num_tokens == build_config.max_num_tokens
         assert build_config1.strongly_typed == build_config.strongly_typed
         assert build_config1.max_seq_len == build_config.max_seq_len
@@ -149,16 +165,41 @@ def test_llm_loading_from_hf():
 def test_llm_loading_from_ckpt():
     tokenizer = TransformersTokenizer.from_pretrained(llama_model_path)
     assert tokenizer is not None
-    with tempfile.TemporaryDirectory() as ckpt_dir:
-        llama = LLaMAForCausalLM.from_hugging_face(llama_model_path)
-        llama.save_checkpoint(ckpt_dir)
-        del llama
 
-        llm_test_harness(ckpt_dir,
-                         prompts, ["D E F G H I J K"],
-                         tokenizer=tokenizer,
-                         kv_cache_config=global_kvcache_config,
-                         sampling_params=SamplingParams(max_tokens=8))
+    ckpt_dir = tempfile.TemporaryDirectory()
+    llama = AutoModelForCausalLM.from_hugging_face(llama_model_path)
+    llama.save_checkpoint(ckpt_dir.name)
+    del llama
+
+    llm_test_harness(ckpt_dir.name,
+                     prompts, ["D E F G H I J K"],
+                     tokenizer=tokenizer,
+                     kv_cache_config=global_kvcache_config,
+                     sampling_params=SamplingParams(max_tokens=8))
+
+
+@pytest.mark.parametrize('model_format', ['hf', 'ckpt'])
+def test_llm_with_dummy_weights(model_format):
+    # dummy_dir contains config.json and tokenizer files only
+    # the test fails if load_format != 'dummy'
+    dummy_dir = tempfile.TemporaryDirectory()
+    if model_format == 'hf':
+        hf_config = transformers.AutoConfig.from_pretrained(llama_model_path)
+        hf_config.save_pretrained(dummy_dir.name)
+    else:
+        config = AutoConfig.from_hugging_face(llama_model_path, dtype='float16')
+        config.to_json_file(os.path.join(dummy_dir.name, 'config.json'))
+    tokenizer = transformers.AutoTokenizer.from_pretrained(llama_model_path)
+    tokenizer.save_pretrained(dummy_dir.name)
+
+    sampling_params = SamplingParams(max_tokens=8)
+    llm_test_harness(dummy_dir.name,
+                     prompts,
+                     ["A placeholder reference for dummy-weight engine."],
+                     sampling_params=sampling_params,
+                     similar_threshold=0.0,
+                     load_format='dummy',
+                     kv_cache_config=global_kvcache_config)
 
 
 class MyTokenizer(TokenizerBase):
@@ -167,8 +208,8 @@ class MyTokenizer(TokenizerBase):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_dir: str, **kwargs):
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir,
-                                                  **kwargs)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained_model_dir, **kwargs)
         return MyTokenizer(tokenizer)
 
     def __init__(self, tokenizer):
@@ -326,6 +367,9 @@ async def main():
     test_future_async()
     test_non_streaming_usage_wait()
 
+    del llm
+    release_gc()
+
 
 @pytest.fixture(scope="module")
 def llm_for_sampling_params() -> LLM:
@@ -475,27 +519,55 @@ def test_generate_with_stop_words():
 
     llm_check_output(llm,
                      prompts, ["D E F G H I J K L M"],
-                     sampling_params=SamplingParams(stop_token_ids=[stop_id]))
+                     sampling_params=SamplingParams(end_id=stop_id),
+                     finish_reasons=['stop'],
+                     stop_reasons=[None])
+
+    llm_check_output(llm,
+                     prompts, ["D E F G H"],
+                     sampling_params=SamplingParams(max_tokens=5),
+                     finish_reasons=['length'],
+                     stop_reasons=[None])
+
+    llm_check_output(llm,
+                     prompts, ["D E F G H I J K L M"],
+                     sampling_params=SamplingParams(stop_token_ids=[stop_id]),
+                     finish_reasons=['stop'],
+                     stop_reasons=[stop_id])
 
     llm_check_output(llm,
                      prompts, ["D E F G H I J K L M N"],
                      sampling_params=SamplingParams(
                          stop_token_ids=[stop_id],
-                         include_stop_str_in_output=True))
+                         include_stop_str_in_output=True),
+                     finish_reasons=['stop'],
+                     stop_reasons=[stop_id])
 
     llm_check_output(llm,
                      prompts, ["D E F G H"],
-                     sampling_params=SamplingParams(stop="I J"))
+                     sampling_params=SamplingParams(stop="I J"),
+                     finish_reasons=['stop'],
+                     stop_reasons=["I J"])
+
+    llm_check_output(llm,
+                     prompts, ["D E F G H I J K L M"],
+                     sampling_params=SamplingParams(stop="I E", max_tokens=10),
+                     finish_reasons=['length'],
+                     stop_reasons=[None])
 
     llm_check_output(llm,
                      prompts, ["D E F G H I J"],
                      sampling_params=SamplingParams(
-                         stop="I J", include_stop_str_in_output=True))
+                         stop="I J", include_stop_str_in_output=True),
+                     finish_reasons=['stop'],
+                     stop_reasons=["I J"])
 
     llm_check_output(llm,
                      prompts, ["D E F G H"],
                      sampling_params=SamplingParams(stop=["F E", "I J"],
-                                                    stop_token_ids=[stop_id]))
+                                                    stop_token_ids=[stop_id]),
+                     finish_reasons=['stop'],
+                     stop_reasons=["I J"])
 
 
 @force_ampere
@@ -524,8 +596,7 @@ def test_generate_with_bad_words():
 
 @force_ampere
 def test_generate_with_embedding_bias():
-
-    tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(llama_model_path)
     biased_word_id = tokenizer.encode("Z", add_special_tokens=False)[-1]
     vocab_size_padded = 32000
     embedding_bias = torch.zeros(vocab_size_padded)
@@ -568,6 +639,7 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs):
     hf_model_dir = get_model_path("llama-models-v2/llama-v2-13b-hf")
     hf_lora_dir = get_model_path("llama-models-v2/chinese-llama-2-lora-13b")
 
+    # For LoRA checkpoints with finetuned embedding and lm_head, lora_dir must be provided at build time.
     build_config = BuildConfig(lora_config=LoraConfig(lora_dir=[hf_lora_dir]))
     llm = LLM(hf_model_dir,
               tokenizer=hf_lora_dir,
@@ -598,8 +670,10 @@ def llama_7b_multi_lora_test_harness(**llm_kwargs):
     hf_lora_dir1 = get_model_path("llama-models/luotuo-lora-7b-0.1")
     hf_lora_dir2 = get_model_path("llama-models/Japanese-Alpaca-LoRA-7b-v0")
 
+    # For LoRA checkpoints without finetuned embedding and lm_head, we can either:
+    # (1) specify lora_target_modules, or
+    # (2) provide a lora_dir to infer the lora_target_modules.
     build_config = BuildConfig(lora_config=LoraConfig(
-        lora_dir=[hf_lora_dir1, hf_lora_dir2],
         lora_target_modules=['attn_q', 'attn_k', 'attn_v']))
     llm = LLM(hf_model_dir,
               enable_lora=True,
@@ -734,7 +808,7 @@ def second_run():
                          prompts, ["D E F G H I J K"],
                          sampling_params=sampling_params)
 
-        # the cache should be hitted
+        # the cache should be hit
         assert llm.llm_build_stats.cache_hitted, llm.llm_build_stats.cache_info
         del llm
         release_gc()
@@ -914,4 +988,51 @@ def test_llm_return_generation_logits():
     check_llm_return_generation_logits(tp_size=1)
 
 
+class DummyExecutorWorker3(ExecutorBindingsWorker):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.counter = 0
+
+    def _engine_response_callback(self, response: tllm.Response):
+        return tllm.Response(request_id=response.request_id,
+                             error_msg="Test error")
+
+
+DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {},
+                                   worker_cls=DummyExecutorWorker3)
+
+
+def test_llm_handling_per_requeust_error():
+    llm = LLM(model=llama_model_path,
+              executor_cls=DummyExecutor3,
+              kv_cache_config=global_kvcache_config)
+    # The dummy executor will delay the responses
+    sampling_params = SamplingParams(max_tokens=6)
+
+    # test in streaming mode
+    async def task():
+        with pytest.raises(RequestError):
+            # 10 requests, each request will get error, while the whole LLM instance is still alive
+            for i in range(10):
+                async for output in llm.generate_async(
+                        prompts[0], streaming=True,
+                        sampling_params=sampling_params):
+                    print(output)
+
+    asyncio.run(task())
+
+    def batch_task():
+        with pytest.raises(RequestError):
+            for output in llm.generate(prompts,
+                                       sampling_params=sampling_params):
+                print(output)
+
+    batch_task()
+
+
 # TODO[chunweiy]: Add test for loading inmemory model
+
+if __name__ == '__main__':
+    test_llm_handling_per_requeust_error()
diff --git a/tests/hlapi/test_llm_models.py b/tests/hlapi/test_llm_models.py
index 3097f2a9e..22f6b256f 100644
--- a/tests/hlapi/test_llm_models.py
+++ b/tests/hlapi/test_llm_models.py
@@ -3,12 +3,12 @@
 import pytest
 
 from tensorrt_llm import BuildConfig, SamplingParams
-from tensorrt_llm.hlapi import QuantAlgo, QuantConfig
+from tensorrt_llm.hlapi import CalibConfig, QuantAlgo, QuantConfig
 
 try:
-    from .test_llm import get_model_path, llm_test_harness
+    from .test_llm import cnn_dailymail_path, get_model_path, llm_test_harness
 except ImportError:
-    from test_llm import get_model_path, llm_test_harness
+    from test_llm import get_model_path, llm_test_harness, cnn_dailymail_path
 
 import os
 import sys
@@ -36,7 +36,8 @@
 qwen_model_path = get_model_path('Qwen-1_8B-Chat')
 qwen1_5_model_path = get_model_path('Qwen1.5-0.5B-Chat')
 qwen2_model_path = get_model_path('Qwen2-7B-Instruct')
-
+mamba2_370m_model_path = get_model_path('mamba2/mamba2-370m')
+gpt_neox_20b_model_path = get_model_path('gpt-neox-20b')
 sampling_params = SamplingParams(max_tokens=10)
 
 
@@ -51,11 +52,13 @@ def test_llm_gptj():
 @force_ampere
 def test_llm_gptj_int4_weight_only():
     quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(gptj_model_path,
                      inputs=["A B C"],
                      references=["D E F G H I J K L M"],
                      sampling_params=sampling_params,
-                     quant_config=quant_config)
+                     quant_config=quant_config,
+                     calib_config=calib_config)
 
 
 @force_ampere
@@ -71,32 +74,38 @@ def test_llm_gpt2_sq():
     quant_config = QuantConfig(
         quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,
         kv_cache_quant_algo=QuantAlgo.INT8)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(gpt2_model_path,
                      inputs=["A B C"],
                      references=["D E F G H I J K L M"],
                      sampling_params=sampling_params,
-                     quant_config=quant_config)
+                     quant_config=quant_config,
+                     calib_config=calib_config)
 
 
 @force_ampere
 def test_llm_gpt2_int8_weight_only():
     quant_config = QuantConfig(quant_algo=QuantAlgo.W8A16,
                                kv_cache_quant_algo=QuantAlgo.INT8)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(gpt2_model_path,
                      inputs=["A B C"],
                      references=["D E F G H I J K L M"],
                      sampling_params=sampling_params,
-                     quant_config=quant_config)
+                     quant_config=quant_config,
+                     calib_config=calib_config)
 
 
 @skip_pre_hopper
 def test_llm_gpt2_fp8():
     quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(gpt2_model_path,
                      inputs=["A B C"],
                      references=["D E F G H I J K L M"],
                      sampling_params=sampling_params,
-                     quant_config=quant_config)
+                     quant_config=quant_config,
+                     calib_config=calib_config)
 
 
 @force_ampere
@@ -110,11 +119,13 @@ def test_llm_starcoder2():
 @skip_pre_hopper
 def test_llm_starcoder2_fp8():
     quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(starcoder2_model_path,
                      inputs=["def print_hello_world():"],
                      references=['\n    print("Hello World")\n\ndef print'],
                      sampling_params=sampling_params,
-                     quant_config=quant_config)
+                     quant_config=quant_config,
+                     calib_config=calib_config)
 
 
 def test_llm_phi_1_5():
@@ -173,12 +184,14 @@ def test_llm_falcon():
 @force_ampere
 def test_llm_falcon_int4_weight_only():
     quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(falcon_model_path,
                      inputs=['A B C'],
                      references=['D E F G H I J K L M'],
                      sampling_params=sampling_params,
                      quant_config=quant_config,
-                     build_config=BuildConfig(strongly_typed=False))
+                     build_config=BuildConfig(strongly_typed=False),
+                     calib_config=calib_config)
 
 
 @force_ampere
@@ -192,11 +205,13 @@ def test_llm_gemma_2b():
 @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/4575937")
 def test_llm_gemma_2b_int4weight_only():
     quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(gemma_2b_model_path,
                      inputs=['A B C'],
                      references=['D E F G H I J K L M'],
                      sampling_params=sampling_params,
-                     quant_config=quant_config)
+                     quant_config=quant_config,
+                     calib_config=calib_config)
 
 
 @force_ampere
@@ -207,6 +222,10 @@ def test_llm_gemma_2_9b_it():
                      sampling_params=sampling_params)
 
 
+@pytest.mark.skip(
+    reason=
+    "Require further transformers update https://github.com/THUDM/ChatGLM3/issues/1324"
+)
 def test_llm_glm():
     print('test GLM....')
     llm_test_harness(glm_model_path,
@@ -257,11 +276,13 @@ def test_llm_baichuan2_13b():
 @force_ampere
 def test_llm_baichuan2_7b_int4weight_only():
     quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(baichuan2_7b_model_path,
                      inputs=['A B C'],
                      references=['D E F G H I J K L M'],
                      sampling_params=sampling_params,
                      quant_config=quant_config,
+                     calib_config=calib_config,
                      trust_remote_code=True)
 
 
@@ -300,22 +321,39 @@ def test_llm_qwen2():
 @skip_pre_ampere
 def test_llm_qwen2_int4_weight_only():
     quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(qwen2_model_path,
                      inputs=['A B C'],
                      references=['D E F G H I J K L M'],
                      sampling_params=sampling_params,
                      quant_config=quant_config,
+                     calib_config=calib_config,
                      trust_remote_code=True)
 
 
 @skip_pre_hopper
 def test_llm_qwen2_fp8():
     quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
     llm_test_harness(qwen2_model_path,
                      inputs=['A B C'],
                      references=['D E F G H I J K L M'],
                      sampling_params=sampling_params,
                      quant_config=quant_config,
+                     calib_config=calib_config,
+                     trust_remote_code=True)
+
+
+@skip_pre_ampere
+def test_llm_mamba2_370m():
+    build_config = BuildConfig()
+    build_config.plugin_config._paged_kv_cache = False
+    llm_test_harness(mamba2_370m_model_path,
+                     inputs=['A B C'],
+                     references=['D E F G H I J K L M'],
+                     sampling_params=sampling_params,
+                     tokenizer=gpt_neox_20b_model_path,
+                     build_config=build_config,
                      trust_remote_code=True)
 
 
diff --git a/tests/hlapi/test_llm_multi_gpu.py b/tests/hlapi/test_llm_multi_gpu.py
index 782a8037b..9f3aaff47 100644
--- a/tests/hlapi/test_llm_multi_gpu.py
+++ b/tests/hlapi/test_llm_multi_gpu.py
@@ -9,6 +9,7 @@
 import torch
 from parameterized import parameterized
 
+from tensorrt_llm._utils import release_gc
 from tensorrt_llm.executor import (ExecutorBindingsProxy, GenerationRequest,
                                    GenerationResult)
 from tensorrt_llm.hlapi import LLM, KvCacheConfig, SamplingParams
@@ -110,8 +111,8 @@ def test_llm_return_generation_logits_tp2():
                          ids=["from_ckpt", "from_hf"])
 @skip_single_gpu
 def test_llm_generate_async_tp2(
-        engine_from_checkpoint: tempfile.TemporaryDirectory,
-        use_auto_parallel: bool, from_ckpt: bool):
+        engine_from_checkpoint: tempfile.TemporaryDirectory, from_ckpt: bool,
+        use_auto_parallel: bool):
     if use_auto_parallel and from_ckpt:
         pytest.skip("Skip auto parallel for TP2 checkpoint")
     model_dir = engine_from_checkpoint.name if from_ckpt else get_model_path(
@@ -264,7 +265,7 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
 
         self.request_queue.put(request)
 
-        req_id = self.request_id_queue.get()
+        req_id = self.rid_or_err_queue.get()
         request.set_id(req_id)
 
         result = GenerationResult(
@@ -354,6 +355,8 @@ def __init__(
     def dispatch_result_task(self) -> bool:
         self.counter += 1
 
+        # This will raise error in dispatch_result_thread in the main process, it will be captured by ManagedThread and
+        # redirect to the error_queue
         if self.counter == 2:
             raise DummyError("Test error")
 
@@ -364,6 +367,29 @@ def dispatch_result_task(self) -> bool:
                                    proxy_class=DummyExecutorProxy2)
 
 
+# TODO[chunweiy]: This test is not stable, need to investigate
+def test_executor_handle_background_error_in_dispatch_result_thread():
+    llm = LLM(model=llama_model_path,
+              executor_cls=DummyExecutor2,
+              kv_cache_config=global_kv_cache_config)
+    # The dummy executor will delay the responses
+    sampling_params = SamplingParams(max_tokens=6)
+
+    # test in streaming mode
+    async def task():
+        with pytest.raises(DummyError):
+            with llm:
+                async for output in llm.generate_async(
+                        prompts[0], streaming=True,
+                        sampling_params=sampling_params):
+                    print(output)
+
+    asyncio.run(task())
+
+    del llm
+    release_gc()
+
+
 class DummyExecutorProxy3(ExecutorBindingsProxy):
     ''' This is for testing the error occur in a Worker process in the Proxy. '''
 
@@ -373,10 +399,13 @@ def __init__(
         model_world_size: int = 1,
         mpi_session=None,
     ) -> None:
-        super().__init__(workers_kwargs,
-                         model_world_size,
-                         mpi_session,
-                         worker_cls=DummyExecutorWorker2)
+        super().__init__(
+            workers_kwargs,
+            model_world_size,
+            mpi_session,
+            # The worker process will raise error, and be captured by mpi4py done handler, and redirect to
+            # the global error queue.
+            worker_cls=DummyExecutorWorker2)
 
 
 DummyExecutor3 = DummyExecutorMeta("DummyExecutor3", (), {},
@@ -384,45 +413,31 @@ def __init__(
 
 
 # TODO[chunweiy]: This test is not stable, need to investigate
-def test_executor_handle_background_error():
+# The phenomenon is that the IpcQueues don't match each other.
+def test_executor_handle_background_error_in_worker_process():
 
     llm = LLM(model=llama_model_path,
-              executor_cls=DummyExecutor2,
+              executor_cls=DummyExecutor3,
               kv_cache_config=global_kv_cache_config)
     # The dummy executor will delay the responses
     sampling_params = SamplingParams(max_tokens=6)
 
     # test in streaming mode
     async def task():
+        nonlocal llm
         with pytest.raises(DummyError):
-            async for output in llm.generate_async(
-                    prompts[0], streaming=True,
-                    sampling_params=sampling_params):
-                print(output)
+            with llm:
+                async for output in llm.generate_async(
+                        prompts[0], streaming=True,
+                        sampling_params=sampling_params):
+                    print(output)
 
     asyncio.run(task())
 
-
-def test_executor_handle_background_error_in_worker():
-    llm = LLM(model=llama_model_path,
-              executor_cls=DummyExecutor2,
-              kv_cache_config=global_kv_cache_config)
-    # The dummy executor will delay the responses
-    sampling_params = SamplingParams(max_tokens=6)
-
-    # test in streaming mode
-    async def task():
-        with pytest.raises(DummyError):
-            async for output in llm.generate_async(
-                    prompts[0], streaming=True,
-                    sampling_params=sampling_params):
-                print(output)
-
-    asyncio.run(task())
+    del llm
+    release_gc()
 
 
 if __name__ == '__main__':
-    #test_llama_v2_13b_lora_tp2()
-    #test_llm_end2end_tp2({'embedding_parallel_mode': 'NONE'})
-    test_llm_return_context_logits_tp2()
-    test_llm_return_generation_logits_tp2()
+    test_executor_handle_background_error_in_dispatch_result_thread()
+    test_executor_handle_background_error_in_worker_process()
diff --git a/tests/hlapi/test_llm_quant.py b/tests/hlapi/test_llm_quant.py
index 33627cdf0..d7edb3eef 100644
--- a/tests/hlapi/test_llm_quant.py
+++ b/tests/hlapi/test_llm_quant.py
@@ -2,24 +2,26 @@
 import sys
 
 from tensorrt_llm.hlapi.llm import LLM, SamplingParams
-from tensorrt_llm.hlapi.llm_utils import QuantAlgo, QuantConfig
+from tensorrt_llm.hlapi.llm_utils import CalibConfig, QuantAlgo, QuantConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.util import skip_pre_ampere, skip_pre_hopper
 
 try:
-    from .test_llm import llama_model_path
+    from .test_llm import cnn_dailymail_path, llama_model_path
 except ImportError:
-    from test_llm import llama_model_path
+    from test_llm import cnn_dailymail_path, llama_model_path
 
 
 @skip_pre_ampere
 def test_llm_int4_awq_quantization():
-    quant_config = QuantConfig()
-    quant_config.quant_algo = QuantAlgo.W4A16_AWQ
+    quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ)
     assert quant_config.quant_mode.has_any_quant()
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
 
-    llm = LLM(llama_model_path, quant_config=quant_config)
+    llm = LLM(llama_model_path,
+              quant_config=quant_config,
+              calib_config=calib_config)
 
     sampling_params = SamplingParams(max_tokens=6)
     for output in llm.generate(["A B C"], sampling_params=sampling_params):
@@ -29,13 +31,14 @@ def test_llm_int4_awq_quantization():
 
 @skip_pre_hopper
 def test_llm_fp8_quantization():
-    quant_config = QuantConfig()
-    quant_config.quant_algo = QuantAlgo.FP8
-    quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-
+    quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
+                               kv_cache_quant_algo=QuantAlgo.FP8)
     assert quant_config.quant_mode.has_any_quant()
+    calib_config = CalibConfig(calib_dataset=cnn_dailymail_path)
 
-    llm = LLM(llama_model_path, quant_config=quant_config)
+    llm = LLM(llama_model_path,
+              quant_config=quant_config,
+              calib_config=calib_config)
     sampling_params = SamplingParams(max_tokens=6)
     for output in llm.generate(["A B C"], sampling_params=sampling_params):
         print(output)
diff --git a/tests/hlapi/test_llm_utils.py b/tests/hlapi/test_llm_utils.py
index 8bc460725..733b424f4 100644
--- a/tests/hlapi/test_llm_utils.py
+++ b/tests/hlapi/test_llm_utils.py
@@ -13,7 +13,7 @@
 
 
 def test_ConfigArbitrator_basic():
-    # the performance and functionality have conflict plugins config, keep the functionalies and disable the performance's
+    # the performance and functionality have conflict plugins config, keep the functionalities and disable the performance's
     arb = _ConfigArbitrator()
     arb.claim_perf("chunked_context",
                    config_name="plugin_config",
diff --git a/tests/model/test_decilm.py b/tests/model/test_decilm.py
deleted file mode 100644
index 083db6671..000000000
--- a/tests/model/test_decilm.py
+++ /dev/null
@@ -1,602 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-import os
-import sys
-import tempfile
-import unittest
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import tensorrt as trt
-import torch
-import transformers
-from parameterized import parameterized
-
-import tensorrt_llm
-from tensorrt_llm import logger
-from tensorrt_llm._utils import str_dtype_to_torch
-from tensorrt_llm.builder import Builder
-from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models.deci.config import DeciConfig, DeciLayerConfig
-from tensorrt_llm.models.deci.convert import _ffn_mult_to_intermediate_size
-from tensorrt_llm.models.deci.layer_config import (AttentionImplementation,
-                                                   FFNImplementation)
-from tensorrt_llm.models.deci.model import DeciLMForCausalLM
-from tensorrt_llm.network import Network, net_guard
-from tensorrt_llm.plugin.plugin import ContextFMHAType
-from tensorrt_llm.runtime.generation import _Runtime
-
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-from utils.llm_data import llm_models_root
-from utils.util import unittest_name_func
-
-
-class TestDeciLM(unittest.TestCase):
-
-    def _make_decilm_config(self,
-                            layer_configs: List[Union[DeciLayerConfig,
-                                                      Dict[str, Dict[str,
-                                                                     Any]]]],
-                            dtype: str = 'bfloat16',
-                            num_attention_heads: int = 32,
-                            num_key_value_heads: Optional[int] = None,
-                            hidden_size: int = 4096,
-                            intermediate_size: int = 16384,
-                            vocab_size: int = 32128,
-                            max_positions_embedding: int = 1024,
-                            norm_epsilon: float = 1e-05) -> DeciConfig:
-        config = {
-            'architecture': 'DeciLMForCausalLM',
-            'num_hidden_layers': len(layer_configs),
-            'num_attention_heads': num_attention_heads,
-            'num_key_value_heads': num_key_value_heads,
-            'dtype': dtype,
-            'logits_dtype': dtype,
-            'hidden_size': hidden_size,
-            'intermediate_size': intermediate_size,
-            'vocab_size': vocab_size,
-            'position_embedding_type': 'rope_gpt_neox',
-            'max_position_embeddings': max_positions_embedding,
-            'hidden_act': 'silu',
-            'norm_epsilon': norm_epsilon,
-            'layer_configs': layer_configs
-        }
-
-        config = DeciConfig.from_dict(config)
-        return config
-
-    def _gen_tensorrt_llm_network(self, network: Network,
-                                  decilm: DeciLMForCausalLM, batch_size: int,
-                                  beam_width: int, input_len: int,
-                                  output_len: int, rank: int,
-                                  tensor_parallel: int, **opt_flags):
-        list(range(tensor_parallel))
-
-        with net_guard(network):
-            # optimize_model(decilm, **opt_flags)
-            # Prepare
-            network.set_named_parameters(decilm.named_parameters())
-            inputs = decilm.prepare_inputs(max_batch_size=batch_size,
-                                           max_input_len=input_len,
-                                           max_seq_len=input_len + output_len,
-                                           max_num_tokens=batch_size *
-                                           input_len,
-                                           use_cache=True,
-                                           max_beam_width=beam_width)
-            # Forward
-            decilm(**inputs)
-        return network
-
-    def _gen_tensorrt_llm_engine(
-            self,
-            rank: int,
-            world_size: int,
-            decilm: DeciLMForCausalLM,
-            model_name: str,
-            use_plugin: bool,
-            batch_size: int,
-            beam_width: int,
-            input_len: int,
-            output_len: int,
-            use_refit: bool,
-            use_gemm: bool = False,
-            context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled,
-            enable_remove_input_padding: bool = False,
-            **opt_flags) -> trt.IHostMemory:
-
-        builder = Builder()
-        dtype = decilm.config.dtype
-
-        with tempfile.TemporaryDirectory():
-            builder_config = builder.create_builder_config(
-                name=model_name,
-                precision=dtype,
-                timing_cache='model.cache',
-                tensor_parallel=world_size,  # TP only
-                use_refit=use_refit,
-                strongly_typed=True,
-            )
-            network = builder.create_network()
-            network.plugin_config.to_legacy_setting()
-            if use_plugin:
-                network.plugin_config.gpt_attention_plugin = dtype
-            if use_gemm:
-                network.plugin_config.gemm_plugin = dtype
-            if enable_remove_input_padding:
-                network.plugin_config.remove_input_padding = True
-            network.plugin_config.set_context_fmha(context_fmha_flag)
-
-            self._gen_tensorrt_llm_network(network=network,
-                                           decilm=decilm,
-                                           batch_size=batch_size,
-                                           beam_width=beam_width,
-                                           input_len=input_len,
-                                           output_len=output_len,
-                                           rank=rank,
-                                           tensor_parallel=world_size,
-                                           **opt_flags)
-            engine_buffer = builder.build_engine(network, builder_config)
-            return engine_buffer
-
-    def _gen_tensorrt_llm_runtime(
-            self,
-            log_level: str,
-            world_size: int,
-            rank: int,
-            decilm: DeciLMForCausalLM,
-            model_name: str,
-            use_plugin: bool,
-            batch_size: int,
-            beam_width: int,
-            input_len: int,
-            output_len: int,
-            use_refit: bool,
-            use_gemm: bool = False,
-            context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled,
-            enable_remove_input_padding: bool = False,
-            **opt_flags) -> Tuple[_Runtime, trt.IHostMemory]:
-        logger.set_level(log_level)
-        mapping = Mapping(world_size, rank, tp_size=world_size)
-        engine_buffer = self._gen_tensorrt_llm_engine(
-            rank=rank,
-            world_size=world_size,
-            decilm=decilm,
-            model_name=model_name,
-            use_plugin=use_plugin,
-            batch_size=batch_size,
-            beam_width=beam_width,
-            input_len=input_len,
-            output_len=output_len,
-            use_refit=use_refit,
-            use_gemm=use_gemm,
-            context_fmha_flag=context_fmha_flag,
-            enable_remove_input_padding=enable_remove_input_padding,
-            **opt_flags)
-        runtime = _Runtime(engine_buffer, mapping)
-        return runtime, engine_buffer
-
-    def test_config_to_from_dict(self) -> None:
-        config = self._make_decilm_config(layer_configs=[{
-            "attention": {
-                "num_key_value_heads": 4
-            },
-            "ffn": {}
-        }, {
-            "attention": {
-                "num_key_value_heads": 2
-            },
-            "ffn": {
-                "impl": "no_op"
-            }
-        }, {
-            "attention": {
-                "impl": "no_op"
-            },
-            "ffn": {
-                "intermediate_size": 8192
-            }
-        }])
-
-        config2 = DeciConfig.from_dict(config.to_dict())
-        self.assertListEqual(config.layer_configs, config2.layer_configs)
-
-    def test_save_load_config(self) -> None:
-        config = self._make_decilm_config(layer_configs=[{
-            "attention": {
-                "num_key_value_heads": 4
-            },
-            "ffn": {}
-        }, {
-            "attention": {
-                "num_key_value_heads": 2
-            },
-            "ffn": {
-                "impl": "no_op"
-            }
-        }, {
-            "attention": {
-                "impl": "no_op"
-            },
-            "ffn": {
-                "intermediate_size": 8192
-            }
-        }])
-
-        with tempfile.TemporaryDirectory(
-                prefix="test_save_load_checkpoint") as ckpt_dir:
-            config_file = f"{ckpt_dir}/config.json"
-            config.to_json_file(config_file)
-            config2 = DeciConfig.from_json_file(config_file)
-
-        self.assertDictEqual(config.to_dict(), config2.to_dict())
-        self.assertListEqual(config.layer_configs, config2.layer_configs)
-
-    def get_loader_test_cases():
-        model_root = llm_models_root(check=True)
-        test_models_base_path = Path(model_root, "nvsmall/tests")
-
-        models_path = [
-            os.path.join(test_models_base_path, x)
-            for x in os.listdir(test_models_base_path)
-        ]
-        test_cases = list(
-            itertools.product(models_path, ["bfloat16", "float16"]))
-
-        return test_cases
-
-    @parameterized.expand(get_loader_test_cases, name_func=unittest_name_func)
-    def test_allclose_to_hf(self, hf_model_dir, dtype):
-        if hf_model_dir is None:
-            self.skipTest(
-                f"Missing nvsmall checkpoint, define a valid checkpoint path with the NVSMALL_CKPT environment variable"
-            )
-
-        dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype)
-
-        hf_model = transformers.AutoModelForCausalLM.from_pretrained(
-            hf_model_dir, trust_remote_code=True, torch_dtype=dtype).cuda()
-        decilm = DeciLMForCausalLM.from_hugging_face(hf_model)
-        config = decilm.config
-
-        log_level = "warning"
-        batch_size = 1
-        beam_width = 1
-        input_len = 4
-        output_len = 2
-        max_seq_len = input_len + output_len
-        dtype = config.dtype
-        enable_remove_input_padding = False
-        use_gpt_plugin = True
-        use_gemm = True
-
-        runtime, engine_buffer = self._gen_tensorrt_llm_runtime(
-            log_level=log_level,
-            decilm=decilm,
-            batch_size=batch_size,
-            beam_width=beam_width,
-            input_len=input_len,
-            output_len=output_len,
-            rank=0,
-            world_size=1,
-            model_name="decilm",
-            use_gemm=use_gemm,
-            use_plugin=use_gpt_plugin,
-            use_refit=False)
-
-        key_value_cache_buffers = []
-        head_size = config.hidden_size // config.num_attention_heads
-
-        attn_layer_idx = [
-            i for i in range(config.num_hidden_layers)
-            if config.get_layer_config(i).attention.needs_kv_cache
-        ]
-        for layer_idx in attn_layer_idx:
-            layer_config = config.get_layer_config(layer_idx)
-            new_cache = torch.zeros((
-                batch_size,
-                2,
-                layer_config.attention.num_key_value_heads,
-                max_seq_len,
-                head_size,
-            ),
-                                    dtype=str_dtype_to_torch(dtype),
-                                    device='cuda')
-            key_value_cache_buffers.append(new_cache)
-
-        # compare context
-        ctx_ids = torch.randint(100, (batch_size, input_len)).int().cuda()
-        ctx_context_lengths = input_len * torch.ones(
-            (batch_size), dtype=torch.int32, device='cuda')
-        ctx_position_ids = torch.tensor(range(input_len),
-                                        dtype=torch.int32).reshape([
-                                            1, input_len
-                                        ]).expand([batch_size,
-                                                   input_len]).cuda()
-        ctx_last_token_ids = ctx_context_lengths.clone()
-        ctx_host_request_types = torch.tensor([0] * batch_size,
-                                              dtype=torch.int32)
-
-        # We need sequence_lengths start as context_lengths for step 0,
-        # and it will be added one after each step.
-        sequence_length_buffer = ctx_context_lengths.detach().clone()
-
-        with torch.no_grad():
-            hf_outputs = hf_model.forward(ctx_ids,
-                                          output_hidden_states=True,
-                                          output_attentions=True)
-
-        torch.cuda.synchronize()
-        ref = hf_outputs.logits[:, -1, :]
-
-        if enable_remove_input_padding:
-            ctx_ids = ctx_ids.view([batch_size * input_len])
-            ctx_position_ids = ctx_position_ids.view([batch_size * input_len])
-            ctx_last_token_ids = torch.cumsum(ctx_last_token_ids, dim=0).int()
-
-        cache_indirections = [
-            torch.full((
-                batch_size,
-                beam_width,
-                max_seq_len,
-            ),
-                       0,
-                       dtype=torch.int32,
-                       device='cuda'),
-            torch.full((
-                batch_size,
-                beam_width,
-                max_seq_len,
-            ),
-                       0,
-                       dtype=torch.int32,
-                       device='cuda')
-        ]  # ping-pong buffers
-
-        perf_knob_tensor_size = 16
-        # runtime_perf_knobs is not used in context phase
-        context_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size,
-                                                  dtype=torch.int64)
-
-        ctx_buffer = {
-            'input_ids': ctx_ids,
-            'context_lengths': ctx_context_lengths,
-            'position_ids': ctx_position_ids,
-            'last_token_ids': ctx_last_token_ids,
-            'cache_indirection': cache_indirections[0],
-            'host_request_types': ctx_host_request_types,
-            'host_runtime_perf_knobs': context_runtime_perf_knobs,
-        }
-        if enable_remove_input_padding:
-            ctx_buffer['host_context_lengths'] = ctx_context_lengths.cpu()
-
-        ctx_shape = {k: v.shape for k, v in ctx_buffer.items()}
-
-        ctx_buffer[f'host_max_attention_window_sizes'] = torch.tensor(
-            [max_seq_len] * len(attn_layer_idx), dtype=torch.int32)
-        ctx_shape[f'host_max_attention_window_sizes'] = (len(attn_layer_idx), )
-        for layer_idx, buf in zip(attn_layer_idx, key_value_cache_buffers):
-            layer_config = config.get_layer_config(layer_idx)
-            kv_shape = (batch_size, 2,
-                        layer_config.attention.num_key_value_heads, max_seq_len,
-                        head_size)
-            ctx_shape[f'past_key_value_{layer_idx}'] = kv_shape
-            ctx_buffer[f'past_key_value_{layer_idx}'] = buf
-            ctx_buffer[f'present_key_value_{layer_idx}'] = buf
-
-        ctx_buffer['sequence_length'] = sequence_length_buffer
-        ctx_shape['sequence_length'] = ctx_buffer['sequence_length'].shape
-        ctx_shape['host_past_key_value_lengths'] = (batch_size, )
-        ctx_buffer['host_past_key_value_lengths'] = torch.tensor(
-            [0] * batch_size, dtype=torch.int32)
-        ctx_shape['host_sink_token_length'] = (1, )
-        ctx_buffer['host_sink_token_length'] = torch.tensor([0],
-                                                            dtype=torch.int32)
-
-        context = runtime.ctx_context
-        runtime._set_shape(context, ctx_shape)
-        runtime._set_buffer(context, ctx_buffer)
-        runtime._run(context)
-        torch.cuda.synchronize()
-
-        res = ctx_buffer['logits']
-        np.testing.assert_allclose(ref.to(torch.float32).cpu().numpy(),
-                                   res.to(torch.float32).cpu().numpy(),
-                                   atol=0.12)
-
-        # compare generation
-        step = 1
-        step1_id = torch.randint(100, (batch_size, 1)).int().cuda()
-        gen_context_lengths = ctx_context_lengths.clone()
-        gen_position_ids = torch.ones_like(step1_id).int().cuda() * input_len
-        gen_last_token_ids = torch.zeros_like(gen_context_lengths).int().cuda()
-        gen_host_request_types = torch.tensor([1] * batch_size,
-                                              dtype=torch.int32)
-        gen_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size,
-                                              dtype=torch.int64)
-
-        with torch.no_grad():
-            hf_outputs = hf_model.forward(
-                step1_id,
-                past_key_values=hf_outputs.past_key_values,
-                use_cache=True,
-                output_hidden_states=True)
-
-        torch.cuda.synchronize()
-        ref = hf_outputs.logits[:, -1, :]
-
-        if enable_remove_input_padding:
-            step1_id = step1_id.view([batch_size])
-            gen_position_ids = gen_position_ids.view([batch_size])
-            gen_last_token_ids = torch.ones_like(
-                gen_context_lengths).int().cuda()
-            gen_last_token_ids = torch.cumsum(gen_last_token_ids, dim=0).int()
-
-        step1_buffer = {
-            'input_ids': step1_id,
-            'context_lengths': gen_context_lengths,
-            'position_ids': gen_position_ids,
-            'last_token_ids': gen_last_token_ids,
-            'host_request_types': gen_host_request_types,
-            'cache_indirection': cache_indirections[1],
-            'host_runtime_perf_knobs': gen_runtime_perf_knobs,
-        }
-        if enable_remove_input_padding:
-            step1_buffer['host_context_lengths'] = gen_context_lengths.cpu()
-
-        step1_shape = {k: v.shape for k, v in step1_buffer.items()}
-
-        sequence_length_buffer = torch.add(sequence_length_buffer, step)
-        step1_buffer[f'host_max_attention_window_sizes'] = torch.tensor(
-            [max_seq_len] * len(attn_layer_idx), dtype=torch.int32)
-        step1_shape[f'host_max_attention_window_sizes'] = (
-            len(attn_layer_idx), )
-        for layer_idx, buf in zip(attn_layer_idx, key_value_cache_buffers):
-            layer_config = config.get_layer_config(layer_idx)
-            kv_shape = (batch_size, 2,
-                        layer_config.attention.num_key_value_heads, max_seq_len,
-                        head_size)
-            step1_shape[f"past_key_value_{layer_idx}"] = kv_shape
-            step1_buffer[f"past_key_value_{layer_idx}"] = buf
-            step1_buffer[f"present_key_value_{layer_idx}"] = buf
-
-        step1_buffer['sequence_length'] = sequence_length_buffer
-        step1_shape['sequence_length'] = ctx_buffer['sequence_length'].shape
-        step1_shape['sequence_length'] = (batch_size, )
-        step1_shape['host_past_key_value_lengths'] = (batch_size, )
-        step1_buffer[
-            'host_past_key_value_lengths'] = sequence_length_buffer.cpu()
-        step1_shape['host_sink_token_length'] = (1, )
-        step1_buffer['host_sink_token_length'] = torch.tensor([0],
-                                                              dtype=torch.int32)
-
-        context = runtime.context_1
-        runtime._set_shape(context, step1_shape)
-        runtime._set_buffer(context, step1_buffer)
-        runtime._run(context)
-        torch.cuda.synchronize()
-        res = step1_buffer['logits']
-
-        np.testing.assert_allclose(ref.to(torch.float32).cpu().numpy(),
-                                   res.to(torch.float32).cpu().numpy(),
-                                   atol=0.12)
-
-    @parameterized.expand(
-        itertools.product(
-            (os.getenv("NVSMALL_CKPT"), ),  # "deci/decilm-7b"),
-            (True, False),
-            (1, 2),
-            (1, 2),
-            ("auto", "float16", "bfloat16")))
-    def test_convert_config_from_hf(self, ckpt_path: Optional[str],
-                                    preloaded: bool, tp_size: int, pp_size: int,
-                                    dtype: str) -> None:
-        if ckpt_path is None:
-            self.skipTest(
-                f"Missing nvsmall checkpoint, define a valid checkpoint path with the NVSMALL_CKPT environment variable"
-            )
-
-        hf_config = transformers.AutoConfig.from_pretrained(
-            ckpt_path, trust_remote_code=True)
-
-        mapping = Mapping(world_size=(tp_size * pp_size),
-                          rank=0,
-                          gpus_per_node=1,
-                          tp_size=tp_size,
-                          pp_size=pp_size)
-
-        config = DeciConfig.from_hugging_face(
-            hf_config if preloaded else ckpt_path,
-            dtype=dtype,
-            mapping=mapping,
-            trust_remote_code=not preloaded)
-
-        if getattr(hf_config, "num_key_value_heads_per_layer",
-                   None) is not None:
-            # verify layers for old config
-            for layer_idx, num_kv_heads in enumerate(
-                    hf_config.num_key_value_heads_per_layer):
-                layer_config = config.get_layer_config(layer_idx)
-                self.assertEqual(layer_config.attention.impl,
-                                 AttentionImplementation.ATTENTION)
-                self.assertEqual(num_kv_heads,
-                                 layer_config.attention.num_key_value_heads)
-                self.assertEqual(layer_config.ffn.impl, FFNImplementation.MLP)
-                self.assertEqual(layer_config.ffn.intermediate_size,
-                                 config.intermediate_size)
-
-        elif getattr(hf_config, "block_configs", None) is not None:
-            # verify layers for new config
-            for layer_idx, block_config in enumerate(hf_config.block_configs):
-                layer_config = config.get_layer_config(layer_idx)
-                if layer_config.attention.impl == AttentionImplementation.ATTENTION:
-                    self.assertFalse(block_config.attention.no_op)
-                    self.assertFalse(block_config.attention.replace_with_linear)
-                    self.assertEqual(
-                        config.num_attention_heads //
-                        block_config.attention.n_heads_in_group,
-                        layer_config.attention.num_key_value_heads)
-                elif layer_config.attention.impl == AttentionImplementation.NO_OP:
-                    self.assertTrue(block_config.attention.no_op)
-                elif layer_config.attention.impl == AttentionImplementation.LINEAR:
-                    self.assertTrue(block_config.attention.replace_with_linear)
-
-                if layer_config.ffn.impl == FFNImplementation.MLP:
-                    self.assertFalse(block_config.ffn.no_op)
-                    self.assertFalse(block_config.ffn.replace_with_linear)
-                    self.assertEqual(
-                        _ffn_mult_to_intermediate_size(
-                            block_config.ffn.ffn_mult, config.hidden_size),
-                        layer_config.ffn.intermediate_size)
-                elif layer_config.ffn.impl == FFNImplementation.NO_OP:
-                    self.assertTrue(block_config.ffn.no_op)
-                elif layer_config.ffn.impl == FFNImplementation.LINEAR:
-                    self.assertTrue(block_config.ffn.replace_with_linear)
-
-        # verify config is valid enough for model creation
-        DeciLMForCausalLM(config)
-
-    @parameterized.expand(
-        itertools.product(
-            (os.getenv("NVSMALL_CKPT"), ),  # "deci/decilm-7b"),
-            (True, False),
-            (1, 2),
-            (1, 2),
-            ("auto", "float16", "bfloat16")))
-    def test_convert_model_from_hf(self, ckpt_path: Optional[str],
-                                   preloaded: bool, tp_size: int, pp_size: int,
-                                   dtype: str) -> None:
-        if ckpt_path is None:
-            self.skipTest(
-                f"Missing nvsmall checkpoint, define a valid checkpoint path with the NVSMALL_CKPT environment variable"
-            )
-
-        if preloaded:
-            hf_model_or_dir = transformers.AutoModelForCausalLM.from_pretrained(
-                ckpt_path, trust_remote_code=True)
-        else:
-            hf_model_or_dir = ckpt_path
-
-        mapping = Mapping(world_size=(tp_size * pp_size),
-                          rank=0,
-                          gpus_per_node=1,
-                          tp_size=tp_size,
-                          pp_size=pp_size)
-
-        DeciLMForCausalLM.from_hugging_face(hf_model_or_dir=hf_model_or_dir,
-                                            dtype=dtype,
-                                            mapping=mapping,
-                                            trust_remote_code=not preloaded)
diff --git a/tests/model/test_gpt.py b/tests/model/test_gpt.py
index 54b2822b4..7123ab296 100644
--- a/tests/model/test_gpt.py
+++ b/tests/model/test_gpt.py
@@ -38,12 +38,16 @@
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 from tensorrt_llm.runtime.generation import _prepare_attention_mask
-from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence,
-                                                   KVCacheManager)
+from tensorrt_llm.runtime.kv_cache_manager import GenerationSequence
+from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
+    PoolsKVCacheManager
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func
 
+from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \
+    MemoryPoolsAllocator
+
 
 class TestGPT(unittest.TestCase):
 
@@ -513,27 +517,50 @@ def test_gpt_plugin(self, test_partition, use_refit, fast_building,
         if enable_paged_kv_cache:
             max_blocks_per_seq = math.ceil(total_length / tokens_per_block)
             num_blocks = batch_size * beam_width * max_blocks_per_seq
-            block_size = gpt_config.n_head * tokens_per_block * head_size
-            kv_cache_manager = KVCacheManager(
-                num_layers=gpt_config.n_layer,
+
+            memory_pools_allocator = MemoryPoolsAllocator(
                 num_blocks=num_blocks,
-                block_size=block_size,
                 tokens_per_block=tokens_per_block,
-                max_blocks_per_seq=max_blocks_per_seq,
+                head_size=head_size)
+            num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer(
+                gpt_config.n_head, gpt_config.n_layer)
+            memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer)
+            pools_kv_cache_manager = PoolsKVCacheManager(
+                memory_pools_allocator.pools_metadata,
+                max_blocks_per_seq,
+                num_blocks,
+                tokens_per_block,
+                head_size,
                 max_attention_window_size=total_length,
-                sink_token_len=0,
-                beam_width=beam_width)
-            host_kv_cache_pool_pointers = torch.tensor(
-                [key_value_cache_buffers[0].data_ptr(), 0], dtype=torch.int64)
+                beam_width=beam_width,
+                sink_token_len=0)
+
+            host_kv_cache_pool_pointers = memory_pools_allocator.get_kv_cache_pool_pointers(
+            )
+            host_kv_cache_pool_mapping = memory_pools_allocator.pool_mapping
+
+            # block_size = gpt_config.n_head * tokens_per_block * head_size
+            # kv_cache_manager = KVCacheManager(
+            #     num_layers=gpt_config.n_layer,
+            #     num_blocks=num_blocks,
+            #     block_size=block_size,
+            #     tokens_per_block=tokens_per_block,
+            #     max_blocks_per_seq=max_blocks_per_seq,
+            #     max_attention_window_size=total_length,
+            #     sink_token_len=0,
+            #     beam_width=beam_width)
+            # host_kv_cache_pool_pointers = torch.tensor(
+            # [key_value_cache_buffers[0].data_ptr(), 0], dtype=torch.int64)
 
             # Add sequences to the manager
             for bi in range(batch_size):
                 generation_sequence = GenerationSequence(seq_idx=bi,
                                                          batch_idx=bi)
-                kv_cache_manager.add_sequence(generation_sequence, seq_len)
+                pools_kv_cache_manager.add_sequence(generation_sequence,
+                                                    seq_len)
 
             # Pre allocate the kv cache for the generated tokens.
-            kv_cache_manager.step([False] * batch_size)
+            pools_kv_cache_manager.step([False] * batch_size)
 
         def run_engine(context,
                        input_ids,
@@ -570,7 +597,7 @@ def run_engine(context,
             if enable_paged_kv_cache:
                 assert beam_width == 1
                 # for beam_width > 1 the argument must be '1' in ctx phase and 'beam_width' in gen phase
-                host_kv_cache_block_offsets = kv_cache_manager.get_block_offsets(
+                host_kv_cache_block_offsets = pools_kv_cache_manager.get_block_offsets(
                     beam_width=1)
                 kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda')
 
@@ -585,6 +612,10 @@ def run_engine(context,
                 ctx_buffer[
                     f'host_kv_cache_pool_pointers'] = host_kv_cache_pool_pointers.contiguous(
                     )
+                ctx_buffer[
+                    f'host_kv_cache_pool_mapping'] = memory_pools_allocator.pool_mapping.contiguous(
+                    )
+
                 ctx_buffer[
                     f'host_max_attention_window_sizes'] = host_max_attention_window_sizes
             else:
diff --git a/tests/model/test_gpt_e2e.py b/tests/model/test_gpt_e2e.py
index 936298085..82ee3eb76 100644
--- a/tests/model/test_gpt_e2e.py
+++ b/tests/model/test_gpt_e2e.py
@@ -62,7 +62,6 @@ def build_engine(checkpoint_dir: str, engine_dir: str, *args):
         '--max_input_len=40',
         '--max_seq_len=60',
         '--max_beam_width=2',
-        '--builder_opt=0',
     ]
     legacy_args = [
         "--gpt_attention_plugin=disable",
diff --git a/tests/model/test_llama.py b/tests/model/test_llama.py
index 344d52172..586522517 100644
--- a/tests/model/test_llama.py
+++ b/tests/model/test_llama.py
@@ -244,6 +244,7 @@ def test_llama(self, use_refit, fast_building, context_fmha_flag,
         llama_config.vocab_size = 128
         llama_config.num_attention_heads = 2 if num_kv_heads <= 1 else 2 * num_kv_heads
         llama_config.hidden_size = llama_config.num_attention_heads * head_size
+        llama_config.head_dim = head_size
         llama_config.intermediate_size = ((
             (llama_config.hidden_size * 4 * 2 // 3) + head_size - 1) //
                                           head_size) * head_size
@@ -256,7 +257,7 @@ def test_llama(self, use_refit, fast_building, context_fmha_flag,
         llama_config.eos_token_id = self.EOS_TOKEN
         seed_idx = random.randint(0, len(PRECHECKED_GOOD_RANDOM_SEEDS) - 1)
         torch.manual_seed(PRECHECKED_GOOD_RANDOM_SEEDS[seed_idx])
-        hf_llama = LlamaForCausalLM(llama_config).cuda()
+        hf_llama = LlamaForCausalLM(llama_config).cuda().eval()
         runtime, _ = self._gen_tensorrt_llm_runtime(
             log_level, dtype, world_size, rank, llama_config, hf_llama, model,
             use_plugin, batch_size, beam_width, input_len, output_len,
diff --git a/tests/model/test_mamba.py b/tests/model/test_mamba.py
index 9f16397d7..d7ae52456 100644
--- a/tests/model/test_mamba.py
+++ b/tests/model/test_mamba.py
@@ -32,7 +32,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 
-from examples.mamba.convert_checkpoint import (convert_from_hf_checkpoint,
+from tensorrt_llm.models.mamba.convert import (convert_from_hf_checkpoint,
                                                convert_hf_mamba)
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
@@ -74,13 +74,12 @@ def _gen_tensorrt_llm_mamba(self, hf_config, hf_path, hf_mamba, load_mode,
                 'pp_size': 1
             },
         }
+        config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
         if load_mode == 'from_checkpoint':
             weights = convert_from_hf_checkpoint(mamba_config=config,
-                                                 model_dir=hf_path,
-                                                 dtype=dtype)
+                                                 model_dir=hf_path)
         else:
-            weights = convert_hf_mamba(hf_mamba, rank=0, dtype=dtype)
-        config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
+            weights = convert_hf_mamba(hf_mamba, dtype=dtype)
         tensorrt_llm_mamba = tensorrt_llm.models.MambaForCausalLM(config)
         tensorrt_llm_mamba.load(weights)
         return tensorrt_llm_mamba
@@ -251,7 +250,7 @@ def test_mamba(self, gemm_plugin, mamba_conv1d_plugin, dtype,
                                                   device=step1_id.device))
                 gen_ref = hf_outputs.logits[:, -1, :]
 
-        # get tensorrt llm mamba rumtime
+        # get tensorrt llm mamba runtime
         runtime, _ = self._gen_tensorrt_llm_runtime(
             log_level, model_name, gemm_plugin, mamba_conv1d_plugin, hf_config,
             hf_path, hf_mamba, load_mode, batch_size, input_len, output_len,
diff --git a/tests/model/test_mistral.py b/tests/model/test_mistral.py
index 5000566aa..f66a83c31 100644
--- a/tests/model/test_mistral.py
+++ b/tests/model/test_mistral.py
@@ -211,6 +211,7 @@ def test_mistral(self, use_refit, fast_building, context_fmha_flag,
         mistral_config.max_position_embeddings = 64
         mistral_config.vocab_size = 128
         mistral_config.num_attention_heads = 2 * num_kv_heads
+        mistral_config.head_dim = head_size
         mistral_config.hidden_size = mistral_config.num_attention_heads * head_size
         mistral_config.intermediate_size = ((
             (mistral_config.hidden_size * 4 * 2 // 3) + head_size - 1) //
@@ -222,7 +223,7 @@ def test_mistral(self, use_refit, fast_building, context_fmha_flag,
         mistral_config.eos_token_id = self.EOS_TOKEN
         seed_idx = random.randint(0, len(PRECHECKED_GOOD_RANDOM_SEEDS) - 1)
         torch.manual_seed(PRECHECKED_GOOD_RANDOM_SEEDS[seed_idx])
-        hf_mistral = MistralForCausalLM(mistral_config).cuda()
+        hf_mistral = MistralForCausalLM(mistral_config).cuda().eval()
         runtime, _ = self._gen_tensorrt_llm_runtime(
             log_level, dtype, world_size, rank, mistral_config, hf_mistral,
             model, use_plugin, batch_size, beam_width, input_len, output_len,
diff --git a/tests/model/test_nemotron_nas.py b/tests/model/test_nemotron_nas.py
new file mode 100644
index 000000000..469a65b4e
--- /dev/null
+++ b/tests/model/test_nemotron_nas.py
@@ -0,0 +1,989 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+import math
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import unittest
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import pytest
+import tensorrt as trt
+import torch
+import transformers
+from parameterized import parameterized
+from transformers import AutoTokenizer
+from typing_extensions import Literal
+
+import tensorrt_llm
+from tensorrt_llm import logger
+from tensorrt_llm._utils import str_dtype_to_torch
+from tensorrt_llm.builder import Builder, Engine, EngineConfig
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+from tensorrt_llm.models.nemotron_nas.config import DeciConfig, DeciLayerConfig
+from tensorrt_llm.models.nemotron_nas.convert import (
+    _ffn_mult_to_intermediate_size, load_weights_from_hf_safetensors)
+from tensorrt_llm.models.nemotron_nas.layer_config import (
+    AttentionImplementation, FFNImplementation)
+from tensorrt_llm.models.nemotron_nas.model import DeciLMForCausalLM
+from tensorrt_llm.network import Network, net_guard
+from tensorrt_llm.plugin.plugin import ContextFMHAType
+from tensorrt_llm.runtime.generation import _Runtime
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from utils.llm_data import llm_models_root
+from utils.util import get_project_root, unittest_name_func
+
+sys.path.append(
+    os.path.join(os.path.dirname(__file__), '../..', 'examples/nemotron_nas'))
+from calibration_utils import create_trtllm_magpie_calibration_dataset
+
+from tensorrt_llm.runtime.kv_cache_manager import GenerationSequence
+from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \
+    MemoryPoolsAllocator
+from tensorrt_llm.runtime.memory_pools.pools_kv_cache_manager import \
+    PoolsKVCacheManager
+from tensorrt_llm.runtime.model_runner import ModelRunner
+from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCpp
+
+
+@dataclass(kw_only=True, frozen=True)
+class TestParams:
+    enable_paged_kv_cache: bool
+    enable_remove_input_padding: bool
+    dtype: Literal["float16", "bfloat16"]
+
+    batch_size: int = 1
+    beam_width: int = 1
+    seq_len: int = 128
+    total_length: int = seq_len + 2
+    tokens_per_block: int = 128
+
+    @property
+    def output_len(self):
+        return self.total_length - self.seq_len
+
+    def __str__(self) -> str:
+        """tests/utils/util.py#L143 - > `str(x)`: parameterized test name"""
+        properties_without_default = (self.enable_paged_kv_cache,
+                                      self.enable_remove_input_padding,
+                                      self.dtype)
+        return "_".join((parameterized.to_safe_name(prop).lower()
+                         for prop in properties_without_default))
+
+    @property
+    def mapping(self) -> Mapping:
+        return Mapping(world_size=1, rank=0, tp_size=1)
+
+
+@dataclass
+class RuntimeHandle:
+    """Deleting `Runtime().runtime` will **definitively** deallocate the weights."""
+    runtime: _Runtime
+
+
+class TestNemotronNas(unittest.TestCase):
+
+    def _make_config(self,
+                     layer_configs: List[Union[DeciLayerConfig,
+                                               Dict[str, Dict[str, Any]]]],
+                     dtype: str = 'bfloat16',
+                     num_attention_heads: int = 32,
+                     num_key_value_heads: Optional[int] = None,
+                     hidden_size: int = 4096,
+                     intermediate_size: int = 16384,
+                     vocab_size: int = 32128,
+                     max_positions_embedding: int = 1024,
+                     norm_epsilon: float = 1e-05) -> DeciConfig:
+        config = {
+            'architecture': 'DeciLMForCausalLM',
+            'num_hidden_layers': len(layer_configs),
+            'num_attention_heads': num_attention_heads,
+            'num_key_value_heads': num_key_value_heads,
+            'dtype': dtype,
+            'logits_dtype': dtype,
+            'hidden_size': hidden_size,
+            'intermediate_size': intermediate_size,
+            'vocab_size': vocab_size,
+            'position_embedding_type': 'rope_gpt_neox',
+            'max_position_embeddings': max_positions_embedding,
+            'hidden_act': 'silu',
+            'norm_epsilon': norm_epsilon,
+            'layer_configs': layer_configs
+        }
+
+        config = DeciConfig.from_dict(config)
+        return config
+
+    def _gen_tensorrt_llm_network(self, network: Network,
+                                  model: DeciLMForCausalLM, batch_size: int,
+                                  beam_width: int, input_len: int,
+                                  output_len: int, rank: int,
+                                  tensor_parallel: int, **opt_flags):
+        list(range(tensor_parallel))
+
+        with net_guard(network):
+            # Prepare
+            network.set_named_parameters(model.named_parameters())
+            inputs = model.prepare_inputs(max_batch_size=batch_size,
+                                          max_input_len=input_len,
+                                          max_seq_len=input_len + output_len,
+                                          max_num_tokens=batch_size * input_len,
+                                          use_cache=True,
+                                          max_beam_width=beam_width)
+            # Forward
+            model(**inputs)
+        return network
+
+    def _gen_tensorrt_llm_engine(
+            self,
+            rank: int,
+            world_size: int,
+            model: DeciLMForCausalLM,
+            model_name: str,
+            use_plugin: bool,
+            batch_size: int,
+            beam_width: int,
+            input_len: int,
+            output_len: int,
+            tokens_per_block: int,
+            use_refit: bool,
+            use_gemm: bool = False,
+            context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled,
+            enable_remove_input_padding: bool = False,
+            enable_paged_kv_cache: bool = False,
+            **opt_flags) -> trt.IHostMemory:
+
+        builder = Builder()
+        dtype = model.config.dtype
+
+        with tempfile.TemporaryDirectory():
+            builder_config = builder.create_builder_config(
+                name=model_name,
+                precision=dtype,
+                timing_cache='model.cache',
+                tensor_parallel=world_size,  # TP only
+                use_refit=use_refit,
+                strongly_typed=True,
+            )
+            network = builder.create_network()
+            network.plugin_config.to_legacy_setting()
+            if use_plugin:
+                network.plugin_config.gpt_attention_plugin = dtype
+            if use_gemm:
+                network.plugin_config.gemm_plugin = dtype
+            if enable_remove_input_padding:
+                network.plugin_config.remove_input_padding = True
+            if enable_paged_kv_cache:
+                network.plugin_config.enable_paged_kv_cache(tokens_per_block)
+
+            network.plugin_config.set_context_fmha(context_fmha_flag)
+
+            self._gen_tensorrt_llm_network(network=network,
+                                           model=model,
+                                           batch_size=batch_size,
+                                           beam_width=beam_width,
+                                           input_len=input_len,
+                                           output_len=output_len,
+                                           rank=rank,
+                                           tensor_parallel=world_size,
+                                           **opt_flags)
+            engine_buffer = builder.build_engine(network, builder_config)
+            return engine_buffer
+
+    def _from_hf_model(
+            self,
+            hf_model: transformers.AutoModelForCausalLM,
+            params: TestParams,
+            *,
+            model_name: str = "nemotron-nas",
+            use_plugin: bool = True,
+            use_refit: bool = False,
+            use_gemm: bool = True,
+            context_fmha_flag: ContextFMHAType = ContextFMHAType.disabled,
+            **opt_flags) -> Tuple[RuntimeHandle, PretrainedConfig]:
+        model = DeciLMForCausalLM.from_hugging_face(hf_model)
+        logger.set_level("warning")
+        mapping = params.mapping
+        engine_buffer = self._gen_tensorrt_llm_engine(
+            rank=mapping.rank,
+            world_size=mapping.world_size,
+            model=model,
+            model_name=model_name,
+            use_plugin=use_plugin,
+            batch_size=params.batch_size,
+            beam_width=params.beam_width,
+            input_len=params.seq_len,
+            output_len=params.output_len,
+            use_refit=use_refit,
+            use_gemm=use_gemm,
+            context_fmha_flag=context_fmha_flag,
+            enable_remove_input_padding=params.enable_remove_input_padding,
+            tokens_per_block=params.tokens_per_block,
+            enable_paged_kv_cache=params.enable_paged_kv_cache,
+            **opt_flags)
+        runtime = RuntimeHandle(_Runtime(engine_buffer, mapping))
+        return runtime, model.config
+
+    def _from_fp8_quantized_engine(
+            self,
+            *,
+            model_dir: str,
+            quantize_dir: str,
+            dataset: Optional[str] = "cnn_dailymail",
+            params: TestParams) -> Tuple[RuntimeHandle, PretrainedConfig]:
+        root = get_project_root(__file__)
+        quantize_path = str(root / "examples/quantization/quantize.py")
+
+        with tempfile.TemporaryDirectory(
+                prefix="transformed_magpie") as dataset_dir:
+            create_trtllm_magpie_calibration_dataset(dataset_dir)
+            quantize = [
+                sys.executable,
+                quantize_path,
+                f"--model_dir={model_dir}",
+                f"--output_dir={quantize_dir}",
+                f"--calib_dataset={dataset_dir}",
+                "--dtype=bfloat16",
+                "--kv_cache_dtype=fp8",
+                "--qformat=fp8",
+                "--calib_size=512",
+            ]
+            print(f"Running quantize: {quantize}")
+            subprocess.run(quantize, check=True)
+
+        engine_path = f"{quantize_dir}/engine"
+        build = [
+            "trtllm-build",
+            f"--checkpoint_dir={quantize_dir}",
+            f"--output_dir={engine_path}",
+            f"--max_input_len={params.seq_len}",
+            f"--max_batch_size={params.batch_size}",
+            f"--remove_input_padding={'enable' if params.enable_remove_input_padding else 'disable'}",
+            f"--kv_cache_type={'paged' if params.enable_paged_kv_cache else 'continuous'}",
+            "--gemm_plugin=auto",
+            "--gpt_attention_plugin=auto",
+        ]
+
+        if params.enable_paged_kv_cache:
+            build.append(f"--tokens_per_block={params.tokens_per_block}")
+
+        print(f"Running trtllm-build: {build}")
+        subprocess.run(build, check=True)
+
+        engine = Engine.from_dir(engine_path)
+        runtime = RuntimeHandle(_Runtime(engine.engine, params.mapping))
+        config = EngineConfig.from_json_file(f"{engine_path}/config.json")
+
+        return runtime, config.pretrained_config
+
+    def test_config_to_from_dict(self) -> None:
+        config = self._make_config(layer_configs=[{
+            "attention": {
+                "num_key_value_heads": 4
+            },
+            "ffn": {}
+        }, {
+            "attention": {
+                "num_key_value_heads": 2
+            },
+            "ffn": {
+                "impl": "no_op"
+            }
+        }, {
+            "attention": {
+                "impl": "no_op"
+            },
+            "ffn": {
+                "intermediate_size": 8192
+            }
+        }])
+
+        config2 = DeciConfig.from_dict(config.to_dict())
+        self.assertListEqual(config.layer_configs, config2.layer_configs)
+
+    def test_save_load_config(self) -> None:
+        config = self._make_config(layer_configs=[{
+            "attention": {
+                "num_key_value_heads": 4
+            },
+            "ffn": {}
+        }, {
+            "attention": {
+                "num_key_value_heads": 2
+            },
+            "ffn": {
+                "impl": "no_op"
+            }
+        }, {
+            "attention": {
+                "impl": "no_op"
+            },
+            "ffn": {
+                "intermediate_size": 8192
+            }
+        }])
+
+        with tempfile.TemporaryDirectory(
+                prefix="test_save_load_checkpoint") as ckpt_dir:
+            config_file = f"{ckpt_dir}/config.json"
+            config.to_json_file(config_file)
+            config2 = DeciConfig.from_json_file(config_file)
+
+        self.assertDictEqual(config.to_dict(), config2.to_dict())
+        self.assertListEqual(config.layer_configs, config2.layer_configs)
+
+    def get_loader_test_cases():
+        model_root = llm_models_root(check=True)
+        test_models_base_path = Path(model_root, "nvsmall/tests")
+        models_path = [
+            os.path.join(test_models_base_path, x)
+            for x in os.listdir(test_models_base_path)
+        ]
+
+        params_product = [
+            TestParams(
+                enable_paged_kv_cache=paged,
+                enable_remove_input_padding=padded,
+                dtype=dtype,
+            ) for paged, padded, dtype in itertools.product(
+                [True, False],
+                [True, False],
+                ["bfloat16", "float16"],
+            )
+        ]
+        test_cases = list(itertools.product(models_path, params_product))
+
+        return test_cases
+
+    @parameterized.expand(get_loader_test_cases, name_func=unittest_name_func)
+    def test_allclose_to_hf(self, hf_model_dir: str, params: TestParams):
+        hf_model = transformers.AutoModelForCausalLM.from_pretrained(
+            hf_model_dir,
+            trust_remote_code=True,
+            torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype),
+        ).cuda()
+        runtime, config = self._from_hf_model(hf_model, params)
+        self.allclose(
+            runtime,
+            config=config,
+            params=params,
+            obtain_hf_model=lambda: hf_model,
+        )
+
+    def allclose(
+        self,
+        runtime_handle: RuntimeHandle,
+        *,
+        config: PretrainedConfig,
+        params: TestParams,
+        obtain_hf_model: Callable[[], transformers.AutoModelForCausalLM],
+    ):
+        batch_size = params.batch_size
+        beam_width = params.beam_width
+        seq_len = params.seq_len
+        total_length = params.total_length
+        dtype = config.dtype
+        tokens_per_block = params.tokens_per_block
+        enable_remove_input_padding = params.enable_remove_input_padding
+        enable_paged_kv_cache = params.enable_paged_kv_cache
+
+        key_value_cache_buffers = []
+        head_size = config.hidden_size // config.num_attention_heads
+        attn_layer_idx = [
+            i for i in range(config.num_hidden_layers)
+            if config.get_layer_config(i).attention.needs_kv_cache
+        ]
+
+        if enable_paged_kv_cache:
+            num_blocks = batch_size * beam_width * math.ceil(
+                total_length / tokens_per_block)
+
+            memory_pools_allocator = MemoryPoolsAllocator(
+                num_blocks=num_blocks,
+                tokens_per_block=tokens_per_block,
+                head_size=head_size)
+            if config.num_kv_heads_per_layer is None:
+                num_kv_heads = config.get_layer_config(
+                    attn_layer_idx[0]).attention.num_key_value_heads
+                num_kv_heads_per_layer = MemoryPoolsAllocator.prepare_num_kv_heads_per_layer(
+                    num_kv_heads, len(attn_layer_idx))
+            else:
+                num_kv_heads_per_layer = config.num_kv_heads_per_layer
+
+            memory_pools_allocator.allocate(dtype, num_kv_heads_per_layer)
+            max_blocks_per_seq = math.ceil(total_length / tokens_per_block)
+            num_blocks = batch_size * beam_width * max_blocks_per_seq
+
+            pools_kv_cache_manager = PoolsKVCacheManager(
+                memory_pools_allocator.pools_metadata,
+                max_blocks_per_seq,
+                num_blocks,
+                tokens_per_block,
+                head_size,
+                max_attention_window_size=total_length,
+                beam_width=beam_width,
+                sink_token_len=0)
+            # Add sequences to the manager
+            for bi in range(batch_size):
+                generation_sequence = GenerationSequence(seq_idx=bi,
+                                                         batch_idx=bi)
+                pools_kv_cache_manager.add_sequence(generation_sequence,
+                                                    seq_len)
+
+            # Pre allocate the kv cache for the generated tokens.
+            pools_kv_cache_manager.step([False] * batch_size)
+
+        else:
+            for layer_idx in attn_layer_idx:
+                layer_config = config.get_layer_config(layer_idx)
+                new_cache = torch.zeros((
+                    batch_size,
+                    2,
+                    layer_config.attention.num_key_value_heads,
+                    total_length,
+                    head_size,
+                ),
+                                        dtype=str_dtype_to_torch(dtype),
+                                        device='cuda')
+                key_value_cache_buffers.append(new_cache)
+
+        cache_indirections = [
+            torch.full((
+                batch_size,
+                beam_width,
+                total_length,
+            ),
+                       0,
+                       dtype=torch.int32,
+                       device='cuda'),
+            torch.full((
+                batch_size,
+                beam_width,
+                total_length,
+            ),
+                       0,
+                       dtype=torch.int32,
+                       device='cuda')
+        ]  # ping-pong buffers
+
+        def run_engine(context,
+                       input_ids,
+                       context_lengths,
+                       host_request_types,
+                       position_ids,
+                       last_token_ids,
+                       cache_indirection,
+                       host_past_key_value_lengths,
+                       host_max_attention_window_sizes,
+                       host_sink_token_length,
+                       host_runtime_perf_knobs,
+                       sequence_length=None,
+                       host_context_lengths=None):
+
+            ctx_buffer = {
+                'input_ids': input_ids,
+                'context_lengths': context_lengths,
+                'host_request_types': host_request_types,
+                'position_ids': position_ids,
+                'last_token_ids': last_token_ids,
+                'cache_indirection': cache_indirection,
+                'host_past_key_value_lengths': host_past_key_value_lengths,
+                'sequence_length': sequence_length,
+                'host_sink_token_length': host_sink_token_length,
+                'host_runtime_perf_knobs': host_runtime_perf_knobs
+            }
+
+            assert host_request_types is not None
+            if enable_remove_input_padding:
+                assert host_context_lengths is not None, "host_context_lengths is required for ragged input"
+                ctx_buffer['host_context_lengths'] = host_context_lengths
+
+            if enable_paged_kv_cache:
+                assert beam_width == 1
+                # for beam_width > 1 the argument must be '1' in ctx phase and 'beam_width' in gen phase
+                host_kv_cache_block_offsets = pools_kv_cache_manager.get_block_offsets(
+                    beam_width=1)
+                kv_cache_block_offsets = host_kv_cache_block_offsets.to('cuda')
+                shape = kv_cache_block_offsets.shape
+                target_shape = [shape[0], shape[1] * shape[2], *shape[3:]]
+                ctx_buffer[
+                    f'kv_cache_block_offsets'] = kv_cache_block_offsets.reshape(
+                        target_shape)
+                ctx_buffer[
+                    f'host_kv_cache_block_offsets'] = host_kv_cache_block_offsets.reshape(
+                        target_shape)
+                ctx_buffer[
+                    f'host_kv_cache_pool_pointers'] = memory_pools_allocator.get_kv_cache_pool_pointers(
+                    ).contiguous()
+                ctx_buffer[
+                    f'host_kv_cache_pool_mapping'] = memory_pools_allocator.pool_mapping.contiguous(
+                    )
+                ctx_buffer[
+                    f'host_max_attention_window_sizes'] = host_max_attention_window_sizes
+            else:
+                for layer_idx, buf in zip(attn_layer_idx,
+                                          key_value_cache_buffers):
+                    ctx_buffer[f'past_key_value_{layer_idx}'] = buf
+                    ctx_buffer[f'present_key_value_{layer_idx}'] = buf
+                ctx_buffer[
+                    f'host_max_attention_window_sizes'] = host_max_attention_window_sizes
+
+            ctx_shape = {
+                key: buffer.shape
+                for key, buffer in ctx_buffer.items()
+            }
+
+            runtime_handle.runtime._set_shape(context, ctx_shape)
+            runtime_handle.runtime._set_buffer(context, ctx_buffer)
+            runtime_handle.runtime._run(context)
+            torch.cuda.synchronize()
+            res = ctx_buffer['logits']
+            return res
+
+        step0_ids = torch.randint(100, (batch_size, seq_len)).int().cuda()
+        step1_ids = torch.randint(100, (batch_size, 1)).int().cuda()
+
+        def tllm() -> Tuple[np.ndarray, np.ndarray]:
+            ctx_ids = step0_ids.clone()
+
+            ctx_context_lengths = seq_len * torch.ones(
+                (batch_size), dtype=torch.int32, device='cuda')
+            ctx_position_ids = torch.tensor(range(seq_len),
+                                            dtype=torch.int32).reshape([
+                                                1, seq_len
+                                            ]).expand([batch_size,
+                                                       seq_len]).cuda()
+            ctx_last_token_ids = ctx_context_lengths.clone()
+
+            if enable_remove_input_padding:
+                ctx_ids = ctx_ids.view([batch_size * seq_len])
+                ctx_position_ids = ctx_position_ids.view([batch_size * seq_len])
+                ctx_last_token_ids = torch.cumsum(ctx_last_token_ids,
+                                                  dim=0).int()
+
+            host_max_attention_window_sizes = torch.tensor([total_length] *
+                                                           len(attn_layer_idx),
+                                                           dtype=torch.int32)
+            host_sink_token_length = torch.tensor([0], dtype=torch.int32)
+
+            host_context_lengths = ctx_context_lengths.cpu(
+            ) if enable_remove_input_padding else None
+            host_request_types = torch.tensor([0 for i in range(batch_size)],
+                                              dtype=torch.int32).cpu()
+
+            host_past_key_value_lengths = ctx_context_lengths.detach().clone(
+            ).cpu()
+            # We need sequence_lengths start as context_lengths for step 0 (context),
+            # and it will be added one after each step.
+            sequence_length = ctx_context_lengths.detach().clone()
+
+            perf_knob_tensor_size = 16
+            ctx_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size,
+                                                  dtype=torch.int64)
+
+            step0 = run_engine(
+                context=runtime_handle.runtime.ctx_context,
+                input_ids=ctx_ids,
+                context_lengths=ctx_context_lengths,
+                position_ids=ctx_position_ids,
+                last_token_ids=ctx_last_token_ids,
+                cache_indirection=cache_indirections[0],
+                host_past_key_value_lengths=host_past_key_value_lengths,
+                host_max_attention_window_sizes=host_max_attention_window_sizes,
+                host_sink_token_length=host_sink_token_length,
+                sequence_length=sequence_length,
+                host_context_lengths=host_context_lengths,
+                host_request_types=host_request_types,
+                host_runtime_perf_knobs=ctx_runtime_perf_knobs)
+
+            step = 1
+            gen_ids = step1_ids.clone()
+
+            gen_context_lengths = seq_len * torch.ones(
+                (batch_size), dtype=torch.int32, device='cuda')
+            gen_position_ids = torch.ones_like(gen_ids).int().cuda() * seq_len
+            gen_last_token_ids = torch.zeros_like(
+                gen_context_lengths).int().cuda()
+
+            if enable_remove_input_padding:
+                gen_ids = gen_ids.view([batch_size])
+                gen_position_ids = gen_position_ids.view([batch_size])
+                gen_last_token_ids = torch.ones_like(
+                    gen_context_lengths).int().cuda()
+                gen_last_token_ids = torch.cumsum(gen_last_token_ids,
+                                                  dim=0).int()
+
+            host_past_key_value_lengths = torch.tensor([seq_len + step - 1] *
+                                                       batch_size,
+                                                       dtype=torch.int32)
+            host_max_attention_window_sizes = torch.tensor([seq_len + step] *
+                                                           len(attn_layer_idx),
+                                                           dtype=torch.int32)
+            host_sink_token_length = torch.tensor([0], dtype=torch.int32)
+
+            host_context_lengths = gen_context_lengths.cpu(
+            ) if enable_remove_input_padding else None
+            host_request_types = torch.tensor([1 for i in range(batch_size)],
+                                              dtype=torch.int32).cpu()
+
+            # For step 1, the sequence_lengths = context_lengths + 1.
+            sequence_length = torch.add(gen_context_lengths.detach().clone(), 1)
+
+            perf_knob_tensor_size = 16
+            gen_runtime_perf_knobs = torch.tensor([-1] * perf_knob_tensor_size,
+                                                  dtype=torch.int64)
+
+            step1 = run_engine(
+                context=runtime_handle.runtime.context_1,
+                input_ids=gen_ids,
+                context_lengths=gen_context_lengths,
+                position_ids=gen_position_ids,
+                last_token_ids=gen_last_token_ids,
+                cache_indirection=cache_indirections[1],
+                host_past_key_value_lengths=host_past_key_value_lengths,
+                host_max_attention_window_sizes=host_max_attention_window_sizes,
+                host_sink_token_length=host_sink_token_length,
+                sequence_length=sequence_length,
+                host_context_lengths=host_context_lengths,
+                host_request_types=host_request_types,
+                host_runtime_perf_knobs=gen_runtime_perf_knobs)
+
+            return step0, step1
+
+        def hf() -> Tuple[np.ndarray, np.ndarray]:
+            with torch.no_grad():
+                hf_model = obtain_hf_model()
+                step0_outputs = hf_model.forward(step0_ids.clone())
+                torch.cuda.synchronize()
+                step0 = step0_outputs.logits[:, -1, :]
+                step1_outputs = hf_model.forward(
+                    step1_ids.clone(),
+                    past_key_values=step0_outputs.past_key_values,
+                    use_cache=True,
+                )
+                torch.cuda.synchronize()
+                step1 = step1_outputs.logits[:, -1, :]
+
+            return step0, step1
+
+        res_step0, res_step1 = tllm()
+        del runtime_handle.runtime
+        ref_step0, ref_step1 = hf()
+        np.testing.assert_allclose(ref_step0.cpu().numpy().flatten(),
+                                   res_step0.cpu().numpy().flatten(),
+                                   atol=1e-1)
+        np.testing.assert_allclose(ref_step1.cpu().numpy().flatten(),
+                                   res_step1.cpu().numpy().flatten(),
+                                   atol=1e-1)
+
+    @parameterized.expand(get_loader_test_cases, name_func=unittest_name_func)
+    @pytest.mark.skipif(
+        os.environ.get("TEST_NEMOTRON_NAS_FP8_ALLCLOSE") is None,
+        reason="fp8 accuracy is low.")
+    def test_allclose_to_hf_fp8(self, hf_model_dir: str, params: TestParams):
+        with tempfile.TemporaryDirectory("quantize_dir") as quantize_dir:
+            runtime, config = self._from_fp8_quantized_engine(
+                model_dir=hf_model_dir,
+                quantize_dir=quantize_dir,
+                params=params)
+        self.allclose(
+            runtime,
+            config=config,
+            params=params,
+            obtain_hf_model=lambda: transformers.AutoModelForCausalLM.
+            from_pretrained(
+                hf_model_dir,
+                trust_remote_code=True,
+                torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
+                                                                   ),
+            ).cuda(),
+        )
+
+    @pytest.mark.skipif(
+        os.environ.get("NEMOTRON_NAS_CKPT") is None
+        or os.environ.get("NEMOTRON_NAS_OUTPUT_DIR") is None,
+        reason="You must define NEMOTRON_NAS_CKPT, NEMOTRON_NAS_OUTPUT_DIR",
+    )
+    def test_allclose_to_hf_fp8_accelerate(self):
+        hf_model_dir = os.environ["NEMOTRON_NAS_CKPT"]
+        output_dir = os.environ["NEMOTRON_NAS_OUTPUT_DIR"]
+        params = TestParams(enable_paged_kv_cache=True,
+                            enable_remove_input_padding=True,
+                            dtype="float16",
+                            seq_len=2048)
+        runtime, config = self._from_fp8_quantized_engine(
+            model_dir=hf_model_dir, quantize_dir=str(output_dir), params=params)
+        self.allclose(
+            runtime,
+            config=config,
+            params=params,
+            obtain_hf_model=lambda: transformers.AutoModelForCausalLM.
+            from_pretrained(
+                hf_model_dir,
+                trust_remote_code=True,
+                torch_dtype=tensorrt_llm._utils.str_dtype_to_torch(params.dtype
+                                                                   ),
+                device_map="auto",
+            ),
+        )
+
+    @parameterized.expand(
+        itertools.product(("nvidia/Llama-3_1-Nemotron-51B-Instruct", ),
+                          (True, False), (1, 2), (1, 2),
+                          ("auto", "float16", "bfloat16")))
+    def test_convert_config_from_hf(self, ckpt_path: Optional[str],
+                                    preloaded: bool, tp_size: int, pp_size: int,
+                                    dtype: str) -> None:
+        hf_config = transformers.AutoConfig.from_pretrained(
+            ckpt_path, trust_remote_code=True)
+
+        mapping = Mapping(world_size=(tp_size * pp_size),
+                          rank=0,
+                          gpus_per_node=1,
+                          tp_size=tp_size,
+                          pp_size=pp_size)
+
+        config = DeciConfig.from_hugging_face(
+            hf_config if preloaded else ckpt_path,
+            dtype=dtype,
+            mapping=mapping,
+            trust_remote_code=not preloaded)
+
+        if getattr(hf_config, "num_key_value_heads_per_layer",
+                   None) is not None:
+            # verify layers for old config
+            for layer_idx, num_kv_heads in enumerate(
+                    hf_config.num_key_value_heads_per_layer):
+                layer_config = config.get_layer_config(layer_idx)
+                self.assertEqual(layer_config.attention.impl,
+                                 AttentionImplementation.ATTENTION)
+                self.assertEqual(num_kv_heads,
+                                 layer_config.attention.num_key_value_heads)
+                self.assertEqual(layer_config.ffn.impl, FFNImplementation.MLP)
+                self.assertEqual(layer_config.ffn.intermediate_size,
+                                 config.intermediate_size)
+
+        elif getattr(hf_config, "block_configs", None) is not None:
+            # verify layers for new config
+            for layer_idx, block_config in enumerate(hf_config.block_configs):
+                layer_config = config.get_layer_config(layer_idx)
+                if layer_config.attention.impl == AttentionImplementation.ATTENTION:
+                    self.assertFalse(block_config.attention.no_op)
+                    self.assertFalse(block_config.attention.replace_with_linear)
+                    self.assertEqual(
+                        config.num_attention_heads //
+                        block_config.attention.n_heads_in_group,
+                        layer_config.attention.num_key_value_heads)
+                elif layer_config.attention.impl == AttentionImplementation.NO_OP:
+                    self.assertTrue(block_config.attention.no_op)
+                elif layer_config.attention.impl == AttentionImplementation.LINEAR:
+                    self.assertTrue(block_config.attention.replace_with_linear)
+
+                if layer_config.ffn.impl == FFNImplementation.MLP:
+                    self.assertFalse(block_config.ffn.no_op)
+                    self.assertFalse(block_config.ffn.replace_with_linear)
+                    self.assertEqual(
+                        _ffn_mult_to_intermediate_size(
+                            block_config.ffn.ffn_mult, config.hidden_size),
+                        layer_config.ffn.intermediate_size)
+                elif layer_config.ffn.impl == FFNImplementation.NO_OP:
+                    self.assertTrue(block_config.ffn.no_op)
+                elif layer_config.ffn.impl == FFNImplementation.LINEAR:
+                    self.assertTrue(block_config.ffn.replace_with_linear)
+
+        # verify config is valid enough for model creation
+        DeciLMForCausalLM(config)
+
+    @parameterized.expand(
+        itertools.product(
+            os.listdir(
+                Path(llm_models_root(check=True), "nvsmall/tests").as_posix()),
+            (True, False), (1, 2), (1, 2), ("auto", "float16", "bfloat16")))
+    def test_convert_model_from_hf(self, model_dir: Optional[str],
+                                   preloaded: bool, tp_size: int, pp_size: int,
+                                   dtype: str) -> None:
+        ckpt_path = Path(llm_models_root(check=True), "nvsmall/tests",
+                         model_dir)
+
+        if preloaded:
+            hf_model_or_dir = transformers.AutoModelForCausalLM.from_pretrained(
+                ckpt_path, trust_remote_code=True)
+        else:
+            hf_model_or_dir = ckpt_path
+
+        mapping = Mapping(world_size=(tp_size * pp_size),
+                          rank=0,
+                          gpus_per_node=1,
+                          tp_size=tp_size,
+                          pp_size=pp_size)
+
+        DeciLMForCausalLM.from_hugging_face(hf_model_or_dir=hf_model_or_dir,
+                                            dtype=dtype,
+                                            mapping=mapping,
+                                            trust_remote_code=not preloaded)
+
+    @parameterized.expand(
+        itertools.product(
+            os.listdir(
+                Path(llm_models_root(check=True), "nvsmall/tests").as_posix()),
+            (1, 2, 4)))
+    def test_weights_loader(self, model_dir: str, tp_size: int) -> None:
+
+        ckpt_path = Path(llm_models_root(check=True), "nvsmall/tests",
+                         model_dir)
+        config = DeciConfig.from_hugging_face(ckpt_path, trust_remote_code=True)
+        weights = load_weights_from_hf_safetensors(ckpt_path, config)
+
+        shard_configs = [
+            DeciConfig.from_hugging_face(ckpt_path,
+                                         trust_remote_code=True,
+                                         mapping=Mapping(world_size=tp_size,
+                                                         tp_size=tp_size,
+                                                         rank=rank))
+            for rank in range(tp_size)
+        ]
+        shard_weights = [
+            load_weights_from_hf_safetensors(ckpt_path, shard_config)
+            for shard_config in shard_configs
+        ]
+
+        for name, param in weights.items():
+            shards = [shard[name] for shard in shard_weights]
+
+            if name.endswith("attention.weight"):
+                # linear attention
+                combined = torch.cat(shards, dim=0)
+                torch.testing.assert_close(combined, param, atol=0, rtol=0)
+            elif name.endswith("attention.qkv.weight"):
+                # proper attention
+                layer_idx = int(
+                    re.match("transformer.layers.(\\d+).", name).groups()[0])
+                layer_config = config.layer_configs[layer_idx]
+                num_kv_heads = int(layer_config.attention.num_key_value_heads)
+                num_kv_heads_tp = (num_kv_heads + tp_size - 1) // tp_size
+                dups = tp_size // num_kv_heads or 1
+                q, k, v = torch.split(param, [
+                    config.num_attention_heads * config.head_size,
+                    num_kv_heads * config.head_size,
+                    num_kv_heads * config.head_size
+                ])
+
+                q_shards, k_shards, v_shards = [], [], []
+                for rank, shard in enumerate(shards):
+                    qt, kt, vt = torch.split(
+                        shard,
+                        [(config.num_attention_heads // tp_size) *
+                         config.head_size, num_kv_heads_tp * config.head_size,
+                         num_kv_heads_tp * config.head_size])
+                    q_shards.append(qt)
+                    if rank % dups == 0:
+                        k_shards.append(kt)
+                        v_shards.append(vt)
+
+                combined_q = torch.cat(q_shards, dim=0)
+                combined_k = torch.cat(k_shards, dim=0)
+                combined_v = torch.cat(v_shards, dim=0)
+
+                torch.testing.assert_close(combined_q, q, atol=0, rtol=0)
+                torch.testing.assert_close(combined_k, k, atol=0, rtol=0)
+                torch.testing.assert_close(combined_v, v, atol=0, rtol=0)
+
+    @parameterized.expand(itertools.product([True, False],
+                                            ["float16", "bfloat16"], [None],
+                                            [None]),
+                          name_func=unittest_name_func)
+    def test_vgqa_model_runner_allclose(self, use_py_session, dtype, engine_dir,
+                                        hf_model_dir):
+        input_text = "Born in north-east France, Soyer trained as a"
+        tokenizer_dir = hf_model_dir
+
+        if engine_dir is None or not Path(engine_dir).exists:
+            self.skipTest(f"Engine dir is either None or doesn't exist")
+        if hf_model_dir is None or not Path(hf_model_dir).exists:
+            self.skipTest(
+                f"Missing HF checkpoint, define a valid checkpoint path with the NEMOTRON_NAS_CKPT environment variable"
+            )
+
+        dtype = tensorrt_llm._utils.str_dtype_to_torch(dtype)
+
+        hf_model = transformers.AutoModelForCausalLM.from_pretrained(
+            hf_model_dir, trust_remote_code=True, torch_dtype=dtype).cuda()
+
+        batch_size = 1
+        max_seq_len = 30
+
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                  padding_side="left",
+                                                  truncation_side="left",
+                                                  trust_remote_code=True,
+                                                  use_fast=True)
+        batch_input_ids = [
+            torch.tensor(tokenizer.encode(input_text,
+                                          add_special_tokens=True,
+                                          truncation=True),
+                         dtype=torch.int32)
+        ]
+
+        hf_batch_ids = batch_input_ids[0].unsqueeze(0).repeat(batch_size,
+                                                              1).cuda()
+        in_tokens = batch_input_ids[0].shape[0]
+
+        with torch.no_grad():
+            hf_outputs = hf_model.generate(hf_batch_ids, max_length=max_seq_len)
+
+        torch.cuda.synchronize()
+
+        if use_py_session:
+            runner = ModelRunner.from_dir(engine_dir=engine_dir,
+                                          rank=0,
+                                          debug_mode=False)
+
+        else:
+            runner = ModelRunnerCpp.from_dir(engine_dir=engine_dir,
+                                             rank=0,
+                                             debug_mode=False)
+
+        pad_token_id = tokenizer.pad_token_id
+        if tokenizer.pad_token_id is None:
+            pad_token_id = tokenizer.eos_token_id
+
+        with torch.no_grad():
+            runner_outputs = runner.generate(batch_input_ids=batch_input_ids,
+                                             max_new_tokens=max_seq_len -
+                                             in_tokens,
+                                             end_id=tokenizer.eos_token_id,
+                                             pad_id=pad_token_id,
+                                             output_sequence_lengths=True,
+                                             return_dict=False)
+
+        torch.cuda.synchronize()
+
+        del runner
+
+        if not use_py_session:
+            np.testing.assert_allclose(
+                runner_outputs[0][0][:max_seq_len].cpu().numpy(),
+                hf_outputs[0].cpu().numpy())
+        else:
+            np.testing.assert_allclose(runner_outputs[0].cpu().numpy(),
+                                       hf_outputs.cpu().numpy())
diff --git a/tests/model_api/test_model_quantization.py b/tests/model_api/test_model_quantization.py
index 160f646d3..6cca4aaef 100644
--- a/tests/model_api/test_model_quantization.py
+++ b/tests/model_api/test_model_quantization.py
@@ -28,12 +28,15 @@
 def test_int4_awq_quantization():
 
     max_batch_size, max_isl, max_osl = 8, 256, 256
-    hf_model_dir = llm_models_root() / "llama-models/llama-7b-hf"
+    hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf")
+    cnn_dailymail_path = str(llm_models_root() / "datasets/cnn_dailymail")
+
     checkpoint_dir = tempfile.TemporaryDirectory("llama-checkpoint").name
     quant_config = QuantConfig(QuantAlgo.W4A16_AWQ)
     LLaMAForCausalLM.quantize(hf_model_dir,
                               checkpoint_dir,
                               quant_config=quant_config,
+                              calib_dataset=cnn_dailymail_path,
                               calib_batches=32,
                               calib_batch_size=32)
     llama = LLaMAForCausalLM.from_checkpoint(checkpoint_dir)
@@ -68,12 +71,14 @@ def test_int4_awq_quantization():
 def test_fp8_quantization():
     max_batch_size, max_isl, max_osl = 8, 256, 256
     hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf")
+    cnn_dailymail_path = str(llm_models_root() / "datasets/cnn_dailymail")
 
     checkpoint_dir = tempfile.TemporaryDirectory("llama-checkpoint").name
     quant_config = QuantConfig(QuantAlgo.FP8)
     LLaMAForCausalLM.quantize(hf_model_dir,
                               checkpoint_dir,
                               quant_config=quant_config,
+                              calib_dataset=cnn_dailymail_path,
                               calib_batches=32)
     llama = LLaMAForCausalLM.from_checkpoint(checkpoint_dir)
 
diff --git a/tests/test_graph_rewriter.py b/tests/test_graph_rewriter.py
index 2adc8875a..2fb8656e7 100644
--- a/tests/test_graph_rewriter.py
+++ b/tests/test_graph_rewriter.py
@@ -455,7 +455,7 @@ def match_and_rewrite(self, layer: Layer) -> bool:
 
         new_inputs = flayer.clone_inputs()
         with net_guard(layer.network):
-            # Step 1: create new inputs and repalce the original arglist
+            # Step 1: create new inputs and replace the original arglist
             input = Tensor(
                 name='qkv',
                 dtype=trt.float16,
diff --git a/tests/test_layer.py b/tests/test_layer.py
index 8546ec725..0219331bb 100644
--- a/tests/test_layer.py
+++ b/tests/test_layer.py
@@ -1359,7 +1359,6 @@ def test_mamba(self, batch_size, in_seq_len, out_seq_len, d_model, d_state,
 
         stream = torch.cuda.current_stream()
         builder_config = builder.create_builder_config(name='mamba',
-                                                       opt_level=0,
                                                        precision=dtype)
         engine = builder.build_engine(net, builder_config)
         session = tensorrt_llm.runtime.Session.from_serialized_engine(engine)
@@ -1695,7 +1694,6 @@ def test_mamba2(self, batch_size, in_seq_len, out_seq_len, d_model, d_state,
 
         stream = torch.cuda.current_stream()
         builder_config = builder.create_builder_config(name='mamba2',
-                                                       opt_level=0,
                                                        precision=dtype)
         engine = builder.build_engine(net, builder_config)
         session = tensorrt_llm.runtime.Session.from_serialized_engine(engine)
@@ -1706,7 +1704,7 @@ def test_mamba2(self, batch_size, in_seq_len, out_seq_len, d_model, d_state,
             hidden_states_ref, last_token_ids, conv_state_ref, ssm_state_ref,
             remove_padding, batch_size, seqlen_offset)
 
-        dtype_atol = {"float16": 5e-3, "float32": 5e-3, "bfloat16": 5e-2}
+        dtype_atol = {"float16": 7e-3, "float32": 5e-3, "bfloat16": 5e-2}
 
         if not remove_padding:
             # get out_mask
@@ -2045,7 +2043,6 @@ def fuse_rg_lru(recurrent_layer):
 
         stream = torch.cuda.current_stream()
         builder_config = builder.create_builder_config(name='recurrent',
-                                                       opt_level=0,
                                                        precision=dtype)
         engine = builder.build_engine(net, builder_config)
         session = tensorrt_llm.runtime.Session.from_serialized_engine(engine)
diff --git a/tests/test_model_runner_cpp.py b/tests/test_model_runner_cpp.py
new file mode 100644
index 000000000..e5bd10459
--- /dev/null
+++ b/tests/test_model_runner_cpp.py
@@ -0,0 +1,84 @@
+import typing as tp
+from pathlib import Path
+
+import torch
+from bindings.binding_test_utils import *
+from transformers import AutoTokenizer
+from utils.cpp_paths import *
+from utils.llm_data import llm_models_root
+from utils.util import skip_pre_ampere
+
+from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCpp
+
+
+@pytest.fixture
+def model_files(llm_root: Path, resource_path: Path, results_data_path: Path):
+    # Model engines and expected outputs need to be generated.
+    print(results_data_path)
+    if not results_data_path.exists():
+        model_cache = llm_models_root()
+        model_cache_arg = ["--model_cache", str(model_cache)
+                           ] if model_cache is not None else []
+        prepare_model_tests(llm_root, resource_path, "gpt", model_cache_arg)
+
+
+@skip_pre_ampere  # ContextFMHAType with fp32 acc is not supported in pre-ampere architecture
+def test_logits_post_processor(model_files, model_path):
+
+    # Define the logits post-processor callback
+    def logits_post_processor(req_id: int, logits: torch.Tensor,
+                              ids: tp.List[tp.List[int]], stream_ptr: int,
+                              client_id: tp.Optional[int]):
+        with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)):
+            logits[:] = float("-inf")
+            logits[..., 42] = 0
+
+    # Create ModelRunnerCpp
+    logits_processor_map = {"my_logits_pp": logits_post_processor}
+    runner = ModelRunnerCpp.from_dir(model_path,
+                                     logits_processor_map=logits_processor_map)
+
+    model_root = llm_models_root(check=True)
+    hf_model_dir = Path(model_root, "gpt2")
+
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_dir,
+                                              padding_side="left",
+                                              truncation_side="left",
+                                              trust_remote_code=True,
+                                              use_fast=True)
+
+    input_text = "Born in north-east France, Soyer trained as a"
+    batch_input_ids = [
+        torch.tensor(tokenizer.encode(input_text,
+                                      add_special_tokens=True,
+                                      truncation=True),
+                     dtype=torch.int32)
+    ]
+
+    pad_token_id = tokenizer.pad_token_id
+    if tokenizer.pad_token_id is None:
+        pad_token_id = tokenizer.eos_token_id
+
+    # Create the request
+    max_new_tokens = 5
+    with torch.no_grad():
+        outputs = runner.generate(batch_input_ids=batch_input_ids,
+                                  max_new_tokens=max_new_tokens,
+                                  end_id=tokenizer.eos_token_id,
+                                  pad_id=pad_token_id,
+                                  output_sequence_lengths=True,
+                                  return_dict=True,
+                                  logits_processor_names={"my_logits_pp"})
+
+    torch.cuda.synchronize()
+
+    # Get the new tokens
+    tokens = outputs['output_ids']
+    sequence_lengths = outputs['sequence_lengths']
+
+    output_begin = len(batch_input_ids[0])
+    output_end = sequence_lengths[0][0]
+
+    # check that all output tokens are 42
+    assert tokens[0][0][output_begin:output_end].tolist() == [42
+                                                              ] * max_new_tokens
diff --git a/tests/test_module.py b/tests/test_module.py
index 30e3513fe..375ff651e 100644
--- a/tests/test_module.py
+++ b/tests/test_module.py
@@ -72,6 +72,7 @@ class TestModule(unittest.TestCase):
 
     def test_module(self):
         m = Module3()
+        print(m)
         m.forward()
 
         self.assertEqual(4, len(list(m.named_modules())))
@@ -88,6 +89,7 @@ def test_module(self):
 
     def test_module_list(self):
         m = Module4()
+        print(m)
         m.forward()
 
         self.assertEqual(8, len(list(m.named_modules())))
diff --git a/tests/utils/cpp_paths.py b/tests/utils/cpp_paths.py
index 02a8abff4..7fd5cee3e 100644
--- a/tests/utils/cpp_paths.py
+++ b/tests/utils/cpp_paths.py
@@ -77,3 +77,8 @@ def results_data_path(data_path: _pl.Path) -> _pl.Path:
 @pytest.fixture(scope="module")
 def results_data_path_beam_width_2(data_path: _pl.Path) -> _pl.Path:
     return data_path / f"gpt2/beam_search_2/{get_base_model_spec().get_results_file()}"
+
+
+@pytest.fixture(scope="module")
+def results_data_path_fmhafp32acc(data_path: _pl.Path) -> _pl.Path:
+    return data_path / f"gpt2/sampling/{get_base_model_spec().enable_context_fmha_fp32_acc().get_results_file()}"
diff --git a/tests/utils/util.py b/tests/utils/util.py
index d12af5b5a..8184c9f0f 100644
--- a/tests/utils/util.py
+++ b/tests/utils/util.py
@@ -1,6 +1,7 @@
 import os
 import unittest
 from difflib import SequenceMatcher
+from pathlib import Path
 
 import pytest
 import tensorrt as trt
@@ -199,7 +200,6 @@ def create_session(builder,
                    precision="float32",
                    int8=False,
                    fp8=False,
-                   opt_level=None,
                    memory_pool_limit=None,
                    optimization_profiles=[],
                    quant_mode=QuantMode(0)):
@@ -208,14 +208,13 @@ def create_session(builder,
     Args:
         network: a tensorrt_llm.Network object
         precision: the precision of the network, choose from ["float32", "float16", "bfloat16"]
-        **kwargs: builder flags such as int8, fp8, builder_opt, etc.
+        **kwargs: builder flags such as int8, fp8, etc.
     Returns:
         session: a tensorrt_llm.runtime.Session
     """
     builder_config = builder.create_builder_config(precision=precision,
                                                    int8=int8,
                                                    fp8=fp8,
-                                                   opt_level=opt_level,
                                                    quant_mode=quant_mode)
     # Some tests require to set mem pool limit to avoid OOM
     if memory_pool_limit is not None:
@@ -279,3 +278,8 @@ def similarity_score(a, b):
 def similar(a, b, threshold=0.8):
     "similar compare a and b "
     return similarity_score(a, b) >= threshold
+
+
+def get_project_root(test_file: str) -> Path:
+    return next(p for p in Path(test_file).resolve().parents
+                if (p / 'tests').is_dir() and (p / "tensorrt_llm").is_dir())